[Midnightbsd-cvs] src [12314] trunk/sys/vm: sync with FreeBSD 11-stable
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Sat Feb 8 14:35:49 EST 2020
Revision: 12314
http://svnweb.midnightbsd.org/src/?rev=12314
Author: laffer1
Date: 2020-02-08 14:35:48 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable
Modified Paths:
--------------
trunk/sys/vm/_vm_radix.h
trunk/sys/vm/default_pager.c
trunk/sys/vm/device_pager.c
trunk/sys/vm/memguard.c
trunk/sys/vm/memguard.h
trunk/sys/vm/phys_pager.c
trunk/sys/vm/pmap.h
trunk/sys/vm/redzone.c
trunk/sys/vm/redzone.h
trunk/sys/vm/sg_pager.c
trunk/sys/vm/swap_pager.c
trunk/sys/vm/swap_pager.h
trunk/sys/vm/uma.h
trunk/sys/vm/uma_core.c
trunk/sys/vm/uma_dbg.c
trunk/sys/vm/uma_dbg.h
trunk/sys/vm/uma_int.h
trunk/sys/vm/vm.h
trunk/sys/vm/vm_extern.h
trunk/sys/vm/vm_fault.c
trunk/sys/vm/vm_glue.c
trunk/sys/vm/vm_init.c
trunk/sys/vm/vm_kern.c
trunk/sys/vm/vm_kern.h
trunk/sys/vm/vm_map.c
trunk/sys/vm/vm_map.h
trunk/sys/vm/vm_meter.c
trunk/sys/vm/vm_mmap.c
trunk/sys/vm/vm_object.c
trunk/sys/vm/vm_object.h
trunk/sys/vm/vm_page.c
trunk/sys/vm/vm_page.h
trunk/sys/vm/vm_pageout.c
trunk/sys/vm/vm_pageout.h
trunk/sys/vm/vm_pager.c
trunk/sys/vm/vm_pager.h
trunk/sys/vm/vm_param.h
trunk/sys/vm/vm_phys.c
trunk/sys/vm/vm_phys.h
trunk/sys/vm/vm_radix.c
trunk/sys/vm/vm_radix.h
trunk/sys/vm/vm_reserv.c
trunk/sys/vm/vm_reserv.h
trunk/sys/vm/vm_unix.c
trunk/sys/vm/vm_zeroidle.c
trunk/sys/vm/vnode_pager.c
trunk/sys/vm/vnode_pager.h
Added Paths:
-----------
trunk/sys/vm/vm_domain.c
trunk/sys/vm/vm_domain.h
trunk/sys/vm/vm_swapout.c
trunk/sys/vm/vm_swapout_dummy.c
Modified: trunk/sys/vm/_vm_radix.h
===================================================================
--- trunk/sys/vm/_vm_radix.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/_vm_radix.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -26,7 +26,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/vm/_vm_radix.h 254141 2013-08-09 11:28:55Z attilio $
+ * $FreeBSD: stable/11/sys/vm/_vm_radix.h 321513 2017-07-26 06:52:45Z kib $
*/
#ifndef __VM_RADIX_H_
@@ -37,20 +37,6 @@
*/
struct vm_radix {
uintptr_t rt_root;
- uint8_t rt_flags;
};
-#define RT_INSERT_INPROG 0x01
-#define RT_TRIE_MODIFIED 0x02
-
-#ifdef _KERNEL
-
-static __inline boolean_t
-vm_radix_is_empty(struct vm_radix *rtree)
-{
-
- return (rtree->rt_root == 0);
-}
-
-#endif /* _KERNEL */
#endif /* !__VM_RADIX_H_ */
Modified: trunk/sys/vm/default_pager.c
===================================================================
--- trunk/sys/vm/default_pager.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/default_pager.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -28,18 +28,10 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
- *
- * The default pager is responsible for supplying backing store to unbacked
- * storage. The backing store is usually swap so we just fall through to
- * the swap routines. However, since swap metadata has not been assigned,
- * the swap routines assign and manage the swap backing store through the
- * vm_page->swapblk field. The object is only converted when the page is
- * physically freed after having been cleaned and even then vm_page->swapblk
- * is maintained whenever a resident page also has swap backing store.
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/default_pager.c 310363 2016-12-21 11:32:08Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/default_pager.c 315473 2017-03-18 05:38:10Z alc $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -54,14 +46,16 @@
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>
-static vm_object_t default_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
- vm_ooffset_t, struct ucred *);
-static void default_pager_dealloc(vm_object_t);
-static int default_pager_getpages(vm_object_t, vm_page_t *, int, int);
-static void default_pager_putpages(vm_object_t, vm_page_t *, int,
- boolean_t, int *);
-static boolean_t default_pager_haspage(vm_object_t, vm_pindex_t, int *,
- int *);
+static vm_object_t default_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
+ vm_ooffset_t, struct ucred *);
+static void default_pager_dealloc(vm_object_t);
+static int default_pager_getpages(vm_object_t, vm_page_t *, int,
+ int *, int *);
+static void default_pager_putpages(vm_object_t, vm_page_t *, int,
+ boolean_t, int *);
+static boolean_t default_pager_haspage(vm_object_t, vm_pindex_t, int *,
+ int *);
+
/*
* pagerops for OBJT_DEFAULT - "default pager".
*
@@ -84,7 +78,7 @@
};
/*
- * no_pager_alloc just returns an initialized object.
+ * Return an initialized object.
*/
static vm_object_t
default_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
@@ -102,51 +96,41 @@
object = vm_object_allocate(OBJT_DEFAULT,
OFF_TO_IDX(round_page(offset + size)));
if (cred != NULL) {
- VM_OBJECT_WLOCK(object);
object->cred = cred;
object->charge = size;
- VM_OBJECT_WUNLOCK(object);
}
return (object);
}
/*
- * deallocate resources associated with default objects. The default objects
- * have no special resources allocated to them, but the vm_page's being used
- * in this object might. Still, we do not have to do anything - we will free
- * the swapblk in the underlying vm_page's when we free the vm_page or
- * garbage collect the vm_page cache list.
+ * Deallocate resources associated with the object.
*/
static void
-default_pager_dealloc(object)
- vm_object_t object;
+default_pager_dealloc(vm_object_t object)
{
- /*
- * OBJT_DEFAULT objects have no special resources allocated to them.
- */
+
+ /* Reserved swap is released by vm_object_destroy(). */
object->type = OBJT_DEAD;
}
/*
- * Load pages from backing store. Since OBJT_DEFAULT is converted to
- * OBJT_SWAP at the time a swap-backed vm_page_t is freed, we will never
- * see a vm_page with assigned swap here.
+ * Load pages from backing store.
*/
static int
-default_pager_getpages(object, m, count, reqpage)
- vm_object_t object;
- vm_page_t *m;
- int count;
- int reqpage;
+default_pager_getpages(vm_object_t object, vm_page_t *m, int count,
+ int *rbehind, int *rahead)
{
- return VM_PAGER_FAIL;
+
+ /*
+ * Since an OBJT_DEFAULT object is converted to OBJT_SWAP by the first
+ * call to the putpages method, this function will never be called on
+ * a vm_page with assigned swap.
+ */
+ return (VM_PAGER_FAIL);
}
/*
- * Store pages to backing store. We should assign swap and initiate
- * I/O. We do not actually convert the object to OBJT_SWAP here. The
- * object will be converted when the written-out vm_page_t is moved from the
- * cache to the free list.
+ * Store pages to backing store.
*/
static void
default_pager_putpages(vm_object_t object, vm_page_t *m, int count,
@@ -153,28 +137,20 @@
int flags, int *rtvals)
{
+ /* The swap pager will convert the object to OBJT_SWAP. */
swappagerops.pgo_putpages(object, m, count, flags, rtvals);
}
/*
- * Tell us whether the backing store for the requested (object,index) is
- * synchronized. i.e. tell us whether we can throw the page away and
- * reload it later. So, for example, if we are in the process of writing
- * the page to its backing store, or if no backing store has been assigned,
- * it is not yet synchronized.
- *
- * It is possible to have fully-synchronized swap assigned without the
- * object having been converted. We just call swap_pager_haspage() to
- * deal with it since it must already deal with it plus deal with swap
- * meta-data structures.
+ * Tell us whether the requested (object,index) is available from the object's
+ * backing store.
*/
static boolean_t
-default_pager_haspage(object, pindex, before, after)
- vm_object_t object;
- vm_pindex_t pindex;
- int *before;
- int *after;
+default_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+ int *after)
{
- return FALSE;
+
+ /* An OBJT_DEFAULT object has no backing store. */
+ return (FALSE);
}
Modified: trunk/sys/vm/device_pager.c
===================================================================
--- trunk/sys/vm/device_pager.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/device_pager.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -36,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/device_pager.c 320439 2017-06-28 06:13:58Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/device_pager.c 331722 2018-03-29 02:50:57Z eadler $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -47,6 +47,7 @@
#include <sys/mman.h>
#include <sys/rwlock.h>
#include <sys/sx.h>
+#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -60,10 +61,12 @@
static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
vm_ooffset_t, struct ucred *);
static void dev_pager_dealloc(vm_object_t);
-static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
static void dev_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
static void dev_pager_free_page(vm_object_t object, vm_page_t m);
+static int dev_pager_populate(vm_object_t object, vm_pindex_t pidx,
+ int fault_type, vm_prot_t, vm_pindex_t *first, vm_pindex_t *last);
/* list of device pager objects */
static struct pagerlst dev_pager_object_list;
@@ -85,6 +88,7 @@
.pgo_getpages = dev_pager_getpages,
.pgo_putpages = dev_pager_putpages,
.pgo_haspage = dev_pager_haspage,
+ .pgo_populate = dev_pager_populate,
};
static int old_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
@@ -128,6 +132,8 @@
if (tp != OBJT_DEVICE && tp != OBJT_MGTDEVICE)
return (NULL);
+ KASSERT(tp == OBJT_MGTDEVICE || ops->cdev_pg_populate == NULL,
+ ("populate on unmanaged device pager"));
/*
* Offset should be page aligned.
@@ -135,8 +141,18 @@
if (foff & PAGE_MASK)
return (NULL);
+ /*
+ * Treat the mmap(2) file offset as an unsigned value for a
+ * device mapping. This, in effect, allows a user to pass all
+ * possible off_t values as the mapping cookie to the driver. At
+ * this point, we know that both foff and size are a multiple
+ * of the page size. Do a check to avoid wrap.
+ */
size = round_page(size);
- pindex = OFF_TO_IDX(foff + size);
+ pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size);
+ if (pindex > OBJ_MAX_SIZE || pindex < UOFF_TO_IDX(foff) ||
+ pindex < UOFF_TO_IDX(size))
+ return (NULL);
if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0)
return (NULL);
@@ -169,6 +185,11 @@
*/
if (pindex > object->size)
object->size = pindex;
+ KASSERT(object->type == tp,
+ ("Inconsistent device pager type %p %d",
+ object, tp));
+ KASSERT(object->un_pager.devp.ops == ops,
+ ("Inconsistent devops %p %p", object, ops));
} else {
object = object1;
object1 = NULL;
@@ -175,12 +196,14 @@
object->handle = handle;
TAILQ_INSERT_TAIL(&dev_pager_object_list, object,
pager_object_list);
- KASSERT(object->type == tp,
- ("Inconsistent device pager type %p %d", object, tp));
+ if (ops->cdev_pg_populate != NULL)
+ vm_object_set_flag(object, OBJ_POPULATE);
}
} else {
if (pindex > object->size)
object->size = pindex;
+ KASSERT(object->type == tp,
+ ("Inconsistent device pager type %p %d", object, tp));
}
mtx_unlock(&dev_pager_mtx);
if (object1 != NULL) {
@@ -256,34 +279,35 @@
}
static int
-dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int reqpage)
+dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
+ int *rahead)
{
- int error, i;
+ int error;
+ /* Since our haspage reports zero after/before, the count is 1. */
+ KASSERT(count == 1, ("%s: count %d", __func__, count));
VM_OBJECT_ASSERT_WLOCKED(object);
+ if (object->un_pager.devp.ops->cdev_pg_fault == NULL)
+ return (VM_PAGER_FAIL);
error = object->un_pager.devp.ops->cdev_pg_fault(object,
- IDX_TO_OFF(ma[reqpage]->pindex), PROT_READ, &ma[reqpage]);
+ IDX_TO_OFF(ma[0]->pindex), PROT_READ, &ma[0]);
VM_OBJECT_ASSERT_WLOCKED(object);
- for (i = 0; i < count; i++) {
- if (i != reqpage) {
- vm_page_lock(ma[i]);
- vm_page_free(ma[i]);
- vm_page_unlock(ma[i]);
- }
- }
-
if (error == VM_PAGER_OK) {
KASSERT((object->type == OBJT_DEVICE &&
- (ma[reqpage]->oflags & VPO_UNMANAGED) != 0) ||
+ (ma[0]->oflags & VPO_UNMANAGED) != 0) ||
(object->type == OBJT_MGTDEVICE &&
- (ma[reqpage]->oflags & VPO_UNMANAGED) == 0),
- ("Wrong page type %p %p", ma[reqpage], object));
+ (ma[0]->oflags & VPO_UNMANAGED) == 0),
+ ("Wrong page type %p %p", ma[0], object));
if (object->type == OBJT_DEVICE) {
TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist,
- ma[reqpage], plinks.q);
+ ma[0], plinks.q);
}
+ if (rbehind)
+ *rbehind = 0;
+ if (rahead)
+ *rahead = 0;
}
return (error);
@@ -290,6 +314,18 @@
}
static int
+dev_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
+ vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ if (object->un_pager.devp.ops->cdev_pg_populate == NULL)
+ return (VM_PAGER_FAIL);
+ return (object->un_pager.devp.ops->cdev_pg_populate(object, pidx,
+ fault_type, max_prot, first, last));
+}
+
+static int
old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
vm_page_t *mres)
{
@@ -355,8 +391,7 @@
*/
page = vm_page_getfake(paddr, memattr);
VM_OBJECT_WLOCK(object);
- if (vm_page_replace(page, object, (*mres)->pindex) != *mres)
- panic("old_dev_pager_fault: invalid page replacement");
+ vm_page_replace_checked(page, object, (*mres)->pindex, *mres);
vm_page_lock(*mres);
vm_page_free(*mres);
vm_page_unlock(*mres);
Modified: trunk/sys/vm/memguard.c
===================================================================
--- trunk/sys/vm/memguard.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/memguard.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -27,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/memguard.c 325037 2017-10-27 14:23:53Z markj $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/memguard.c 331017 2018-03-15 19:08:33Z kevans $");
/*
* MemGuard is a simple replacement allocator for debugging only
@@ -50,6 +50,7 @@
#include <sys/malloc.h>
#include <sys/sysctl.h>
#include <sys/vmem.h>
+#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/uma.h>
@@ -68,9 +69,9 @@
* reserved for MemGuard.
*/
static u_int vm_memguard_divisor;
-SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RDTUN,
+SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
&vm_memguard_divisor,
- 0, "(kmem_size/memguard_divisor) == memguard submap size");
+ 0, "(kmem_size/memguard_divisor) == memguard submap size");
/*
* Short description (ks_shortdesc) of memory type to monitor.
@@ -131,8 +132,7 @@
#define MG_GUARD_ALLLARGE 0x002
#define MG_GUARD_NOFREE 0x004
static int memguard_options = MG_GUARD_AROUND;
-TUNABLE_INT("vm.memguard.options", &memguard_options);
-SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RW,
+SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RWTUN,
&memguard_options, 0,
"MemGuard options:\n"
"\t0x001 - add guard pages around each allocation\n"
@@ -148,8 +148,7 @@
static u_int memguard_frequency;
static u_long memguard_frequency_hits;
-TUNABLE_INT("vm.memguard.frequency", &memguard_frequency);
-SYSCTL_UINT(_vm_memguard, OID_AUTO, frequency, CTLFLAG_RW,
+SYSCTL_UINT(_vm_memguard, OID_AUTO, frequency, CTLFLAG_RWTUN,
&memguard_frequency, 0, "Times in 100000 that MemGuard will randomly run");
SYSCTL_ULONG(_vm_memguard, OID_AUTO, frequency_hits, CTLFLAG_RD,
&memguard_frequency_hits, 0, "# times MemGuard randomly chose");
@@ -165,6 +164,7 @@
u_long mem_pgs, parent_size;
vm_memguard_divisor = 10;
+ /* CTFLAG_RDTUN doesn't work during the early boot process. */
TUNABLE_INT_FETCH("vm.memguard.divisor", &vm_memguard_divisor);
parent_size = vm_map_max(parent_map) - vm_map_min(parent_map) +
@@ -180,7 +180,7 @@
* This prevents memguard's page promotions from completely
* using up memory, since most malloc(9) calls are sub-page.
*/
- mem_pgs = cnt.v_page_count;
+ mem_pgs = vm_cnt.v_page_count;
memguard_physlimit = (mem_pgs / vm_memguard_divisor) * PAGE_SIZE;
/*
* We want as much KVA as we can take safely. Use at most our
Modified: trunk/sys/vm/memguard.h
===================================================================
--- trunk/sys/vm/memguard.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/memguard.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -24,7 +24,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $
+ * $FreeBSD: stable/11/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $
*/
#ifndef _VM_MEMGUARD_H_
Modified: trunk/sys/vm/phys_pager.c
===================================================================
--- trunk/sys/vm/phys_pager.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/phys_pager.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/phys_pager.c 310110 2016-12-15 10:47:35Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/phys_pager.c 327785 2018-01-10 20:39:26Z markj $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -42,6 +42,7 @@
#include <vm/vm_param.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
/* list of phys pager objects */
@@ -99,6 +100,7 @@
object = object1;
object1 = NULL;
object->handle = handle;
+ vm_object_set_flag(object, OBJ_POPULATE);
TAILQ_INSERT_TAIL(&phys_pager_object_list,
object, pager_object_list);
}
@@ -110,6 +112,7 @@
vm_object_deallocate(object1);
} else {
object = vm_object_allocate(OBJT_PHYS, pindex);
+ vm_object_set_flag(object, OBJ_POPULATE);
}
return (object);
@@ -134,7 +137,8 @@
* Fill as many pages as vm_fault has allocated for us.
*/
static int
-phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
+phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+ int *rahead)
{
int i;
@@ -149,35 +153,98 @@
("phys_pager_getpages: partially valid page %p", m[i]));
KASSERT(m[i]->dirty == 0,
("phys_pager_getpages: dirty page %p", m[i]));
- /* The requested page must remain busy, the others not. */
- if (i == reqpage) {
- vm_page_lock(m[i]);
- vm_page_flash(m[i]);
- vm_page_unlock(m[i]);
- } else
- vm_page_xunbusy(m[i]);
}
+ if (rbehind)
+ *rbehind = 0;
+ if (rahead)
+ *rahead = 0;
return (VM_PAGER_OK);
}
-static void
-phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
- int *rtvals)
-{
-
- panic("phys_pager_putpage called");
-}
-
/*
* Implement a pretty aggressive clustered getpages strategy. Hint that
* everything in an entire 4MB window should be prefaulted at once.
*
- * XXX 4MB (1024 slots per page table page) is convenient for x86,
+ * 4MB (1024 slots per page table page) is convenient for x86,
* but may not be for other arches.
*/
#ifndef PHYSCLUSTER
#define PHYSCLUSTER 1024
#endif
+static int phys_pager_cluster = PHYSCLUSTER;
+SYSCTL_INT(_vm, OID_AUTO, phys_pager_cluster, CTLFLAG_RWTUN,
+ &phys_pager_cluster, 0,
+ "prefault window size for phys pager");
+
+/*
+ * Max hint to vm_page_alloc() about the further allocation needs
+ * inside the phys_pager_populate() loop. The number of bits used to
+ * implement VM_ALLOC_COUNT() determines the hard limit on this value.
+ * That limit is currently 65535.
+ */
+#define PHYSALLOC 16
+
+static int
+phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
+ int fault_type __unused, vm_prot_t max_prot __unused, vm_pindex_t *first,
+ vm_pindex_t *last)
+{
+ vm_page_t m;
+ vm_pindex_t base, end, i;
+ int ahead;
+
+ base = rounddown(pidx, phys_pager_cluster);
+ end = base + phys_pager_cluster - 1;
+ if (end >= object->size)
+ end = object->size - 1;
+ if (*first > base)
+ base = *first;
+ if (end > *last)
+ end = *last;
+ *first = base;
+ *last = end;
+
+ for (i = base; i <= end; i++) {
+retry:
+ m = vm_page_lookup(object, i);
+ if (m == NULL) {
+ ahead = MIN(end - i, PHYSALLOC);
+ m = vm_page_alloc(object, i, VM_ALLOC_NORMAL |
+ VM_ALLOC_ZERO | VM_ALLOC_WAITFAIL |
+ VM_ALLOC_COUNT(ahead));
+ if (m == NULL)
+ goto retry;
+ if ((m->flags & PG_ZERO) == 0)
+ pmap_zero_page(m);
+ m->valid = VM_PAGE_BITS_ALL;
+ } else if (vm_page_xbusied(m)) {
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(object);
+ vm_page_busy_sleep(m, "physb", true);
+ VM_OBJECT_WLOCK(object);
+ goto retry;
+ } else {
+ vm_page_xbusy(m);
+ if (m->valid != VM_PAGE_BITS_ALL)
+ vm_page_zero_invalid(m, TRUE);
+ }
+
+ KASSERT(m->valid == VM_PAGE_BITS_ALL,
+ ("phys_pager_populate: partially valid page %p", m));
+ KASSERT(m->dirty == 0,
+ ("phys_pager_populate: dirty page %p", m));
+ }
+ return (VM_PAGER_OK);
+}
+
+static void
+phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
+ int *rtvals)
+{
+
+ panic("phys_pager_putpage called");
+}
+
static boolean_t
phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
int *after)
@@ -184,8 +251,8 @@
{
vm_pindex_t base, end;
- base = pindex & (~(PHYSCLUSTER - 1));
- end = base + (PHYSCLUSTER - 1);
+ base = rounddown(pindex, phys_pager_cluster);
+ end = base + phys_pager_cluster - 1;
if (before != NULL)
*before = pindex - base;
if (after != NULL)
@@ -200,4 +267,5 @@
.pgo_getpages = phys_pager_getpages,
.pgo_putpages = phys_pager_putpages,
.pgo_haspage = phys_pager_haspage,
+ .pgo_populate = phys_pager_populate,
};
Modified: trunk/sys/vm/pmap.h
===================================================================
--- trunk/sys/vm/pmap.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/pmap.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: stable/10/sys/vm/pmap.h 270920 2014-09-01 07:58:15Z kib $
+ * $FreeBSD: stable/11/sys/vm/pmap.h 331722 2018-03-29 02:50:57Z eadler $
*/
/*
@@ -101,10 +101,22 @@
/*
* Flags for pmap_enter(). The bits in the low-order byte are reserved
* for the protection code (vm_prot_t) that describes the fault type.
+ * Bits 24 through 31 are reserved for the pmap's internal use.
*/
-#define PMAP_ENTER_NOSLEEP 0x0100
-#define PMAP_ENTER_WIRED 0x0200
+#define PMAP_ENTER_NOSLEEP 0x00000100
+#define PMAP_ENTER_WIRED 0x00000200
+#define PMAP_ENTER_RESERVED 0xFF000000
+/*
+ * Define the maximum number of machine-dependent reference bits that are
+ * cleared by a call to pmap_ts_referenced(). This limit serves two purposes.
+ * First, it bounds the cost of reference bit maintenance on widely shared
+ * pages. Second, it prevents numeric overflow during maintenance of a
+ * widely shared page's "act_count" field. An overflow could result in the
+ * premature deactivation of the page.
+ */
+#define PMAP_TS_REFERENCED_MAX 5
+
void pmap_activate(struct thread *td);
void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
int advice);
@@ -142,6 +154,8 @@
void pmap_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
void pmap_qenter(vm_offset_t, vm_page_t *, int);
void pmap_qremove(vm_offset_t, int);
+vm_offset_t pmap_quick_enter_page(vm_page_t);
+void pmap_quick_remove_page(vm_offset_t);
void pmap_release(pmap_t);
void pmap_remove(pmap_t, vm_offset_t, vm_offset_t);
void pmap_remove_all(vm_page_t m);
Modified: trunk/sys/vm/redzone.c
===================================================================
--- trunk/sys/vm/redzone.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/redzone.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/redzone.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/redzone.c 267992 2014-06-28 03:56:17Z hselasky $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -42,8 +42,7 @@
SYSCTL_ULONG(_vm_redzone, OID_AUTO, extra_mem, CTLFLAG_RD, &redzone_extra_mem,
0, "Extra memory allocated by redzone");
static int redzone_panic = 0;
-TUNABLE_INT("vm.redzone.panic", &redzone_panic);
-SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RW, &redzone_panic, 0,
+SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RWTUN, &redzone_panic, 0,
"Panic when buffer corruption is detected");
#define REDZONE_CHSIZE (16)
Modified: trunk/sys/vm/redzone.h
===================================================================
--- trunk/sys/vm/redzone.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/redzone.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -24,7 +24,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $
+ * $FreeBSD: stable/11/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $
*/
#ifndef _VM_REDZONE_H_
Modified: trunk/sys/vm/sg_pager.c
===================================================================
--- trunk/sys/vm/sg_pager.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/sg_pager.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -27,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/sg_pager.c 284100 2015-06-06 20:37:40Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/sg_pager.c 331017 2018-03-15 19:08:33Z kevans $");
/*
* This pager manages OBJT_SG objects. These objects are backed by
@@ -39,6 +39,8 @@
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/sglist.h>
+#include <sys/vmmeter.h>
+
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_object.h>
@@ -50,7 +52,7 @@
static vm_object_t sg_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
vm_ooffset_t, struct ucred *);
static void sg_pager_dealloc(vm_object_t);
-static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
static void sg_pager_putpages(vm_object_t, vm_page_t *, int,
boolean_t, int *);
static boolean_t sg_pager_haspage(vm_object_t, vm_pindex_t, int *,
@@ -97,8 +99,9 @@
* to map beyond that.
*/
size = round_page(size);
- pindex = OFF_TO_IDX(foff + size);
- if (pindex > npages)
+ pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size);
+ if (pindex > npages || pindex < UOFF_TO_IDX(foff) ||
+ pindex < UOFF_TO_IDX(size))
return (NULL);
/*
@@ -136,7 +139,8 @@
}
static int
-sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
+sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+ int *rahead)
{
struct sglist *sg;
vm_page_t m_paddr, page;
@@ -146,11 +150,13 @@
size_t space;
int i;
+ /* Since our haspage reports zero after/before, the count is 1. */
+ KASSERT(count == 1, ("%s: count %d", __func__, count));
VM_OBJECT_ASSERT_WLOCKED(object);
sg = object->handle;
memattr = object->memattr;
VM_OBJECT_WUNLOCK(object);
- offset = m[reqpage]->pindex;
+ offset = m[0]->pindex;
/*
* Lookup the physical address of the requested page. An initial
@@ -179,7 +185,7 @@
}
/* Return a fake page for the requested page. */
- KASSERT(!(m[reqpage]->flags & PG_FICTITIOUS),
+ KASSERT(!(m[0]->flags & PG_FICTITIOUS),
("backing page for SG is fake"));
/* Construct a new fake page. */
@@ -186,19 +192,18 @@
page = vm_page_getfake(paddr, memattr);
VM_OBJECT_WLOCK(object);
TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, plinks.q);
-
- /* Free the original pages and insert this fake page into the object. */
- for (i = 0; i < count; i++) {
- if (i == reqpage &&
- vm_page_replace(page, object, offset) != m[i])
- panic("sg_pager_getpages: invalid place replacement");
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
- }
- m[reqpage] = page;
+ vm_page_replace_checked(page, object, offset, m[0]);
+ vm_page_lock(m[0]);
+ vm_page_free(m[0]);
+ vm_page_unlock(m[0]);
+ m[0] = page;
page->valid = VM_PAGE_BITS_ALL;
+ if (rbehind)
+ *rbehind = 0;
+ if (rahead)
+ *rahead = 0;
+
return (VM_PAGER_OK);
}
Modified: trunk/sys/vm/swap_pager.c
===================================================================
--- trunk/sys/vm/swap_pager.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/swap_pager.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -68,7 +68,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/swap_pager.c 320557 2017-07-01 22:21:11Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/swap_pager.c 350355 2019-07-26 10:36:07Z kib $");
#include "opt_swap.h"
#include "opt_vm.h"
@@ -87,10 +87,12 @@
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
+#include <sys/pctrie.h>
#include <sys/racct.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
+#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/blist.h>
@@ -120,7 +122,7 @@
* The 64-page limit is due to the radix code (kern/subr_blist.c).
*/
#ifndef MAX_PAGEOUT_CLUSTER
-#define MAX_PAGEOUT_CLUSTER 16
+#define MAX_PAGEOUT_CLUSTER 32
#endif
#if !defined(SWB_NPAGES)
@@ -127,22 +129,17 @@
#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
#endif
+#define SWAP_META_PAGES PCTRIE_COUNT
+
/*
- * The swblock structure maps an object and a small, fixed-size range
- * of page indices to disk addresses within a swap area.
- * The collection of these mappings is implemented as a hash table.
- * Unused disk addresses within a swap area are allocated and managed
- * using a blist.
+ * A swblk structure maps each page index within a
+ * SWAP_META_PAGES-aligned and sized range to the address of an
+ * on-disk swap block (or SWAPBLK_NONE). The collection of these
+ * mappings for an entire vm object is implemented as a pc-trie.
*/
-#define SWAP_META_PAGES (SWB_NPAGES * 2)
-#define SWAP_META_MASK (SWAP_META_PAGES - 1)
-
-struct swblock {
- struct swblock *swb_hnext;
- vm_object_t swb_object;
- vm_pindex_t swb_index;
- int swb_count;
- daddr_t swb_pages[SWAP_META_PAGES];
+struct swblk {
+ vm_pindex_t p;
+ daddr_t d[SWAP_META_PAGES];
};
static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
@@ -151,7 +148,7 @@
static struct swdevt *swdevhd; /* Allocate from here next */
static int nswapdev; /* Number of swap devices */
int swap_pager_avail;
-static int swdev_syscall_active = 0; /* serialize swap(on|off) */
+static struct sx swdev_syscall_lock; /* serialize swap(on|off) */
static vm_ooffset_t swap_total;
SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
@@ -160,7 +157,7 @@
SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
"Amount of swap storage needed to back all allocated anonymous memory.");
static int overcommit = 0;
-SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
+SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0,
"Configure virtual memory overcommit behavior. See tuning(7) "
"for details.");
static unsigned long swzone;
@@ -210,7 +207,7 @@
mtx_lock(&sw_dev_mtx);
r = swap_reserved + incr;
if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
- s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count;
+ s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_cnt.v_wire_count;
s *= PAGE_SIZE;
} else
s = 0;
@@ -223,16 +220,14 @@
mtx_unlock(&sw_dev_mtx);
if (res) {
- PROC_LOCK(curproc);
UIDINFO_VMSIZE_LOCK(uip);
if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
- uip->ui_vmsize + incr > lim_cur(curproc, RLIMIT_SWAP) &&
+ uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) &&
priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
res = 0;
else
uip->ui_vmsize += incr;
UIDINFO_VMSIZE_UNLOCK(uip);
- PROC_UNLOCK(curproc);
if (!res) {
mtx_lock(&sw_dev_mtx);
swap_reserved -= incr;
@@ -314,12 +309,10 @@
racct_sub_cred(cred, RACCT_SWAP, decr);
}
-static void swapdev_strategy(struct buf *, struct swdevt *sw);
-
#define SWM_FREE 0x02 /* free, period */
#define SWM_POP 0x04 /* pop out */
-int swap_pager_full = 2; /* swap space exhaustion (task killing) */
+static int swap_pager_full = 2; /* swap space exhaustion (task killing) */
static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
static int nsw_rcount; /* free read buffers */
static int nsw_wcount_sync; /* limit write buffers / synchronous */
@@ -327,17 +320,17 @@
static int nsw_wcount_async_max;/* assigned maximum */
static int nsw_cluster_max; /* maximum VOP I/O allowed */
-static struct swblock **swhash;
-static int swhash_mask;
-static struct mtx swhash_mtx;
+static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW |
+ CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I",
+ "Maximum running async swap ops");
+static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A",
+ "Swap Fragmentation Info");
-static int swap_async_max = 4; /* maximum in-progress async I/O's */
static struct sx sw_alloc_sx;
-
-SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
- CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
-
/*
* "named" and "unnamed" anon region objects. Try to reduce the overhead
* of searching a named list by hashing it just a little.
@@ -348,9 +341,9 @@
#define NOBJLIST(handle) \
(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
-static struct mtx sw_alloc_mtx; /* protect list manipulation */
static struct pagerlst swap_pager_object_list[NOBJLISTS];
-static uma_zone_t swap_zone;
+static uma_zone_t swblk_zone;
+static uma_zone_t swpctrie_zone;
/*
* pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
@@ -361,7 +354,10 @@
swap_pager_alloc(void *handle, vm_ooffset_t size,
vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
static void swap_pager_dealloc(vm_object_t object);
-static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *,
+ int *);
+static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
+ int *, pgo_getpages_iodone_t, void *);
static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
static boolean_t
swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
@@ -374,6 +370,7 @@
.pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */
.pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
.pgo_getpages = swap_pager_getpages, /* pagein */
+ .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */
.pgo_putpages = swap_pager_putpages, /* pageout */
.pgo_haspage = swap_pager_haspage, /* get backing store status for page */
.pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */
@@ -391,7 +388,7 @@
static void swp_sizecheck(void);
static void swp_pager_async_iodone(struct buf *bp);
-static int swapongeom(struct thread *, struct vnode *);
+static int swapongeom(struct vnode *);
static int swaponvp(struct thread *, struct vnode *, u_long);
static int swapoff_one(struct swdevt *sp, struct ucred *cred);
@@ -404,22 +401,28 @@
/*
* Metadata functions
*/
-static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
-static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
+static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
static void swp_pager_meta_free_all(vm_object_t);
static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
+static void *
+swblk_trie_alloc(struct pctrie *ptree)
+{
+
+ return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ?
+ M_USE_RESERVE : 0)));
+}
+
static void
-swp_pager_free_nrpage(vm_page_t m)
+swblk_trie_free(struct pctrie *ptree, void *node)
{
- vm_page_lock(m);
- if (m->wire_count == 0)
- vm_page_free(m);
- vm_page_unlock(m);
+ uma_zfree(swpctrie_zone, node);
}
+PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);
+
/*
* SWP_SIZECHECK() - update swap_pager_full indication
*
@@ -448,33 +451,6 @@
}
/*
- * SWP_PAGER_HASH() - hash swap meta data
- *
- * This is an helper function which hashes the swapblk given
- * the object and page index. It returns a pointer to a pointer
- * to the object, or a pointer to a NULL pointer if it could not
- * find a swapblk.
- */
-static struct swblock **
-swp_pager_hash(vm_object_t object, vm_pindex_t index)
-{
- struct swblock **pswap;
- struct swblock *swap;
-
- index &= ~(vm_pindex_t)SWAP_META_MASK;
- pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
- while ((swap = *pswap) != NULL) {
- if (swap->swb_object == object &&
- swap->swb_index == index
- ) {
- break;
- }
- pswap = &swap->swb_hnext;
- }
- return (pswap);
-}
-
-/*
* SWAP_PAGER_INIT() - initialize the swap pager!
*
* Expected to be started from system init. NOTE: This code is run
@@ -491,9 +467,9 @@
for (i = 0; i < NOBJLISTS; ++i)
TAILQ_INIT(&swap_pager_object_list[i]);
- mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
sx_init(&sw_alloc_sx, "swspsx");
+ sx_init(&swdev_syscall_lock, "swsysc");
}
/*
@@ -539,21 +515,25 @@
mtx_unlock(&pbuf_mtx);
/*
- * Initialize our zone. Right now I'm just guessing on the number
- * we need based on the number of pages in the system. Each swblock
- * can hold 32 pages, so this is probably overkill. This reservation
- * is typically limited to around 32MB by default.
+ * Initialize our zone, taking the user's requested size or
+ * estimating the number we need based on the number of pages
+ * in the system.
*/
- n = cnt.v_page_count / 2;
- if (maxswzone && n > maxswzone / sizeof(struct swblock))
- n = maxswzone / sizeof(struct swblock);
+ n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) :
+ vm_cnt.v_page_count / 2;
+ swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL,
+ pctrie_zone_init, NULL, UMA_ALIGN_PTR,
+ UMA_ZONE_NOFREE | UMA_ZONE_VM);
+ if (swpctrie_zone == NULL)
+ panic("failed to create swap pctrie zone.");
+ swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL,
+ NULL, NULL, _Alignof(struct swblk) - 1,
+ UMA_ZONE_NOFREE | UMA_ZONE_VM);
+ if (swblk_zone == NULL)
+ panic("failed to create swap blk zone.");
n2 = n;
- swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
- NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
- if (swap_zone == NULL)
- panic("failed to create swap_zone.");
do {
- if (uma_zone_reserve_kva(swap_zone, n))
+ if (uma_zone_reserve_kva(swblk_zone, n))
break;
/*
* if the allocation failed, try a zone two thirds the
@@ -561,25 +541,50 @@
*/
n -= ((n + 2) / 3);
} while (n > 0);
- if (n2 != n)
- printf("Swap zone entries reduced from %lu to %lu.\n", n2, n);
+
+ /*
+ * Often uma_zone_reserve_kva() cannot reserve exactly the
+ * requested size. Account for the difference when
+ * calculating swap_maxpages.
+ */
+ n = uma_zone_get_max(swblk_zone);
+
+ if (n < n2)
+ printf("Swap blk zone entries changed from %lu to %lu.\n",
+ n2, n);
swap_maxpages = n * SWAP_META_PAGES;
- swzone = n * sizeof(struct swblock);
- n2 = n;
+ swzone = n * sizeof(struct swblk);
+ if (!uma_zone_reserve_kva(swpctrie_zone, n))
+ printf("Cannot reserve swap pctrie zone, "
+ "reduce kern.maxswzone.\n");
+}
+static vm_object_t
+swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size,
+ vm_ooffset_t offset)
+{
+ vm_object_t object;
+
+ if (cred != NULL) {
+ if (!swap_reserve_by_cred(size, cred))
+ return (NULL);
+ crhold(cred);
+ }
+
/*
- * Initialize our meta-data hash table. The swapper does not need to
- * be quite as efficient as the VM system, so we do not use an
- * oversized hash table.
- *
- * n: size of hash table, must be power of 2
- * swhash_mask: hash table index mask
+ * The un_pager.swp.swp_blks trie is initialized by
+ * vm_object_allocate() to ensure the correct order of
+ * visibility to other threads.
*/
- for (n = 1; n < n2 / 8; n *= 2)
- ;
- swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
- swhash_mask = n - 1;
- mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
+ object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset +
+ PAGE_MASK + size));
+
+ object->handle = handle;
+ if (cred != NULL) {
+ object->cred = cred;
+ object->charge = size;
+ }
+ return (object);
}
/*
@@ -587,13 +592,11 @@
* its metadata structures.
*
* This routine is called from the mmap and fork code to create a new
- * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object
- * and then converting it with swp_pager_meta_build().
+ * OBJT_SWAP object.
*
- * This routine may block in vm_object_allocate() and create a named
- * object lookup race, so we must interlock.
- *
- * MPSAFE
+ * This routine must ensure that no live duplicate is created for
+ * the named object request, which is protected against by
+ * holding the sw_alloc_sx lock in case handle != NULL.
*/
static vm_object_t
swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
@@ -600,11 +603,8 @@
vm_ooffset_t offset, struct ucred *cred)
{
vm_object_t object;
- vm_pindex_t pindex;
- pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
- if (handle) {
- mtx_lock(&Giant);
+ if (handle != NULL) {
/*
* Reference existing named region or allocate new one. There
* should not be a race here against swp_pager_meta_build()
@@ -614,40 +614,16 @@
sx_xlock(&sw_alloc_sx);
object = vm_pager_object_lookup(NOBJLIST(handle), handle);
if (object == NULL) {
- if (cred != NULL) {
- if (!swap_reserve_by_cred(size, cred)) {
- sx_xunlock(&sw_alloc_sx);
- mtx_unlock(&Giant);
- return (NULL);
- }
- crhold(cred);
+ object = swap_pager_alloc_init(handle, cred, size,
+ offset);
+ if (object != NULL) {
+ TAILQ_INSERT_TAIL(NOBJLIST(object->handle),
+ object, pager_object_list);
}
- object = vm_object_allocate(OBJT_DEFAULT, pindex);
- VM_OBJECT_WLOCK(object);
- object->handle = handle;
- if (cred != NULL) {
- object->cred = cred;
- object->charge = size;
- }
- swp_pager_meta_build(object, 0, SWAPBLK_NONE);
- VM_OBJECT_WUNLOCK(object);
}
sx_xunlock(&sw_alloc_sx);
- mtx_unlock(&Giant);
} else {
- if (cred != NULL) {
- if (!swap_reserve_by_cred(size, cred))
- return (NULL);
- crhold(cred);
- }
- object = vm_object_allocate(OBJT_DEFAULT, pindex);
- VM_OBJECT_WLOCK(object);
- if (cred != NULL) {
- object->cred = cred;
- object->charge = size;
- }
- swp_pager_meta_build(object, 0, SWAPBLK_NONE);
- VM_OBJECT_WUNLOCK(object);
+ object = swap_pager_alloc_init(handle, cred, size, offset);
}
return (object);
}
@@ -666,17 +642,22 @@
swap_pager_dealloc(vm_object_t object)
{
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj"));
+
/*
* Remove from list right away so lookups will fail if we block for
* pageout completion.
*/
if (object->handle != NULL) {
- mtx_lock(&sw_alloc_mtx);
- TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
- mtx_unlock(&sw_alloc_mtx);
+ VM_OBJECT_WUNLOCK(object);
+ sx_xlock(&sw_alloc_sx);
+ TAILQ_REMOVE(NOBJLIST(object->handle), object,
+ pager_object_list);
+ sx_xunlock(&sw_alloc_sx);
+ VM_OBJECT_WLOCK(object);
}
- VM_OBJECT_ASSERT_WLOCKED(object);
vm_object_pip_wait(object, "swpdea");
/*
@@ -763,11 +744,8 @@
mtx_unlock(&sw_dev_mtx);
if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
unmapped_buf_allowed) {
- bp->b_kvaalloc = bp->b_data;
bp->b_data = unmapped_buf;
- bp->b_kvabase = unmapped_buf;
bp->b_offset = 0;
- bp->b_flags |= B_UNMAPPED;
} else {
pmap_qenter((vm_offset_t)bp->b_data,
&bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
@@ -815,6 +793,36 @@
}
/*
+ * SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats
+ */
+static int
+sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sbuf;
+ struct swdevt *sp;
+ const char *devname;
+ int error;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+ mtx_lock(&sw_dev_mtx);
+ TAILQ_FOREACH(sp, &swtailq, sw_list) {
+ if (vn_isdisk(sp->sw_vp, NULL))
+ devname = devtoname(sp->sw_vp->v_rdev);
+ else
+ devname = "[file]";
+ sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname);
+ blist_stats(sp->sw_blist, &sbuf);
+ }
+ mtx_unlock(&sw_dev_mtx);
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ return (error);
+}
+
+/*
* SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
* range within an object.
*
@@ -906,16 +914,19 @@
* If destroysource is set, we remove the source object from the
* swap_pager internal queue now.
*/
- if (destroysource) {
- if (srcobject->handle != NULL) {
- mtx_lock(&sw_alloc_mtx);
- TAILQ_REMOVE(
- NOBJLIST(srcobject->handle),
- srcobject,
- pager_object_list
- );
- mtx_unlock(&sw_alloc_mtx);
- }
+ if (destroysource && srcobject->handle != NULL) {
+ vm_object_pip_add(srcobject, 1);
+ VM_OBJECT_WUNLOCK(srcobject);
+ vm_object_pip_add(dstobject, 1);
+ VM_OBJECT_WUNLOCK(dstobject);
+ sx_xlock(&sw_alloc_sx);
+ TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
+ pager_object_list);
+ sx_xunlock(&sw_alloc_sx);
+ VM_OBJECT_WLOCK(dstobject);
+ vm_object_pip_wakeup(dstobject);
+ VM_OBJECT_WLOCK(srcobject);
+ vm_object_pip_wakeup(srcobject);
}
/*
@@ -970,7 +981,7 @@
/*
* Free left over swap blocks in source.
*
- * We have to revert the type to OBJT_DEFAULT so we do not accidently
+ * We have to revert the type to OBJT_DEFAULT so we do not accidentally
* double-remove the object from the swap queues.
*/
if (destroysource) {
@@ -993,22 +1004,21 @@
* page and return TRUE if it does, FALSE if it doesn't.
*
* If TRUE, we also try to determine how much valid, contiguous backing
- * store exists before and after the requested page within a reasonable
- * distance. We do not try to restrict it to the swap device stripe
- * (that is handled in getpages/putpages). It probably isn't worth
- * doing here.
+ * store exists before and after the requested page.
*/
static boolean_t
-swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
+swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+ int *after)
{
- daddr_t blk0;
+ daddr_t blk, blk0;
+ int i;
VM_OBJECT_ASSERT_LOCKED(object);
+
/*
* do we have good backing store at the requested index ?
*/
blk0 = swp_pager_meta_ctl(object, pindex, 0);
-
if (blk0 == SWAPBLK_NONE) {
if (before)
*before = 0;
@@ -1021,11 +1031,7 @@
* find backwards-looking contiguous good backing store
*/
if (before != NULL) {
- int i;
-
- for (i = 1; i < (SWB_NPAGES/2); ++i) {
- daddr_t blk;
-
+ for (i = 1; i < SWB_NPAGES; i++) {
if (i > pindex)
break;
blk = swp_pager_meta_ctl(object, pindex - i, 0);
@@ -1032,7 +1038,7 @@
if (blk != blk0 - i)
break;
}
- *before = (i - 1);
+ *before = i - 1;
}
/*
@@ -1039,16 +1045,12 @@
* find forward-looking contiguous good backing store
*/
if (after != NULL) {
- int i;
-
- for (i = 1; i < (SWB_NPAGES/2); ++i) {
- daddr_t blk;
-
+ for (i = 1; i < SWB_NPAGES; i++) {
blk = swp_pager_meta_ctl(object, pindex + i, 0);
if (blk != blk0 + i)
break;
}
- *after = (i - 1);
+ *after = i - 1;
}
return (TRUE);
}
@@ -1080,134 +1082,130 @@
}
/*
- * SWAP_PAGER_GETPAGES() - bring pages in from swap
+ * swap_pager_getpages() - bring pages in from swap
*
- * Attempt to retrieve (m, count) pages from backing store, but make
- * sure we retrieve at least m[reqpage]. We try to load in as large
- * a chunk surrounding m[reqpage] as is contiguous in swap and which
- * belongs to the same object.
+ * Attempt to page in the pages in array "ma" of length "count". The
+ * caller may optionally specify that additional pages preceding and
+ * succeeding the specified range be paged in. The number of such pages
+ * is returned in the "rbehind" and "rahead" parameters, and they will
+ * be in the inactive queue upon return.
*
- * The code is designed for asynchronous operation and
- * immediate-notification of 'reqpage' but tends not to be
- * used that way. Please do not optimize-out this algorithmic
- * feature, I intend to improve on it in the future.
- *
- * The parent has a single vm_object_pip_add() reference prior to
- * calling us and we should return with the same.
- *
- * The parent has BUSY'd the pages. We should return with 'm'
- * left busy, but the others adjusted.
+ * The pages in "ma" must be busied and will remain busied upon return.
*/
static int
-swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
+swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
+ int *rahead)
{
struct buf *bp;
- vm_page_t mreq;
- int i;
- int j;
+ vm_page_t bm, mpred, msucc, p;
+ vm_pindex_t pindex;
daddr_t blk;
+ int i, maxahead, maxbehind, reqcount;
- mreq = m[reqpage];
+ reqcount = count;
- KASSERT(mreq->object == object,
- ("swap_pager_getpages: object mismatch %p/%p",
- object, mreq->object));
+ /*
+ * Determine the final number of read-behind pages and
+ * allocate them BEFORE releasing the object lock. Otherwise,
+ * there can be a problematic race with vm_object_split().
+ * Specifically, vm_object_split() might first transfer pages
+ * that precede ma[0] in the current object to a new object,
+ * and then this function incorrectly recreates those pages as
+ * read-behind pages in the current object.
+ */
+ if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead))
+ return (VM_PAGER_FAIL);
/*
- * Calculate range to retrieve. The pages have already been assigned
- * their swapblks. We require a *contiguous* range but we know it to
- * not span devices. If we do not supply it, bad things
- * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
- * loops are set up such that the case(s) are handled implicitly.
- *
- * The swp_*() calls must be made with the object locked.
+ * Clip the readahead and readbehind ranges to exclude resident pages.
*/
- blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
-
- for (i = reqpage - 1; i >= 0; --i) {
- daddr_t iblk;
-
- iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
- if (blk != iblk + (reqpage - i))
- break;
+ if (rahead != NULL) {
+ KASSERT(reqcount - 1 <= maxahead,
+ ("page count %d extends beyond swap block", reqcount));
+ *rahead = imin(*rahead, maxahead - (reqcount - 1));
+ pindex = ma[reqcount - 1]->pindex;
+ msucc = TAILQ_NEXT(ma[reqcount - 1], listq);
+ if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead)
+ *rahead = msucc->pindex - pindex - 1;
}
- ++i;
+ if (rbehind != NULL) {
+ *rbehind = imin(*rbehind, maxbehind);
+ pindex = ma[0]->pindex;
+ mpred = TAILQ_PREV(ma[0], pglist, listq);
+ if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind)
+ *rbehind = pindex - mpred->pindex - 1;
+ }
- for (j = reqpage + 1; j < count; ++j) {
- daddr_t jblk;
+ bm = ma[0];
+ for (i = 0; i < count; i++)
+ ma[i]->oflags |= VPO_SWAPINPROG;
- jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
- if (blk != jblk - (j - reqpage))
- break;
- }
-
/*
- * free pages outside our collection range. Note: we never free
- * mreq, it must remain busy throughout.
+ * Allocate readahead and readbehind pages.
*/
- if (0 < i || j < count) {
- int k;
-
- for (k = 0; k < i; ++k)
- swp_pager_free_nrpage(m[k]);
- for (k = j; k < count; ++k)
- swp_pager_free_nrpage(m[k]);
+ if (rbehind != NULL) {
+ for (i = 1; i <= *rbehind; i++) {
+ p = vm_page_alloc(object, ma[0]->pindex - i,
+ VM_ALLOC_NORMAL);
+ if (p == NULL)
+ break;
+ p->oflags |= VPO_SWAPINPROG;
+ bm = p;
+ }
+ *rbehind = i - 1;
}
+ if (rahead != NULL) {
+ for (i = 0; i < *rahead; i++) {
+ p = vm_page_alloc(object,
+ ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL);
+ if (p == NULL)
+ break;
+ p->oflags |= VPO_SWAPINPROG;
+ }
+ *rahead = i;
+ }
+ if (rbehind != NULL)
+ count += *rbehind;
+ if (rahead != NULL)
+ count += *rahead;
- /*
- * Return VM_PAGER_FAIL if we have nothing to do. Return mreq
- * still busy, but the others unbusied.
- */
- if (blk == SWAPBLK_NONE)
- return (VM_PAGER_FAIL);
+ vm_object_pip_add(object, count);
- /*
- * Getpbuf() can sleep.
- */
+ pindex = bm->pindex;
+ blk = swp_pager_meta_ctl(object, pindex, 0);
+ KASSERT(blk != SWAPBLK_NONE,
+ ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex));
+
VM_OBJECT_WUNLOCK(object);
- /*
- * Get a swap buffer header to perform the IO
- */
bp = getpbuf(&nsw_rcount);
+ /* Pages cannot leave the object while busy. */
+ for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) {
+ MPASS(p->pindex == bm->pindex + i);
+ bp->b_pages[i] = p;
+ }
+
bp->b_flags |= B_PAGING;
-
bp->b_iocmd = BIO_READ;
bp->b_iodone = swp_pager_async_iodone;
bp->b_rcred = crhold(thread0.td_ucred);
bp->b_wcred = crhold(thread0.td_ucred);
- bp->b_blkno = blk - (reqpage - i);
- bp->b_bcount = PAGE_SIZE * (j - i);
- bp->b_bufsize = PAGE_SIZE * (j - i);
- bp->b_pager.pg_reqpage = reqpage - i;
+ bp->b_blkno = blk;
+ bp->b_bcount = PAGE_SIZE * count;
+ bp->b_bufsize = PAGE_SIZE * count;
+ bp->b_npages = count;
+ bp->b_pgbefore = rbehind != NULL ? *rbehind : 0;
+ bp->b_pgafter = rahead != NULL ? *rahead : 0;
- VM_OBJECT_WLOCK(object);
- {
- int k;
-
- for (k = i; k < j; ++k) {
- bp->b_pages[k - i] = m[k];
- m[k]->oflags |= VPO_SWAPINPROG;
- }
- }
- bp->b_npages = j - i;
-
PCPU_INC(cnt.v_swapin);
- PCPU_ADD(cnt.v_swappgsin, bp->b_npages);
+ PCPU_ADD(cnt.v_swappgsin, count);
/*
- * We still hold the lock on mreq, and our automatic completion routine
- * does not remove it.
- */
- vm_object_pip_add(object, bp->b_npages);
- VM_OBJECT_WUNLOCK(object);
-
- /*
* perform the I/O. NOTE!!! bp cannot be considered valid after
* this point because we automatically release it on completion.
* Instead, we look at the one page we are interested in which we
* still hold a lock on even through the I/O completion.
*
- * The other pages in our m[] array are also released on completion,
+ * The other pages in our ma[] array are also released on completion,
* so we cannot assume they are valid anymore either.
*
* NOTE: b_blkno is destroyed by the call to swapdev_strategy
@@ -1216,13 +1214,13 @@
swp_pager_strategy(bp);
/*
- * wait for the page we want to complete. VPO_SWAPINPROG is always
+ * Wait for the pages we want to complete. VPO_SWAPINPROG is always
* cleared on completion. If an I/O error occurs, SWAPBLK_NONE
- * is set in the meta-data.
+ * is set in the metadata for each page in the request.
*/
VM_OBJECT_WLOCK(object);
- while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
- mreq->oflags |= VPO_SWAPSLEEP;
+ while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) {
+ ma[0]->oflags |= VPO_SWAPSLEEP;
PCPU_INC(cnt.v_intrans);
if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP,
"swread", hz * 20)) {
@@ -1233,16 +1231,14 @@
}
/*
- * mreq is left busied after completion, but all the other pages
- * are freed. If we had an unrecoverable read error the page will
- * not be valid.
+ * If we had an unrecoverable read error pages will not be valid.
*/
- if (mreq->valid != VM_PAGE_BITS_ALL) {
- return (VM_PAGER_ERROR);
- } else {
- return (VM_PAGER_OK);
- }
+ for (i = 0; i < reqcount; i++)
+ if (ma[i]->valid != VM_PAGE_BITS_ALL)
+ return (VM_PAGER_ERROR);
+ return (VM_PAGER_OK);
+
/*
* A final note: in a low swap situation, we cannot deallocate swap
* and mark a page dirty here because the caller is likely to mark
@@ -1252,6 +1248,39 @@
}
/*
+ * swap_pager_getpages_async():
+ *
+ * Right now this is emulation of asynchronous operation on top of
+ * swap_pager_getpages().
+ */
+static int
+swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count,
+ int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
+{
+ int r, error;
+
+ r = swap_pager_getpages(object, ma, count, rbehind, rahead);
+ VM_OBJECT_WUNLOCK(object);
+ switch (r) {
+ case VM_PAGER_OK:
+ error = 0;
+ break;
+ case VM_PAGER_ERROR:
+ error = EIO;
+ break;
+ case VM_PAGER_FAIL:
+ error = EINVAL;
+ break;
+ default:
+ panic("unhandled swap_pager_getpages() error %d", r);
+ }
+ (iodone)(arg, ma, count, error);
+ VM_OBJECT_WLOCK(object);
+
+ return (r);
+}
+
+/*
* swap_pager_putpages:
*
* Assign swap (if necessary) and initiate I/O on the specified pages.
@@ -1273,17 +1302,17 @@
* those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
* We need to unbusy the rest on I/O completion.
*/
-void
-swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
+static void
+swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
int flags, int *rtvals)
{
int i, n;
boolean_t sync;
- if (count && m[0]->object != object) {
+ if (count && ma[0]->object != object) {
panic("swap_pager_putpages: object mismatch %p/%p",
object,
- m[0]->object
+ ma[0]->object
);
}
@@ -1307,39 +1336,6 @@
/*
* Step 2
*
- * Update nsw parameters from swap_async_max sysctl values.
- * Do not let the sysop crash the machine with bogus numbers.
- */
- mtx_lock(&pbuf_mtx);
- if (swap_async_max != nsw_wcount_async_max) {
- int n;
-
- /*
- * limit range
- */
- if ((n = swap_async_max) > nswbuf / 2)
- n = nswbuf / 2;
- if (n < 1)
- n = 1;
- swap_async_max = n;
-
- /*
- * Adjust difference ( if possible ). If the current async
- * count is too low, we may not be able to make the adjustment
- * at this time.
- */
- n -= nsw_wcount_async_max;
- if (nsw_wcount_async + n >= 0) {
- nsw_wcount_async += n;
- nsw_wcount_async_max += n;
- wakeup(&nsw_wcount_async);
- }
- }
- mtx_unlock(&pbuf_mtx);
-
- /*
- * Step 3
- *
* Assign swap blocks and issue I/O. We reallocate swap on the fly.
* The page is left dirty until the pageout operation completes
* successfully.
@@ -1394,7 +1390,7 @@
VM_OBJECT_WLOCK(object);
for (j = 0; j < n; ++j) {
- vm_page_t mreq = m[i+j];
+ vm_page_t mreq = ma[i+j];
swp_pager_meta_build(
mreq->object,
@@ -1402,8 +1398,6 @@
blk + j
);
MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
- rtvals[i+j] = VM_PAGER_OK;
-
mreq->oflags |= VPO_SWAPINPROG;
bp->b_pages[j] = mreq;
}
@@ -1419,6 +1413,16 @@
PCPU_ADD(cnt.v_swappgsout, bp->b_npages);
/*
+ * We unconditionally set rtvals[] to VM_PAGER_PEND so that we
+ * can call the async completion routine at the end of a
+ * synchronous I/O operation. Otherwise, our caller would
+ * perform duplicate unbusy and wakeup operations on the page
+ * and object, respectively.
+ */
+ for (j = 0; j < n; j++)
+ rtvals[i + j] = VM_PAGER_PEND;
+
+ /*
* asynchronous
*
* NOTE: b_blkno is destroyed by the call to swapdev_strategy
@@ -1427,10 +1431,6 @@
bp->b_iodone = swp_pager_async_iodone;
BUF_KERNPROC(bp);
swp_pager_strategy(bp);
-
- for (j = 0; j < n; ++j)
- rtvals[i+j] = VM_PAGER_PEND;
- /* restart outter loop */
continue;
}
@@ -1443,14 +1443,10 @@
swp_pager_strategy(bp);
/*
- * Wait for the sync I/O to complete, then update rtvals.
- * We just set the rtvals[] to VM_PAGER_PEND so we can call
- * our async completion routine at the end, thus avoiding a
- * double-free.
+ * Wait for the sync I/O to complete.
*/
bwait(bp, PVM, "swwrt");
- for (j = 0; j < n; ++j)
- rtvals[i+j] = VM_PAGER_PEND;
+
/*
* Now that we are through with the bp, we can call the
* normal async completion, which frees everything up.
@@ -1491,12 +1487,10 @@
/*
* remove the mapping for kernel virtual
*/
- if ((bp->b_flags & B_UNMAPPED) != 0) {
- bp->b_data = bp->b_kvaalloc;
- bp->b_kvabase = bp->b_kvaalloc;
- bp->b_flags &= ~B_UNMAPPED;
- } else
+ if (buf_mapped(bp))
pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
+ else
+ bp->b_data = bp->b_kvabase;
if (bp->b_npages) {
object = bp->b_pages[0]->object;
@@ -1529,33 +1523,11 @@
*/
if (bp->b_iocmd == BIO_READ) {
/*
- * When reading, reqpage needs to stay
- * locked for the parent, but all other
- * pages can be freed. We still want to
- * wakeup the parent waiting on the page,
- * though. ( also: pg_reqpage can be -1 and
- * not match anything ).
- *
- * We have to wake specifically requested pages
- * up too because we cleared VPO_SWAPINPROG and
- * someone may be waiting for that.
- *
* NOTE: for reads, m->dirty will probably
* be overridden by the original caller of
* getpages so don't play cute tricks here.
*/
m->valid = 0;
- if (i != bp->b_pager.pg_reqpage)
- swp_pager_free_nrpage(m);
- else {
- vm_page_lock(m);
- vm_page_flash(m);
- vm_page_unlock(m);
- }
- /*
- * If i == bp->b_pager.pg_reqpage, do not wake
- * the page up. The caller needs to.
- */
} else {
/*
* If a write error occurs, reactivate page
@@ -1562,7 +1534,7 @@
* so it doesn't clog the inactive list,
* then finish the I/O.
*/
- vm_page_dirty(m);
+ MPASS(m->dirty == VM_PAGE_BITS_ALL);
vm_page_lock(m);
vm_page_activate(m);
vm_page_unlock(m);
@@ -1577,54 +1549,33 @@
* want to do that anyway, but it was an optimization
* that existed in the old swapper for a time before
* it got ripped out due to precisely this problem.
- *
- * If not the requested page then deactivate it.
- *
- * Note that the requested page, reqpage, is left
- * busied, but we still have to wake it up. The
- * other pages are released (unbusied) by
- * vm_page_xunbusy().
*/
KASSERT(!pmap_page_is_mapped(m),
("swp_pager_async_iodone: page %p is mapped", m));
- m->valid = VM_PAGE_BITS_ALL;
KASSERT(m->dirty == 0,
("swp_pager_async_iodone: page %p is dirty", m));
- /*
- * We have to wake specifically requested pages
- * up too because we cleared VPO_SWAPINPROG and
- * could be waiting for it in getpages. However,
- * be sure to not unbusy getpages specifically
- * requested page - getpages expects it to be
- * left busy.
- */
- if (i != bp->b_pager.pg_reqpage) {
- vm_page_lock(m);
- vm_page_deactivate(m);
- vm_page_unlock(m);
- vm_page_xunbusy(m);
- } else {
- vm_page_lock(m);
- vm_page_flash(m);
- vm_page_unlock(m);
- }
+ m->valid = VM_PAGE_BITS_ALL;
+ if (i < bp->b_pgbefore ||
+ i >= bp->b_npages - bp->b_pgafter)
+ vm_page_readahead_finish(m);
} else {
/*
* For write success, clear the dirty
* status, then finish the I/O ( which decrements the
* busy count and possibly wakes waiter's up ).
+ * A page is only written to swap after a period of
+ * inactivity. Therefore, we do not expect it to be
+ * reused.
*/
KASSERT(!pmap_page_is_write_mapped(m),
("swp_pager_async_iodone: page %p is not write"
" protected", m));
vm_page_undirty(m);
+ vm_page_lock(m);
+ vm_page_deactivate_noreuse(m);
+ vm_page_unlock(m);
vm_page_sunbusy(m);
- if (vm_page_count_severe()) {
- vm_page_lock(m);
- vm_page_try_to_cache(m);
- vm_page_unlock(m);
- }
}
}
@@ -1661,51 +1612,17 @@
}
/*
- * swap_pager_isswapped:
+ * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
*
- * Return 1 if at least one page in the given object is paged
- * out to the given swap device.
+ * This routine dissociates the page at the given index within an object
+ * from its backing store, paging it in if it does not reside in memory.
+ * If the page is paged in, it is marked dirty and placed in the laundry
+ * queue. The page is marked dirty because it no longer has backing
+ * store. It is placed in the laundry queue because it has not been
+ * accessed recently. Otherwise, it would already reside in memory.
*
- * This routine may not sleep.
- */
-int
-swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
-{
- daddr_t index = 0;
- int bcount;
- int i;
-
- VM_OBJECT_ASSERT_WLOCKED(object);
- if (object->type != OBJT_SWAP)
- return (0);
-
- mtx_lock(&swhash_mtx);
- for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
- struct swblock *swap;
-
- if ((swap = *swp_pager_hash(object, index)) != NULL) {
- for (i = 0; i < SWAP_META_PAGES; ++i) {
- if (swp_pager_isondev(swap->swb_pages[i], sp)) {
- mtx_unlock(&swhash_mtx);
- return (1);
- }
- }
- }
- index += SWAP_META_PAGES;
- }
- mtx_unlock(&swhash_mtx);
- return (0);
-}
-
-/*
- * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
- *
- * This routine dissociates the page at the given index within a
- * swap block from its backing store, paging it in if necessary.
- * If the page is paged in, it is placed in the inactive queue,
- * since it had its backing store ripped out from under it.
- * We also attempt to swap in all other pages in the swap block,
- * we only guarantee that the one at the specified index is
+ * We also attempt to swap in all other pages in the swap block.
+ * However, we only guarantee that the one at the specified index is
* paged in.
*
* XXX - The code to page the whole block in doesn't work, so we
@@ -1719,7 +1636,7 @@
vm_object_pip_add(object, 1);
m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
if (m->valid == VM_PAGE_BITS_ALL) {
- vm_object_pip_subtract(object, 1);
+ vm_object_pip_wakeup(object);
vm_page_dirty(m);
vm_page_lock(m);
vm_page_activate(m);
@@ -1729,12 +1646,12 @@
return;
}
- if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
+ if (swap_pager_getpages(object, &m, 1, NULL, NULL) != VM_PAGER_OK)
panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
- vm_object_pip_subtract(object, 1);
+ vm_object_pip_wakeup(object);
vm_page_dirty(m);
vm_page_lock(m);
- vm_page_deactivate(m);
+ vm_page_launder(m);
vm_page_unlock(m);
vm_page_xunbusy(m);
vm_pager_page_unswapped(m);
@@ -1753,50 +1670,56 @@
static void
swap_pager_swapoff(struct swdevt *sp)
{
- struct swblock *swap;
- vm_object_t locked_obj, object;
- vm_pindex_t pindex;
- int i, j, retries;
+ struct swblk *sb;
+ vm_object_t object;
+ vm_pindex_t pi;
+ int i, retries;
- GIANT_REQUIRED;
+ sx_assert(&swdev_syscall_lock, SA_XLOCKED);
retries = 0;
- locked_obj = NULL;
full_rescan:
- mtx_lock(&swhash_mtx);
- for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
-restart:
- for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
- object = swap->swb_object;
- pindex = swap->swb_index;
- for (j = 0; j < SWAP_META_PAGES; ++j) {
- if (!swp_pager_isondev(swap->swb_pages[j], sp))
+ mtx_lock(&vm_object_list_mtx);
+ TAILQ_FOREACH(object, &vm_object_list, object_list) {
+ if (object->type != OBJT_SWAP)
+ continue;
+ mtx_unlock(&vm_object_list_mtx);
+ /* Depends on type-stability. */
+ VM_OBJECT_WLOCK(object);
+
+ /*
+ * Dead objects are eventually terminated on their own.
+ */
+ if ((object->flags & OBJ_DEAD) != 0)
+ goto next_obj;
+
+ /*
+ * Sync with fences placed after pctrie
+ * initialization. We must not access pctrie below
+ * unless we checked that our object is swap and not
+ * dead.
+ */
+ atomic_thread_fence_acq();
+ if (object->type != OBJT_SWAP)
+ goto next_obj;
+
+ for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
+ &object->un_pager.swp.swp_blks, pi)) != NULL; ) {
+ pi = sb->p + SWAP_META_PAGES;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] == SWAPBLK_NONE)
continue;
- if (locked_obj != object) {
- if (locked_obj != NULL)
- VM_OBJECT_WUNLOCK(locked_obj);
- locked_obj = object;
- if (!VM_OBJECT_TRYWLOCK(object)) {
- mtx_unlock(&swhash_mtx);
- /* Depends on type-stability. */
- VM_OBJECT_WLOCK(object);
- mtx_lock(&swhash_mtx);
- goto restart;
- }
- }
- MPASS(locked_obj == object);
- mtx_unlock(&swhash_mtx);
- swp_pager_force_pagein(object, pindex + j);
- mtx_lock(&swhash_mtx);
- goto restart;
+ if (swp_pager_isondev(sb->d[i], sp))
+ swp_pager_force_pagein(object,
+ sb->p + i);
}
}
+next_obj:
+ VM_OBJECT_WUNLOCK(object);
+ mtx_lock(&vm_object_list_mtx);
}
- mtx_unlock(&swhash_mtx);
- if (locked_obj != NULL) {
- VM_OBJECT_WUNLOCK(locked_obj);
- locked_obj = NULL;
- }
+ mtx_unlock(&vm_object_list_mtx);
+
if (sp->sw_used) {
/*
* Objects may be locked or paging to the device being
@@ -1839,94 +1762,120 @@
static void
swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
{
- static volatile int exhausted;
- struct swblock *swap;
- struct swblock **pswap;
- int idx;
+ static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
+ struct swblk *sb, *sb1;
+ vm_pindex_t modpi, rdpi;
+ int error, i;
VM_OBJECT_ASSERT_WLOCKED(object);
+
/*
* Convert default object to swap object if necessary
*/
if (object->type != OBJT_SWAP) {
+ pctrie_init(&object->un_pager.swp.swp_blks);
+
+ /*
+ * Ensure that swap_pager_swapoff()'s iteration over
+ * object_list does not see a garbage pctrie.
+ */
+ atomic_thread_fence_rel();
+
object->type = OBJT_SWAP;
- object->un_pager.swp.swp_bcount = 0;
-
- if (object->handle != NULL) {
- mtx_lock(&sw_alloc_mtx);
- TAILQ_INSERT_TAIL(
- NOBJLIST(object->handle),
- object,
- pager_object_list
- );
- mtx_unlock(&sw_alloc_mtx);
- }
+ KASSERT(object->handle == NULL, ("default pager with handle"));
}
- /*
- * Locate hash entry. If not found create, but if we aren't adding
- * anything just return. If we run out of space in the map we wait
- * and, since the hash table may have changed, retry.
- */
-retry:
- mtx_lock(&swhash_mtx);
- pswap = swp_pager_hash(object, pindex);
-
- if ((swap = *pswap) == NULL) {
- int i;
-
+ rdpi = rounddown(pindex, SWAP_META_PAGES);
+ sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi);
+ if (sb == NULL) {
if (swapblk == SWAPBLK_NONE)
- goto done;
-
- swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT |
- (curproc == pageproc ? M_USE_RESERVE : 0));
- if (swap == NULL) {
- mtx_unlock(&swhash_mtx);
+ return;
+ for (;;) {
+ sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc ==
+ pageproc ? M_USE_RESERVE : 0));
+ if (sb != NULL) {
+ sb->p = rdpi;
+ for (i = 0; i < SWAP_META_PAGES; i++)
+ sb->d[i] = SWAPBLK_NONE;
+ if (atomic_cmpset_int(&swblk_zone_exhausted,
+ 1, 0))
+ printf("swblk zone ok\n");
+ break;
+ }
VM_OBJECT_WUNLOCK(object);
- if (uma_zone_exhausted(swap_zone)) {
- if (atomic_cmpset_int(&exhausted, 0, 1))
- printf("swap zone exhausted, "
+ if (uma_zone_exhausted(swblk_zone)) {
+ if (atomic_cmpset_int(&swblk_zone_exhausted,
+ 0, 1))
+ printf("swap blk zone exhausted, "
"increase kern.maxswzone\n");
vm_pageout_oom(VM_OOM_SWAPZ);
- pause("swzonex", 10);
+ pause("swzonxb", 10);
} else
- VM_WAIT;
+ uma_zwait(swblk_zone);
VM_OBJECT_WLOCK(object);
- goto retry;
+ sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+ rdpi);
+ if (sb != NULL)
+ /*
+ * Somebody swapped out a nearby page,
+ * allocating swblk at the rdpi index,
+ * while we dropped the object lock.
+ */
+ goto allocated;
}
+ for (;;) {
+ error = SWAP_PCTRIE_INSERT(
+ &object->un_pager.swp.swp_blks, sb);
+ if (error == 0) {
+ if (atomic_cmpset_int(&swpctrie_zone_exhausted,
+ 1, 0))
+ printf("swpctrie zone ok\n");
+ break;
+ }
+ VM_OBJECT_WUNLOCK(object);
+ if (uma_zone_exhausted(swpctrie_zone)) {
+ if (atomic_cmpset_int(&swpctrie_zone_exhausted,
+ 0, 1))
+ printf("swap pctrie zone exhausted, "
+ "increase kern.maxswzone\n");
+ vm_pageout_oom(VM_OOM_SWAPZ);
+ pause("swzonxp", 10);
+ } else
+ uma_zwait(swpctrie_zone);
+ VM_OBJECT_WLOCK(object);
+ sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+ rdpi);
+ if (sb1 != NULL) {
+ uma_zfree(swblk_zone, sb);
+ sb = sb1;
+ goto allocated;
+ }
+ }
+ }
+allocated:
+ MPASS(sb->p == rdpi);
- if (atomic_cmpset_int(&exhausted, 1, 0))
- printf("swap zone ok\n");
+ modpi = pindex % SWAP_META_PAGES;
+ /* Delete prior contents of metadata. */
+ if (sb->d[modpi] != SWAPBLK_NONE)
+ swp_pager_freeswapspace(sb->d[modpi], 1);
+ /* Enter block into metadata. */
+ sb->d[modpi] = swapblk;
- swap->swb_hnext = NULL;
- swap->swb_object = object;
- swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
- swap->swb_count = 0;
-
- ++object->un_pager.swp.swp_bcount;
-
- for (i = 0; i < SWAP_META_PAGES; ++i)
- swap->swb_pages[i] = SWAPBLK_NONE;
- }
-
/*
- * Delete prior contents of metadata
+ * Free the swblk if we end up with the empty page run.
*/
- idx = pindex & SWAP_META_MASK;
-
- if (swap->swb_pages[idx] != SWAPBLK_NONE) {
- swp_pager_freeswapspace(swap->swb_pages[idx], 1);
- --swap->swb_count;
+ if (swapblk == SWAPBLK_NONE) {
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ break;
+ }
+ if (i == SWAP_META_PAGES) {
+ SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+ rdpi);
+ uma_zfree(swblk_zone, sb);
+ }
}
-
- /*
- * Enter block into metadata
- */
- swap->swb_pages[idx] = swapblk;
- if (swapblk != SWAPBLK_NONE)
- ++swap->swb_count;
-done:
- mtx_unlock(&swhash_mtx);
}
/*
@@ -1940,41 +1889,39 @@
* with resident pages.
*/
static void
-swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
+swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count)
{
+ struct swblk *sb;
+ vm_pindex_t last;
+ int i;
+ bool empty;
- VM_OBJECT_ASSERT_LOCKED(object);
- if (object->type != OBJT_SWAP)
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ if (object->type != OBJT_SWAP || count == 0)
return;
- while (count > 0) {
- struct swblock **pswap;
- struct swblock *swap;
-
- mtx_lock(&swhash_mtx);
- pswap = swp_pager_hash(object, index);
-
- if ((swap = *pswap) != NULL) {
- daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
-
- if (v != SWAPBLK_NONE) {
- swp_pager_freeswapspace(v, 1);
- swap->swb_pages[index & SWAP_META_MASK] =
- SWAPBLK_NONE;
- if (--swap->swb_count == 0) {
- *pswap = swap->swb_hnext;
- uma_zfree(swap_zone, swap);
- --object->un_pager.swp.swp_bcount;
- }
- }
- --count;
- ++index;
- } else {
- int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
- count -= n;
- index += n;
+ last = pindex + count - 1;
+ for (;;) {
+ sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+ rounddown(pindex, SWAP_META_PAGES));
+ if (sb == NULL || sb->p > last)
+ break;
+ empty = true;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] == SWAPBLK_NONE)
+ continue;
+ if (pindex <= sb->p + i && sb->p + i <= last) {
+ swp_pager_freeswapspace(sb->d[i], 1);
+ sb->d[i] = SWAPBLK_NONE;
+ } else
+ empty = false;
}
- mtx_unlock(&swhash_mtx);
+ pindex = sb->p + SWAP_META_PAGES;
+ if (empty) {
+ SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+ sb->p);
+ uma_zfree(swblk_zone, sb);
+ }
}
}
@@ -1987,9 +1934,8 @@
static void
swp_pager_meta_free_all(vm_object_t object)
{
- struct swblock **pswap, *swap;
- vm_pindex_t index;
- daddr_t v;
+ struct swblk *sb;
+ vm_pindex_t pindex;
int i;
VM_OBJECT_ASSERT_WLOCKED(object);
@@ -1996,27 +1942,15 @@
if (object->type != OBJT_SWAP)
return;
- index = 0;
- while (object->un_pager.swp.swp_bcount != 0) {
- mtx_lock(&swhash_mtx);
- pswap = swp_pager_hash(object, index);
- if ((swap = *pswap) != NULL) {
- for (i = 0; i < SWAP_META_PAGES; ++i) {
- v = swap->swb_pages[i];
- if (v != SWAPBLK_NONE) {
- --swap->swb_count;
- swp_pager_freeswapspace(v, 1);
- }
- }
- if (swap->swb_count != 0)
- panic(
- "swap_pager_meta_free_all: swb_count != 0");
- *pswap = swap->swb_hnext;
- uma_zfree(swap_zone, swap);
- --object->un_pager.swp.swp_bcount;
+ for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
+ &object->un_pager.swp.swp_blks, pindex)) != NULL;) {
+ pindex = sb->p + SWAP_META_PAGES;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ swp_pager_freeswapspace(sb->d[i], 1);
}
- mtx_unlock(&swhash_mtx);
- index += SWAP_META_PAGES;
+ SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
+ uma_zfree(swblk_zone, sb);
}
}
@@ -2030,9 +1964,6 @@
* was invalid. This routine will automatically free any invalid
* meta-data swapblks.
*
- * It is not possible to store invalid swapblks in the swap meta data
- * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
- *
* When acting on a busy resident page and paging is in progress, we
* have to wait until paging is complete but otherwise can act on the
* busy page.
@@ -2043,44 +1974,90 @@
static daddr_t
swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
{
- struct swblock **pswap;
- struct swblock *swap;
+ struct swblk *sb;
daddr_t r1;
- int idx;
+ int i;
- VM_OBJECT_ASSERT_LOCKED(object);
+ if ((flags & (SWM_FREE | SWM_POP)) != 0)
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ else
+ VM_OBJECT_ASSERT_LOCKED(object);
+
/*
- * The meta data only exists of the object is OBJT_SWAP
+ * The meta data only exists if the object is OBJT_SWAP
* and even then might not be allocated yet.
*/
if (object->type != OBJT_SWAP)
return (SWAPBLK_NONE);
- r1 = SWAPBLK_NONE;
- mtx_lock(&swhash_mtx);
- pswap = swp_pager_hash(object, pindex);
+ sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+ rounddown(pindex, SWAP_META_PAGES));
+ if (sb == NULL)
+ return (SWAPBLK_NONE);
+ r1 = sb->d[pindex % SWAP_META_PAGES];
+ if (r1 == SWAPBLK_NONE)
+ return (SWAPBLK_NONE);
+ if ((flags & (SWM_FREE | SWM_POP)) != 0) {
+ sb->d[pindex % SWAP_META_PAGES] = SWAPBLK_NONE;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ break;
+ }
+ if (i == SWAP_META_PAGES) {
+ SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+ rounddown(pindex, SWAP_META_PAGES));
+ uma_zfree(swblk_zone, sb);
+ }
+ }
+ if ((flags & SWM_FREE) != 0) {
+ swp_pager_freeswapspace(r1, 1);
+ r1 = SWAPBLK_NONE;
+ }
+ return (r1);
+}
- if ((swap = *pswap) != NULL) {
- idx = pindex & SWAP_META_MASK;
- r1 = swap->swb_pages[idx];
+/*
+ * Returns the least page index which is greater than or equal to the
+ * parameter pindex and for which there is a swap block allocated.
+ * Returns object's size if the object's type is not swap or if there
+ * are no allocated swap blocks for the object after the requested
+ * pindex.
+ */
+vm_pindex_t
+swap_pager_find_least(vm_object_t object, vm_pindex_t pindex)
+{
+ struct swblk *sb;
+ int i;
- if (r1 != SWAPBLK_NONE) {
- if (flags & SWM_FREE) {
- swp_pager_freeswapspace(r1, 1);
- r1 = SWAPBLK_NONE;
- }
- if (flags & (SWM_FREE|SWM_POP)) {
- swap->swb_pages[idx] = SWAPBLK_NONE;
- if (--swap->swb_count == 0) {
- *pswap = swap->swb_hnext;
- uma_zfree(swap_zone, swap);
- --object->un_pager.swp.swp_bcount;
- }
- }
+ VM_OBJECT_ASSERT_LOCKED(object);
+ if (object->type != OBJT_SWAP)
+ return (object->size);
+
+ sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+ rounddown(pindex, SWAP_META_PAGES));
+ if (sb == NULL)
+ return (object->size);
+ if (sb->p < pindex) {
+ for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ return (sb->p + i);
}
+ sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+ roundup(pindex, SWAP_META_PAGES));
+ if (sb == NULL)
+ return (object->size);
}
- mtx_unlock(&swhash_mtx);
- return (r1);
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->d[i] != SWAPBLK_NONE)
+ return (sb->p + i);
+ }
+
+ /*
+ * We get here if a swblk is present in the trie but it
+ * doesn't map any blocks.
+ */
+ MPASS(0);
+ return (object->size);
}
/*
@@ -2110,16 +2087,13 @@
if (error)
return (error);
- mtx_lock(&Giant);
- while (swdev_syscall_active)
- tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
- swdev_syscall_active = 1;
+ sx_xlock(&swdev_syscall_lock);
/*
* Swap metadata may not fit in the KVM if we have physical
* memory of >1GB.
*/
- if (swap_zone == NULL) {
+ if (swblk_zone == NULL) {
error = ENOMEM;
goto done;
}
@@ -2134,7 +2108,7 @@
vp = nd.ni_vp;
if (vn_isdisk(vp, &error)) {
- error = swapongeom(td, vp);
+ error = swapongeom(vp);
} else if (vp->v_type == VREG &&
(vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
(error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
@@ -2148,9 +2122,7 @@
if (error)
vrele(vp);
done:
- swdev_syscall_active = 0;
- wakeup_one(&swdev_syscall_active);
- mtx_unlock(&Giant);
+ sx_xunlock(&swdev_syscall_lock);
return (error);
}
@@ -2157,15 +2129,16 @@
/*
* Check that the total amount of swap currently configured does not
* exceed half the theoretical maximum. If it does, print a warning
- * message and return -1; otherwise, return 0.
+ * message.
*/
-static int
-swapon_check_swzone(unsigned long npages)
+static void
+swapon_check_swzone(void)
{
- unsigned long maxpages;
+ unsigned long maxpages, npages;
+ npages = swap_total / PAGE_SIZE;
/* absolute maximum we can handle assuming 100% efficiency */
- maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES;
+ maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES;
/* recommend using no more than half that amount */
if (npages > maxpages / 2) {
@@ -2174,9 +2147,7 @@
npages, maxpages / 2);
printf("warning: increase kern.maxswzone "
"or reduce amount of swap.\n");
- return (-1);
}
- return (0);
}
static void
@@ -2212,7 +2183,6 @@
sp->sw_vp = vp;
sp->sw_id = id;
sp->sw_dev = dev;
- sp->sw_flags = 0;
sp->sw_nblks = nblks;
sp->sw_used = 0;
sp->sw_strategy = strategy;
@@ -2244,7 +2214,7 @@
nswapdev++;
swap_pager_avail += nblks - 2;
swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
- swapon_check_swzone(swap_total / PAGE_SIZE);
+ swapon_check_swzone();
swp_sizecheck();
mtx_unlock(&sw_dev_mtx);
}
@@ -2280,10 +2250,7 @@
if (error)
return (error);
- mtx_lock(&Giant);
- while (swdev_syscall_active)
- tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
- swdev_syscall_active = 1;
+ sx_xlock(&swdev_syscall_lock);
NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
td);
@@ -2305,9 +2272,7 @@
}
error = swapoff_one(sp, td->td_ucred);
done:
- swdev_syscall_active = 0;
- wakeup_one(&swdev_syscall_active);
- mtx_unlock(&Giant);
+ sx_xunlock(&swdev_syscall_lock);
return (error);
}
@@ -2319,7 +2284,7 @@
int error;
#endif
- mtx_assert(&Giant, MA_OWNED);
+ sx_assert(&swdev_syscall_lock, SA_XLOCKED);
#ifdef MAC
(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
error = mac_system_check_swapoff(cred, sp->sw_vp);
@@ -2335,10 +2300,8 @@
* of data we will have to page back in, plus an epsilon so
* the system doesn't become critically low on swap space.
*/
- if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
- nblks + nswap_lowat) {
+ if (vm_cnt.v_free_count + swap_pager_avail < nblks + nswap_lowat)
return (ENOMEM);
- }
/*
* Prevent further allocations on this device.
@@ -2378,10 +2341,7 @@
const char *devname;
int error;
- mtx_lock(&Giant);
- while (swdev_syscall_active)
- tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
- swdev_syscall_active = 1;
+ sx_xlock(&swdev_syscall_lock);
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
@@ -2401,9 +2361,7 @@
}
mtx_unlock(&sw_dev_mtx);
- swdev_syscall_active = 0;
- wakeup_one(&swdev_syscall_active);
- mtx_unlock(&Giant);
+ sx_xunlock(&swdev_syscall_lock);
}
void
@@ -2472,19 +2430,14 @@
SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
"Number of swap devices");
-SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
+SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_vm_swap_info,
"Swap statistics by device");
/*
- * vmspace_swap_count() - count the approximate swap usage in pages for a
- * vmspace.
- *
- * The map must be locked.
- *
- * Swap usage is determined by taking the proportional swap used by
- * VM objects backing the VM map. To make up for fractional losses,
- * if the VM object has any swap use at all the associated map entries
- * count for at least 1 swap page.
+ * Count the approximate swap usage in pages for a vmspace. The
+ * shadowed or not yet copied on write swap blocks are not accounted.
+ * The map must be locked.
*/
long
vmspace_swap_count(struct vmspace *vmspace)
@@ -2492,23 +2445,38 @@
vm_map_t map;
vm_map_entry_t cur;
vm_object_t object;
- long count, n;
+ struct swblk *sb;
+ vm_pindex_t e, pi;
+ long count;
+ int i;
map = &vmspace->vm_map;
count = 0;
for (cur = map->header.next; cur != &map->header; cur = cur->next) {
- if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
- (object = cur->object.vm_object) != NULL) {
- VM_OBJECT_WLOCK(object);
- if (object->type == OBJT_SWAP &&
- object->un_pager.swp.swp_bcount != 0) {
- n = (cur->end - cur->start) / PAGE_SIZE;
- count += object->un_pager.swp.swp_bcount *
- SWAP_META_PAGES * n / object->size + 1;
+ if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
+ continue;
+ object = cur->object.vm_object;
+ if (object == NULL || object->type != OBJT_SWAP)
+ continue;
+ VM_OBJECT_RLOCK(object);
+ if (object->type != OBJT_SWAP)
+ goto unlock;
+ pi = OFF_TO_IDX(cur->offset);
+ e = pi + OFF_TO_IDX(cur->end - cur->start);
+ for (;; pi = sb->p + SWAP_META_PAGES) {
+ sb = SWAP_PCTRIE_LOOKUP_GE(
+ &object->un_pager.swp.swp_blks, pi);
+ if (sb == NULL || sb->p >= e)
+ break;
+ for (i = 0; i < SWAP_META_PAGES; i++) {
+ if (sb->p + i < e &&
+ sb->d[i] != SWAPBLK_NONE)
+ count++;
}
- VM_OBJECT_WUNLOCK(object);
}
+unlock:
+ VM_OBJECT_RUNLOCK(object);
}
return (count);
}
@@ -2554,8 +2522,9 @@
}
/*
- * Remove a reference from the g_consumer. Post a close event if
- * all referneces go away.
+ * Remove a reference from the g_consumer. Post a close event if all
+ * references go away, since the function might be called from the
+ * biodone context.
*/
static void
swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
@@ -2628,7 +2597,7 @@
bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
bio->bio_length = bp->b_bcount;
bio->bio_done = swapgeom_done;
- if ((bp->b_flags & B_UNMAPPED) != 0) {
+ if (!buf_mapped(bp)) {
bio->bio_ma = bp->b_pages;
bio->bio_data = unmapped_buf;
bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
@@ -2678,22 +2647,19 @@
cp = sw->sw_id;
sw->sw_id = NULL;
mtx_unlock(&sw_dev_mtx);
- /* XXX: direct call when Giant untangled */
+
+ /*
+ * swapgeom_close() may be called from the biodone context,
+ * where we cannot perform topology changes. Delegate the
+ * work to the events thread.
+ */
if (cp != NULL)
g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
}
-
-struct swh0h0 {
- struct cdev *dev;
- struct vnode *vp;
- int error;
-};
-
-static void
-swapongeom_ev(void *arg, int flags)
+static int
+swapongeom_locked(struct cdev *dev, struct vnode *vp)
{
- struct swh0h0 *swh;
struct g_provider *pp;
struct g_consumer *cp;
static struct g_geom *gp;
@@ -2701,20 +2667,15 @@
u_long nblks;
int error;
- swh = arg;
- swh->error = 0;
- pp = g_dev_getprovider(swh->dev);
- if (pp == NULL) {
- swh->error = ENODEV;
- return;
- }
+ pp = g_dev_getprovider(dev);
+ if (pp == NULL)
+ return (ENODEV);
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list) {
cp = sp->sw_id;
if (cp != NULL && cp->provider == pp) {
mtx_unlock(&sw_dev_mtx);
- swh->error = EBUSY;
- return;
+ return (EBUSY);
}
}
mtx_unlock(&sw_dev_mtx);
@@ -2721,44 +2682,41 @@
if (gp == NULL)
gp = g_new_geomf(&g_swap_class, "swap");
cp = g_new_consumer(gp);
- cp->index = 1; /* Number of active I/Os, plus one for being active. */
+ cp->index = 1; /* Number of active I/Os, plus one for being active. */
cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
g_attach(cp, pp);
/*
- * XXX: Everytime you think you can improve the margin for
+ * XXX: Every time you think you can improve the margin for
* footshooting, somebody depends on the ability to do so:
* savecore(8) wants to write to our swapdev so we cannot
* set an exclusive count :-(
*/
error = g_access(cp, 1, 1, 0);
- if (error) {
+ if (error != 0) {
g_detach(cp);
g_destroy_consumer(cp);
- swh->error = error;
- return;
+ return (error);
}
nblks = pp->mediasize / DEV_BSIZE;
- swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
- swapgeom_close, dev2udev(swh->dev),
+ swaponsomething(vp, cp, nblks, swapgeom_strategy,
+ swapgeom_close, dev2udev(dev),
(pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
- swh->error = 0;
+ return (0);
}
static int
-swapongeom(struct thread *td, struct vnode *vp)
+swapongeom(struct vnode *vp)
{
int error;
- struct swh0h0 swh;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-
- swh.dev = vp->v_rdev;
- swh.vp = vp;
- swh.error = 0;
- /* XXX: direct call when Giant untangled */
- error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
- if (!error)
- error = swh.error;
+ if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) {
+ error = ENOENT;
+ } else {
+ g_topology_lock();
+ error = swapongeom_locked(vp->v_rdev, vp);
+ g_topology_unlock();
+ }
VOP_UNLOCK(vp, 0);
return (error);
}
@@ -2833,3 +2791,40 @@
NODEV, 0);
return (0);
}
+
+static int
+sysctl_swap_async_max(SYSCTL_HANDLER_ARGS)
+{
+ int error, new, n;
+
+ new = nsw_wcount_async_max;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ if (new > nswbuf / 2 || new < 1)
+ return (EINVAL);
+
+ mtx_lock(&pbuf_mtx);
+ while (nsw_wcount_async_max != new) {
+ /*
+ * Adjust difference. If the current async count is too low,
+ * we will need to sqeeze our update slowly in. Sleep with a
+ * higher priority than getpbuf() to finish faster.
+ */
+ n = new - nsw_wcount_async_max;
+ if (nsw_wcount_async + n >= 0) {
+ nsw_wcount_async += n;
+ nsw_wcount_async_max += n;
+ wakeup(&nsw_wcount_async);
+ } else {
+ nsw_wcount_async_max -= nsw_wcount_async;
+ nsw_wcount_async = 0;
+ msleep(&nsw_wcount_async, &pbuf_mtx, PSWP,
+ "swpsysctl", 0);
+ }
+ }
+ mtx_unlock(&pbuf_mtx);
+
+ return (0);
+}
Modified: trunk/sys/vm/swap_pager.h
===================================================================
--- trunk/sys/vm/swap_pager.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/swap_pager.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -33,7 +33,7 @@
* SUCH DAMAGE.
*
* from: @(#)swap_pager.h 7.1 (Berkeley) 12/5/90
- * $FreeBSD: stable/10/sys/vm/swap_pager.h 248514 2013-03-19 14:39:27Z kib $
+ * $FreeBSD: stable/11/sys/vm/swap_pager.h 331722 2018-03-29 02:50:57Z eadler $
*/
#ifndef _VM_SWAP_PAGER_H_
@@ -74,15 +74,14 @@
#ifdef _KERNEL
-extern int swap_pager_full;
extern int swap_pager_avail;
struct xswdev;
int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len);
void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int);
+vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex);
void swap_pager_freespace(vm_object_t, vm_pindex_t, vm_size_t);
void swap_pager_swap_init(void);
-int swap_pager_isswapped(vm_object_t, struct swdevt *);
int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t);
void swap_pager_status(int *total, int *used);
void swapoff_all(void);
Modified: trunk/sys/vm/uma.h
===================================================================
--- trunk/sys/vm/uma.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,7 +25,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/vm/uma.h 324602 2017-10-13 17:11:08Z jhb $
+ * $FreeBSD: stable/11/sys/vm/uma.h 338389 2018-08-29 17:58:01Z markj $
*
*/
@@ -263,8 +263,8 @@
* information in the vm_page.
*/
#define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */
-#define UMA_ZONE_REFCNT 0x0400 /* Allocate refcnts in slabs */
-#define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets */
+#define UMA_ZONE_NOBUCKET 0x0400 /* Do not use buckets. */
+#define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets. */
#define UMA_ZONE_CACHESPREAD 0x1000 /*
* Spread memory start locations across
* all possible cache lines. May
@@ -277,7 +277,7 @@
* mini-dumps.
*/
#define UMA_ZONE_PCPU 0x8000 /*
- * Allocates mp_ncpus slabs sized to
+ * Allocates mp_maxid + 1 slabs sized to
* sizeof(struct pcpu).
*/
@@ -288,7 +288,7 @@
*/
#define UMA_ZONE_INHERIT \
(UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \
- UMA_ZONE_HASH | UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
+ UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
/* Definitions for align */
#define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */
@@ -367,6 +367,11 @@
}
/*
+ * Wait until the specified zone can allocate an item.
+ */
+void uma_zwait(uma_zone_t zone);
+
+/*
* XXX The rest of the prototypes in this header are h0h0 magic for the VM.
* If you think you need to use it for a normal zone you're probably incorrect.
*/
@@ -523,6 +528,19 @@
void uma_zone_set_warning(uma_zone_t zone, const char *warning);
/*
+ * Sets a function to run when limit is reached
+ *
+ * Arguments:
+ * zone The zone to which this applies
+ * fx The function ro run
+ *
+ * Returns:
+ * Nothing
+ */
+typedef void (*uma_maxaction_t)(uma_zone_t, int);
+void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t);
+
+/*
* Obtains the approximate current number of items allocated from a zone
*
* Arguments:
@@ -612,21 +630,6 @@
void uma_prealloc(uma_zone_t zone, int itemcnt);
/*
- * Used to lookup the reference counter allocated for an item
- * from a UMA_ZONE_REFCNT zone. For UMA_ZONE_REFCNT zones,
- * reference counters are allocated for items and stored in
- * the underlying slab header.
- *
- * Arguments:
- * zone The UMA_ZONE_REFCNT zone to which the item belongs.
- * item The address of the item for which we want a refcnt.
- *
- * Returns:
- * A pointer to a uint32_t reference counter.
- */
-uint32_t *uma_find_refcnt(uma_zone_t zone, void *item);
-
-/*
* Used to determine if a fixed-size zone is exhausted.
*
* Arguments:
Modified: trunk/sys/vm/uma_core.c
===================================================================
--- trunk/sys/vm/uma_core.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_core.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -32,7 +32,7 @@
*
* This allocator is intended to replace the multitude of similar object caches
* in the standard FreeBSD kernel. The intent is to be flexible as well as
- * effecient. A primary design goal is to return unused memory to the rest of
+ * efficient. A primary design goal is to return unused memory to the rest of
* the system. This will make the system as a whole more flexible due to the
* ability to move memory to subsystems which most need it instead of leaving
* pools of reserved memory unused.
@@ -49,7 +49,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/uma_core.c 320440 2017-06-28 06:40:13Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/uma_core.c 357046 2020-01-23 14:14:38Z markj $");
/* I should really use ktr.. */
/*
@@ -75,10 +75,12 @@
#include <sys/sysctl.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/random.h>
#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/smp.h>
+#include <sys/taskqueue.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
@@ -112,7 +114,6 @@
/* This is the zone from which all of uma_slab_t's are allocated. */
static uma_zone_t slabzone;
-static uma_zone_t slabrefzone; /* With refcounters (for UMA_ZONE_REFCNT) */
/*
* The initial hash tables come out of this zone so they can be allocated
@@ -138,7 +139,7 @@
LIST_HEAD_INITIALIZER(uma_cachezones);
/* This RW lock protects the keg list */
-static struct rwlock_padalign uma_rwlock;
+static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
/* Linked list of boot time pages */
static LIST_HEAD(,uma_slab) uma_boot_pages =
@@ -153,14 +154,9 @@
static int booted = 0;
#define UMA_STARTUP 1
#define UMA_STARTUP2 2
+#define UMA_SHUTDOWN 3
/*
- * Only mbuf clusters use ref zones. Just provide enough references
- * to support the one user. New code should not use the ref facility.
- */
-static const u_int uma_max_ipers_ref = PAGE_SIZE / MCLBYTES;
-
-/*
* This is the handle used to schedule events that need to happen
* outside of the allocation fast path.
*/
@@ -248,11 +244,12 @@
static void keg_large_init(uma_keg_t keg);
static void zone_foreach(void (*zfunc)(uma_zone_t));
static void zone_timeout(uma_zone_t zone);
-static int hash_alloc(struct uma_hash *);
+static int hash_alloc(struct uma_hash *, u_int);
static int hash_expand(struct uma_hash *, struct uma_hash *);
static void hash_free(struct uma_hash *hash);
static void uma_timeout(void *);
static void uma_startup3(void);
+static void uma_shutdown(void);
static void *zone_alloc_item(uma_zone_t, void *, int);
static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
static void bucket_enable(void);
@@ -276,6 +273,11 @@
static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
+#ifdef INVARIANTS
+static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
+static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
+#endif
+
SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
@@ -285,8 +287,7 @@
0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
static int zone_warnings = 1;
-TUNABLE_INT("vm.zone_warnings", &zone_warnings);
-SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
+SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
"Warn when UMA zones becomes full");
/*
@@ -433,6 +434,14 @@
printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
}
+static inline void
+zone_maxaction(uma_zone_t zone)
+{
+
+ if (zone->uz_maxaction.ta_func != NULL)
+ taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
+}
+
static void
zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
{
@@ -471,6 +480,7 @@
static void
keg_timeout(uma_keg_t keg)
{
+ u_int slabs;
KEG_LOCK(keg);
/*
@@ -481,7 +491,8 @@
* may be a little aggressive. Should I allow for two collisions max?
*/
if (keg->uk_flags & UMA_ZONE_HASH &&
- keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
+ (slabs = keg->uk_pages / keg->uk_ppera) >
+ keg->uk_hash.uh_hashsize) {
struct uma_hash newhash;
struct uma_hash oldhash;
int ret;
@@ -492,9 +503,8 @@
* I have to do everything in stages and check for
* races.
*/
- newhash = keg->uk_hash;
KEG_UNLOCK(keg);
- ret = hash_alloc(&newhash);
+ ret = hash_alloc(&newhash, 1 << fls(slabs));
KEG_LOCK(keg);
if (ret) {
if (hash_expand(&keg->uk_hash, &newhash)) {
@@ -526,19 +536,16 @@
* hash A new hash structure with the old hash size in uh_hashsize
*
* Returns:
- * 1 on sucess and 0 on failure.
+ * 1 on success and 0 on failure.
*/
static int
-hash_alloc(struct uma_hash *hash)
+hash_alloc(struct uma_hash *hash, u_int size)
{
- int oldsize;
- int alloc;
+ size_t alloc;
- oldsize = hash->uh_hashsize;
-
- /* We're just going to go to a power of two greater */
- if (oldsize) {
- hash->uh_hashsize = oldsize * 2;
+ KASSERT(powerof2(size), ("hash size must be power of 2"));
+ if (size > UMA_HASH_SIZE_INIT) {
+ hash->uh_hashsize = size;
alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
M_UMAHASH, M_NOWAIT);
@@ -575,8 +582,8 @@
hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
{
uma_slab_t slab;
- int hval;
- int i;
+ u_int hval;
+ u_int idx;
if (!newhash->uh_slab_hash)
return (0);
@@ -589,10 +596,10 @@
* full rehash.
*/
- for (i = 0; i < oldhash->uh_hashsize; i++)
- while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
- slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
- SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
+ for (idx = 0; idx < oldhash->uh_hashsize; idx++)
+ while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
+ slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]);
+ SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink);
hval = UMA_HASH(newhash, slab->us_data);
SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
slab, us_hlink);
@@ -840,8 +847,7 @@
keg_drain(uma_keg_t keg)
{
struct slabhead freeslabs = { 0 };
- uma_slab_t slab;
- uma_slab_t n;
+ uma_slab_t slab, tmp;
/*
* We don't want to take pages from statically allocated kegs at this
@@ -857,15 +863,10 @@
if (keg->uk_free == 0)
goto finished;
- slab = LIST_FIRST(&keg->uk_free_slab);
- while (slab) {
- n = LIST_NEXT(slab, us_link);
-
- /* We have no where to free these to */
- if (slab->us_flags & UMA_SLAB_BOOT) {
- slab = n;
+ LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) {
+ /* We have nowhere to free these to. */
+ if (slab->us_flags & UMA_SLAB_BOOT)
continue;
- }
LIST_REMOVE(slab, us_link);
keg->uk_pages -= keg->uk_ppera;
@@ -875,8 +876,6 @@
UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
-
- slab = n;
}
finished:
KEG_UNLOCK(keg);
@@ -939,7 +938,6 @@
static uma_slab_t
keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
{
- uma_slabrefcnt_t slabref;
uma_alloc allocf;
uma_slab_t slab;
uint8_t *mem;
@@ -1002,11 +1000,6 @@
#ifdef INVARIANTS
BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
#endif
- if (keg->uk_flags & UMA_ZONE_REFCNT) {
- slabref = (uma_slabrefcnt_t)slab;
- for (i = 0; i < keg->uk_ipers; i++)
- slabref->us_refcnt[i] = 0;
- }
if (keg->uk_init != NULL) {
for (i = 0; i < keg->uk_ipers; i++)
@@ -1135,7 +1128,9 @@
npages = howmany(bytes, PAGE_SIZE);
while (npages > 0) {
p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
- VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
+ VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
+ ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
+ VM_ALLOC_NOWAIT));
if (p != NULL) {
/*
* Since the page does not belong to an object, its
@@ -1145,17 +1140,12 @@
npages--;
continue;
}
- if (wait & M_WAITOK) {
- VM_WAIT;
- continue;
- }
-
/*
* Page allocation failed, free intermediate pages and
* exit.
*/
TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
- vm_page_unwire(p, 0);
+ vm_page_unwire(p, PQ_NONE);
vm_page_free(p);
}
return (NULL);
@@ -1229,7 +1219,7 @@
u_int slabsize;
if (keg->uk_flags & UMA_ZONE_PCPU) {
- u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+ u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
slabsize = sizeof(struct pcpu);
keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
@@ -1255,15 +1245,20 @@
keg->uk_rsize < sizeof(struct pcpu),
("%s: size %u too large", __func__, keg->uk_rsize));
- if (keg->uk_flags & UMA_ZONE_REFCNT)
- rsize += sizeof(uint32_t);
-
if (keg->uk_flags & UMA_ZONE_OFFPAGE)
shsize = 0;
else
shsize = sizeof(struct uma_slab);
- keg->uk_ipers = (slabsize - shsize) / rsize;
+ if (rsize <= slabsize - shsize)
+ keg->uk_ipers = (slabsize - shsize) / rsize;
+ else {
+ /* Handle special case when we have 1 item per slab, so
+ * alignment requirement can be relaxed. */
+ KASSERT(keg->uk_size <= slabsize - shsize,
+ ("%s: size %u greater than slab", __func__, keg->uk_size));
+ keg->uk_ipers = 1;
+ }
KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
@@ -1337,21 +1332,24 @@
keg->uk_ipers = 1;
keg->uk_rsize = keg->uk_size;
- /* We can't do OFFPAGE if we're internal, bail out here. */
- if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
- return;
-
/* Check whether we have enough space to not do OFFPAGE. */
if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
shsize = sizeof(struct uma_slab);
- if (keg->uk_flags & UMA_ZONE_REFCNT)
- shsize += keg->uk_ipers * sizeof(uint32_t);
if (shsize & UMA_ALIGN_PTR)
shsize = (shsize & ~UMA_ALIGN_PTR) +
(UMA_ALIGN_PTR + 1);
- if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize)
- keg->uk_flags |= UMA_ZONE_OFFPAGE;
+ if (PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < shsize) {
+ /*
+ * We can't do OFFPAGE if we're internal, in which case
+ * we need an extra page per allocation to contain the
+ * slab header.
+ */
+ if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
+ keg->uk_flags |= UMA_ZONE_OFFPAGE;
+ else
+ keg->uk_ppera++;
+ }
}
if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
@@ -1433,7 +1431,7 @@
if (arg->flags & UMA_ZONE_ZINIT)
keg->uk_init = zero_init;
- if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
+ if (arg->flags & UMA_ZONE_MALLOC)
keg->uk_flags |= UMA_ZONE_VTOSLAB;
if (arg->flags & UMA_ZONE_PCPU)
@@ -1445,13 +1443,6 @@
if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
keg_cachespread_init(keg);
- } else if (keg->uk_flags & UMA_ZONE_REFCNT) {
- if (keg->uk_size >
- (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) -
- sizeof(uint32_t)))
- keg_large_init(keg);
- else
- keg_small_init(keg);
} else {
if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
keg_large_init(keg);
@@ -1459,15 +1450,8 @@
keg_small_init(keg);
}
- if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
- if (keg->uk_flags & UMA_ZONE_REFCNT) {
- if (keg->uk_ipers > uma_max_ipers_ref)
- panic("Too many ref items per zone: %d > %d\n",
- keg->uk_ipers, uma_max_ipers_ref);
- keg->uk_slabzone = slabrefzone;
- } else
- keg->uk_slabzone = slabzone;
- }
+ if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+ keg->uk_slabzone = slabzone;
/*
* If we haven't booted yet we need allocations to go through the
@@ -1504,10 +1488,6 @@
/* Size of the slab struct and free list */
totsize = sizeof(struct uma_slab);
- /* Size of the reference counts. */
- if (keg->uk_flags & UMA_ZONE_REFCNT)
- totsize += keg->uk_ipers * sizeof(uint32_t);
-
if (totsize & UMA_ALIGN_PTR)
totsize = (totsize & ~UMA_ALIGN_PTR) +
(UMA_ALIGN_PTR + 1);
@@ -1521,8 +1501,6 @@
* sure here anyway.
*/
totsize = keg->uk_pgoff + sizeof(struct uma_slab);
- if (keg->uk_flags & UMA_ZONE_REFCNT)
- totsize += keg->uk_ipers * sizeof(uint32_t);
if (totsize > PAGE_SIZE * keg->uk_ppera) {
printf("zone %s ipers %d rsize %d size %d\n",
zone->uz_name, keg->uk_ipers, keg->uk_rsize,
@@ -1532,7 +1510,7 @@
}
if (keg->uk_flags & UMA_ZONE_HASH)
- hash_alloc(&keg->uk_hash);
+ hash_alloc(&keg->uk_hash, 0);
#ifdef UMA_DEBUG
printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
@@ -1667,10 +1645,15 @@
}
out:
- if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0)
+ KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
+ (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
+ ("Invalid zone flag combination"));
+ if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
+ zone->uz_count = BUCKET_MAX;
+ else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
+ zone->uz_count = 0;
+ else
zone->uz_count = bucket_select(zone->uz_size);
- else
- zone->uz_count = BUCKET_MAX;
zone->uz_count_min = zone->uz_count;
return (0);
@@ -1785,7 +1768,6 @@
{
struct uma_zctor_args args;
uma_slab_t slab;
- u_int slabsize;
int i;
#ifdef UMA_DEBUG
@@ -1835,9 +1817,6 @@
zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
#ifdef UMA_DEBUG
- printf("Initializing pcpu cache locks.\n");
-#endif
-#ifdef UMA_DEBUG
printf("Creating slab and hash zones.\n");
#endif
@@ -1847,18 +1826,6 @@
NULL, NULL, NULL, NULL,
UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
- /*
- * We also create a zone for the bigger slabs with reference
- * counts in them, to accomodate UMA_ZONE_REFCNT zones.
- */
- slabsize = sizeof(struct uma_slab_refcnt);
- slabsize += uma_max_ipers_ref * sizeof(uint32_t);
- slabrefzone = uma_zcreate("UMA RCntSlabs",
- slabsize,
- NULL, NULL, NULL, NULL,
- UMA_ALIGN_PTR,
- UMA_ZFLAG_INTERNAL);
-
hashzone = uma_zcreate("UMA Hash",
sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
NULL, NULL, NULL, NULL,
@@ -1885,10 +1852,6 @@
#endif
}
-/*
- * Initialize our callout handle
- *
- */
static void
uma_startup3(void)
@@ -1901,8 +1864,18 @@
#ifdef UMA_DEBUG
printf("UMA startup3 complete.\n");
#endif
+
+ EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL,
+ EVENTHANDLER_PRI_FIRST);
}
+static void
+uma_shutdown(void)
+{
+
+ booted = UMA_SHUTDOWN;
+}
+
static uma_keg_t
uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
int align, uint32_t flags)
@@ -1948,6 +1921,20 @@
args.dtor = dtor;
args.uminit = uminit;
args.fini = fini;
+#ifdef INVARIANTS
+ /*
+ * If a zone is being created with an empty constructor and
+ * destructor, pass UMA constructor/destructor which checks for
+ * memory use after free.
+ */
+ if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
+ ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
+ args.ctor = trash_ctor;
+ args.dtor = trash_dtor;
+ args.uminit = trash_init;
+ args.fini = trash_fini;
+ }
+#endif
args.align = align;
args.flags = flags;
args.keg = NULL;
@@ -2070,15 +2057,8 @@
error = EINVAL;
goto out;
}
+
/*
- * Both must either be refcnt, or not be refcnt.
- */
- if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
- (master->uz_flags & UMA_ZONE_REFCNT)) {
- error = EINVAL;
- goto out;
- }
- /*
* The underlying object must be the same size. rsize
* may be different.
*/
@@ -2114,11 +2094,28 @@
uma_zdestroy(uma_zone_t zone)
{
+ /*
+ * Large slabs are expensive to reclaim, so don't bother doing
+ * unnecessary work if we're shutting down.
+ */
+ if (booted == UMA_SHUTDOWN &&
+ zone->uz_fini == NULL &&
+ zone->uz_release == (uma_release)zone_release)
+ return;
sx_slock(&uma_drain_lock);
zone_free_item(zones, zone, NULL, SKIP_NONE);
sx_sunlock(&uma_drain_lock);
}
+void
+uma_zwait(uma_zone_t zone)
+{
+ void *item;
+
+ item = uma_zalloc_arg(zone, NULL, M_WAITOK);
+ uma_zfree(zone, item);
+}
+
/* See uma.h */
void *
uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
@@ -2129,6 +2126,9 @@
int lockfail;
int cpu;
+ /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
+ random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
+
/* This is the fast path allocation */
#ifdef UMA_DEBUG_ALLOC_1
printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
@@ -2140,20 +2140,17 @@
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"uma_zalloc_arg: zone \"%s\"", zone->uz_name);
}
+ KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
+ ("uma_zalloc_arg: called with spinlock or critical section held"));
+
#ifdef DEBUG_MEMGUARD
if (memguard_cmp_zone(zone)) {
item = memguard_alloc(zone->uz_size, flags);
if (item != NULL) {
- /*
- * Avoid conflict with the use-after-free
- * protecting infrastructure from INVARIANTS.
- */
if (zone->uz_init != NULL &&
- zone->uz_init != mtrash_init &&
zone->uz_init(item, zone->uz_size, flags) != 0)
return (NULL);
if (zone->uz_ctor != NULL &&
- zone->uz_ctor != mtrash_ctor &&
zone->uz_ctor(item, zone->uz_size, udata,
flags) != 0) {
zone->uz_fini(item, zone->uz_size);
@@ -2289,7 +2286,7 @@
/*
* Now lets just fill a bucket and put it on the free list. If that
- * works we'll restart the allocation from the begining and it
+ * works we'll restart the allocation from the beginning and it
* will use the just filled bucket.
*/
bucket = zone_alloc_bucket(zone, udata, flags);
@@ -2370,6 +2367,7 @@
if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
zone->uz_flags |= UMA_ZFLAG_FULL;
zone_log_warning(zone);
+ zone_maxaction(zone);
}
if (flags & M_NOWAIT)
break;
@@ -2489,6 +2487,7 @@
zone->uz_flags |= UMA_ZFLAG_FULL;
zone->uz_sleeps++;
zone_log_warning(zone);
+ zone_maxaction(zone);
msleep(zone, zone->uz_lockptr, PVM,
"zonelimit", hz/100);
zone->uz_flags &= ~UMA_ZFLAG_FULL;
@@ -2668,6 +2667,9 @@
int lockfail;
int cpu;
+ /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
+ random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
+
#ifdef UMA_DEBUG_ALLOC_1
printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
#endif
@@ -2674,14 +2676,17 @@
CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
zone->uz_name);
+ KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
+ ("uma_zfree_arg: called with spinlock or critical section held"));
+
/* uma_zfree(..., NULL) does nothing, to match free(9). */
if (item == NULL)
return;
#ifdef DEBUG_MEMGUARD
if (is_memguard_addr(item)) {
- if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
+ if (zone->uz_dtor != NULL)
zone->uz_dtor(item, zone->uz_size, udata);
- if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
+ if (zone->uz_fini != NULL)
zone->uz_fini(item, zone->uz_size);
memguard_free(item);
return;
@@ -2988,6 +2993,16 @@
}
/* See uma.h */
+void
+uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
+{
+
+ ZONE_LOCK(zone);
+ TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
+ ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
int
uma_zone_get_cur(uma_zone_t zone)
{
@@ -3176,26 +3191,6 @@
}
/* See uma.h */
-uint32_t *
-uma_find_refcnt(uma_zone_t zone, void *item)
-{
- uma_slabrefcnt_t slabref;
- uma_slab_t slab;
- uma_keg_t keg;
- uint32_t *refcnt;
- int idx;
-
- slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
- slabref = (uma_slabrefcnt_t)slab;
- keg = slab->us_keg;
- KASSERT(keg->uk_flags & UMA_ZONE_REFCNT,
- ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
- idx = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
- refcnt = &slabref->us_refcnt[idx];
- return refcnt;
-}
-
-/* See uma.h */
static void
uma_reclaim_locked(bool kmem_danger)
{
@@ -3216,7 +3211,6 @@
* zones are drained. We have to do the same for buckets.
*/
zone_drain(slabzone);
- zone_drain(slabrefzone);
bucket_zone_drain();
}
@@ -3309,9 +3303,10 @@
static void
uma_zero_item(void *item, uma_zone_t zone)
{
+ int i;
if (zone->uz_flags & UMA_ZONE_PCPU) {
- for (int i = 0; i < mp_ncpus; i++)
+ CPU_FOREACH(i)
bzero(zpcpu_get_cpu(item, i), zone->uz_size);
} else
bzero(item, zone->uz_size);
@@ -3447,7 +3442,7 @@
{
struct uma_stream_header ush;
struct uma_type_header uth;
- struct uma_percpu_stat ups;
+ struct uma_percpu_stat *ups;
uma_bucket_t bucket;
struct sbuf sbuf;
uma_cache_t cache;
@@ -3461,6 +3456,8 @@
if (error != 0)
return (error);
sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+ sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
+ ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
count = 0;
rw_rlock(&uma_rwlock);
@@ -3509,7 +3506,6 @@
uth.uth_frees = z->uz_frees;
uth.uth_fails = z->uz_fails;
uth.uth_sleeps = z->uz_sleeps;
- (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
/*
* While it is not normally safe to access the cache
* bucket pointers while not on the CPU that owns the
@@ -3518,30 +3514,31 @@
* accept the possible race associated with bucket
* exchange during monitoring.
*/
- for (i = 0; i < (mp_maxid + 1); i++) {
- bzero(&ups, sizeof(ups));
- if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
- goto skip;
- if (CPU_ABSENT(i))
- goto skip;
+ for (i = 0; i < mp_maxid + 1; i++) {
+ bzero(&ups[i], sizeof(*ups));
+ if (kz->uk_flags & UMA_ZFLAG_INTERNAL ||
+ CPU_ABSENT(i))
+ continue;
cache = &z->uz_cpu[i];
if (cache->uc_allocbucket != NULL)
- ups.ups_cache_free +=
+ ups[i].ups_cache_free +=
cache->uc_allocbucket->ub_cnt;
if (cache->uc_freebucket != NULL)
- ups.ups_cache_free +=
+ ups[i].ups_cache_free +=
cache->uc_freebucket->ub_cnt;
- ups.ups_allocs = cache->uc_allocs;
- ups.ups_frees = cache->uc_frees;
-skip:
- (void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
+ ups[i].ups_allocs = cache->uc_allocs;
+ ups[i].ups_frees = cache->uc_frees;
}
ZONE_UNLOCK(z);
+ (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
+ for (i = 0; i < mp_maxid + 1; i++)
+ (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
}
}
rw_runlock(&uma_rwlock);
error = sbuf_finish(&sbuf);
sbuf_delete(&sbuf);
+ free(ups, M_TEMP);
return (error);
}
@@ -3549,16 +3546,13 @@
sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
{
uma_zone_t zone = *(uma_zone_t *)arg1;
- int error, max, old;
+ int error, max;
- old = max = uma_zone_get_max(zone);
+ max = uma_zone_get_max(zone);
error = sysctl_handle_int(oidp, &max, 0, req);
if (error || !req->newptr)
return (error);
- if (max < old)
- return (EINVAL);
-
uma_zone_set_max(zone, max);
return (0);
@@ -3574,6 +3568,102 @@
return (sysctl_handle_int(oidp, &cur, 0, req));
}
+#ifdef INVARIANTS
+static uma_slab_t
+uma_dbg_getslab(uma_zone_t zone, void *item)
+{
+ uma_slab_t slab;
+ uma_keg_t keg;
+ uint8_t *mem;
+
+ mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
+ if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
+ slab = vtoslab((vm_offset_t)mem);
+ } else {
+ /*
+ * It is safe to return the slab here even though the
+ * zone is unlocked because the item's allocation state
+ * essentially holds a reference.
+ */
+ ZONE_LOCK(zone);
+ keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
+ if (keg->uk_flags & UMA_ZONE_HASH)
+ slab = hash_sfind(&keg->uk_hash, mem);
+ else
+ slab = (uma_slab_t)(mem + keg->uk_pgoff);
+ ZONE_UNLOCK(zone);
+ }
+
+ return (slab);
+}
+
+/*
+ * Set up the slab's freei data such that uma_dbg_free can function.
+ *
+ */
+static void
+uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
+{
+ uma_keg_t keg;
+ int freei;
+
+ if (zone_first_keg(zone) == NULL)
+ return;
+ if (slab == NULL) {
+ slab = uma_dbg_getslab(zone, item);
+ if (slab == NULL)
+ panic("uma: item %p did not belong to zone %s\n",
+ item, zone->uz_name);
+ }
+ keg = slab->us_keg;
+ freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+
+ if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+ panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
+ item, zone, zone->uz_name, slab, freei);
+ BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
+
+ return;
+}
+
+/*
+ * Verifies freed addresses. Checks for alignment, valid slab membership
+ * and duplicate frees.
+ *
+ */
+static void
+uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
+{
+ uma_keg_t keg;
+ int freei;
+
+ if (zone_first_keg(zone) == NULL)
+ return;
+ if (slab == NULL) {
+ slab = uma_dbg_getslab(zone, item);
+ if (slab == NULL)
+ panic("uma: Freed item %p did not belong to zone %s\n",
+ item, zone->uz_name);
+ }
+ keg = slab->us_keg;
+ freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+
+ if (freei >= keg->uk_ipers)
+ panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
+ item, zone, zone->uz_name, slab, freei);
+
+ if (((freei * keg->uk_rsize) + slab->us_data) != item)
+ panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
+ item, zone, zone->uz_name, slab, freei);
+
+ if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+ panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
+ item, zone, zone->uz_name, slab, freei);
+
+ BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
+}
+#endif /* INVARIANTS */
+
#ifdef DDB
DB_SHOW_COMMAND(uma, db_show_uma)
{
@@ -3631,4 +3721,4 @@
return;
}
}
-#endif
+#endif /* DDB */
Modified: trunk/sys/vm/uma_dbg.c
===================================================================
--- trunk/sys/vm/uma_dbg.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_dbg.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -32,8 +32,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/uma_dbg.c 252040 2013-06-20 19:08:12Z jeff $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/uma_dbg.c 301176 2016-06-01 22:31:35Z markj $");
+#include "opt_vm.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bitset.h>
@@ -50,6 +52,7 @@
#include <vm/uma.h>
#include <vm/uma_int.h>
#include <vm/uma_dbg.h>
+#include <vm/memguard.h>
static const uint32_t uma_junk = 0xdeadc0de;
@@ -58,7 +61,6 @@
* prior to subsequent reallocation.
*
* Complies with standard ctor arg/return
- *
*/
int
trash_ctor(void *mem, int size, void *arg, int flags)
@@ -66,12 +68,22 @@
int cnt;
uint32_t *p;
+#ifdef DEBUG_MEMGUARD
+ if (is_memguard_addr(mem))
+ return (0);
+#endif
+
cnt = size / sizeof(uma_junk);
for (p = mem; cnt > 0; cnt--, p++)
if (*p != uma_junk) {
+#ifdef INVARIANTS
+ panic("Memory modified after free %p(%d) val=%x @ %p\n",
+ mem, size, *p, p);
+#else
printf("Memory modified after free %p(%d) val=%x @ %p\n",
mem, size, *p, p);
+#endif
return (0);
}
return (0);
@@ -89,6 +101,11 @@
int cnt;
uint32_t *p;
+#ifdef DEBUG_MEMGUARD
+ if (is_memguard_addr(mem))
+ return;
+#endif
+
cnt = size / sizeof(uma_junk);
for (p = mem; cnt > 0; cnt--, p++)
@@ -127,6 +144,11 @@
uint32_t *p = mem;
int cnt;
+#ifdef DEBUG_MEMGUARD
+ if (is_memguard_addr(mem))
+ return (0);
+#endif
+
size -= sizeof(struct malloc_type *);
ksp = (struct malloc_type **)mem;
ksp += size / sizeof(struct malloc_type *);
@@ -154,6 +176,11 @@
int cnt;
uint32_t *p;
+#ifdef DEBUG_MEMGUARD
+ if (is_memguard_addr(mem))
+ return;
+#endif
+
size -= sizeof(struct malloc_type *);
cnt = size / sizeof(uma_junk);
@@ -172,6 +199,11 @@
{
struct malloc_type **ksp;
+#ifdef DEBUG_MEMGUARD
+ if (is_memguard_addr(mem))
+ return (0);
+#endif
+
mtrash_dtor(mem, size, NULL);
ksp = (struct malloc_type **)mem;
@@ -192,100 +224,3 @@
{
(void)mtrash_ctor(mem, size, NULL, 0);
}
-
-#ifdef INVARIANTS
-static uma_slab_t
-uma_dbg_getslab(uma_zone_t zone, void *item)
-{
- uma_slab_t slab;
- uma_keg_t keg;
- uint8_t *mem;
-
- mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
- if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
- slab = vtoslab((vm_offset_t)mem);
- } else {
- /*
- * It is safe to return the slab here even though the
- * zone is unlocked because the item's allocation state
- * essentially holds a reference.
- */
- ZONE_LOCK(zone);
- keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
- if (keg->uk_flags & UMA_ZONE_HASH)
- slab = hash_sfind(&keg->uk_hash, mem);
- else
- slab = (uma_slab_t)(mem + keg->uk_pgoff);
- ZONE_UNLOCK(zone);
- }
-
- return (slab);
-}
-
-/*
- * Set up the slab's freei data such that uma_dbg_free can function.
- *
- */
-void
-uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
-{
- uma_keg_t keg;
- int freei;
-
- if (zone_first_keg(zone) == NULL)
- return;
- if (slab == NULL) {
- slab = uma_dbg_getslab(zone, item);
- if (slab == NULL)
- panic("uma: item %p did not belong to zone %s\n",
- item, zone->uz_name);
- }
- keg = slab->us_keg;
- freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
-
- if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
- panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
- item, zone, zone->uz_name, slab, freei);
- BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
-
- return;
-}
-
-/*
- * Verifies freed addresses. Checks for alignment, valid slab membership
- * and duplicate frees.
- *
- */
-void
-uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
-{
- uma_keg_t keg;
- int freei;
-
- if (zone_first_keg(zone) == NULL)
- return;
- if (slab == NULL) {
- slab = uma_dbg_getslab(zone, item);
- if (slab == NULL)
- panic("uma: Freed item %p did not belong to zone %s\n",
- item, zone->uz_name);
- }
- keg = slab->us_keg;
- freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
-
- if (freei >= keg->uk_ipers)
- panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
- item, zone, zone->uz_name, slab, freei);
-
- if (((freei * keg->uk_rsize) + slab->us_data) != item)
- panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
- item, zone, zone->uz_name, slab, freei);
-
- if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
- panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
- item, zone, zone->uz_name, slab, freei);
-
- BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
-}
-
-#endif /* INVARIANTS */
Modified: trunk/sys/vm/uma_dbg.h
===================================================================
--- trunk/sys/vm/uma_dbg.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_dbg.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,7 +25,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/vm/uma_dbg.h 148078 2005-07-16 09:51:52Z rwatson $
+ * $FreeBSD: stable/11/sys/vm/uma_dbg.h 295221 2016-02-03 22:02:36Z glebius $
*
*/
@@ -50,7 +50,4 @@
int mtrash_init(void *mem, int size, int flags);
void mtrash_fini(void *mem, int size);
-void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
-void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
-
#endif /* VM_UMA_DBG_H */
Modified: trunk/sys/vm/uma_int.h
===================================================================
--- trunk/sys/vm/uma_int.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_int.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,10 +25,13 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/vm/uma_int.h 316835 2017-04-14 14:11:59Z avg $
+ * $FreeBSD: stable/11/sys/vm/uma_int.h 344363 2019-02-20 14:12:25Z pfg $
*
*/
+#include <sys/_bitset.h>
+#include <sys/_task.h>
+
/*
* This file includes definitions, structures, prototypes, and inlines that
* should not be used outside of the actual implementation of UMA.
@@ -109,6 +112,8 @@
#define UMA_SLAB_SHIFT PAGE_SHIFT /* Number of bits PAGE_MASK */
#define UMA_BOOT_PAGES 64 /* Pages allocated for startup */
+#define UMA_BOOT_PAGES_ZONES 32 /* Multiplier for pages to reserve */
+ /* if uma_zone > PAGE_SIZE */
/* Max waste percentage before going to off page slab management */
#define UMA_MAX_WASTE 10
@@ -140,8 +145,8 @@
struct uma_hash {
struct slabhead *uh_slab_hash; /* Hash table for slabs */
- int uh_hashsize; /* Current size of the hash table */
- int uh_hashmask; /* Mask used during hashing */
+ u_int uh_hashsize; /* Current size of the hash table */
+ u_int uh_hashmask; /* Mask used during hashing */
};
/*
@@ -207,7 +212,7 @@
vm_offset_t uk_kva; /* Zone base KVA */
uma_zone_t uk_slabzone; /* Slab zone backing us, if OFFPAGE */
- uint16_t uk_pgoff; /* Offset to uma_slab struct */
+ uint32_t uk_pgoff; /* Offset to uma_slab struct */
uint16_t uk_ppera; /* pages per allocation from backend */
uint16_t uk_ipers; /* Items per slab */
uint32_t uk_flags; /* Internal flags */
@@ -248,17 +253,7 @@
#define us_link us_type._us_link
#define us_size us_type._us_size
-/*
- * The slab structure for UMA_ZONE_REFCNT zones for whose items we
- * maintain reference counters in the slab for.
- */
-struct uma_slab_refcnt {
- struct uma_slab us_head; /* slab header data */
- uint32_t us_refcnt[0]; /* Actually larger. */
-};
-
typedef struct uma_slab * uma_slab_t;
-typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int);
struct uma_klink {
@@ -303,10 +298,12 @@
uint16_t uz_count; /* Amount of items in full bucket */
uint16_t uz_count_min; /* Minimal amount of items there */
- /* The next three fields are used to print a rate-limited warnings. */
+ /* The next two fields are used to print a rate-limited warnings. */
const char *uz_warning; /* Warning to print on failure */
struct timeval uz_ratecheck; /* Warnings rate-limiting */
+ struct task uz_maxaction; /* Task to run when at limit */
+
/*
* This HAS to be the last item because we adjust the zone size
* based on NCPU and then allocate the space for the zones.
@@ -390,7 +387,7 @@
hash_sfind(struct uma_hash *hash, uint8_t *data)
{
uma_slab_t slab;
- int hval;
+ u_int hval;
hval = UMA_HASH(hash, data);
@@ -421,7 +418,7 @@
/*
* The following two functions may be defined by architecture specific code
- * if they can provide more effecient allocation functions. This is useful
+ * if they can provide more efficient allocation functions. This is useful
* for using direct mapped addresses.
*/
void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag,
Modified: trunk/sys/vm/vm.h
===================================================================
--- trunk/sys/vm/vm.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -56,7 +56,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: stable/10/sys/vm/vm.h 321717 2017-07-30 10:36:20Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm.h 331921 2018-04-03 09:38:53Z kib $
*/
#ifndef VM_H
@@ -79,7 +79,9 @@
#define VM_PROT_WRITE ((vm_prot_t) 0x02)
#define VM_PROT_EXECUTE ((vm_prot_t) 0x04)
#define VM_PROT_COPY ((vm_prot_t) 0x08) /* copy-on-read */
-#define VM_PROT_FAULT_LOOKUP ((vm_prot_t) 0x010)
+#define VM_PROT_PRIV_FLAG ((vm_prot_t) 0x10)
+#define VM_PROT_FAULT_LOOKUP VM_PROT_PRIV_FLAG
+#define VM_PROT_QUICK_NOFAULT VM_PROT_PRIV_FLAG /* same to save bits */
#define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)
#define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE)
@@ -112,8 +114,9 @@
typedef int boolean_t;
/*
- * The exact set of memory attributes is machine dependent. However, every
- * machine is required to define VM_MEMATTR_DEFAULT.
+ * The exact set of memory attributes is machine dependent. However,
+ * every machine is required to define VM_MEMATTR_DEFAULT and
+ * VM_MEMATTR_UNCACHEABLE.
*/
typedef char vm_memattr_t; /* memory attribute codes */
Added: trunk/sys/vm/vm_domain.c
===================================================================
--- trunk/sys/vm/vm_domain.c (rev 0)
+++ trunk/sys/vm/vm_domain.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,401 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 Adrian Chadd <adrian at FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ * redistribution must be conditioned upon including a substantially
+ * similar Disclaimer requirement for further binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_domain.c 312714 2017-01-24 19:39:24Z mjg $");
+
+#include "opt_vm.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#ifdef VM_NUMA_ALLOC
+#include <sys/proc.h>
+#endif
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/tree.h>
+#include <sys/vmmeter.h>
+#include <sys/seq.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_phys.h>
+
+#include <vm/vm_domain.h>
+
+#ifdef VM_NUMA_ALLOC
+static __inline int
+vm_domain_rr_selectdomain(int skip_domain)
+{
+ struct thread *td;
+
+ td = curthread;
+
+ td->td_dom_rr_idx++;
+ td->td_dom_rr_idx %= vm_ndomains;
+
+ /*
+ * If skip_domain is provided then skip over that
+ * domain. This is intended for round robin variants
+ * which first try a fixed domain.
+ */
+ if ((skip_domain > -1) && (td->td_dom_rr_idx == skip_domain)) {
+ td->td_dom_rr_idx++;
+ td->td_dom_rr_idx %= vm_ndomains;
+ }
+ return (td->td_dom_rr_idx);
+}
+#endif
+
+/*
+ * This implements a very simple set of VM domain memory allocation
+ * policies and iterators.
+ */
+
+/*
+ * A VM domain policy represents a desired VM domain policy.
+ * Iterators implement searching through VM domains in a specific
+ * order.
+ */
+
+/*
+ * When setting a policy, the caller must establish their own
+ * exclusive write protection for the contents of the domain
+ * policy.
+ */
+int
+vm_domain_policy_init(struct vm_domain_policy *vp)
+{
+
+ bzero(vp, sizeof(*vp));
+ vp->p.policy = VM_POLICY_NONE;
+ vp->p.domain = -1;
+ return (0);
+}
+
+int
+vm_domain_policy_set(struct vm_domain_policy *vp,
+ vm_domain_policy_type_t vt, int domain)
+{
+
+ seq_write_begin(&vp->seq);
+ vp->p.policy = vt;
+ vp->p.domain = domain;
+ seq_write_end(&vp->seq);
+ return (0);
+}
+
+/*
+ * Take a local copy of a policy.
+ *
+ * The destination policy isn't write-barriered; this is used
+ * for doing local copies into something that isn't shared.
+ */
+void
+vm_domain_policy_localcopy(struct vm_domain_policy *dst,
+ const struct vm_domain_policy *src)
+{
+ seq_t seq;
+
+ for (;;) {
+ seq = seq_read(&src->seq);
+ *dst = *src;
+ if (seq_consistent(&src->seq, seq))
+ return;
+ }
+}
+
+/*
+ * Take a write-barrier copy of a policy.
+ *
+ * The destination policy is write -barriered; this is used
+ * for doing copies into policies that may be read by other
+ * threads.
+ */
+void
+vm_domain_policy_copy(struct vm_domain_policy *dst,
+ const struct vm_domain_policy *src)
+{
+ seq_t seq;
+ struct vm_domain_policy d;
+
+ for (;;) {
+ seq = seq_read(&src->seq);
+ d = *src;
+ if (seq_consistent(&src->seq, seq)) {
+ seq_write_begin(&dst->seq);
+ dst->p.domain = d.p.domain;
+ dst->p.policy = d.p.policy;
+ seq_write_end(&dst->seq);
+ return;
+ }
+ }
+}
+
+int
+vm_domain_policy_validate(const struct vm_domain_policy *vp)
+{
+
+ switch (vp->p.policy) {
+ case VM_POLICY_NONE:
+ case VM_POLICY_ROUND_ROBIN:
+ case VM_POLICY_FIRST_TOUCH:
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ if (vp->p.domain == -1)
+ return (0);
+ return (-1);
+ case VM_POLICY_FIXED_DOMAIN:
+ case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+#ifdef VM_NUMA_ALLOC
+ if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains)
+ return (0);
+#else
+ if (vp->p.domain == 0)
+ return (0);
+#endif
+ return (-1);
+ default:
+ return (-1);
+ }
+ return (-1);
+}
+
+int
+vm_domain_policy_cleanup(struct vm_domain_policy *vp)
+{
+
+ /* For now, empty */
+ return (0);
+}
+
+int
+vm_domain_iterator_init(struct vm_domain_iterator *vi)
+{
+
+ /* Nothing to do for now */
+ return (0);
+}
+
+/*
+ * Manually setup an iterator with the given details.
+ */
+int
+vm_domain_iterator_set(struct vm_domain_iterator *vi,
+ vm_domain_policy_type_t vt, int domain)
+{
+
+#ifdef VM_NUMA_ALLOC
+ switch (vt) {
+ case VM_POLICY_FIXED_DOMAIN:
+ vi->policy = VM_POLICY_FIXED_DOMAIN;
+ vi->domain = domain;
+ vi->n = 1;
+ break;
+ case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+ vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN;
+ vi->domain = domain;
+ vi->n = vm_ndomains;
+ break;
+ case VM_POLICY_FIRST_TOUCH:
+ vi->policy = VM_POLICY_FIRST_TOUCH;
+ vi->domain = PCPU_GET(domain);
+ vi->n = 1;
+ break;
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN;
+ vi->domain = PCPU_GET(domain);
+ vi->n = vm_ndomains;
+ break;
+ case VM_POLICY_ROUND_ROBIN:
+ default:
+ vi->policy = VM_POLICY_ROUND_ROBIN;
+ vi->domain = -1;
+ vi->n = vm_ndomains;
+ break;
+ }
+#else
+ vi->domain = 0;
+ vi->n = 1;
+#endif
+ return (0);
+}
+
+/*
+ * Setup an iterator based on the given policy.
+ */
+static inline void
+_vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+ const struct vm_domain_policy *vt)
+{
+
+#ifdef VM_NUMA_ALLOC
+ /*
+ * Initialise the iterator.
+ *
+ * For first-touch, the initial domain is set
+ * via the current thread CPU domain.
+ *
+ * For fixed-domain, it's assumed that the
+ * caller has initialised the specific domain
+ * it is after.
+ */
+ switch (vt->p.policy) {
+ case VM_POLICY_FIXED_DOMAIN:
+ vi->policy = vt->p.policy;
+ vi->domain = vt->p.domain;
+ vi->n = 1;
+ break;
+ case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+ vi->policy = vt->p.policy;
+ vi->domain = vt->p.domain;
+ vi->n = vm_ndomains;
+ break;
+ case VM_POLICY_FIRST_TOUCH:
+ vi->policy = vt->p.policy;
+ vi->domain = PCPU_GET(domain);
+ vi->n = 1;
+ break;
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ vi->policy = vt->p.policy;
+ vi->domain = PCPU_GET(domain);
+ vi->n = vm_ndomains;
+ break;
+ case VM_POLICY_ROUND_ROBIN:
+ default:
+ /*
+ * Default to round-robin policy.
+ */
+ vi->policy = VM_POLICY_ROUND_ROBIN;
+ vi->domain = -1;
+ vi->n = vm_ndomains;
+ break;
+ }
+#else
+ vi->domain = 0;
+ vi->n = 1;
+#endif
+}
+
+void
+vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+ const struct vm_domain_policy *vt)
+{
+ seq_t seq;
+ struct vm_domain_policy vt_lcl;
+
+ for (;;) {
+ seq = seq_read(&vt->seq);
+ vt_lcl = *vt;
+ if (seq_consistent(&vt->seq, seq)) {
+ _vm_domain_iterator_set_policy(vi, &vt_lcl);
+ return;
+ }
+ }
+}
+
+/*
+ * Return the next VM domain to use.
+ *
+ * Returns 0 w/ domain set to the next domain to use, or
+ * -1 to indicate no more domains are available.
+ */
+int
+vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain)
+{
+
+ /* General catch-all */
+ if (vi->n <= 0)
+ return (-1);
+
+#ifdef VM_NUMA_ALLOC
+ switch (vi->policy) {
+ case VM_POLICY_FIXED_DOMAIN:
+ case VM_POLICY_FIRST_TOUCH:
+ *domain = vi->domain;
+ vi->n--;
+ break;
+ case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ /*
+ * XXX TODO: skip over the rr'ed domain
+ * if it equals the one we started with.
+ */
+ if (vi->n == vm_ndomains)
+ *domain = vi->domain;
+ else
+ *domain = vm_domain_rr_selectdomain(vi->domain);
+ vi->n--;
+ break;
+ case VM_POLICY_ROUND_ROBIN:
+ default:
+ *domain = vm_domain_rr_selectdomain(-1);
+ vi->n--;
+ break;
+ }
+#else
+ *domain = 0;
+ vi->n--;
+#endif
+
+ return (0);
+}
+
+/*
+ * Returns 1 if the iteration is done, or 0 if it has not.
+
+ * This can only be called after at least one loop through
+ * the iterator. Ie, it's designed to be used as a tail
+ * check of a loop, not the head check of a loop.
+ */
+int
+vm_domain_iterator_isdone(struct vm_domain_iterator *vi)
+{
+
+ return (vi->n <= 0);
+}
+
+int
+vm_domain_iterator_cleanup(struct vm_domain_iterator *vi)
+{
+
+ return (0);
+}
Property changes on: trunk/sys/vm/vm_domain.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/vm/vm_domain.h
===================================================================
--- trunk/sys/vm/vm_domain.h (rev 0)
+++ trunk/sys/vm/vm_domain.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,67 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 Adrian Chadd <adrian at FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ * redistribution must be conditioned upon including a substantially
+ * similar Disclaimer requirement for further binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD: stable/11/sys/vm/vm_domain.h 285387 2015-07-11 15:21:37Z adrian $
+ */
+#ifndef __VM_DOMAIN_H__
+#define __VM_DOMAIN_H__
+
+#include <sys/_vm_domain.h>
+
+struct vm_domain_iterator {
+ vm_domain_policy_type_t policy;
+ int domain;
+ int n;
+};
+
+/*
+ * TODO: check to see if these should just become inline functions
+ * at some point.
+ */
+extern int vm_domain_policy_init(struct vm_domain_policy *vp);
+extern int vm_domain_policy_set(struct vm_domain_policy *vp,
+ vm_domain_policy_type_t vt, int domain);
+extern int vm_domain_policy_cleanup(struct vm_domain_policy *vp);
+extern void vm_domain_policy_localcopy(struct vm_domain_policy *dst,
+ const struct vm_domain_policy *src);
+extern void vm_domain_policy_copy(struct vm_domain_policy *dst,
+ const struct vm_domain_policy *src);
+extern int vm_domain_policy_validate(const struct vm_domain_policy *vp);
+
+extern int vm_domain_iterator_init(struct vm_domain_iterator *vi);
+extern int vm_domain_iterator_set(struct vm_domain_iterator *vi,
+ vm_domain_policy_type_t vt, int domain);
+extern void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+ const struct vm_domain_policy *vt);
+extern int vm_domain_iterator_run(struct vm_domain_iterator *vi,
+ int *domain);
+extern int vm_domain_iterator_isdone(struct vm_domain_iterator *vi);
+extern int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi);
+
+#endif /* __VM_DOMAIN_H__ */
Property changes on: trunk/sys/vm/vm_domain.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/vm_extern.h
===================================================================
--- trunk/sys/vm/vm_extern.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_extern.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -28,7 +28,7 @@
* SUCH DAMAGE.
*
* @(#)vm_extern.h 8.2 (Berkeley) 1/12/94
- * $FreeBSD: stable/10/sys/vm/vm_extern.h 270920 2014-09-01 07:58:15Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_extern.h 337262 2018-08-03 15:42:39Z markj $
*/
#ifndef _VM_EXTERN_H_
@@ -41,6 +41,8 @@
struct vmem;
#ifdef _KERNEL
+struct cdev;
+struct cdevsw;
/* These operate on kernel virtual addresses only. */
vm_offset_t kva_alloc(vm_size_t);
@@ -64,6 +66,7 @@
void kmem_unback(vm_object_t, vm_offset_t, vm_size_t);
/* Bootstrapping. */
+void kmem_bootstrap_free(vm_offset_t, vm_size_t);
vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t,
boolean_t);
void kmem_init(vm_offset_t, vm_offset_t);
@@ -70,7 +73,6 @@
void kmem_init_zero_region(void);
void kmeminit(void);
-void swapout_procs(int);
int kernacc(void *, int, int);
int useracc(void *, int, int);
int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
@@ -82,10 +84,18 @@
int fault_flags, vm_page_t *m_hold);
int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
vm_prot_t prot, vm_page_t *ma, int max_count);
-int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
+int vm_forkproc(struct thread *, struct proc *, struct thread *,
+ struct vmspace *, int);
void vm_waitproc(struct proc *);
-int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t);
+int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int,
+ objtype_t, void *, vm_ooffset_t);
+int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t,
+ vm_prot_t, int, vm_object_t, vm_ooffset_t, boolean_t, struct thread *);
int vm_mmap_to_errno(int rv);
+int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
+ int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *);
+int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *,
+ struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
void vm_set_page_size(void);
void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t);
typedef int (*pmap_pinit_t)(struct pmap *pmap);
@@ -97,6 +107,7 @@
struct vmspace *vmspace_acquire_ref(struct proc *);
void vmspace_free(struct vmspace *);
void vmspace_exitfree(struct proc *);
+void vmspace_switch_aio(struct vmspace *);
void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
int vslock(void *, size_t);
void vsunlock(void *, size_t);
@@ -104,6 +115,5 @@
void vm_imgact_unmap_page(struct sf_buf *sf);
void vm_thread_dispose(struct thread *td);
int vm_thread_new(struct thread *td, int pages);
-int vm_mlock(struct proc *, struct ucred *, const void *, size_t);
#endif /* _KERNEL */
#endif /* !_VM_EXTERN_H_ */
Modified: trunk/sys/vm/vm_fault.c
===================================================================
--- trunk/sys/vm/vm_fault.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_fault.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -73,7 +73,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_fault.c 329707 2018-02-21 11:31:29Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_fault.c 345572 2019-03-27 11:03:07Z kib $");
#include "opt_ktrace.h"
#include "opt_vm.h"
@@ -82,7 +82,9 @@
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
+#include <sys/mman.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sysctl.h>
@@ -107,14 +109,11 @@
#define PFBAK 4
#define PFFOR 4
-static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
-
-#define VM_FAULT_READ_BEHIND 8
+#define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT)
#define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX)
-#define VM_FAULT_NINCR (VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND)
-#define VM_FAULT_SUM (VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2)
-#define VM_FAULT_CACHE_BEHIND (VM_FAULT_READ_BEHIND * VM_FAULT_SUM)
+#define VM_FAULT_DONTNEED_MIN 1048576
+
struct faultstate {
vm_page_t m;
vm_object_t object;
@@ -124,14 +123,15 @@
vm_pindex_t first_pindex;
vm_map_t map;
vm_map_entry_t entry;
- int lookup_still_valid;
int map_generation;
+ bool lookup_still_valid;
struct vnode *vp;
};
-static void vm_fault_cache_behind(const struct faultstate *fs, int distance);
+static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
+ int ahead);
static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
- int faultcount, int reqpage);
+ int backward, int forward, bool obj_locked);
static inline void
release_page(struct faultstate *fs)
@@ -150,7 +150,7 @@
if (fs->lookup_still_valid) {
vm_map_lookup_done(fs->map, fs->entry);
- fs->lookup_still_valid = FALSE;
+ fs->lookup_still_valid = false;
}
}
@@ -237,14 +237,15 @@
* written NOW so dirty it explicitly to save on
* pmap_is_modified() calls later.
*
- * Also tell the backing pager, if any, that it should remove
- * any swap backing since the page is now dirty.
+ * Also, since the page is now dirty, we can possibly tell
+ * the pager to release any swap backing the page. Calling
+ * the pager requires a write lock on the object.
*/
if (need_dirty)
vm_page_dirty(m);
if (!set_wd)
vm_page_unlock(m);
- if (need_dirty)
+ else if (need_dirty)
vm_pager_page_unswapped(m);
}
@@ -267,8 +268,12 @@
vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
{
- vm_page_t m;
- int rv;
+ vm_page_t m, m_map;
+#if defined(__amd64__) && VM_NRESERVLEVEL > 0
+ vm_page_t m_super;
+ int flags;
+#endif
+ int psind, rv;
MPASS(fs->vp == NULL);
m = vm_page_lookup(fs->first_object, fs->first_pindex);
@@ -276,20 +281,204 @@
if (m == NULL || ((prot & VM_PROT_WRITE) != 0 &&
vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL)
return (KERN_FAILURE);
- rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
- PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), 0);
+ m_map = m;
+ psind = 0;
+#if defined(__amd64__) && VM_NRESERVLEVEL > 0
+ if ((m->flags & PG_FICTITIOUS) == 0 &&
+ (m_super = vm_reserv_to_superpage(m)) != NULL &&
+ rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
+ roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
+ (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
+ (pagesizes[m_super->psind] - 1)) &&
+ pmap_ps_enabled(fs->map->pmap)) {
+ flags = PS_ALL_VALID;
+ if ((prot & VM_PROT_WRITE) != 0) {
+ /*
+ * Create a superpage mapping allowing write access
+ * only if none of the constituent pages are busy and
+ * all of them are already dirty (except possibly for
+ * the page that was faulted on).
+ */
+ flags |= PS_NONE_BUSY;
+ if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
+ flags |= PS_ALL_DIRTY;
+ }
+ if (vm_page_ps_test(m_super, flags, m)) {
+ m_map = m_super;
+ psind = m_super->psind;
+ vaddr = rounddown2(vaddr, pagesizes[psind]);
+ /* Preset the modified bit for dirty superpages. */
+ if ((flags & PS_ALL_DIRTY) != 0)
+ fault_type |= VM_PROT_WRITE;
+ }
+ }
+#endif
+ rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type |
+ PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind);
if (rv != KERN_SUCCESS)
return (rv);
vm_fault_fill_hold(m_hold, m);
vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
+ if (psind == 0 && !wired)
+ vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
VM_OBJECT_RUNLOCK(fs->first_object);
- if (!wired)
- vm_fault_prefault(fs, vaddr, 0, 0);
vm_map_lookup_done(fs->map, fs->entry);
curthread->td_ru.ru_minflt++;
return (KERN_SUCCESS);
}
+static void
+vm_fault_restore_map_lock(struct faultstate *fs)
+{
+
+ VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
+ MPASS(fs->first_object->paging_in_progress > 0);
+
+ if (!vm_map_trylock_read(fs->map)) {
+ VM_OBJECT_WUNLOCK(fs->first_object);
+ vm_map_lock_read(fs->map);
+ VM_OBJECT_WLOCK(fs->first_object);
+ }
+ fs->lookup_still_valid = true;
+}
+
+static void
+vm_fault_populate_check_page(vm_page_t m)
+{
+
+ /*
+ * Check each page to ensure that the pager is obeying the
+ * interface: the page must be installed in the object, fully
+ * valid, and exclusively busied.
+ */
+ MPASS(m != NULL);
+ MPASS(m->valid == VM_PAGE_BITS_ALL);
+ MPASS(vm_page_xbusied(m));
+}
+
+static void
+vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first,
+ vm_pindex_t last)
+{
+ vm_page_t m;
+ vm_pindex_t pidx;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ MPASS(first <= last);
+ for (pidx = first, m = vm_page_lookup(object, pidx);
+ pidx <= last; pidx++, m = vm_page_next(m)) {
+ vm_fault_populate_check_page(m);
+ vm_page_lock(m);
+ vm_page_deactivate(m);
+ vm_page_unlock(m);
+ vm_page_xunbusy(m);
+ }
+}
+
+static int
+vm_fault_populate(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
+ int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
+{
+ vm_page_t m;
+ vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
+ int rv;
+
+ MPASS(fs->object == fs->first_object);
+ VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
+ MPASS(fs->first_object->paging_in_progress > 0);
+ MPASS(fs->first_object->backing_object == NULL);
+ MPASS(fs->lookup_still_valid);
+
+ pager_first = OFF_TO_IDX(fs->entry->offset);
+ pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1;
+ unlock_map(fs);
+ unlock_vp(fs);
+
+ /*
+ * Call the pager (driver) populate() method.
+ *
+ * There is no guarantee that the method will be called again
+ * if the current fault is for read, and a future fault is
+ * for write. Report the entry's maximum allowed protection
+ * to the driver.
+ */
+ rv = vm_pager_populate(fs->first_object, fs->first_pindex,
+ fault_type, fs->entry->max_protection, &pager_first, &pager_last);
+
+ VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
+ if (rv == VM_PAGER_BAD) {
+ /*
+ * VM_PAGER_BAD is the backdoor for a pager to request
+ * normal fault handling.
+ */
+ vm_fault_restore_map_lock(fs);
+ if (fs->map->timestamp != fs->map_generation)
+ return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
+ return (KERN_NOT_RECEIVER);
+ }
+ if (rv != VM_PAGER_OK)
+ return (KERN_FAILURE); /* AKA SIGSEGV */
+
+ /* Ensure that the driver is obeying the interface. */
+ MPASS(pager_first <= pager_last);
+ MPASS(fs->first_pindex <= pager_last);
+ MPASS(fs->first_pindex >= pager_first);
+ MPASS(pager_last < fs->first_object->size);
+
+ vm_fault_restore_map_lock(fs);
+ if (fs->map->timestamp != fs->map_generation) {
+ vm_fault_populate_cleanup(fs->first_object, pager_first,
+ pager_last);
+ return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
+ }
+
+ /*
+ * The map is unchanged after our last unlock. Process the fault.
+ *
+ * The range [pager_first, pager_last] that is given to the
+ * pager is only a hint. The pager may populate any range
+ * within the object that includes the requested page index.
+ * In case the pager expanded the range, clip it to fit into
+ * the map entry.
+ */
+ map_first = OFF_TO_IDX(fs->entry->offset);
+ if (map_first > pager_first) {
+ vm_fault_populate_cleanup(fs->first_object, pager_first,
+ map_first - 1);
+ pager_first = map_first;
+ }
+ map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1;
+ if (map_last < pager_last) {
+ vm_fault_populate_cleanup(fs->first_object, map_last + 1,
+ pager_last);
+ pager_last = map_last;
+ }
+ for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx);
+ pidx <= pager_last; pidx++, m = vm_page_next(m)) {
+ vm_fault_populate_check_page(m);
+ vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags,
+ true);
+ VM_OBJECT_WUNLOCK(fs->first_object);
+ pmap_enter(fs->map->pmap, fs->entry->start + IDX_TO_OFF(pidx) -
+ fs->entry->offset, m, prot, fault_type | (wired ?
+ PMAP_ENTER_WIRED : 0), 0);
+ VM_OBJECT_WLOCK(fs->first_object);
+ if (pidx == fs->first_pindex)
+ vm_fault_fill_hold(m_hold, m);
+ vm_page_lock(m);
+ if ((fault_flags & VM_FAULT_WIRE) != 0) {
+ KASSERT(wired, ("VM_FAULT_WIRE && !wired"));
+ vm_page_wire(m);
+ } else {
+ vm_page_activate(m);
+ }
+ vm_page_unlock(m);
+ vm_page_xunbusy(m);
+ }
+ curthread->td_ru.ru_majflt++;
+ return (KERN_SUCCESS);
+}
+
/*
* vm_fault:
*
@@ -334,21 +523,23 @@
vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold)
{
- vm_prot_t prot;
- long ahead, behind;
- int alloc_req, era, faultcount, nera, reqpage, result;
- boolean_t dead, is_first_object_locked, wired;
- vm_object_t next_object;
- vm_page_t marray[VM_FAULT_READ_MAX];
- int hardfault;
struct faultstate fs;
struct vnode *vp;
- int locked, error;
+ vm_object_t next_object, retry_object;
+ vm_offset_t e_end, e_start;
+ vm_pindex_t retry_pindex;
+ vm_prot_t prot, retry_prot;
+ int ahead, alloc_req, behind, cluster_offset, error, era, faultcount;
+ int locked, nera, result, rv;
+ u_char behavior;
+ boolean_t wired; /* Passed by reference. */
+ bool dead, hardfault, is_first_object_locked;
- hardfault = 0;
PCPU_INC(cnt.v_vm_faults);
fs.vp = NULL;
- faultcount = reqpage = 0;
+ faultcount = 0;
+ nera = -1;
+ hardfault = false;
RetryFault:;
@@ -415,10 +606,10 @@
(fs.first_object->type != OBJT_VNODE &&
(fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
(fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) {
- result = vm_fault_soft_fast(&fs, vaddr, prot,
- fault_type, fault_flags, wired, m_hold);
- if (result == KERN_SUCCESS)
- return (result);
+ rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type,
+ fault_flags, wired, m_hold);
+ if (rv == KERN_SUCCESS)
+ return (rv);
}
if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
VM_OBJECT_RUNLOCK(fs.first_object);
@@ -435,13 +626,12 @@
* they will stay around as well.
*
* Bump the paging-in-progress count to prevent size changes (e.g.
- * truncation operations) during I/O. This must be done after
- * obtaining the vnode lock in order to avoid possible deadlocks.
+ * truncation operations) during I/O.
*/
vm_object_reference_locked(fs.first_object);
vm_object_pip_add(fs.first_object, 1);
- fs.lookup_still_valid = TRUE;
+ fs.lookup_still_valid = true;
fs.first_m = NULL;
@@ -534,11 +724,13 @@
goto readrest;
break;
}
+ KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m));
/*
- * Page is not resident. If this is the search termination
- * or the pager might contain the page, allocate a new page.
- * Default objects are zero-fill, there is no real pager.
+ * Page is not resident. If the pager might contain the page
+ * or this is the beginning of the search, allocate a new
+ * page. (Default objects are zero-fill, so there is no real
+ * pager for them.)
*/
if (fs.object->type != OBJT_DEFAULT ||
fs.object == fs.first_object) {
@@ -547,6 +739,30 @@
return (KERN_PROTECTION_FAILURE);
}
+ if (fs.object == fs.first_object &&
+ (fs.first_object->flags & OBJ_POPULATE) != 0 &&
+ fs.first_object->shadow_count == 0) {
+ rv = vm_fault_populate(&fs, vaddr, prot,
+ fault_type, fault_flags, wired, m_hold);
+ switch (rv) {
+ case KERN_SUCCESS:
+ case KERN_FAILURE:
+ unlock_and_deallocate(&fs);
+ return (rv);
+ case KERN_RESOURCE_SHORTAGE:
+ unlock_and_deallocate(&fs);
+ goto RetryFault;
+ case KERN_NOT_RECEIVER:
+ /*
+ * Pager's populate() method
+ * returned VM_PAGER_BAD.
+ */
+ break;
+ default:
+ panic("inconsistent return codes");
+ }
+ }
+
/*
* Allocate a new page for this object/offset pair.
*
@@ -555,14 +771,10 @@
* there, and allocation can fail, causing
* restart and new reading of the p_flag.
*/
- fs.m = NULL;
if (!vm_page_count_severe() || P_KILLED(curproc)) {
#if VM_NRESERVLEVEL > 0
- if ((fs.object->flags & OBJ_COLORED) == 0) {
- fs.object->flags |= OBJ_COLORED;
- fs.object->pg_color = atop(vaddr) -
- fs.pindex;
- }
+ vm_object_color(fs.object, atop(vaddr) -
+ fs.pindex);
#endif
alloc_req = P_KILLED(curproc) ?
VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
@@ -576,80 +788,113 @@
unlock_and_deallocate(&fs);
VM_WAITPFAULT;
goto RetryFault;
- } else if (fs.m->valid == VM_PAGE_BITS_ALL)
- break;
+ }
}
readrest:
/*
- * We have found a valid page or we have allocated a new page.
- * The page thus may not be valid or may not be entirely
- * valid.
+ * At this point, we have either allocated a new page or found
+ * an existing page that is only partially valid.
*
- * Attempt to fault-in the page if there is a chance that the
- * pager has it, and potentially fault in additional pages
- * at the same time. For default objects simply provide
- * zero-filled pages.
+ * We hold a reference on the current object and the page is
+ * exclusive busied.
*/
- if (fs.object->type != OBJT_DEFAULT) {
- int rv;
- u_char behavior = vm_map_entry_behavior(fs.entry);
- if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
- P_KILLED(curproc)) {
- behind = 0;
- ahead = 0;
+ /*
+ * If the pager for the current object might have the page,
+ * then determine the number of additional pages to read and
+ * potentially reprioritize previously read pages for earlier
+ * reclamation. These operations should only be performed
+ * once per page fault. Even if the current pager doesn't
+ * have the page, the number of additional pages to read will
+ * apply to subsequent objects in the shadow chain.
+ */
+ if (fs.object->type != OBJT_DEFAULT && nera == -1 &&
+ !P_KILLED(curproc)) {
+ KASSERT(fs.lookup_still_valid, ("map unlocked"));
+ era = fs.entry->read_ahead;
+ behavior = vm_map_entry_behavior(fs.entry);
+ if (behavior == MAP_ENTRY_BEHAV_RANDOM) {
+ nera = 0;
} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
- behind = 0;
- ahead = atop(fs.entry->end - vaddr) - 1;
- if (ahead > VM_FAULT_READ_AHEAD_MAX)
- ahead = VM_FAULT_READ_AHEAD_MAX;
- if (fs.pindex == fs.entry->next_read)
- vm_fault_cache_behind(&fs,
- VM_FAULT_READ_MAX);
- } else {
+ nera = VM_FAULT_READ_AHEAD_MAX;
+ if (vaddr == fs.entry->next_read)
+ vm_fault_dontneed(&fs, vaddr, nera);
+ } else if (vaddr == fs.entry->next_read) {
/*
- * If this is a sequential page fault, then
- * arithmetically increase the number of pages
- * in the read-ahead window. Otherwise, reset
- * the read-ahead window to its smallest size.
+ * This is a sequential fault. Arithmetically
+ * increase the requested number of pages in
+ * the read-ahead window. The requested
+ * number of pages is "# of sequential faults
+ * x (read ahead min + 1) + read ahead min"
*/
- behind = atop(vaddr - fs.entry->start);
- if (behind > VM_FAULT_READ_BEHIND)
- behind = VM_FAULT_READ_BEHIND;
- ahead = atop(fs.entry->end - vaddr) - 1;
- era = fs.entry->read_ahead;
- if (fs.pindex == fs.entry->next_read) {
- nera = era + behind;
+ nera = VM_FAULT_READ_AHEAD_MIN;
+ if (era > 0) {
+ nera += era + 1;
if (nera > VM_FAULT_READ_AHEAD_MAX)
nera = VM_FAULT_READ_AHEAD_MAX;
- behind = 0;
- if (ahead > nera)
- ahead = nera;
- if (era == VM_FAULT_READ_AHEAD_MAX)
- vm_fault_cache_behind(&fs,
- VM_FAULT_CACHE_BEHIND);
- } else if (ahead > VM_FAULT_READ_AHEAD_MIN)
- ahead = VM_FAULT_READ_AHEAD_MIN;
- if (era != ahead)
- fs.entry->read_ahead = ahead;
+ }
+ if (era == VM_FAULT_READ_AHEAD_MAX)
+ vm_fault_dontneed(&fs, vaddr, nera);
+ } else {
+ /*
+ * This is a non-sequential fault.
+ */
+ nera = 0;
}
+ if (era != nera) {
+ /*
+ * A read lock on the map suffices to update
+ * the read ahead count safely.
+ */
+ fs.entry->read_ahead = nera;
+ }
/*
- * Call the pager to retrieve the data, if any, after
- * releasing the lock on the map. We hold a ref on
- * fs.object and the pages are exclusive busied.
+ * Prepare for unlocking the map. Save the map
+ * entry's start and end addresses, which are used to
+ * optimize the size of the pager operation below.
+ * Even if the map entry's addresses change after
+ * unlocking the map, using the saved addresses is
+ * safe.
*/
+ e_start = fs.entry->start;
+ e_end = fs.entry->end;
+ }
+
+ /*
+ * Call the pager to retrieve the page if there is a chance
+ * that the pager has it, and potentially retrieve additional
+ * pages at the same time.
+ */
+ if (fs.object->type != OBJT_DEFAULT) {
+ /*
+ * Release the map lock before locking the vnode or
+ * sleeping in the pager. (If the current object has
+ * a shadow, then an earlier iteration of this loop
+ * may have already unlocked the map.)
+ */
unlock_map(&fs);
if (fs.object->type == OBJT_VNODE &&
(vp = fs.object->handle) != fs.vp) {
+ /*
+ * Perform an unlock in case the desired vnode
+ * changed while the map was unlocked during a
+ * retry.
+ */
unlock_vp(&fs);
+
locked = VOP_ISLOCKED(vp);
-
if (locked != LK_EXCLUSIVE)
locked = LK_SHARED;
- /* Do not sleep for vnode lock while fs.m is busy */
+
+ /*
+ * We must not sleep acquiring the vnode lock
+ * while we have the page exclusive busied or
+ * the object's paging-in-progress count
+ * incremented. Otherwise, we could deadlock.
+ */
error = vget(vp, locked | LK_CANRECURSE |
LK_NOWAIT, curthread);
if (error != 0) {
@@ -670,88 +915,85 @@
("vm_fault: vnode-backed object mapped by system map"));
/*
- * now we find out if any other pages should be paged
- * in at this time this routine checks to see if the
- * pages surrounding this fault reside in the same
- * object as the page for this fault. If they do,
- * then they are faulted in also into the object. The
- * array "marray" returned contains an array of
- * vm_page_t structs where one of them is the
- * vm_page_t passed to the routine. The reqpage
- * return value is the index into the marray for the
- * vm_page_t passed to the routine.
- *
- * fs.m plus the additional pages are exclusive busied.
+ * Page in the requested page and hint the pager,
+ * that it may bring up surrounding pages.
*/
- faultcount = vm_fault_additional_pages(
- fs.m, behind, ahead, marray, &reqpage);
-
- rv = faultcount ?
- vm_pager_get_pages(fs.object, marray, faultcount,
- reqpage) : VM_PAGER_FAIL;
-
+ if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM ||
+ P_KILLED(curproc)) {
+ behind = 0;
+ ahead = 0;
+ } else {
+ /* Is this a sequential fault? */
+ if (nera > 0) {
+ behind = 0;
+ ahead = nera;
+ } else {
+ /*
+ * Request a cluster of pages that is
+ * aligned to a VM_FAULT_READ_DEFAULT
+ * page offset boundary within the
+ * object. Alignment to a page offset
+ * boundary is more likely to coincide
+ * with the underlying file system
+ * block than alignment to a virtual
+ * address boundary.
+ */
+ cluster_offset = fs.pindex %
+ VM_FAULT_READ_DEFAULT;
+ behind = ulmin(cluster_offset,
+ atop(vaddr - e_start));
+ ahead = VM_FAULT_READ_DEFAULT - 1 -
+ cluster_offset;
+ }
+ ahead = ulmin(ahead, atop(e_end - vaddr) - 1);
+ }
+ rv = vm_pager_get_pages(fs.object, &fs.m, 1,
+ &behind, &ahead);
if (rv == VM_PAGER_OK) {
- /*
- * Found the page. Leave it busy while we play
- * with it.
- */
-
- /*
- * Relookup in case pager changed page. Pager
- * is responsible for disposition of old page
- * if moved.
- */
- fs.m = vm_page_lookup(fs.object, fs.pindex);
- if (!fs.m) {
- unlock_and_deallocate(&fs);
- goto RetryFault;
- }
-
- hardfault++;
+ faultcount = behind + 1 + ahead;
+ hardfault = true;
break; /* break to PAGE HAS BEEN FOUND */
}
- /*
- * Remove the bogus page (which does not exist at this
- * object/offset); before doing so, we must get back
- * our object lock to preserve our invariant.
- *
- * Also wake up any other process that may want to bring
- * in this page.
- *
- * If this is the top-level object, we must leave the
- * busy page to prevent another process from rushing
- * past us, and inserting the page in that object at
- * the same time that we are.
- */
if (rv == VM_PAGER_ERROR)
printf("vm_fault: pager read error, pid %d (%s)\n",
curproc->p_pid, curproc->p_comm);
+
/*
- * Data outside the range of the pager or an I/O error
+ * If an I/O error occurred or the requested page was
+ * outside the range of the pager, clean up and return
+ * an error.
*/
- /*
- * XXX - the check for kernel_map is a kludge to work
- * around having the machine panic on a kernel space
- * fault w/ I/O error.
- */
- if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) ||
- (rv == VM_PAGER_BAD)) {
+ if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) {
vm_page_lock(fs.m);
- vm_page_free(fs.m);
+ if (fs.m->wire_count == 0)
+ vm_page_free(fs.m);
+ else
+ vm_page_xunbusy_maybelocked(fs.m);
vm_page_unlock(fs.m);
fs.m = NULL;
unlock_and_deallocate(&fs);
- return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
+ return (rv == VM_PAGER_ERROR ? KERN_FAILURE :
+ KERN_PROTECTION_FAILURE);
}
+
+ /*
+ * The requested page does not exist at this object/
+ * offset. Remove the invalid page from the object,
+ * waking up anyone waiting for it, and continue on to
+ * the next object. However, if this is the top-level
+ * object, we must leave the busy page in place to
+ * prevent another process from rushing past us, and
+ * inserting the page in that object at the same time
+ * that we are.
+ */
if (fs.object != fs.first_object) {
vm_page_lock(fs.m);
- vm_page_free(fs.m);
+ if (fs.m->wire_count == 0)
+ vm_page_free(fs.m);
+ else
+ vm_page_xunbusy_maybelocked(fs.m);
vm_page_unlock(fs.m);
fs.m = NULL;
- /*
- * XXX - we cannot just fall out at this
- * point, m has been freed and is invalid!
- */
}
}
@@ -766,7 +1008,6 @@
* Move on to the next object. Lock the next object before
* unlocking the current one.
*/
- fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset);
next_object = fs.object->backing_object;
if (next_object == NULL) {
/*
@@ -804,6 +1045,8 @@
vm_object_pip_add(next_object, 1);
if (fs.object != fs.first_object)
vm_object_pip_wakeup(fs.object);
+ fs.pindex +=
+ OFF_TO_IDX(fs.object->backing_object_offset);
VM_OBJECT_WUNLOCK(fs.object);
fs.object = next_object;
}
@@ -836,7 +1079,7 @@
* dirty in the first object so that it will go out
* to swap when needed.
*/
- is_first_object_locked = FALSE;
+ is_first_object_locked = false;
if (
/*
* Only one shadow object
@@ -860,22 +1103,15 @@
* We don't chase down the shadow chain
*/
fs.object == fs.first_object->backing_object) {
- /*
- * get rid of the unnecessary page
- */
+ vm_page_lock(fs.m);
+ vm_page_remove(fs.m);
+ vm_page_unlock(fs.m);
vm_page_lock(fs.first_m);
+ vm_page_replace_checked(fs.m, fs.first_object,
+ fs.first_pindex, fs.first_m);
vm_page_free(fs.first_m);
vm_page_unlock(fs.first_m);
- /*
- * grab the page and put it into the
- * process'es object. The page is
- * automatically made dirty.
- */
- if (vm_page_rename(fs.m, fs.first_object,
- fs.first_pindex)) {
- unlock_and_deallocate(&fs);
- goto RetryFault;
- }
+ vm_page_dirty(fs.m);
#if VM_NRESERVLEVEL > 0
/*
* Rename the reservation.
@@ -884,6 +1120,10 @@
fs.object, OFF_TO_IDX(
fs.first_object->backing_object_offset));
#endif
+ /*
+ * Removing the page from the backing object
+ * unbusied it.
+ */
vm_page_xbusy(fs.m);
fs.first_m = fs.m;
fs.m = NULL;
@@ -905,7 +1145,7 @@
vm_page_unlock(fs.first_m);
vm_page_lock(fs.m);
- vm_page_unwire(fs.m, FALSE);
+ vm_page_unwire(fs.m, PQ_INACTIVE);
vm_page_unlock(fs.m);
}
/*
@@ -939,16 +1179,12 @@
* lookup.
*/
if (!fs.lookup_still_valid) {
- vm_object_t retry_object;
- vm_pindex_t retry_pindex;
- vm_prot_t retry_prot;
-
if (!vm_map_trylock_read(fs.map)) {
release_page(&fs);
unlock_and_deallocate(&fs);
goto RetryFault;
}
- fs.lookup_still_valid = TRUE;
+ fs.lookup_still_valid = true;
if (fs.map->timestamp != fs.map_generation) {
result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
&fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
@@ -986,20 +1222,23 @@
* write-enabled after all.
*/
prot &= retry_prot;
+ fault_type &= retry_prot;
+ if (prot == 0) {
+ release_page(&fs);
+ unlock_and_deallocate(&fs);
+ goto RetryFault;
+ }
}
}
+
/*
- * If the page was filled by a pager, update the map entry's
- * last read offset. Since the pager does not return the
- * actual set of pages that it read, this update is based on
- * the requested set. Typically, the requested and actual
- * sets are the same.
- *
- * XXX The following assignment modifies the map
- * without holding a write lock on it.
+ * If the page was filled by a pager, save the virtual address that
+ * should be faulted on next under a sequential access pattern to the
+ * map entry. A read lock on the map suffices to update this address
+ * safely.
*/
if (hardfault)
- fs.entry->next_read = fs.pindex + faultcount - reqpage;
+ fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;
vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
vm_page_assert_xbusied(fs.m);
@@ -1022,7 +1261,9 @@
fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0);
if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 &&
wired == 0)
- vm_fault_prefault(&fs, vaddr, faultcount, reqpage);
+ vm_fault_prefault(&fs, vaddr,
+ faultcount > 0 ? behind : PFBAK,
+ faultcount > 0 ? ahead : PFFOR, false);
VM_OBJECT_WLOCK(fs.object);
vm_page_lock(fs.m);
@@ -1049,6 +1290,21 @@
if (hardfault) {
PCPU_INC(cnt.v_io_faults);
curthread->td_ru.ru_majflt++;
+#ifdef RACCT
+ if (racct_enable && fs.object->type == OBJT_VNODE) {
+ PROC_LOCK(curproc);
+ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
+ racct_add_force(curproc, RACCT_WRITEBPS,
+ PAGE_SIZE + behind * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_READBPS,
+ PAGE_SIZE + ahead * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ }
+ PROC_UNLOCK(curproc);
+ }
+#endif
} else
curthread->td_ru.ru_minflt++;
@@ -1056,15 +1312,26 @@
}
/*
- * Speed up the reclamation of up to "distance" pages that precede the
- * faulting pindex within the first object of the shadow chain.
+ * Speed up the reclamation of pages that precede the faulting pindex within
+ * the first object of the shadow chain. Essentially, perform the equivalent
+ * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
+ * the faulting pindex by the cluster size when the pages read by vm_fault()
+ * cross a cluster-size boundary. The cluster size is the greater of the
+ * smallest superpage size and VM_FAULT_DONTNEED_MIN.
+ *
+ * When "fs->first_object" is a shadow object, the pages in the backing object
+ * that precede the faulting pindex are deactivated by vm_fault(). So, this
+ * function must only be concerned with pages in the first object.
*/
static void
-vm_fault_cache_behind(const struct faultstate *fs, int distance)
+vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
{
+ vm_map_entry_t entry;
vm_object_t first_object, object;
- vm_page_t m, m_prev;
- vm_pindex_t pindex;
+ vm_offset_t end, start;
+ vm_page_t m, m_next;
+ vm_pindex_t pend, pstart;
+ vm_size_t size;
object = fs->object;
VM_OBJECT_ASSERT_WLOCKED(object);
@@ -1076,32 +1343,44 @@
VM_OBJECT_WLOCK(object);
}
}
- /* Neither fictitious nor unmanaged pages can be cached. */
+ /* Neither fictitious nor unmanaged pages can be reclaimed. */
if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
- if (fs->first_pindex < distance)
- pindex = 0;
- else
- pindex = fs->first_pindex - distance;
- if (pindex < OFF_TO_IDX(fs->entry->offset))
- pindex = OFF_TO_IDX(fs->entry->offset);
- m = first_object != object ? fs->first_m : fs->m;
- vm_page_assert_xbusied(m);
- m_prev = vm_page_prev(m);
- while ((m = m_prev) != NULL && m->pindex >= pindex &&
- m->valid == VM_PAGE_BITS_ALL) {
- m_prev = vm_page_prev(m);
- if (vm_page_busied(m))
- continue;
- vm_page_lock(m);
- if (m->hold_count == 0 && m->wire_count == 0) {
- pmap_remove_all(m);
- vm_page_aflag_clear(m, PGA_REFERENCED);
- if (m->dirty != 0)
- vm_page_deactivate(m);
- else
- vm_page_cache(m);
+ size = VM_FAULT_DONTNEED_MIN;
+ if (MAXPAGESIZES > 1 && size < pagesizes[1])
+ size = pagesizes[1];
+ end = rounddown2(vaddr, size);
+ if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
+ (entry = fs->entry)->start < end) {
+ if (end - entry->start < size)
+ start = entry->start;
+ else
+ start = end - size;
+ pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
+ pstart = OFF_TO_IDX(entry->offset) + atop(start -
+ entry->start);
+ m_next = vm_page_find_least(first_object, pstart);
+ pend = OFF_TO_IDX(entry->offset) + atop(end -
+ entry->start);
+ while ((m = m_next) != NULL && m->pindex < pend) {
+ m_next = TAILQ_NEXT(m, listq);
+ if (m->valid != VM_PAGE_BITS_ALL ||
+ vm_page_busied(m))
+ continue;
+
+ /*
+ * Don't clear PGA_REFERENCED, since it would
+ * likely represent a reference by a different
+ * process.
+ *
+ * Typically, at this point, prefetched pages
+ * are still in the inactive queue. Only
+ * pages that triggered page faults are in the
+ * active queue.
+ */
+ vm_page_lock(m);
+ vm_page_deactivate(m);
+ vm_page_unlock(m);
}
- vm_page_unlock(m);
}
}
if (first_object != object)
@@ -1116,7 +1395,7 @@
*/
static void
vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
- int faultcount, int reqpage)
+ int backward, int forward, bool obj_locked)
{
pmap_t pmap;
vm_map_entry_t entry;
@@ -1124,19 +1403,12 @@
vm_offset_t addr, starta;
vm_pindex_t pindex;
vm_page_t m;
- int backward, forward, i;
+ int i;
pmap = fs->map->pmap;
if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
return;
- if (faultcount > 0) {
- backward = reqpage;
- forward = faultcount - reqpage - 1;
- } else {
- backward = PFBAK;
- forward = PFFOR;
- }
entry = fs->entry;
if (addra < backward * PAGE_SIZE) {
@@ -1169,7 +1441,8 @@
pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
lobject = entry->object.vm_object;
- VM_OBJECT_RLOCK(lobject);
+ if (!obj_locked)
+ VM_OBJECT_RLOCK(lobject);
while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
lobject->type == OBJT_DEFAULT &&
(backing_object = lobject->backing_object) != NULL) {
@@ -1177,17 +1450,20 @@
0, ("vm_fault_prefault: unaligned object offset"));
pindex += lobject->backing_object_offset >> PAGE_SHIFT;
VM_OBJECT_RLOCK(backing_object);
- VM_OBJECT_RUNLOCK(lobject);
+ if (!obj_locked || lobject != entry->object.vm_object)
+ VM_OBJECT_RUNLOCK(lobject);
lobject = backing_object;
}
if (m == NULL) {
- VM_OBJECT_RUNLOCK(lobject);
+ if (!obj_locked || lobject != entry->object.vm_object)
+ VM_OBJECT_RUNLOCK(lobject);
break;
}
if (m->valid == VM_PAGE_BITS_ALL &&
(m->flags & PG_FICTITIOUS) == 0)
pmap_enter_quick(pmap, addr, m, entry->protection);
- VM_OBJECT_RUNLOCK(lobject);
+ if (!obj_locked || lobject != entry->object.vm_object)
+ VM_OBJECT_RUNLOCK(lobject);
}
}
@@ -1252,7 +1528,18 @@
* page was mapped at the specified virtual address or that
* mapping had insufficient permissions. Attempt to fault in
* and hold these pages.
+ *
+ * If vm_fault_disable_pagefaults() was called,
+ * i.e., TDP_NOFAULTING is set, we must not sleep nor
+ * acquire MD VM locks, which means we must not call
+ * vm_fault_hold(). Some (out of tree) callers mark
+ * too wide a code area with vm_fault_disable_pagefaults()
+ * already, use the VM_PROT_QUICK_NOFAULT flag to request
+ * the proper behaviour explicitly.
*/
+ if ((prot & VM_PROT_QUICK_NOFAULT) != 0 &&
+ (curthread->td_pflags & TDP_NOFAULTING) != 0)
+ goto error;
for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
if (*mp == NULL && vm_fault_hold(map, va, prot,
VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
@@ -1315,11 +1602,12 @@
* actually shadow anything - we copy the pages directly.)
*/
dst_object = vm_object_allocate(OBJT_DEFAULT,
- OFF_TO_IDX(dst_entry->end - dst_entry->start));
+ atop(dst_entry->end - dst_entry->start));
#if VM_NRESERVLEVEL > 0
dst_object->flags |= OBJ_COLORED;
dst_object->pg_color = atop(dst_entry->start);
#endif
+ dst_object->charge = dst_entry->end - dst_entry->start;
}
VM_OBJECT_WLOCK(dst_object);
@@ -1328,7 +1616,6 @@
if (src_object != dst_object) {
dst_entry->object.vm_object = dst_object;
dst_entry->offset = 0;
- dst_object->charge = dst_entry->end - dst_entry->start;
}
if (fork_charge != NULL) {
KASSERT(dst_entry->cred == NULL,
@@ -1336,7 +1623,9 @@
dst_object->cred = curthread->td_ucred;
crhold(dst_object->cred);
*fork_charge += dst_object->charge;
- } else if (dst_object->cred == NULL) {
+ } else if ((dst_object->type == OBJT_DEFAULT ||
+ dst_object->type == OBJT_SWAP) &&
+ dst_object->cred == NULL) {
KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
dst_entry));
dst_object->cred = dst_entry->cred;
@@ -1361,7 +1650,7 @@
* range, copying each page from the source object to the
* destination object. Since the source is wired, those pages
* must exist. In contrast, the destination is pageable.
- * Since the destination object does share any backing storage
+ * Since the destination object doesn't share any backing storage
* with the source object, all of its pages must be dirtied,
* regardless of whether they can be written.
*/
@@ -1417,15 +1706,19 @@
}
pmap_copy_page(src_m, dst_m);
VM_OBJECT_RUNLOCK(object);
- dst_m->valid = VM_PAGE_BITS_ALL;
- dst_m->dirty = VM_PAGE_BITS_ALL;
+ dst_m->dirty = dst_m->valid = src_m->valid;
} else {
dst_m = src_m;
if (vm_page_sleep_if_busy(dst_m, "fltupg"))
goto again;
+ if (dst_m->pindex >= dst_object->size)
+ /*
+ * We are upgrading. Index can occur
+ * out of bounds if the object type is
+ * vnode and the file was truncated.
+ */
+ break;
vm_page_xbusy(dst_m);
- KASSERT(dst_m->valid == VM_PAGE_BITS_ALL,
- ("invalid dst page %p", dst_m));
}
VM_OBJECT_WUNLOCK(dst_object);
@@ -1433,9 +1726,18 @@
* Enter it in the pmap. If a wired, copy-on-write
* mapping is being replaced by a write-enabled
* mapping, then wire that new mapping.
+ *
+ * The page can be invalid if the user called
+ * msync(MS_INVALIDATE) or truncated the backing vnode
+ * or shared memory object. In this case, do not
+ * insert it into pmap, but still do the copy so that
+ * all copies of the wired map entry have similar
+ * backing pages.
*/
- pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
- access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
+ if (dst_m->valid == VM_PAGE_BITS_ALL) {
+ pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
+ access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
+ }
/*
* Mark it no longer busy, and put it on the active list.
@@ -1445,7 +1747,7 @@
if (upgrade) {
if (src_m != dst_m) {
vm_page_lock(src_m);
- vm_page_unwire(src_m, 0);
+ vm_page_unwire(src_m, PQ_INACTIVE);
vm_page_unlock(src_m);
vm_page_lock(dst_m);
vm_page_wire(dst_m);
@@ -1468,134 +1770,7 @@
}
}
-
/*
- * This routine checks around the requested page for other pages that
- * might be able to be faulted in. This routine brackets the viable
- * pages for the pages to be paged in.
- *
- * Inputs:
- * m, rbehind, rahead
- *
- * Outputs:
- * marray (array of vm_page_t), reqpage (index of requested page)
- *
- * Return value:
- * number of pages in marray
- */
-static int
-vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
- vm_page_t m;
- int rbehind;
- int rahead;
- vm_page_t *marray;
- int *reqpage;
-{
- int i,j;
- vm_object_t object;
- vm_pindex_t pindex, startpindex, endpindex, tpindex;
- vm_page_t rtm;
- int cbehind, cahead;
-
- VM_OBJECT_ASSERT_WLOCKED(m->object);
-
- object = m->object;
- pindex = m->pindex;
- cbehind = cahead = 0;
-
- /*
- * if the requested page is not available, then give up now
- */
- if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
- return 0;
- }
-
- if ((cbehind == 0) && (cahead == 0)) {
- *reqpage = 0;
- marray[0] = m;
- return 1;
- }
-
- if (rahead > cahead) {
- rahead = cahead;
- }
-
- if (rbehind > cbehind) {
- rbehind = cbehind;
- }
-
- /*
- * scan backward for the read behind pages -- in memory
- */
- if (pindex > 0) {
- if (rbehind > pindex) {
- rbehind = pindex;
- startpindex = 0;
- } else {
- startpindex = pindex - rbehind;
- }
-
- if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL &&
- rtm->pindex >= startpindex)
- startpindex = rtm->pindex + 1;
-
- /* tpindex is unsigned; beware of numeric underflow. */
- for (i = 0, tpindex = pindex - 1; tpindex >= startpindex &&
- tpindex < pindex; i++, tpindex--) {
-
- rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
- VM_ALLOC_IFNOTCACHED);
- if (rtm == NULL) {
- /*
- * Shift the allocated pages to the
- * beginning of the array.
- */
- for (j = 0; j < i; j++) {
- marray[j] = marray[j + tpindex + 1 -
- startpindex];
- }
- break;
- }
-
- marray[tpindex - startpindex] = rtm;
- }
- } else {
- startpindex = 0;
- i = 0;
- }
-
- marray[i] = m;
- /* page offset of the required page */
- *reqpage = i;
-
- tpindex = pindex + 1;
- i++;
-
- /*
- * scan forward for the read ahead pages
- */
- endpindex = tpindex + rahead;
- if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex)
- endpindex = rtm->pindex;
- if (endpindex > object->size)
- endpindex = object->size;
-
- for (; tpindex < endpindex; i++, tpindex++) {
-
- rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
- VM_ALLOC_IFNOTCACHED);
- if (rtm == NULL) {
- break;
- }
-
- marray[i] = rtm;
- }
-
- /* return number of pages */
- return i;
-}
-
-/*
* Block entry into the machine-independent layer's page fault handler by
* the calling thread. Subsequent calls to vm_fault() by that thread will
* return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of
Modified: trunk/sys/vm/vm_glue.c
===================================================================
--- trunk/sys/vm/vm_glue.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_glue.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_glue.c 300673 2016-05-25 10:04:53Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_glue.c 341467 2018-12-04 15:04:48Z emaste $");
#include "opt_vm.h"
#include "opt_kstack_pages.h"
@@ -102,13 +102,6 @@
#include <machine/cpu.h>
-#ifndef NO_SWAPPING
-static int swapout(struct proc *);
-static void swapclear(struct proc *);
-static void vm_thread_swapin(struct thread *td);
-static void vm_thread_swapout(struct thread *td);
-#endif
-
/*
* MPSAFE
*
@@ -119,9 +112,7 @@
* space.
*/
int
-kernacc(addr, len, rw)
- void *addr;
- int len, rw;
+kernacc(void *addr, int len, int rw)
{
boolean_t rv;
vm_offset_t saddr, eaddr;
@@ -130,7 +121,7 @@
KASSERT((rw & ~VM_PROT_ALL) == 0,
("illegal ``rw'' argument to kernacc (%x)\n", rw));
- if ((vm_offset_t)addr + len > kernel_map->max_offset ||
+ if ((vm_offset_t)addr + len > vm_map_max(kernel_map) ||
(vm_offset_t)addr + len < (vm_offset_t)addr)
return (FALSE);
@@ -150,12 +141,10 @@
* the associated vm_map_entry range. It does not determine whether the
* contents of the memory is actually readable or writable. vmapbuf(),
* vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
- * used in conjuction with this call.
+ * used in conjunction with this call.
*/
int
-useracc(addr, len, rw)
- void *addr;
- int len, rw;
+useracc(void *addr, int len, int rw)
{
boolean_t rv;
vm_prot_t prot;
@@ -201,16 +190,21 @@
* Also, the sysctl code, which is the only present user
* of vslock(), does a hard loop on EAGAIN.
*/
- if (npages + cnt.v_wire_count > vm_page_max_wired)
+ if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
return (EAGAIN);
#endif
error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
+ if (error == KERN_SUCCESS) {
+ curthread->td_vslock_sz += len;
+ return (0);
+ }
+
/*
* Return EFAULT on error to match copy{in,out}() behaviour
* rather than returning ENOMEM like mlock() would.
*/
- return (error == KERN_SUCCESS ? 0 : EFAULT);
+ return (EFAULT);
}
void
@@ -218,6 +212,8 @@
{
/* Rely on the parameter sanity checks performed by vslock(). */
+ MPASS(curthread->td_vslock_sz >= len);
+ curthread->td_vslock_sz -= len;
(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
@@ -231,19 +227,16 @@
static vm_page_t
vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
{
- vm_page_t m, ma[1];
+ vm_page_t m;
vm_pindex_t pindex;
int rv;
VM_OBJECT_WLOCK(object);
pindex = OFF_TO_IDX(offset);
- m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
+ m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
if (m->valid != VM_PAGE_BITS_ALL) {
- ma[0] = m;
- rv = vm_pager_get_pages(object, ma, 1, 0);
- m = vm_page_lookup(object, pindex);
- if (m == NULL)
- goto out;
+ vm_page_xbusy(m);
+ rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
if (rv != VM_PAGER_OK) {
vm_page_lock(m);
vm_page_free(m);
@@ -251,8 +244,8 @@
m = NULL;
goto out;
}
+ vm_page_xunbusy(m);
}
- vm_page_xunbusy(m);
vm_page_lock(m);
vm_page_hold(m);
vm_page_activate(m);
@@ -312,10 +305,6 @@
SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
"");
-#ifndef KSTACK_MAX_PAGES
-#define KSTACK_MAX_PAGES 32
-#endif
-
/*
* Create the kernel stack (including pcb for i386) for a new thread.
* This routine directly affects the fork perf for a process and
@@ -326,17 +315,17 @@
{
vm_object_t ksobj;
vm_offset_t ks;
- vm_page_t m, ma[KSTACK_MAX_PAGES];
+ vm_page_t ma[KSTACK_MAX_PAGES];
struct kstack_cache_entry *ks_ce;
int i;
/* Bounds check */
if (pages <= 1)
- pages = KSTACK_PAGES;
+ pages = kstack_pages;
else if (pages > KSTACK_MAX_PAGES)
pages = KSTACK_MAX_PAGES;
- if (pages == KSTACK_PAGES) {
+ if (pages == kstack_pages) {
mtx_lock(&kstack_cache_mtx);
if (kstack_cache != NULL) {
ks_ce = kstack_cache;
@@ -345,7 +334,7 @@
td->td_kstack_obj = ks_ce->ksobj;
td->td_kstack = (vm_offset_t)ks_ce;
- td->td_kstack_pages = KSTACK_PAGES;
+ td->td_kstack_pages = kstack_pages;
return (1);
}
mtx_unlock(&kstack_cache_mtx);
@@ -395,15 +384,10 @@
* page of stack.
*/
VM_OBJECT_WLOCK(ksobj);
- for (i = 0; i < pages; i++) {
- /*
- * Get a kernel stack page.
- */
- m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
- VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
- ma[i] = m;
- m->valid = VM_PAGE_BITS_ALL;
- }
+ (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY |
+ VM_ALLOC_WIRED, ma, pages);
+ for (i = 0; i < pages; i++)
+ ma[i]->valid = VM_PAGE_BITS_ALL;
VM_OBJECT_WUNLOCK(ksobj);
pmap_qenter(ks, ma, pages);
return (1);
@@ -423,7 +407,7 @@
if (m == NULL)
panic("vm_thread_dispose: kstack already missing?");
vm_page_lock(m);
- vm_page_unwire(m, 0);
+ vm_page_unwire(m, PQ_NONE);
vm_page_free(m);
vm_page_unlock(m);
}
@@ -449,7 +433,7 @@
ks = td->td_kstack;
td->td_kstack = 0;
td->td_kstack_pages = 0;
- if (pages == KSTACK_PAGES && kstacks <= kstack_cache_size) {
+ if (pages == kstack_pages && kstacks <= kstack_cache_size) {
ks_ce = (struct kstack_cache_entry *)ks;
ks_ce->ksobj = ksobj;
mtx_lock(&kstack_cache_mtx);
@@ -476,7 +460,7 @@
ks_ce = ks_ce->next_ks_entry;
vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1,
- KSTACK_PAGES);
+ kstack_pages);
}
}
@@ -536,78 +520,7 @@
}
#endif /* KSTACK_USAGE_PROF */
-#ifndef NO_SWAPPING
/*
- * Allow a thread's kernel stack to be paged out.
- */
-static void
-vm_thread_swapout(struct thread *td)
-{
- vm_object_t ksobj;
- vm_page_t m;
- int i, pages;
-
- cpu_thread_swapout(td);
- pages = td->td_kstack_pages;
- ksobj = td->td_kstack_obj;
- pmap_qremove(td->td_kstack, pages);
- VM_OBJECT_WLOCK(ksobj);
- for (i = 0; i < pages; i++) {
- m = vm_page_lookup(ksobj, i);
- if (m == NULL)
- panic("vm_thread_swapout: kstack already missing?");
- vm_page_dirty(m);
- vm_page_lock(m);
- vm_page_unwire(m, 0);
- vm_page_unlock(m);
- }
- VM_OBJECT_WUNLOCK(ksobj);
-}
-
-/*
- * Bring the kernel stack for a specified thread back in.
- */
-static void
-vm_thread_swapin(struct thread *td)
-{
- vm_object_t ksobj;
- vm_page_t ma[KSTACK_MAX_PAGES];
- int i, j, k, pages, rv;
-
- pages = td->td_kstack_pages;
- ksobj = td->td_kstack_obj;
- VM_OBJECT_WLOCK(ksobj);
- for (i = 0; i < pages; i++)
- ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL |
- VM_ALLOC_WIRED);
- for (i = 0; i < pages; i++) {
- if (ma[i]->valid != VM_PAGE_BITS_ALL) {
- vm_page_assert_xbusied(ma[i]);
- vm_object_pip_add(ksobj, 1);
- for (j = i + 1; j < pages; j++) {
- if (ma[j]->valid != VM_PAGE_BITS_ALL)
- vm_page_assert_xbusied(ma[j]);
- if (ma[j]->valid == VM_PAGE_BITS_ALL)
- break;
- }
- rv = vm_pager_get_pages(ksobj, ma + i, j - i, 0);
- if (rv != VM_PAGER_OK)
- panic("vm_thread_swapin: cannot get kstack for proc: %d",
- td->td_proc->p_pid);
- vm_object_pip_wakeup(ksobj);
- for (k = i; k < j; k++)
- ma[k] = vm_page_lookup(ksobj, k);
- vm_page_xunbusy(ma[i]);
- } else if (vm_page_xbusied(ma[i]))
- vm_page_xunbusy(ma[i]);
- }
- VM_OBJECT_WUNLOCK(ksobj);
- pmap_qenter(td->td_kstack, ma, pages);
- cpu_thread_swapin(td);
-}
-#endif /* !NO_SWAPPING */
-
-/*
* Implement fork's actions on an address space.
* Here we arrange for the address space to be copied or referenced,
* allocate a user struct (pcb and kernel stack), then call the
@@ -616,12 +529,8 @@
* to user mode to avoid stack copying and relocation problems.
*/
int
-vm_forkproc(td, p2, td2, vm2, flags)
- struct thread *td;
- struct proc *p2;
- struct thread *td2;
- struct vmspace *vm2;
- int flags;
+vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2,
+ struct vmspace *vm2, int flags)
{
struct proc *p1 = td->td_proc;
int error;
@@ -667,7 +576,7 @@
}
/*
- * Called after process has been wait(2)'ed apon and is being reaped.
+ * Called after process has been wait(2)'ed upon and is being reaped.
* The idea is to reclaim resources that we could not reclaim while
* the process was still executing.
*/
@@ -680,414 +589,8 @@
}
void
-faultin(p)
- struct proc *p;
-{
-#ifdef NO_SWAPPING
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
- if ((p->p_flag & P_INMEM) == 0)
- panic("faultin: proc swapped out with NO_SWAPPING!");
-#else /* !NO_SWAPPING */
- struct thread *td;
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
- /*
- * If another process is swapping in this process,
- * just wait until it finishes.
- */
- if (p->p_flag & P_SWAPPINGIN) {
- while (p->p_flag & P_SWAPPINGIN)
- msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
- return;
- }
- if ((p->p_flag & P_INMEM) == 0) {
- /*
- * Don't let another thread swap process p out while we are
- * busy swapping it in.
- */
- ++p->p_lock;
- p->p_flag |= P_SWAPPINGIN;
- PROC_UNLOCK(p);
-
- /*
- * We hold no lock here because the list of threads
- * can not change while all threads in the process are
- * swapped out.
- */
- FOREACH_THREAD_IN_PROC(p, td)
- vm_thread_swapin(td);
- PROC_LOCK(p);
- swapclear(p);
- p->p_swtick = ticks;
-
- wakeup(&p->p_flag);
-
- /* Allow other threads to swap p out now. */
- --p->p_lock;
- }
-#endif /* NO_SWAPPING */
-}
-
-/*
- * This swapin algorithm attempts to swap-in processes only if there
- * is enough space for them. Of course, if a process waits for a long
- * time, it will be swapped in anyway.
- */
-void
-swapper(void)
-{
- struct proc *p;
- struct thread *td;
- struct proc *pp;
- int slptime;
- int swtime;
- int ppri;
- int pri;
-
-loop:
- if (vm_page_count_min()) {
- VM_WAIT;
- goto loop;
- }
-
- pp = NULL;
- ppri = INT_MIN;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state == PRS_NEW ||
- p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
- PROC_UNLOCK(p);
- continue;
- }
- swtime = (ticks - p->p_swtick) / hz;
- FOREACH_THREAD_IN_PROC(p, td) {
- /*
- * An otherwise runnable thread of a process
- * swapped out has only the TDI_SWAPPED bit set.
- *
- */
- thread_lock(td);
- if (td->td_inhibitors == TDI_SWAPPED) {
- slptime = (ticks - td->td_slptick) / hz;
- pri = swtime + slptime;
- if ((td->td_flags & TDF_SWAPINREQ) == 0)
- pri -= p->p_nice * 8;
- /*
- * if this thread is higher priority
- * and there is enough space, then select
- * this process instead of the previous
- * selection.
- */
- if (pri > ppri) {
- pp = p;
- ppri = pri;
- }
- }
- thread_unlock(td);
- }
- PROC_UNLOCK(p);
- }
- sx_sunlock(&allproc_lock);
-
- /*
- * Nothing to do, back to sleep.
- */
- if ((p = pp) == NULL) {
- tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
- goto loop;
- }
- PROC_LOCK(p);
-
- /*
- * Another process may be bringing or may have already
- * brought this process in while we traverse all threads.
- * Or, this process may even be being swapped out again.
- */
- if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
- PROC_UNLOCK(p);
- goto loop;
- }
-
- /*
- * We would like to bring someone in. (only if there is space).
- * [What checks the space? ]
- */
- faultin(p);
- PROC_UNLOCK(p);
- goto loop;
-}
-
-void
kick_proc0(void)
{
wakeup(&proc0);
}
-
-#ifndef NO_SWAPPING
-
-/*
- * Swap_idle_threshold1 is the guaranteed swapped in time for a process
- */
-static int swap_idle_threshold1 = 2;
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
- &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
-
-/*
- * Swap_idle_threshold2 is the time that a process can be idle before
- * it will be swapped out, if idle swapping is enabled.
- */
-static int swap_idle_threshold2 = 10;
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
- &swap_idle_threshold2, 0, "Time before a process will be swapped out");
-
-/*
- * First, if any processes have been sleeping or stopped for at least
- * "swap_idle_threshold1" seconds, they are swapped out. If, however,
- * no such processes exist, then the longest-sleeping or stopped
- * process is swapped out. Finally, and only as a last resort, if
- * there are no sleeping or stopped processes, the longest-resident
- * process is swapped out.
- */
-void
-swapout_procs(action)
-int action;
-{
- struct proc *p;
- struct thread *td;
- int didswap = 0;
-
-retry:
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- struct vmspace *vm;
- int minslptime = 100000;
- int slptime;
-
- /*
- * Watch out for a process in
- * creation. It may have no
- * address space or lock yet.
- */
- if (p->p_state == PRS_NEW)
- continue;
- /*
- * An aio daemon switches its
- * address space while running.
- * Perform a quick check whether
- * a process has P_SYSTEM.
- */
- if ((p->p_flag & P_SYSTEM) != 0)
- continue;
- /*
- * Do not swapout a process that
- * is waiting for VM data
- * structures as there is a possible
- * deadlock. Test this first as
- * this may block.
- *
- * Lock the map until swapout
- * finishes, or a thread of this
- * process may attempt to alter
- * the map.
- */
- vm = vmspace_acquire_ref(p);
- if (vm == NULL)
- continue;
- if (!vm_map_trylock(&vm->vm_map))
- goto nextproc1;
-
- PROC_LOCK(p);
- if (p->p_lock != 0 ||
- (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
- ) != 0) {
- goto nextproc;
- }
- /*
- * only aiod changes vmspace, however it will be
- * skipped because of the if statement above checking
- * for P_SYSTEM
- */
- if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
- goto nextproc;
-
- switch (p->p_state) {
- default:
- /* Don't swap out processes in any sort
- * of 'special' state. */
- break;
-
- case PRS_NORMAL:
- /*
- * do not swapout a realtime process
- * Check all the thread groups..
- */
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (PRI_IS_REALTIME(td->td_pri_class)) {
- thread_unlock(td);
- goto nextproc;
- }
- slptime = (ticks - td->td_slptick) / hz;
- /*
- * Guarantee swap_idle_threshold1
- * time in memory.
- */
- if (slptime < swap_idle_threshold1) {
- thread_unlock(td);
- goto nextproc;
- }
-
- /*
- * Do not swapout a process if it is
- * waiting on a critical event of some
- * kind or there is a thread whose
- * pageable memory may be accessed.
- *
- * This could be refined to support
- * swapping out a thread.
- */
- if (!thread_safetoswapout(td)) {
- thread_unlock(td);
- goto nextproc;
- }
- /*
- * If the system is under memory stress,
- * or if we are swapping
- * idle processes >= swap_idle_threshold2,
- * then swap the process out.
- */
- if (((action & VM_SWAP_NORMAL) == 0) &&
- (((action & VM_SWAP_IDLE) == 0) ||
- (slptime < swap_idle_threshold2))) {
- thread_unlock(td);
- goto nextproc;
- }
-
- if (minslptime > slptime)
- minslptime = slptime;
- thread_unlock(td);
- }
-
- /*
- * If the pageout daemon didn't free enough pages,
- * or if this process is idle and the system is
- * configured to swap proactively, swap it out.
- */
- if ((action & VM_SWAP_NORMAL) ||
- ((action & VM_SWAP_IDLE) &&
- (minslptime > swap_idle_threshold2))) {
- if (swapout(p) == 0)
- didswap++;
- PROC_UNLOCK(p);
- vm_map_unlock(&vm->vm_map);
- vmspace_free(vm);
- sx_sunlock(&allproc_lock);
- goto retry;
- }
- }
-nextproc:
- PROC_UNLOCK(p);
- vm_map_unlock(&vm->vm_map);
-nextproc1:
- vmspace_free(vm);
- continue;
- }
- sx_sunlock(&allproc_lock);
- /*
- * If we swapped something out, and another process needed memory,
- * then wakeup the sched process.
- */
- if (didswap)
- wakeup(&proc0);
-}
-
-static void
-swapclear(p)
- struct proc *p;
-{
- struct thread *td;
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
-
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- td->td_flags |= TDF_INMEM;
- td->td_flags &= ~TDF_SWAPINREQ;
- TD_CLR_SWAPPED(td);
- if (TD_CAN_RUN(td))
- if (setrunnable(td)) {
-#ifdef INVARIANTS
- /*
- * XXX: We just cleared TDI_SWAPPED
- * above and set TDF_INMEM, so this
- * should never happen.
- */
- panic("not waking up swapper");
-#endif
- }
- thread_unlock(td);
- }
- p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
- p->p_flag |= P_INMEM;
-}
-
-static int
-swapout(p)
- struct proc *p;
-{
- struct thread *td;
-
- PROC_LOCK_ASSERT(p, MA_OWNED);
-#if defined(SWAP_DEBUG)
- printf("swapping out %d\n", p->p_pid);
-#endif
-
- /*
- * The states of this process and its threads may have changed
- * by now. Assuming that there is only one pageout daemon thread,
- * this process should still be in memory.
- */
- KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
- ("swapout: lost a swapout race?"));
-
- /*
- * remember the process resident count
- */
- p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
- /*
- * Check and mark all threads before we proceed.
- */
- p->p_flag &= ~P_INMEM;
- p->p_flag |= P_SWAPPINGOUT;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (!thread_safetoswapout(td)) {
- thread_unlock(td);
- swapclear(p);
- return (EBUSY);
- }
- td->td_flags &= ~TDF_INMEM;
- TD_SET_SWAPPED(td);
- thread_unlock(td);
- }
- td = FIRST_THREAD_IN_PROC(p);
- ++td->td_ru.ru_nswap;
- PROC_UNLOCK(p);
-
- /*
- * This list is stable because all threads are now prevented from
- * running. The list is only modified in the context of a running
- * thread in this process.
- */
- FOREACH_THREAD_IN_PROC(p, td)
- vm_thread_swapout(td);
-
- PROC_LOCK(p);
- p->p_flag &= ~P_SWAPPINGOUT;
- p->p_swtick = ticks;
- return (0);
-}
-#endif /* !NO_SWAPPING */
Modified: trunk/sys/vm/vm_init.c
===================================================================
--- trunk/sys/vm/vm_init.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_init.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_init.c 255426 2013-09-09 18:11:59Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_init.c 338484 2018-09-05 21:28:33Z kib $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -75,6 +75,7 @@
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/selinfo.h>
+#include <sys/smp.h>
#include <sys/pipe.h>
#include <sys/bio.h>
#include <sys/buf.h>
@@ -91,11 +92,6 @@
long physmem;
-static int exec_map_entries = 16;
-TUNABLE_INT("vm.exec_map_entries", &exec_map_entries);
-SYSCTL_INT(_vm, OID_AUTO, exec_map_entries, CTLFLAG_RD, &exec_map_entries, 0,
- "Maximum number of simultaneous execs");
-
/*
* System initialization
*/
@@ -197,8 +193,8 @@
* Discount the physical memory larger than the size of kernel_map
* to avoid eating up all of KVA space.
*/
- physmem_est = lmin(physmem, btoc(kernel_map->max_offset -
- kernel_map->min_offset));
+ physmem_est = lmin(physmem, btoc(vm_map_max(kernel_map) -
+ vm_map_min(kernel_map)));
v = kern_vfs_bio_buffer_alloc(v, physmem_est);
@@ -231,12 +227,15 @@
/*
* Allocate the buffer arena.
+ *
+ * Enable the quantum cache if we have more than 4 cpus. This
+ * avoids lock contention at the expense of some fragmentation.
*/
size = (long)nbuf * BKVASIZE;
kmi->buffer_sva = firstaddr;
kmi->buffer_eva = kmi->buffer_sva + size;
vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size,
- PAGE_SIZE, 0, 0);
+ PAGE_SIZE, (mp_ncpus > 4) ? BKVASIZE * 8 : 0, 0);
firstaddr += size;
/*
@@ -259,10 +258,19 @@
panic("Clean map calculation incorrect");
/*
- * Allocate the pageable submaps.
+ * Allocate the pageable submaps. We may cache an exec map entry per
+ * CPU, so we therefore need to reserve space for at least ncpu+1
+ * entries to avoid deadlock. The exec map is also used by some image
+ * activators, so we leave a fixed number of pages for their use.
*/
+#ifdef __LP64__
+ exec_map_entries = 8 * mp_ncpus;
+#else
+ exec_map_entries = 2 * mp_ncpus + 4;
+#endif
+ exec_map_entry_size = round_page(PATH_MAX + ARG_MAX);
exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
- exec_map_entries * round_page(PATH_MAX + ARG_MAX), FALSE);
+ exec_map_entries * exec_map_entry_size + 64 * PAGE_SIZE, FALSE);
pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva,
FALSE);
}
Modified: trunk/sys/vm/vm_kern.c
===================================================================
--- trunk/sys/vm/vm_kern.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_kern.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_kern.c 324782 2017-10-20 00:38:01Z emaste $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_kern.c 340660 2018-11-20 01:12:21Z markj $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -85,6 +85,8 @@
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
@@ -98,6 +100,9 @@
/* NB: Used by kernel debuggers. */
const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS;
+u_int exec_map_entry_size;
+u_int exec_map_entries;
+
SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
@@ -160,8 +165,7 @@
vm_paddr_t high, vm_memattr_t memattr)
{
vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
- vm_offset_t addr, i;
- vm_ooffset_t offset;
+ vm_offset_t addr, i, offset;
vm_page_t m;
int pflags, tries;
@@ -170,16 +174,21 @@
return (0);
offset = addr - VM_MIN_KERNEL_ADDRESS;
pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+ pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+ pflags |= VM_ALLOC_NOWAIT;
VM_OBJECT_WLOCK(object);
for (i = 0; i < size; i += PAGE_SIZE) {
tries = 0;
retry:
- m = vm_page_alloc_contig(object, OFF_TO_IDX(offset + i),
+ m = vm_page_alloc_contig(object, atop(offset + i),
pflags, 1, low, high, PAGE_SIZE, 0, memattr);
if (m == NULL) {
VM_OBJECT_WUNLOCK(object);
if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
- vm_pageout_grow_cache(tries, low, high);
+ if (!vm_page_reclaim_contig(pflags, 1,
+ low, high, PAGE_SIZE, 0) &&
+ (flags & M_WAITOK) != 0)
+ VM_WAIT;
VM_OBJECT_WLOCK(object);
tries++;
goto retry;
@@ -212,9 +221,9 @@
vm_memattr_t memattr)
{
vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
- vm_offset_t addr, tmp;
- vm_ooffset_t offset;
+ vm_offset_t addr, offset, tmp;
vm_page_t end_m, m;
+ u_long npages;
int pflags, tries;
size = round_page(size);
@@ -222,15 +231,20 @@
return (0);
offset = addr - VM_MIN_KERNEL_ADDRESS;
pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+ pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+ pflags |= VM_ALLOC_NOWAIT;
+ npages = atop(size);
VM_OBJECT_WLOCK(object);
tries = 0;
retry:
- m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags,
- atop(size), low, high, alignment, boundary, memattr);
+ m = vm_page_alloc_contig(object, atop(offset), pflags,
+ npages, low, high, alignment, boundary, memattr);
if (m == NULL) {
VM_OBJECT_WUNLOCK(object);
if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
- vm_pageout_grow_cache(tries, low, high);
+ if (!vm_page_reclaim_contig(pflags, npages, low, high,
+ alignment, boundary) && (flags & M_WAITOK) != 0)
+ VM_WAIT;
VM_OBJECT_WLOCK(object);
tries++;
goto retry;
@@ -238,7 +252,7 @@
vmem_free(vmem, addr, size);
return (0);
}
- end_m = m + atop(size);
+ end_m = m + npages;
tmp = addr;
for (; m < end_m; m++) {
if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
@@ -322,7 +336,7 @@
kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
{
vm_offset_t offset, i;
- vm_page_t m;
+ vm_page_t m, mpred;
int pflags;
KASSERT(object == kmem_object || object == kernel_object,
@@ -330,11 +344,17 @@
offset = addr - VM_MIN_KERNEL_ADDRESS;
pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+ pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+ if (flags & M_WAITOK)
+ pflags |= VM_ALLOC_WAITFAIL;
+ i = 0;
VM_OBJECT_WLOCK(object);
- for (i = 0; i < size; i += PAGE_SIZE) {
retry:
- m = vm_page_alloc(object, OFF_TO_IDX(offset + i), pflags);
+ mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i));
+ for (; i < size; i += PAGE_SIZE, mpred = m) {
+ m = vm_page_alloc_after(object, atop(offset + i), pflags,
+ mpred);
/*
* Ran out of space, free everything up and return. Don't need
@@ -342,12 +362,9 @@
* aren't on any queues.
*/
if (m == NULL) {
+ if ((flags & M_NOWAIT) == 0)
+ goto retry;
VM_OBJECT_WUNLOCK(object);
- if ((flags & M_NOWAIT) == 0) {
- VM_WAIT;
- VM_OBJECT_WLOCK(object);
- goto retry;
- }
kmem_unback(object, addr, i);
return (KERN_NO_SPACE);
}
@@ -376,8 +393,8 @@
void
kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
{
- vm_page_t m;
- vm_offset_t i, offset;
+ vm_page_t m, next;
+ vm_offset_t end, offset;
KASSERT(object == kmem_object || object == kernel_object,
("kmem_unback: only supports kernel objects."));
@@ -384,10 +401,12 @@
pmap_remove(kernel_pmap, addr, addr + size);
offset = addr - VM_MIN_KERNEL_ADDRESS;
+ end = offset + size;
VM_OBJECT_WLOCK(object);
- for (i = 0; i < size; i += PAGE_SIZE) {
- m = vm_page_lookup(object, OFF_TO_IDX(offset + i));
- vm_page_unwire(m, 0);
+ for (m = vm_page_lookup(object, atop(offset)); offset < end;
+ offset += PAGE_SIZE, m = next) {
+ next = vm_page_next(m);
+ vm_page_unwire(m, PQ_NONE);
vm_page_free(m);
}
VM_OBJECT_WUNLOCK(object);
@@ -443,8 +462,8 @@
map->needs_wakeup = TRUE;
vm_map_unlock_and_wait(map, 0);
}
- vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL,
- VM_PROT_ALL, MAP_ACC_CHARGED);
+ vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_RW, VM_PROT_RW,
+ MAP_ACC_CHARGED);
vm_map_unlock(map);
return (addr);
}
@@ -520,6 +539,43 @@
vm_map_unlock(m);
}
+/*
+ * kmem_bootstrap_free:
+ *
+ * Free pages backing preloaded data (e.g., kernel modules) to the
+ * system. Currently only supported on platforms that create a
+ * vm_phys segment for preloaded data.
+ */
+void
+kmem_bootstrap_free(vm_offset_t start, vm_size_t size)
+{
+#if defined(__i386__) || defined(__amd64__)
+ struct vm_domain *vmd;
+ vm_offset_t end, va;
+ vm_paddr_t pa;
+ vm_page_t m;
+
+ end = trunc_page(start + size);
+ start = round_page(start);
+
+ for (va = start; va < end; va += PAGE_SIZE) {
+ pa = pmap_kextract(va);
+ m = PHYS_TO_VM_PAGE(pa);
+
+ vmd = vm_phys_domain(m);
+ mtx_lock(&vm_page_queue_free_mtx);
+ vm_phys_free_pages(m, 0);
+ vmd->vmd_page_count++;
+ vm_phys_freecnt_adj(m, 1);
+ mtx_unlock(&vm_page_queue_free_mtx);
+
+ vm_cnt.v_page_count++;
+ }
+ pmap_remove(kernel_pmap, start, end);
+ (void)vmem_add(kernel_arena, start, end - start, M_WAITOK);
+#endif
+}
+
#ifdef DIAGNOSTIC
/*
* Allow userspace to directly trigger the VM drain routine for testing
Modified: trunk/sys/vm/vm_kern.h
===================================================================
--- trunk/sys/vm/vm_kern.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_kern.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,11 +58,11 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: stable/10/sys/vm/vm_kern.h 254307 2013-08-13 22:40:43Z jeff $
+ * $FreeBSD: stable/11/sys/vm/vm_kern.h 331722 2018-03-29 02:50:57Z eadler $
*/
#ifndef _VM_VM_KERN_H_
-#define _VM_VM_KERN_H_ 1
+#define _VM_VM_KERN_H_
/* Kernel memory management definitions. */
extern vm_map_t kernel_map;
@@ -75,5 +75,7 @@
extern struct vmem *memguard_arena;
extern vm_offset_t swapbkva;
extern u_long vm_kmem_size;
+extern u_int exec_map_entries;
+extern u_int exec_map_entry_size;
-#endif /* _VM_VM_KERN_H_ */
+#endif /* _VM_VM_KERN_H_ */
Modified: trunk/sys/vm/vm_map.c
===================================================================
--- trunk/sys/vm/vm_map.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_map.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_map.c 326523 2017-12-04 10:05:59Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_map.c 355049 2019-11-24 06:54:17Z dougm $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -136,6 +136,8 @@
static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
vm_map_entry_t gap_entry);
+static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
+ vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
#ifdef INVARIANTS
static void vm_map_zdtor(void *mem, int size, void *arg);
static void vmspace_zdtor(void *mem, int size, void *arg);
@@ -277,12 +279,7 @@
struct vmspace *vm;
vm = uma_zalloc(vmspace_zone, M_WAITOK);
-
KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
-
- if (pinit == NULL)
- pinit = &pmap_pinit;
-
if (!pinit(vmspace_pmap(vm))) {
uma_zfree(vmspace_zone, vm);
return (NULL);
@@ -333,8 +330,8 @@
* Delete all of the mappings and pages they hold, then call
* the pmap module to reclaim anything left.
*/
- (void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
- vm->vm_map.max_offset);
+ (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
+ vm_map_max(&vm->vm_map));
pmap_release(vmspace_pmap(vm));
vm->vm_map.pmap = NULL;
@@ -346,7 +343,7 @@
{
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
- "vmspace_free() called with non-sleepable lock held");
+ "vmspace_free() called");
if (vm->vm_refcnt == 0)
panic("vmspace_free: attempt to free already freed vmspace");
@@ -452,7 +449,48 @@
return (vm);
}
+/*
+ * Switch between vmspaces in an AIO kernel process.
+ *
+ * The new vmspace is either the vmspace of a user process obtained
+ * from an active AIO request or the initial vmspace of the AIO kernel
+ * process (when it is idling). Because user processes will block to
+ * drain any active AIO requests before proceeding in exit() or
+ * execve(), the reference count for vmspaces from AIO requests can
+ * never be 0. Similarly, AIO kernel processes hold an extra
+ * reference on their initial vmspace for the life of the process. As
+ * a result, the 'newvm' vmspace always has a non-zero reference
+ * count. This permits an additional reference on 'newvm' to be
+ * acquired via a simple atomic increment rather than the loop in
+ * vmspace_acquire_ref() above.
+ */
void
+vmspace_switch_aio(struct vmspace *newvm)
+{
+ struct vmspace *oldvm;
+
+ /* XXX: Need some way to assert that this is an aio daemon. */
+
+ KASSERT(newvm->vm_refcnt > 0,
+ ("vmspace_switch_aio: newvm unreferenced"));
+
+ oldvm = curproc->p_vmspace;
+ if (oldvm == newvm)
+ return;
+
+ /*
+ * Point to the new address space and refer to it.
+ */
+ curproc->p_vmspace = newvm;
+ atomic_add_int(&newvm->vm_refcnt, 1);
+
+ /* Activate the new mapping. */
+ pmap_activate(curthread);
+
+ vmspace_free(oldvm);
+}
+
+void
_vm_map_lock(vm_map_t map, const char *file, int line)
{
@@ -748,8 +786,8 @@
map->needs_wakeup = FALSE;
map->system_map = 0;
map->pmap = pmap;
- map->min_offset = min;
- map->max_offset = max;
+ map->header.end = min;
+ map->header.start = max;
map->flags = 0;
map->root = NULL;
map->timestamp = 0;
@@ -952,12 +990,10 @@
"vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
map->nentries, entry, after_where);
VM_MAP_ASSERT_LOCKED(map);
- KASSERT(after_where == &map->header ||
- after_where->end <= entry->start,
+ KASSERT(after_where->end <= entry->start,
("vm_map_entry_link: prev end %jx new start %jx overlap",
(uintmax_t)after_where->end, (uintmax_t)entry->start));
- KASSERT(after_where->next == &map->header ||
- entry->end <= after_where->next->start,
+ KASSERT(entry->end <= after_where->next->start,
("vm_map_entry_link: new end %jx next start %jx overlap",
(uintmax_t)entry->end, (uintmax_t)after_where->next->start));
@@ -979,8 +1015,7 @@
entry->right = map->root;
entry->left = NULL;
}
- entry->adj_free = (entry->next == &map->header ? map->max_offset :
- entry->next->start) - entry->end;
+ entry->adj_free = entry->next->start - entry->end;
vm_map_entry_set_max_free(entry);
map->root = entry;
}
@@ -999,8 +1034,7 @@
else {
root = vm_map_entry_splay(entry->start, entry->left);
root->right = entry->right;
- root->adj_free = (entry->next == &map->header ? map->max_offset :
- entry->next->start) - root->end;
+ root->adj_free = entry->next->start - root->end;
vm_map_entry_set_max_free(root);
}
map->root = root;
@@ -1036,8 +1070,7 @@
if (entry != map->root)
map->root = vm_map_entry_splay(entry->start, map->root);
- entry->adj_free = (entry->next == &map->header ? map->max_offset :
- entry->next->start) - entry->end;
+ entry->adj_free = entry->next->start - entry->end;
vm_map_entry_set_max_free(entry);
}
@@ -1152,7 +1185,8 @@
/*
* Check that the start and end points are not bogus.
*/
- if (start < map->min_offset || end > map->max_offset || start >= end)
+ if (start < vm_map_min(map) || end > vm_map_max(map) ||
+ start >= end)
return (KERN_INVALID_ADDRESS);
/*
@@ -1167,7 +1201,7 @@
/*
* Assert that the next entry doesn't overlap the end point.
*/
- if (prev_entry->next != &map->header && prev_entry->next->start < end)
+ if (prev_entry->next->start < end)
return (KERN_NO_SPACE);
if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
@@ -1295,7 +1329,7 @@
new_entry->wired_count = 0;
new_entry->wiring_thread = NULL;
new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
- new_entry->next_read = OFF_TO_IDX(offset);
+ new_entry->next_read = start;
KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
("overcommit: vm_map_insert leaks vm_map %p", new_entry));
@@ -1352,9 +1386,8 @@
* Request must fit within min/max VM address and must avoid
* address wrap.
*/
- if (start < map->min_offset)
- start = map->min_offset;
- if (start + length > map->max_offset || start + length < start)
+ start = MAX(start, vm_map_min(map));
+ if (start + length > vm_map_max(map) || start + length < start)
return (1);
/* Empty tree means wide open address space. */
@@ -1456,6 +1489,8 @@
KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
object == NULL,
("vm_map_find: non-NULL backing object for stack"));
+ MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
+ (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
(object->flags & OBJ_COLORED) == 0))
find_space = VMFS_ANY_SPACE;
@@ -1496,6 +1531,14 @@
}
start = *addr;
+ } else if ((cow & MAP_REMAP) != 0) {
+ if (start < vm_map_min(map) ||
+ start + length > vm_map_max(map) ||
+ start + length <= length) {
+ result = KERN_INVALID_ADDRESS;
+ break;
+ }
+ vm_map_delete(map, start, start + length);
}
if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
result = vm_map_stack_locked(map, start, length,
@@ -1549,7 +1592,7 @@
*
* The map must be locked.
*
- * This routine guarentees that the passed entry remains valid (though
+ * This routine guarantees that the passed entry remains valid (though
* possibly extended). When merging, this routine may delete one or
* both neighbors.
*/
@@ -1655,6 +1698,8 @@
vm_map_entry_t new_entry;
VM_MAP_ASSERT_LOCKED(map);
+ KASSERT(entry->end > start && entry->start < start,
+ ("_vm_map_clip_start: invalid clip of entry %p", entry));
/*
* Split off the front portion -- note that we must insert the new
@@ -1740,6 +1785,8 @@
vm_map_entry_t new_entry;
VM_MAP_ASSERT_LOCKED(map);
+ KASSERT(entry->start < end && entry->end > end,
+ ("_vm_map_clip_end: invalid clip of entry %p", entry));
/*
* If there is no object backing this entry, we might as well create
@@ -1856,11 +1903,9 @@
* limited number of page mappings are created at the low-end of the
* specified address range. (For this purpose, a superpage mapping
* counts as one page mapping.) Otherwise, all resident pages within
- * the specified address range are mapped. Because these mappings are
- * being created speculatively, cached pages are not reactivated and
- * mapped.
+ * the specified address range are mapped.
*/
-void
+static void
vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
{
@@ -1910,7 +1955,7 @@
* free pages allocating pv entries.
*/
if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
- cnt.v_free_count < cnt.v_free_reserved) ||
+ vm_cnt.v_free_count < vm_cnt.v_free_reserved) ||
((flags & MAP_PREFAULT_PARTIAL) != 0 &&
tmpidx >= threshold)) {
psize = tmpidx;
@@ -1926,7 +1971,7 @@
(pagesizes[p->psind] - 1)) == 0) {
mask = atop(pagesizes[p->psind]) - 1;
if (tmpidx + mask < psize &&
- vm_page_ps_is_valid(p)) {
+ vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
p += mask;
threshold += mask;
}
@@ -1955,7 +2000,7 @@
vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
vm_prot_t new_prot, boolean_t set_max)
{
- vm_map_entry_t current, entry;
+ vm_map_entry_t current, entry, in_tran;
vm_object_t obj;
struct ucred *cred;
vm_prot_t old_prot;
@@ -1963,8 +2008,18 @@
if (start == end)
return (KERN_SUCCESS);
+again:
+ in_tran = NULL;
vm_map_lock(map);
+ /*
+ * Ensure that we are not concurrently wiring pages. vm_map_wire() may
+ * need to fault pages into the map and will drop the map lock while
+ * doing so, and the VM object may end up in an inconsistent state if we
+ * update the protection on the map entry in between faults.
+ */
+ vm_map_wait_busy(map);
+
VM_MAP_RANGE_CHECK(map, start, end);
if (vm_map_lookup_entry(map, start, &entry)) {
@@ -1976,8 +2031,7 @@
/*
* Make a first pass to check for protection violations.
*/
- for (current = entry; current != &map->header && current->start < end;
- current = current->next) {
+ for (current = entry; current->start < end; current = current->next) {
if ((current->eflags & MAP_ENTRY_GUARD) != 0)
continue;
if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
@@ -1988,15 +2042,29 @@
vm_map_unlock(map);
return (KERN_PROTECTION_FAILURE);
}
+ if ((current->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
+ in_tran = current;
}
/*
+ * Postpone the operation until all in-transition map entries have
+ * stabilized. An in-transition entry might already have its pages
+ * wired and wired_count incremented, but not yet have its
+ * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call
+ * vm_fault_copy_entry() in the final loop below.
+ */
+ if (in_tran != NULL) {
+ in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
+ vm_map_unlock_and_wait(map, 0);
+ goto again;
+ }
+
+ /*
* Do an accounting pass for private read-only mappings that
* now will do cow due to allowed write (e.g. debugger sets
* breakpoint on text segment)
*/
- for (current = entry; current != &map->header && current->start < end;
- current = current->next) {
+ for (current = entry; current->start < end; current = current->next) {
vm_map_clip_end(map, current, end);
@@ -2050,8 +2118,7 @@
* Go back and fix up protections. [Note that clipping is not
* necessary the second time.]
*/
- for (current = entry; current != &map->header && current->start < end;
- current = current->next) {
+ for (current = entry; current->start < end; current = current->next) {
if ((current->eflags & MAP_ENTRY_GUARD) != 0)
continue;
@@ -2160,10 +2227,8 @@
* We clip the vm_map_entry so that behavioral changes are
* limited to the specified address range.
*/
- for (current = entry;
- (current != &map->header) && (current->start < end);
- current = current->next
- ) {
+ for (current = entry; current->start < end;
+ current = current->next) {
if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
continue;
@@ -2207,15 +2272,25 @@
* Since we don't clip the vm_map_entry, we have to clip
* the vm_object pindex and count.
*/
- for (current = entry;
- (current != &map->header) && (current->start < end);
- current = current->next
- ) {
+ for (current = entry; current->start < end;
+ current = current->next) {
vm_offset_t useEnd, useStart;
if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
continue;
+ /*
+ * MADV_FREE would otherwise rewind time to
+ * the creation of the shadow object. Because
+ * we hold the VM map read-locked, neither the
+ * entry's object nor the presence of a
+ * backing object can change.
+ */
+ if (behav == MADV_FREE &&
+ current->object.vm_object != NULL &&
+ current->object.vm_object->backing_object != NULL)
+ continue;
+
pstart = OFF_TO_IDX(current->offset);
pend = pstart + atop(current->end - current->start);
useStart = current->start;
@@ -2306,7 +2381,7 @@
vm_map_clip_start(map, entry, start);
} else
entry = temp_entry->next;
- while ((entry != &map->header) && (entry->start < end)) {
+ while (entry->start < end) {
vm_map_clip_end(map, entry, end);
if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
new_inheritance != VM_INHERIT_ZERO)
@@ -2348,7 +2423,7 @@
}
last_timestamp = map->timestamp;
entry = first_entry;
- while (entry != &map->header && entry->start < end) {
+ while (entry->start < end) {
if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
/*
* We have not yet clipped the entry.
@@ -2411,8 +2486,7 @@
* If VM_MAP_WIRE_HOLESOK was specified, skip this check.
*/
if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
- (entry->end < end && (entry->next == &map->header ||
- entry->next->start > entry->end))) {
+ (entry->end < end && entry->next->start > entry->end)) {
end = entry->end;
rv = KERN_INVALID_ADDRESS;
goto done;
@@ -2438,8 +2512,7 @@
else
KASSERT(result, ("vm_map_unwire: lookup failed"));
}
- for (entry = first_entry; entry != &map->header && entry->start < end;
- entry = entry->next) {
+ for (entry = first_entry; entry->start < end; entry = entry->next) {
/*
* If VM_MAP_WIRE_HOLESOK was specified, an empty
* space in the unwired region could have been mapped
@@ -2553,7 +2626,7 @@
}
last_timestamp = map->timestamp;
entry = first_entry;
- while (entry != &map->header && entry->start < end) {
+ while (entry->start < end) {
if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
/*
* We have not yet clipped the entry.
@@ -2690,8 +2763,7 @@
*/
next_entry:
if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
- entry->end < end && (entry->next == &map->header ||
- entry->next->start > entry->end)) {
+ entry->end < end && entry->next->start > entry->end) {
end = entry->end;
rv = KERN_INVALID_ADDRESS;
goto done;
@@ -2708,8 +2780,7 @@
else
KASSERT(result, ("vm_map_wire: lookup failed"));
}
- for (entry = first_entry; entry != &map->header && entry->start < end;
- entry = entry->next) {
+ for (entry = first_entry; entry->start < end; entry = entry->next) {
/*
* If VM_MAP_WIRE_HOLESOK was specified, an empty
* space in the unwired region could have been mapped
@@ -2813,15 +2884,13 @@
/*
* Make a first pass to check for user-wired memory and holes.
*/
- for (current = entry; current != &map->header && current->start < end;
- current = current->next) {
+ for (current = entry; current->start < end; current = current->next) {
if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
vm_map_unlock_read(map);
return (KERN_INVALID_ARGUMENT);
}
if (end > current->end &&
- (current->next == &map->header ||
- current->end != current->next->start)) {
+ current->end != current->next->start) {
vm_map_unlock_read(map);
return (KERN_INVALID_ADDRESS);
}
@@ -2835,7 +2904,7 @@
* Make a second pass, cleaning/uncaching pages from the indicated
* objects as we go.
*/
- for (current = entry; current != &map->header && current->start < end;) {
+ for (current = entry; current->start < end;) {
offset = current->offset + (start - current->start);
size = (end <= current->end ? end : current->end) - start;
if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
@@ -2912,7 +2981,7 @@
{
vm_object_t object;
vm_pindex_t offidxstart, offidxend, count, size1;
- vm_ooffset_t size;
+ vm_size_t size;
vm_map_entry_unlink(map, entry);
object = entry->object.vm_object;
@@ -2938,7 +3007,7 @@
KASSERT(entry->cred == NULL || object->cred == NULL ||
(entry->eflags & MAP_ENTRY_NEEDS_COPY),
("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
- count = OFF_TO_IDX(size);
+ count = atop(size);
offidxstart = OFF_TO_IDX(entry->offset);
offidxend = offidxstart + count;
VM_OBJECT_WLOCK(object);
@@ -3012,7 +3081,7 @@
/*
* Step through all entries in this region
*/
- while ((entry != &map->header) && (entry->start < end)) {
+ while (entry->start < end) {
vm_map_entry_t next;
/*
@@ -3058,11 +3127,17 @@
* Unwire before removing addresses from the pmap; otherwise,
* unwiring will put the entries back in the pmap.
*/
- if (entry->wired_count != 0) {
+ if (entry->wired_count != 0)
vm_map_entry_unwire(map, entry);
- }
- pmap_remove(map->pmap, entry->start, entry->end);
+ /*
+ * Remove mappings for the pages, but only if the
+ * mappings could exist. For instance, it does not
+ * make sense to call pmap_remove() for guard entries.
+ */
+ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
+ entry->object.vm_object != NULL)
+ pmap_remove(map->pmap, entry->start, entry->end);
/*
* Delete the entry only after removing all pmap
@@ -3120,8 +3195,6 @@
entry = tmp_entry;
while (start < end) {
- if (entry == &map->header)
- return (FALSE);
/*
* No holes allowed!
*/
@@ -3325,7 +3398,8 @@
old_map = &vm1->vm_map;
/* Copy immutable fields of vm1 to vm2. */
- vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
+ vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
+ pmap_pinit);
if (vm2 == NULL)
return (NULL);
vm2->vm_taddr = vm1->vm_taddr;
@@ -3529,9 +3603,7 @@
growsize = sgrowsiz;
init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
vm_map_lock(map);
- PROC_LOCK(curproc);
- vmemlim = lim_cur(curproc, RLIMIT_VMEM);
- PROC_UNLOCK(curproc);
+ vmemlim = lim_cur(curthread, RLIMIT_VMEM);
/* If we would blow our VMEM resource limit, no go */
if (map->size + init_ssize > vmemlim) {
rv = KERN_NO_SPACE;
@@ -3572,7 +3644,8 @@
addrbos + max_ssize > vm_map_max(map) ||
addrbos + max_ssize <= addrbos)
return (KERN_INVALID_ADDRESS);
- sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
+ sgp = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 :
+ (vm_size_t)stack_guard_page * PAGE_SIZE;
if (sgp >= max_ssize)
return (KERN_INVALID_ARGUMENT);
@@ -3585,10 +3658,9 @@
return (KERN_NO_SPACE);
/*
- * If we can't accomodate max_ssize in the current mapping, no go.
+ * If we can't accommodate max_ssize in the current mapping, no go.
*/
- if ((prev_entry->next != &map->header) &&
- (prev_entry->next->start < addrbos + max_ssize))
+ if (prev_entry->next->start < addrbos + max_ssize)
return (KERN_NO_SPACE);
/*
@@ -3624,11 +3696,25 @@
KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
(new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
("new entry lacks MAP_ENTRY_GROWS_UP"));
+ if (gap_bot == gap_top)
+ return (KERN_SUCCESS);
rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
- if (rv != KERN_SUCCESS)
+ if (rv == KERN_SUCCESS) {
+ /*
+ * Gap can never successfully handle a fault, so
+ * read-ahead logic is never used for it. Re-use
+ * next_read of the gap entry to store
+ * stack_guard_page for vm_map_growstack().
+ */
+ if (orient == MAP_STACK_GROWS_DOWN)
+ new_entry->prev->next_read = sgp;
+ else
+ new_entry->next->next_read = sgp;
+ } else {
(void)vm_map_delete(map, bot, top);
+ }
return (rv);
}
@@ -3663,17 +3749,15 @@
* debugger or AIO daemon. The reason is that the wrong
* resource limits are applied.
*/
- if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
+ if (p != initproc && (map != &p->p_vmspace->vm_map ||
+ p->p_textvp == NULL))
return (KERN_FAILURE);
MPASS(!map->system_map);
- guard = stack_guard_page * PAGE_SIZE;
- PROC_LOCK(p);
- lmemlim = lim_cur(p, RLIMIT_MEMLOCK);
- stacklim = lim_cur(p, RLIMIT_STACK);
- vmemlim = lim_cur(p, RLIMIT_VMEM);
- PROC_UNLOCK(p);
+ lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
+ stacklim = lim_cur(curthread, RLIMIT_STACK);
+ vmemlim = lim_cur(curthread, RLIMIT_VMEM);
retry:
/* If addr is not in a hole for a stack grow area, no need to grow. */
if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
@@ -3697,6 +3781,8 @@
} else {
return (KERN_FAILURE);
}
+ guard = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 :
+ gap_entry->next_read;
max_grow = gap_entry->end - gap_entry->start;
if (guard > max_grow)
return (KERN_NO_SPACE);
@@ -3844,9 +3930,7 @@
if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
vm_map_unlock(map);
vm_map_wire(map, grow_start, grow_start + grow_amount,
- (p->p_flag & P_SYSTEM)
- ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
- : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
vm_map_lock_read(map);
} else
vm_map_lock_downgrade(map);
@@ -3883,7 +3967,7 @@
KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
("vmspace_exec recursed"));
- newvmspace = vmspace_alloc(minuser, maxuser, NULL);
+ newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
if (newvmspace == NULL)
return (ENOMEM);
newvmspace->vm_swrss = oldvmspace->vm_swrss;
@@ -4125,7 +4209,7 @@
* Return the object/offset from this entry. If the entry was
* copy-on-write or empty, it has been fixed up.
*/
- *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
+ *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
*object = entry->object.vm_object;
*out_prot = prot;
@@ -4206,7 +4290,7 @@
* Return the object/offset from this entry. If the entry was
* copy-on-write or empty, it has been fixed up.
*/
- *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
+ *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
*object = entry->object.vm_object;
*out_prot = prot;
@@ -4228,6 +4312,27 @@
vm_map_unlock_read(map);
}
+vm_offset_t
+vm_map_max_KBI(const struct vm_map *map)
+{
+
+ return (vm_map_max(map));
+}
+
+vm_offset_t
+vm_map_min_KBI(const struct vm_map *map)
+{
+
+ return (vm_map_min(map));
+}
+
+pmap_t
+vm_map_pmap_KBI(vm_map_t map)
+{
+
+ return (map->pmap);
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <sys/kernel.h>
Modified: trunk/sys/vm/vm_map.h
===================================================================
--- trunk/sys/vm/vm_map.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_map.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: stable/10/sys/vm/vm_map.h 321718 2017-07-30 10:49:13Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_map.h 343426 2019-01-25 11:46:07Z kib $
*/
/*
@@ -105,6 +105,7 @@
vm_offset_t start; /* start address */
vm_offset_t end; /* end address */
vm_offset_t pad0;
+ vm_offset_t next_read; /* vaddr of the next sequential read */
vm_size_t adj_free; /* amount of adjacent free space */
vm_size_t max_free; /* max free space in subtree */
union vm_map_object object; /* object I point to */
@@ -115,7 +116,6 @@
vm_inherit_t inheritance; /* inheritance */
uint8_t read_ahead; /* pages in the read-ahead window */
int wired_count; /* can be paged if = 0 */
- vm_pindex_t next_read; /* index of the next sequential read */
struct ucred *cred; /* tmp storage for creator ref */
struct thread *wiring_thread;
};
@@ -173,15 +173,26 @@
* A map is a set of map entries. These map entries are
* organized both as a binary search tree and as a doubly-linked
* list. Both structures are ordered based upon the start and
- * end addresses contained within each map entry. Sleator and
- * Tarjan's top-down splay algorithm is employed to control
- * height imbalance in the binary search tree.
+ * end addresses contained within each map entry.
*
- * List of locks
+ * Counterintuitively, the map's min offset value is stored in
+ * map->header.end, and its max offset value is stored in
+ * map->header.start.
+ *
+ * The list header has max start value and min end value to act
+ * as sentinels for sequential search of the doubly-linked list.
+ * Sleator and Tarjan's top-down splay algorithm is employed to
+ * control height imbalance in the binary search tree.
+ *
+ * List of locks
* (c) const until freed
*/
struct vm_map {
struct vm_map_entry header; /* List of entries */
+/*
+ map min_offset header.end (c)
+ map max_offset header.start (c)
+*/
struct sx lock; /* Lock for map data */
struct mtx system_mtx;
int nentries; /* Number of entries */
@@ -192,8 +203,6 @@
vm_flags_t flags; /* flags for this vm_map */
vm_map_entry_t root; /* Root of a binary search tree */
pmap_t pmap; /* (c) Physical map */
-#define min_offset header.start /* (c) */
-#define max_offset header.end /* (c) */
int busy;
};
@@ -204,16 +213,23 @@
#define MAP_BUSY_WAKEUP 0x02
#ifdef _KERNEL
+#ifdef KLD_MODULE
+#define vm_map_max(map) vm_map_max_KBI((map))
+#define vm_map_min(map) vm_map_min_KBI((map))
+#define vm_map_pmap(map) vm_map_pmap_KBI((map))
+#else
static __inline vm_offset_t
vm_map_max(const struct vm_map *map)
{
- return (map->max_offset);
+
+ return (map->header.start);
}
static __inline vm_offset_t
vm_map_min(const struct vm_map *map)
{
- return (map->min_offset);
+
+ return (map->header.end);
}
static __inline pmap_t
@@ -227,6 +243,7 @@
{
map->flags = (map->flags | set) & ~clear;
}
+#endif /* KLD_MODULE */
#endif /* _KERNEL */
/*
@@ -287,6 +304,9 @@
void vm_map_busy(vm_map_t map);
void vm_map_unbusy(vm_map_t map);
void vm_map_wait_busy(vm_map_t map);
+vm_offset_t vm_map_max_KBI(const struct vm_map *map);
+vm_offset_t vm_map_min_KBI(const struct vm_map *map);
+pmap_t vm_map_pmap_KBI(vm_map_t map);
#define vm_map_lock(map) _vm_map_lock(map, LOCK_FILE, LOCK_LINE)
#define vm_map_unlock(map) _vm_map_unlock(map, LOCK_FILE, LOCK_LINE)
@@ -306,9 +326,8 @@
#endif /* _KERNEL */
-/* XXX: number of kernel maps and entries to statically allocate */
+/* XXX: number of kernel maps to statically allocate */
#define MAX_KMAP 10
-#define MAX_KMAPENT 128
/*
* Copy-on-write flags for vm_map operations
@@ -324,6 +343,7 @@
#define MAP_DISABLE_COREDUMP 0x0100
#define MAP_PREFAULT_MADVISE 0x0200 /* from (user) madvise request */
#define MAP_VN_WRITECOUNT 0x0400
+#define MAP_REMAP 0x0800
#define MAP_STACK_GROWS_DOWN 0x1000
#define MAP_STACK_GROWS_UP 0x2000
#define MAP_ACC_CHARGED 0x4000
@@ -389,15 +409,13 @@
vm_pindex_t *, vm_prot_t *, boolean_t *);
void vm_map_lookup_done (vm_map_t, vm_map_entry_t);
boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *);
-void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
- vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t);
int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t);
+void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry);
void vm_map_startup (void);
int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t);
int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int);
-void vm_map_simplify_entry (vm_map_t, vm_map_entry_t);
int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
int flags);
Modified: trunk/sys/vm/vm_meter.c
===================================================================
--- trunk/sys/vm/vm_meter.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_meter.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -31,7 +31,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_meter.c 311049 2017-01-02 08:31:29Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_meter.c 331722 2018-03-29 02:50:57Z eadler $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -54,24 +54,20 @@
#include <vm/vm_object.h>
#include <sys/sysctl.h>
-struct vmmeter cnt;
+struct vmmeter vm_cnt;
SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min,
- CTLFLAG_RW, &cnt.v_free_min, 0, "Minimum low-free-pages threshold");
+ CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold");
SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target,
- CTLFLAG_RW, &cnt.v_free_target, 0, "Desired free pages");
+ CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages");
SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved,
- CTLFLAG_RW, &cnt.v_free_reserved, 0, "Pages reserved for deadlock");
+ CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock");
SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target,
- CTLFLAG_RW, &cnt.v_inactive_target, 0, "Pages desired inactive");
-SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min,
- CTLFLAG_RW, &cnt.v_cache_min, 0, "Min pages on cache queue");
-SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max,
- CTLFLAG_RW, &cnt.v_cache_max, 0, "Max pages on cache queue");
+ CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive");
SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
- CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
+ CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
- CTLFLAG_RW, &cnt.v_free_severe, 0, "Severe page depletion point");
+ CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point");
static int
sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS)
@@ -140,7 +136,7 @@
else
total.t_sl++;
if (td->td_wchan ==
- &cnt.v_free_count)
+ &vm_cnt.v_free_count)
total.t_pw++;
}
break;
@@ -209,13 +205,13 @@
}
}
mtx_unlock(&vm_object_list_mtx);
- total.t_free = cnt.v_free_count + cnt.v_cache_count;
+ total.t_free = vm_cnt.v_free_count;
return (sysctl_handle_opaque(oidp, &total, sizeof(total), req));
}
/*
- * vcnt() - accumulate statistics from all cpus and the global cnt
- * structure.
+ * vm_meter_cnt() - accumulate statistics from all cpus and the global cnt
+ * structure.
*
* The vmmeter structure is now per-cpu as well as global. Those
* statistics which can be kept on a per-cpu basis (to avoid cache
@@ -222,23 +218,31 @@
* stalls between cpus) can be moved to the per-cpu vmmeter. Remaining
* statistics, such as v_free_reserved, are left in the global
* structure.
- *
- * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
*/
-static int
-vcnt(SYSCTL_HANDLER_ARGS)
+u_int
+vm_meter_cnt(size_t offset)
{
- int count = *(int *)arg1;
- int offset = (char *)arg1 - (char *)&cnt;
+ struct pcpu *pcpu;
+ u_int count;
int i;
+ count = *(u_int *)((char *)&vm_cnt + offset);
CPU_FOREACH(i) {
- struct pcpu *pcpu = pcpu_find(i);
- count += *(int *)((char *)&pcpu->pc_cnt + offset);
+ pcpu = pcpu_find(i);
+ count += *(u_int *)((char *)&pcpu->pc_cnt + offset);
}
- return (SYSCTL_OUT(req, &count, sizeof(int)));
+ return (count);
}
+static int
+cnt_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ u_int count;
+
+ count = vm_meter_cnt((char *)arg1 - (char *)&vm_cnt);
+ return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
0, sizeof(struct vmtotal), vmtotal, "S,vmtotal",
"System virtual memory statistics");
@@ -251,8 +255,8 @@
#define VM_STATS(parent, var, descr) \
SYSCTL_PROC(parent, OID_AUTO, var, \
- CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &cnt.var, 0, vcnt, \
- "IU", descr)
+ CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, \
+ cnt_sysctl, "IU", descr)
#define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr)
#define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr)
@@ -276,9 +280,10 @@
VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in");
VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out");
VM_STATS_VM(v_intrans, "In transit page faults");
-VM_STATS_VM(v_reactivated, "Pages reactivated from free list");
+VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon");
VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups");
VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon");
+VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls");
VM_STATS_VM(v_tcached, "Total pages cached");
VM_STATS_VM(v_dfree, "Pages freed by pagedaemon");
VM_STATS_VM(v_pfree, "Pages freed by exiting processes");
@@ -293,9 +298,8 @@
VM_STATS_VM(v_active_count, "Active pages");
VM_STATS_VM(v_inactive_target, "Desired inactive pages");
VM_STATS_VM(v_inactive_count, "Inactive pages");
+VM_STATS_VM(v_laundry_count, "Pages eligible for laundering");
VM_STATS_VM(v_cache_count, "Pages on cache queue");
-VM_STATS_VM(v_cache_min, "Min pages on cache queue");
-VM_STATS_VM(v_cache_max, "Max pages on cached queue");
VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel");
VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code");
VM_STATS_VM(v_forks, "Number of fork() calls");
Modified: trunk/sys/vm/vm_mmap.c
===================================================================
--- trunk/sys/vm/vm_mmap.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_mmap.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -42,10 +42,11 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 321717 2017-07-30 10:36:20Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_mmap.c 356634 2020-01-11 15:06:06Z kevans $");
#include "opt_compat.h"
#include "opt_hwpmc_hooks.h"
+#include "opt_vm.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -74,6 +75,7 @@
#include <sys/sysent.h>
#include <sys/vmmeter.h>
+#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
@@ -93,21 +95,16 @@
#endif
int old_mlock = 0;
-SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0,
+SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
"Do not apply RLIMIT_MEMLOCK on mlockall");
-TUNABLE_INT("vm.old_mlock", &old_mlock);
+static int mincore_mapped = 1;
+SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
+ "mincore reports mappings, not residency");
#ifdef MAP_32BIT
#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
#endif
-static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
- int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
-static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
- int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
-static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
- int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
-
#ifndef _SYS_SYSPROTO_H_
struct sbrk_args {
int incr;
@@ -177,34 +174,48 @@
#endif
int
-sys_mmap(td, uap)
- struct thread *td;
- struct mmap_args *uap;
+sys_mmap(struct thread *td, struct mmap_args *uap)
{
-#ifdef HWPMC_HOOKS
- struct pmckern_map_in pkm;
-#endif
+
+ return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
+ uap->flags, uap->fd, uap->pos));
+}
+
+int
+kern_mmap(struct thread *td, uintptr_t addr0, size_t size, int prot, int flags,
+ int fd, off_t pos)
+{
+
+ return (kern_mmap_fpcheck(td, addr0, size, prot, flags, fd, pos, NULL));
+}
+
+/*
+ * When mmap'ing a file, check_fp_fn may be used for the caller to do any
+ * last-minute validation based on the referenced file in a non-racy way.
+ */
+int
+kern_mmap_fpcheck(struct thread *td, uintptr_t addr0, size_t size, int prot,
+ int flags, int fd, off_t pos, mmap_check_fp_fn check_fp_fn)
+{
+ struct vmspace *vms;
struct file *fp;
- struct vnode *vp;
vm_offset_t addr;
- vm_size_t size, pageoff;
- vm_prot_t cap_maxprot, prot, maxprot;
- void *handle;
- objtype_t handle_type;
- int align, error, flags;
- off_t pos;
- struct vmspace *vms = td->td_proc->p_vmspace;
+ vm_size_t pageoff;
+ vm_prot_t cap_maxprot;
+ int align, error;
cap_rights_t rights;
- addr = (vm_offset_t) uap->addr;
- size = uap->len;
- prot = uap->prot & VM_PROT_ALL;
- flags = uap->flags;
- pos = uap->pos;
-
+ vms = td->td_proc->p_vmspace;
fp = NULL;
+ AUDIT_ARG_FD(fd);
+ addr = addr0;
/*
+ * Ignore old flags that used to be defined but did not do anything.
+ */
+ flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
+
+ /*
* Enforce the constraints.
* Mapping of length 0 is only allowed for old binaries.
* Anonymous mapping shall specify -1 as filedescriptor and
@@ -214,8 +225,8 @@
* pos.
*/
if (!SV_CURPROC_FLAG(SV_AOUT)) {
- if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
- ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
+ if ((size == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
+ ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0)))
return (EINVAL);
} else {
if ((flags & MAP_ANON) != 0)
@@ -223,15 +234,28 @@
}
if (flags & MAP_STACK) {
- if ((uap->fd != -1) ||
+ if ((fd != -1) ||
((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
return (EINVAL);
flags |= MAP_ANON;
pos = 0;
}
+ if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
+ MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
+ MAP_PREFAULT_READ | MAP_GUARD |
+#ifdef MAP_32BIT
+ MAP_32BIT |
+#endif
+ MAP_ALIGNMENT_MASK)) != 0)
+ return (EINVAL);
if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
return (EINVAL);
- if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || uap->fd != -1 ||
+ if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
+ return (EINVAL);
+ if (prot != PROT_NONE &&
+ (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
+ return (EINVAL);
+ if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT |
MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0))
return (EINVAL);
@@ -295,28 +319,32 @@
* There should really be a pmap call to determine a reasonable
* location.
*/
- PROC_LOCK(td->td_proc);
if (addr == 0 ||
(addr >= round_page((vm_offset_t)vms->vm_taddr) &&
addr < round_page((vm_offset_t)vms->vm_daddr +
- lim_max(td->td_proc, RLIMIT_DATA))))
+ lim_max(td, RLIMIT_DATA))))
addr = round_page((vm_offset_t)vms->vm_daddr +
- lim_max(td->td_proc, RLIMIT_DATA));
- PROC_UNLOCK(td->td_proc);
+ lim_max(td, RLIMIT_DATA));
}
- if ((flags & MAP_GUARD) != 0) {
- handle = NULL;
- handle_type = OBJT_DEFAULT;
- maxprot = VM_PROT_NONE;
- cap_maxprot = VM_PROT_NONE;
+ if (size == 0) {
+ /*
+ * Return success without mapping anything for old
+ * binaries that request a page-aligned mapping of
+ * length 0. For modern binaries, this function
+ * returns an error earlier.
+ */
+ error = 0;
+ } else if ((flags & MAP_GUARD) != 0) {
+ error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
+ VM_PROT_NONE, flags, NULL, pos, FALSE, td);
} else if ((flags & MAP_ANON) != 0) {
/*
* Mapping blank space is trivial.
+ *
+ * This relies on VM_PROT_* matching PROT_*.
*/
- handle = NULL;
- handle_type = OBJT_DEFAULT;
- maxprot = VM_PROT_ALL;
- cap_maxprot = VM_PROT_ALL;
+ error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
+ VM_PROT_ALL, flags, NULL, pos, FALSE, td);
} else {
/*
* Mapping file, get fp for validation and don't let the
@@ -333,94 +361,24 @@
}
if (prot & PROT_EXEC)
cap_rights_set(&rights, CAP_MMAP_X);
- error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
+ error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
if (error != 0)
goto done;
- if (fp->f_type == DTYPE_SHM) {
- handle = fp->f_data;
- handle_type = OBJT_SWAP;
- maxprot = VM_PROT_NONE;
-
- /* FREAD should always be set. */
- if (fp->f_flag & FREAD)
- maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
- if (fp->f_flag & FWRITE)
- maxprot |= VM_PROT_WRITE;
- goto map;
- }
- if (fp->f_type != DTYPE_VNODE) {
- error = ENODEV;
+ if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
+ td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) {
+ error = EINVAL;
goto done;
}
-#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
- defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
- /*
- * POSIX shared-memory objects are defined to have
- * kernel persistence, and are not defined to support
- * read(2)/write(2) -- or even open(2). Thus, we can
- * use MAP_ASYNC to trade on-disk coherence for speed.
- * The shm_open(3) library routine turns on the FPOSIXSHM
- * flag to request this behavior.
- */
- if (fp->f_flag & FPOSIXSHM)
- flags |= MAP_NOSYNC;
-#endif
- vp = fp->f_vnode;
- /*
- * Ensure that file and memory protections are
- * compatible. Note that we only worry about
- * writability if mapping is shared; in this case,
- * current and max prot are dictated by the open file.
- * XXX use the vnode instead? Problem is: what
- * credentials do we use for determination? What if
- * proc does a setuid?
- */
- if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
- maxprot = VM_PROT_NONE;
- else
- maxprot = VM_PROT_EXECUTE;
- if (fp->f_flag & FREAD) {
- maxprot |= VM_PROT_READ;
- } else if (prot & PROT_READ) {
- error = EACCES;
- goto done;
- }
- /*
- * If we are sharing potential changes (either via
- * MAP_SHARED or via the implicit sharing of character
- * device mappings), and we are trying to get write
- * permission although we opened it without asking
- * for it, bail out.
- */
- if ((flags & MAP_SHARED) != 0) {
- if ((fp->f_flag & FWRITE) != 0) {
- maxprot |= VM_PROT_WRITE;
- } else if ((prot & PROT_WRITE) != 0) {
- error = EACCES;
+ if (check_fp_fn != NULL) {
+ error = check_fp_fn(fp, prot, cap_maxprot, flags);
+ if (error != 0)
goto done;
- }
- } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
- maxprot |= VM_PROT_WRITE;
- cap_maxprot |= VM_PROT_WRITE;
}
- handle = (void *)vp;
- handle_type = OBJT_VNODE;
+ /* This relies on VM_PROT_* matching PROT_*. */
+ error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
+ cap_maxprot, flags, pos, td);
}
-map:
- td->td_fpop = fp;
- maxprot &= cap_maxprot;
- error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
- flags, handle_type, handle, pos);
- td->td_fpop = NULL;
-#ifdef HWPMC_HOOKS
- /* inform hwpmc(4) if an executable is being mapped */
- if (error == 0 && handle_type == OBJT_VNODE &&
- (prot & PROT_EXEC)) {
- pkm.pm_file = handle;
- pkm.pm_address = (uintptr_t) addr;
- PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
- }
-#endif
+
if (error == 0)
td->td_retval[0] = (register_t) (addr + pageoff);
done:
@@ -430,19 +388,15 @@
return (error);
}
+#if defined(COMPAT_FREEBSD6)
int
freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
{
- struct mmap_args oargs;
- oargs.addr = uap->addr;
- oargs.len = uap->len;
- oargs.prot = uap->prot;
- oargs.flags = uap->flags;
- oargs.fd = uap->fd;
- oargs.pos = uap->pos;
- return (sys_mmap(td, &oargs));
+ return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
+ uap->flags, uap->fd, uap->pos));
}
+#endif
#ifdef COMPAT_43
#ifndef _SYS_SYSPROTO_H_
@@ -456,11 +410,8 @@
};
#endif
int
-ommap(td, uap)
- struct thread *td;
- struct ommap_args *uap;
+ommap(struct thread *td, struct ommap_args *uap)
{
- struct mmap_args nargs;
static const char cvtbsdprot[8] = {
0,
PROT_EXEC,
@@ -471,6 +422,7 @@
PROT_WRITE | PROT_READ,
PROT_EXEC | PROT_WRITE | PROT_READ,
};
+ int flags, prot;
#define OMAP_ANON 0x0002
#define OMAP_COPY 0x0020
@@ -477,30 +429,27 @@
#define OMAP_SHARED 0x0010
#define OMAP_FIXED 0x0100
- nargs.addr = uap->addr;
- nargs.len = uap->len;
- nargs.prot = cvtbsdprot[uap->prot & 0x7];
+ prot = cvtbsdprot[uap->prot & 0x7];
#ifdef COMPAT_FREEBSD32
-#if defined(__amd64__) || defined(__ia64__)
+#if defined(__amd64__)
if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
- nargs.prot != 0)
- nargs.prot |= PROT_EXEC;
+ prot != 0)
+ prot |= PROT_EXEC;
#endif
#endif
- nargs.flags = 0;
+ flags = 0;
if (uap->flags & OMAP_ANON)
- nargs.flags |= MAP_ANON;
+ flags |= MAP_ANON;
if (uap->flags & OMAP_COPY)
- nargs.flags |= MAP_COPY;
+ flags |= MAP_COPY;
if (uap->flags & OMAP_SHARED)
- nargs.flags |= MAP_SHARED;
+ flags |= MAP_SHARED;
else
- nargs.flags |= MAP_PRIVATE;
+ flags |= MAP_PRIVATE;
if (uap->flags & OMAP_FIXED)
- nargs.flags |= MAP_FIXED;
- nargs.fd = uap->fd;
- nargs.pos = uap->pos;
- return (sys_mmap(td, &nargs));
+ flags |= MAP_FIXED;
+ return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags,
+ uap->fd, uap->pos));
}
#endif /* COMPAT_43 */
@@ -513,20 +462,21 @@
};
#endif
int
-sys_msync(td, uap)
- struct thread *td;
- struct msync_args *uap;
+sys_msync(struct thread *td, struct msync_args *uap)
{
+
+ return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
+}
+
+int
+kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
+{
vm_offset_t addr;
- vm_size_t size, pageoff;
- int flags;
+ vm_size_t pageoff;
vm_map_t map;
int rv;
- addr = (vm_offset_t) uap->addr;
- size = uap->len;
- flags = uap->flags;
-
+ addr = addr0;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
@@ -565,23 +515,28 @@
};
#endif
int
-sys_munmap(td, uap)
- struct thread *td;
- struct munmap_args *uap;
+sys_munmap(struct thread *td, struct munmap_args *uap)
{
+
+ return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
+}
+
+int
+kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
+{
#ifdef HWPMC_HOOKS
struct pmckern_map_out pkm;
vm_map_entry_t entry;
+ bool pmc_handled;
#endif
vm_offset_t addr;
- vm_size_t size, pageoff;
+ vm_size_t pageoff;
vm_map_t map;
- addr = (vm_offset_t) uap->addr;
- size = uap->len;
if (size == 0)
return (EINVAL);
+ addr = addr0;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
@@ -597,20 +552,23 @@
return (EINVAL);
vm_map_lock(map);
#ifdef HWPMC_HOOKS
- /*
- * Inform hwpmc if the address range being unmapped contains
- * an executable region.
- */
- pkm.pm_address = (uintptr_t) NULL;
- if (vm_map_lookup_entry(map, addr, &entry)) {
- for (;
- entry != &map->header && entry->start < addr + size;
- entry = entry->next) {
- if (vm_map_check_protection(map, entry->start,
- entry->end, VM_PROT_EXECUTE) == TRUE) {
- pkm.pm_address = (uintptr_t) addr;
- pkm.pm_size = (size_t) size;
- break;
+ pmc_handled = false;
+ if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
+ pmc_handled = true;
+ /*
+ * Inform hwpmc if the address range being unmapped contains
+ * an executable region.
+ */
+ pkm.pm_address = (uintptr_t) NULL;
+ if (vm_map_lookup_entry(map, addr, &entry)) {
+ for (; entry->start < addr + size;
+ entry = entry->next) {
+ if (vm_map_check_protection(map, entry->start,
+ entry->end, VM_PROT_EXECUTE) == TRUE) {
+ pkm.pm_address = (uintptr_t) addr;
+ pkm.pm_size = (size_t) size;
+ break;
+ }
}
}
}
@@ -618,14 +576,16 @@
vm_map_delete(map, addr, addr + size);
#ifdef HWPMC_HOOKS
- /* downgrade the lock to prevent a LOR with the pmc-sx lock */
- vm_map_lock_downgrade(map);
- if (pkm.pm_address != (uintptr_t) NULL)
- PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
- vm_map_unlock_read(map);
-#else
- vm_map_unlock(map);
+ if (__predict_false(pmc_handled)) {
+ /* downgrade the lock to prevent a LOR with the pmc-sx lock */
+ vm_map_lock_downgrade(map);
+ if (pkm.pm_address != (uintptr_t) NULL)
+ PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
+ vm_map_unlock_read(map);
+ } else
#endif
+ vm_map_unlock(map);
+
/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
return (0);
}
@@ -638,22 +598,30 @@
};
#endif
int
-sys_mprotect(td, uap)
- struct thread *td;
- struct mprotect_args *uap;
+sys_mprotect(struct thread *td, struct mprotect_args *uap)
{
+
+ return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot));
+}
+
+int
+kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot)
+{
vm_offset_t addr;
- vm_size_t size, pageoff;
- vm_prot_t prot;
+ vm_size_t pageoff;
- addr = (vm_offset_t) uap->addr;
- size = uap->len;
- prot = uap->prot & VM_PROT_ALL;
-
+ addr = addr0;
+ prot = (prot & VM_PROT_ALL);
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
+#ifdef COMPAT_FREEBSD32
+ if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+ if (((addr + size) & 0xffffffff) < addr)
+ return (EINVAL);
+ } else
+#endif
if (addr + size < addr)
return (EINVAL);
@@ -715,8 +683,15 @@
int
sys_madvise(struct thread *td, struct madvise_args *uap)
{
- vm_offset_t start, end;
+
+ return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
+}
+
+int
+kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
+{
vm_map_t map;
+ vm_offset_t addr, end, start;
int flags;
/*
@@ -723,7 +698,7 @@
* Check for our special case, advising the swap pager we are
* "immortal."
*/
- if (uap->behav == MADV_PROTECT) {
+ if (behav == MADV_PROTECT) {
flags = PPROT_SET;
return (kern_procctl(td, P_PID, td->td_proc->p_pid,
PROC_SPROTECT, &flags));
@@ -732,7 +707,7 @@
/*
* Check for illegal behavior
*/
- if (uap->behav < 0 || uap->behav > MADV_CORE)
+ if (behav < 0 || behav > MADV_CORE)
return (EINVAL);
/*
* Check for illegal addresses. Watch out for address wrap... Note
@@ -739,10 +714,10 @@
* that VM_*_ADDRESS are not constants due to casts (argh).
*/
map = &td->td_proc->p_vmspace->vm_map;
- if ((vm_offset_t)uap->addr < vm_map_min(map) ||
- (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
+ addr = addr0;
+ if (addr < vm_map_min(map) || addr + len > vm_map_max(map))
return (EINVAL);
- if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
+ if ((addr + len) < addr)
return (EINVAL);
/*
@@ -749,10 +724,10 @@
* Since this routine is only advisory, we default to conservative
* behavior.
*/
- start = trunc_page((vm_offset_t) uap->addr);
- end = round_page((vm_offset_t) uap->addr + uap->len);
+ start = trunc_page(addr);
+ end = round_page(addr + len);
- if (vm_map_madvise(map, start, end, uap->behav))
+ if (vm_map_madvise(map, start, end, behav))
return (EINVAL);
return (0);
}
@@ -768,11 +743,17 @@
int
sys_mincore(struct thread *td, struct mincore_args *uap)
{
+
+ return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
+}
+
+int
+kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
+{
vm_offset_t addr, first_addr;
vm_offset_t end, cend;
pmap_t pmap;
vm_map_t map;
- char *vec;
int error = 0;
int vecindex, lastvecindex;
vm_map_entry_t current;
@@ -789,17 +770,12 @@
* Make sure that the addresses presented are valid for user
* mode.
*/
- first_addr = addr = trunc_page((vm_offset_t) uap->addr);
- end = addr + (vm_size_t)round_page(uap->len);
+ first_addr = addr = trunc_page(addr0);
+ end = addr + (vm_size_t)round_page(len);
map = &td->td_proc->p_vmspace->vm_map;
if (end > vm_map_max(map) || end < addr)
return (ENOMEM);
- /*
- * Address of byte vector
- */
- vec = uap->vec;
-
pmap = vmspace_pmap(td->td_proc->p_vmspace);
vm_map_lock_read(map);
@@ -817,16 +793,12 @@
* up the pages elsewhere.
*/
lastvecindex = -1;
- for (current = entry;
- (current != &map->header) && (current->start < end);
- current = current->next) {
+ for (current = entry; current->start < end; current = current->next) {
/*
* check for contiguity
*/
- if (current->end < end &&
- (entry->next == &map->header ||
- current->next->start > current->end)) {
+ if (current->end < end && current->next->start > current->end) {
vm_map_unlock_read(map);
return (ENOMEM);
}
@@ -862,8 +834,17 @@
retry:
m = NULL;
mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
- if (locked_pa != 0) {
+ if (mincore_mapped) {
/*
+ * We only care about this pmap's
+ * mapping of the page, if any.
+ */
+ if (locked_pa != 0) {
+ vm_page_unlock(PHYS_TO_VM_PAGE(
+ locked_pa));
+ }
+ } else if (locked_pa != 0) {
+ /*
* The page is mapped by this process but not
* both accessed and modified. It is also
* managed. Acquire the object lock so that
@@ -905,9 +886,6 @@
pindex = OFF_TO_IDX(current->offset +
(addr - current->start));
m = vm_page_lookup(object, pindex);
- if (m == NULL &&
- vm_page_is_cached(object, pindex))
- mincoreinfo = MINCORE_INCORE;
if (m != NULL && m->valid == 0)
m = NULL;
if (m != NULL)
@@ -945,7 +923,7 @@
/*
* calculate index into user supplied byte vector
*/
- vecindex = OFF_TO_IDX(addr - first_addr);
+ vecindex = atop(addr - first_addr);
/*
* If we have skipped map entries, we need to make sure that
@@ -991,7 +969,7 @@
/*
* Zero the last entries in the byte vector.
*/
- vecindex = OFF_TO_IDX(end - first_addr);
+ vecindex = atop(end - first_addr);
while ((lastvecindex + 1) < vecindex) {
++lastvecindex;
error = subyte(vec + lastvecindex, 0);
@@ -1023,11 +1001,12 @@
sys_mlock(struct thread *td, struct mlock_args *uap)
{
- return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
+ return (kern_mlock(td->td_proc, td->td_ucred,
+ __DECONST(uintptr_t, uap->addr), uap->len));
}
int
-vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
+kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
{
vm_offset_t addr, end, last, start;
vm_size_t npages, size;
@@ -1038,7 +1017,7 @@
error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
if (error)
return (error);
- addr = (vm_offset_t)addr0;
+ addr = addr0;
size = len;
last = addr + size;
start = trunc_page(addr);
@@ -1051,12 +1030,12 @@
map = &proc->p_vmspace->vm_map;
PROC_LOCK(proc);
nsize = ptoa(npages + pmap_wired_count(map->pmap));
- if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
+ if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
PROC_UNLOCK(proc);
return (ENOMEM);
}
PROC_UNLOCK(proc);
- if (npages + cnt.v_wire_count > vm_page_max_wired)
+ if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
return (EAGAIN);
#ifdef RACCT
if (racct_enable) {
@@ -1106,7 +1085,7 @@
*/
if (!old_mlock && uap->how & MCL_CURRENT) {
PROC_LOCK(td->td_proc);
- if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
+ if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) {
PROC_UNLOCK(td->td_proc);
return (ENOMEM);
}
@@ -1195,12 +1174,16 @@
};
#endif
int
-sys_munlock(td, uap)
- struct thread *td;
- struct munlock_args *uap;
+sys_munlock(struct thread *td, struct munlock_args *uap)
{
+
+ return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
+}
+
+int
+kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
+{
vm_offset_t addr, end, last, start;
- vm_size_t size;
#ifdef RACCT
vm_map_t map;
#endif
@@ -1209,8 +1192,7 @@
error = priv_check(td, PRIV_VM_MUNLOCK);
if (error)
return (error);
- addr = (vm_offset_t)uap->addr;
- size = uap->len;
+ addr = addr0;
last = addr + size;
start = trunc_page(addr);
end = round_page(last);
@@ -1235,9 +1217,6 @@
*
* Helper function for vm_mmap. Perform sanity check specific for mmap
* operations on vnodes.
- *
- * For VCHR vnodes, the vnode lock is held over the call to
- * vm_mmap_cdev() to keep vp->v_rdev valid.
*/
int
vm_mmap_vnode(struct thread *td, vm_size_t objsize,
@@ -1247,7 +1226,7 @@
{
struct vattr va;
vm_object_t obj;
- vm_offset_t foff;
+ vm_ooffset_t foff;
struct ucred *cred;
int error, flags, locktype;
@@ -1258,6 +1237,7 @@
locktype = LK_SHARED;
if ((error = vget(vp, locktype, td)) != 0)
return (error);
+ AUDIT_ARG_VNODE1(vp);
foff = *foffp;
flags = *flagsp;
obj = vp->v_object;
@@ -1284,12 +1264,6 @@
*writecounted = TRUE;
vnode_pager_update_writecount(obj, 0, objsize);
}
- } else if (vp->v_type == VCHR) {
- error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
- vp->v_rdev, foffp, objp);
- if (error == 0)
- goto mark_atime;
- goto done;
} else {
error = EINVAL;
goto done;
@@ -1297,13 +1271,14 @@
if ((error = VOP_GETATTR(vp, &va, cred)))
goto done;
#ifdef MAC
- error = mac_vnode_check_mmap(cred, vp, prot, flags);
+ /* This relies on VM_PROT_* matching PROT_*. */
+ error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
if (error != 0)
goto done;
#endif
if ((flags & MAP_SHARED) != 0) {
if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
- if (prot & PROT_WRITE) {
+ if (prot & VM_PROT_WRITE) {
error = EPERM;
goto done;
}
@@ -1318,22 +1293,26 @@
objsize = round_page(va.va_size);
if (va.va_nlink == 0)
flags |= MAP_NOSYNC;
- if (obj->type == OBJT_VNODE)
+ if (obj->type == OBJT_VNODE) {
obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
cred);
- else {
+ if (obj == NULL) {
+ error = ENOMEM;
+ goto done;
+ }
+ } else {
KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
("wrong object type"));
- vm_object_reference(obj);
+ VM_OBJECT_WLOCK(obj);
+ vm_object_reference_locked(obj);
+#if VM_NRESERVLEVEL > 0
+ vm_object_color(obj, 0);
+#endif
+ VM_OBJECT_WUNLOCK(obj);
}
- if (obj == NULL) {
- error = ENOMEM;
- goto done;
- }
*objp = obj;
*flagsp = flags;
-mark_atime:
vfs_mark_atime(vp, cred);
done:
@@ -1352,21 +1331,18 @@
* operations on cdevs.
*/
int
-vm_mmap_cdev(struct thread *td, vm_size_t objsize,
- vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
- struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
+vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
+ vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
+ vm_ooffset_t *foff, vm_object_t *objp)
{
vm_object_t obj;
- struct cdevsw *dsw;
- int error, flags, ref;
+ int error, flags;
flags = *flagsp;
- dsw = dev_refthread(cdev, &ref);
- if (dsw == NULL)
- return (ENXIO);
if (dsw->d_flags & D_MMAP_ANON) {
- dev_relthread(cdev, ref);
+ *objp = NULL;
+ *foff = 0;
*maxprotp = VM_PROT_ALL;
*flagsp |= MAP_ANON;
return (0);
@@ -1375,24 +1351,18 @@
* cdevs do not provide private mappings of any kind.
*/
if ((*maxprotp & VM_PROT_WRITE) == 0 &&
- (prot & PROT_WRITE) != 0) {
- dev_relthread(cdev, ref);
+ (prot & VM_PROT_WRITE) != 0)
return (EACCES);
- }
- if (flags & (MAP_PRIVATE|MAP_COPY)) {
- dev_relthread(cdev, ref);
+ if (flags & (MAP_PRIVATE|MAP_COPY))
return (EINVAL);
- }
/*
* Force device mappings to be shared.
*/
flags |= MAP_SHARED;
#ifdef MAC_XXX
- error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
- if (error != 0) {
- dev_relthread(cdev, ref);
+ error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
+ if (error != 0)
return (error);
- }
#endif
/*
* First, try d_mmap_single(). If that is not implemented
@@ -1404,7 +1374,6 @@
* XXX assumes VM_PROT_* == PROT_*
*/
error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
- dev_relthread(cdev, ref);
if (error != ENODEV)
return (error);
obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
@@ -1417,65 +1386,96 @@
}
/*
- * vm_mmap_shm()
+ * vm_mmap()
*
- * MPSAFE
- *
- * Helper function for vm_mmap. Perform sanity check specific for mmap
- * operations on shm file descriptors.
+ * Internal version of mmap used by exec, sys5 shared memory, and
+ * various device drivers. Handle is either a vnode pointer, a
+ * character device, or NULL for MAP_ANON.
*/
int
-vm_mmap_shm(struct thread *td, vm_size_t objsize,
- vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
- struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp)
+vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
+ vm_prot_t maxprot, int flags,
+ objtype_t handle_type, void *handle,
+ vm_ooffset_t foff)
{
+ vm_object_t object;
+ struct thread *td = curthread;
int error;
+ boolean_t writecounted;
- if ((*flagsp & MAP_SHARED) != 0 &&
- (*maxprotp & VM_PROT_WRITE) == 0 &&
- (prot & PROT_WRITE) != 0)
- return (EACCES);
-#ifdef MAC
- error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
- if (error != 0)
- return (error);
-#endif
- error = shm_mmap(shmfd, objsize, foff, objp);
+ if (size == 0)
+ return (EINVAL);
+
+ size = round_page(size);
+ object = NULL;
+ writecounted = FALSE;
+
+ /*
+ * Lookup/allocate object.
+ */
+ switch (handle_type) {
+ case OBJT_DEVICE: {
+ struct cdevsw *dsw;
+ struct cdev *cdev;
+ int ref;
+
+ cdev = handle;
+ dsw = dev_refthread(cdev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
+ dsw, &foff, &object);
+ dev_relthread(cdev, ref);
+ break;
+ }
+ case OBJT_VNODE:
+ error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
+ handle, &foff, &object, &writecounted);
+ break;
+ case OBJT_DEFAULT:
+ if (handle == NULL) {
+ error = 0;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ error = EINVAL;
+ break;
+ }
if (error)
return (error);
- return (0);
+
+ error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
+ foff, writecounted, td);
+ if (error != 0 && object != NULL) {
+ /*
+ * If this mapping was accounted for in the vnode's
+ * writecount, then undo that now.
+ */
+ if (writecounted)
+ vnode_pager_release_writecount(object, 0, size);
+ vm_object_deallocate(object);
+ }
+ return (error);
}
/*
- * vm_mmap()
- *
- * MPSAFE
- *
- * Internal version of mmap. Currently used by mmap, exec, and sys5
- * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON.
+ * Internal version of mmap that maps a specific VM object into an
+ * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
*/
int
-vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
- vm_prot_t maxprot, int flags,
- objtype_t handle_type, void *handle,
- vm_ooffset_t foff)
+vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
+ vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
+ boolean_t writecounted, struct thread *td)
{
boolean_t curmap, fitit;
vm_offset_t max_addr;
- vm_object_t object = NULL;
- struct thread *td = curthread;
int docow, error, findspace, rv;
- boolean_t writecounted;
- if (size == 0)
- return (0);
-
- size = round_page(size);
-
curmap = map == &td->td_proc->p_vmspace->vm_map;
if (curmap) {
PROC_LOCK(td->td_proc);
- if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) {
+ if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) {
PROC_UNLOCK(td->td_proc);
return (ENOMEM);
}
@@ -1485,7 +1485,7 @@
}
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
if (ptoa(pmap_wired_count(map->pmap)) + size >
- lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
+ lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) {
racct_set_force(td->td_proc, RACCT_VMEM,
map->size);
PROC_UNLOCK(td->td_proc);
@@ -1505,11 +1505,11 @@
/*
* We currently can only deal with page aligned file offsets.
- * The check is here rather than in the syscall because the
- * kernel calls this function internally for other mmaping
- * operations (such as in exec) and non-aligned offsets will
- * cause pmap inconsistencies...so we want to be sure to
- * disallow this in all cases.
+ * The mmap() system call already enforces this by subtracting
+ * the page offset from the file offset, but checking here
+ * catches errors in device drivers (e.g. d_single_mmap()
+ * callbacks) and other internal mapping requests (such as in
+ * exec).
*/
if (foff & PAGE_MASK)
return (EINVAL);
@@ -1522,44 +1522,11 @@
return (EINVAL);
fitit = FALSE;
}
- writecounted = FALSE;
- /*
- * Lookup/allocate object.
- */
- switch (handle_type) {
- case OBJT_DEVICE:
- error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
- handle, &foff, &object);
- break;
- case OBJT_VNODE:
- error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
- handle, &foff, &object, &writecounted);
- break;
- case OBJT_SWAP:
- error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
- handle, foff, &object);
- break;
- case OBJT_DEFAULT:
- if (handle == NULL) {
- error = 0;
- break;
- }
- /* FALLTHROUGH */
- default:
- error = EINVAL;
- break;
- }
- if (error)
- return (error);
if (flags & MAP_ANON) {
- object = NULL;
+ if (object != NULL || foff != 0)
+ return (EINVAL);
docow = 0;
- /*
- * Unnamed anonymous regions always start at 0.
- */
- if (handle == 0)
- foff = 0;
} else if (flags & MAP_PREFAULT_READ)
docow = MAP_PREFAULT;
else
@@ -1600,15 +1567,9 @@
max_addr = MAP_32BIT_MAX_ADDR;
#endif
if (curmap) {
- vm_offset_t min_addr;
-
- PROC_LOCK(td->td_proc);
- min_addr = round_page((vm_offset_t)td->td_proc->
- p_vmspace->vm_daddr + lim_max(td->td_proc,
- RLIMIT_DATA));
- PROC_UNLOCK(td->td_proc);
rv = vm_map_find_min(map, object, foff, addr, size,
- min_addr, max_addr,
+ round_page((vm_offset_t)td->td_proc->p_vmspace->
+ vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr,
findspace, prot, maxprot, docow);
} else {
rv = vm_map_find(map, object, foff, addr, size,
@@ -1629,19 +1590,6 @@
VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
}
- } else {
- /*
- * If this mapping was accounted for in the vnode's
- * writecount, then undo that now.
- */
- if (writecounted)
- vnode_pager_release_writecount(object, 0, size);
- /*
- * Lose the object reference. Will destroy the
- * object if it's an unnamed anonymous mapping
- * or named anonymous without other references.
- */
- vm_object_deallocate(object);
}
return (vm_mmap_to_errno(rv));
}
Modified: trunk/sys/vm/vm_object.c
===================================================================
--- trunk/sys/vm/vm_object.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_object.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_object.c 321677 2017-07-29 08:24:51Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_object.c 352331 2019-09-14 13:35:48Z kib $");
#include "opt_vm.h"
@@ -74,6 +74,7 @@
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/kernel.h>
+#include <sys/pctrie.h>
#include <sys/sysctl.h>
#include <sys/mutex.h>
#include <sys/proc.h> /* for curproc, pageproc */
@@ -179,9 +180,6 @@
("object %p has reservations",
object));
#endif
- KASSERT(vm_object_cache_is_empty(object),
- ("object %p has cached pages",
- object));
KASSERT(object->paging_in_progress == 0,
("object %p paging_in_progress = %d",
object, object->paging_in_progress));
@@ -203,19 +201,16 @@
vm_object_t object;
object = (vm_object_t)mem;
- bzero(&object->lock, sizeof(object->lock));
- rw_init_flags(&object->lock, "vm object", RW_DUPOK);
+ rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW);
/* These are true for any object that has been freed */
object->type = OBJT_DEAD;
object->ref_count = 0;
- object->rtree.rt_root = 0;
- object->rtree.rt_flags = 0;
+ vm_radix_init(&object->rtree);
object->paging_in_progress = 0;
object->resident_page_count = 0;
object->shadow_count = 0;
- object->cache.rt_root = 0;
- object->cache.rt_flags = 0;
+ object->flags = OBJ_DEAD;
mtx_lock(&vm_object_list_mtx);
TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
@@ -231,6 +226,16 @@
LIST_INIT(&object->shadow_head);
object->type = type;
+ if (type == OBJT_SWAP)
+ pctrie_init(&object->un_pager.swp.swp_blks);
+
+ /*
+ * Ensure that swap_pager_swapoff() iteration over object_list
+ * sees up to date type and pctrie head if it observed
+ * non-dead object.
+ */
+ atomic_thread_fence_rel();
+
switch (type) {
case OBJT_DEAD:
panic("_vm_object_allocate: can't create OBJT_DEAD");
@@ -266,6 +271,7 @@
#if VM_NRESERVLEVEL > 0
LIST_INIT(&object->rvq);
#endif
+ umtx_shm_object_init(object);
}
/*
@@ -280,8 +286,8 @@
mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
rw_init(&kernel_object->lock, "kernel vm object");
- _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
- kernel_object);
+ _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS -
+ VM_MIN_KERNEL_ADDRESS), kernel_object);
#if VM_NRESERVLEVEL > 0
kernel_object->flags |= OBJ_COLORED;
kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
@@ -288,8 +294,8 @@
#endif
rw_init(&kmem_object->lock, "kmem vm object");
- _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
- kmem_object);
+ _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS -
+ VM_MIN_KERNEL_ADDRESS), kmem_object);
#if VM_NRESERVLEVEL > 0
kmem_object->flags |= OBJ_COLORED;
kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
@@ -308,7 +314,7 @@
#endif
vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- vm_radix_init();
+ vm_radix_zinit();
}
void
@@ -472,11 +478,14 @@
KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
#ifdef INVARIANTS
if (object->ref_count == 0) {
- vprint("vm_object_vndeallocate", vp);
+ vn_printf(vp, "vm_object_vndeallocate ");
panic("vm_object_vndeallocate: bad object reference count");
}
#endif
+ if (!umtx_shm_vnobj_persistent && object->ref_count == 1)
+ umtx_shm_object_terminated(object);
+
/*
* The test for text of vp vnode does not need a bypass to
* reach right VV_TEXT there, since it is obtained from
@@ -649,6 +658,7 @@
return;
}
doterm:
+ umtx_shm_object_terminated(object);
temp = object->backing_object;
if (temp != NULL) {
KASSERT((object->flags & OBJ_TMPFS_NODE) == 0,
@@ -697,6 +707,89 @@
}
/*
+ * vm_object_terminate_pages removes any remaining pageable pages
+ * from the object and resets the object to an empty state.
+ */
+static void
+vm_object_terminate_pages(vm_object_t object)
+{
+ vm_page_t p, p_next;
+ struct mtx *mtx, *mtx1;
+ struct vm_pagequeue *pq, *pq1;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ mtx = NULL;
+ pq = NULL;
+
+ /*
+ * Free any remaining pageable pages. This also removes them from the
+ * paging queues. However, don't free wired pages, just remove them
+ * from the object. Rather than incrementally removing each page from
+ * the object, the page and object are reset to any empty state.
+ */
+ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
+ vm_page_assert_unbusied(p);
+ if ((object->flags & OBJ_UNMANAGED) == 0) {
+ /*
+ * vm_page_free_prep() only needs the page
+ * lock for managed pages.
+ */
+ mtx1 = vm_page_lockptr(p);
+ if (mtx1 != mtx) {
+ if (mtx != NULL)
+ mtx_unlock(mtx);
+ if (pq != NULL) {
+ vm_pagequeue_unlock(pq);
+ pq = NULL;
+ }
+ mtx = mtx1;
+ mtx_lock(mtx);
+ }
+ }
+ p->object = NULL;
+ if (p->wire_count != 0)
+ goto unlist;
+ PCPU_INC(cnt.v_pfree);
+ p->flags &= ~PG_ZERO;
+ if (p->queue != PQ_NONE) {
+ KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: "
+ "page %p is not queued", p));
+ pq1 = vm_page_pagequeue(p);
+ if (pq != pq1) {
+ if (pq != NULL)
+ vm_pagequeue_unlock(pq);
+ pq = pq1;
+ vm_pagequeue_lock(pq);
+ }
+ }
+ if (vm_page_free_prep(p, true))
+ continue;
+unlist:
+ TAILQ_REMOVE(&object->memq, p, listq);
+ }
+ if (pq != NULL)
+ vm_pagequeue_unlock(pq);
+ if (mtx != NULL)
+ mtx_unlock(mtx);
+
+ vm_page_free_phys_pglist(&object->memq);
+
+ /*
+ * If the object contained any pages, then reset it to an empty state.
+ * None of the object's fields, including "resident_page_count", were
+ * modified by the preceding loop.
+ */
+ if (object->resident_page_count != 0) {
+ vm_radix_reclaim_allnodes(&object->rtree);
+ TAILQ_INIT(&object->memq);
+ object->resident_page_count = 0;
+ if (object->type == OBJT_VNODE)
+ vdrop(object->handle);
+ }
+}
+
+/*
* vm_object_terminate actually destroys the specified object, freeing
* up all previously used resources.
*
@@ -706,7 +799,6 @@
void
vm_object_terminate(vm_object_t object)
{
- vm_page_t p, p_next;
VM_OBJECT_ASSERT_WLOCKED(object);
@@ -749,48 +841,13 @@
("vm_object_terminate: object with references, ref_count=%d",
object->ref_count));
- /*
- * Free any remaining pageable pages. This also removes them from the
- * paging queues. However, don't free wired pages, just remove them
- * from the object. Rather than incrementally removing each page from
- * the object, the page and object are reset to any empty state.
- */
- TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
- vm_page_assert_unbusied(p);
- vm_page_lock(p);
- /*
- * Optimize the page's removal from the object by resetting
- * its "object" field. Specifically, if the page is not
- * wired, then the effect of this assignment is that
- * vm_page_free()'s call to vm_page_remove() will return
- * immediately without modifying the page or the object.
- */
- p->object = NULL;
- if (p->wire_count == 0) {
- vm_page_free(p);
- PCPU_INC(cnt.v_pfree);
- }
- vm_page_unlock(p);
- }
- /*
- * If the object contained any pages, then reset it to an empty state.
- * None of the object's fields, including "resident_page_count", were
- * modified by the preceding loop.
- */
- if (object->resident_page_count != 0) {
- vm_radix_reclaim_allnodes(&object->rtree);
- TAILQ_INIT(&object->memq);
- object->resident_page_count = 0;
- if (object->type == OBJT_VNODE)
- vdrop(object->handle);
- }
+ if ((object->flags & OBJ_PG_DTOR) == 0)
+ vm_object_terminate_pages(object);
#if VM_NRESERVLEVEL > 0
if (__predict_false(!LIST_EMPTY(&object->rvq)))
vm_reserv_break_all(object);
#endif
- if (__predict_false(!vm_object_cache_is_empty(object)))
- vm_page_cache_free(object, 0, 0);
KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT ||
object->type == OBJT_SWAP,
@@ -1027,13 +1084,13 @@
* I/O.
*/
if (object->type == OBJT_VNODE &&
- (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
- vp = object->handle;
+ (object->flags & OBJ_MIGHTBEDIRTY) != 0 &&
+ ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) {
VM_OBJECT_WUNLOCK(object);
(void) vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (syncio && !invalidate && offset == 0 &&
- OFF_TO_IDX(size) == object->size) {
+ atop(size) == object->size) {
/*
* If syncing the whole mapping of the file,
* it is faster to schedule all the writes in
@@ -1080,6 +1137,33 @@
}
/*
+ * Determine whether the given advice can be applied to the object. Advice is
+ * not applied to unmanaged pages since they never belong to page queues, and
+ * since MADV_FREE is destructive, it can apply only to anonymous pages that
+ * have been mapped at most once.
+ */
+static bool
+vm_object_advice_applies(vm_object_t object, int advice)
+{
+
+ if ((object->flags & OBJ_UNMANAGED) != 0)
+ return (false);
+ if (advice != MADV_FREE)
+ return (true);
+ return ((object->type == OBJT_DEFAULT || object->type == OBJT_SWAP) &&
+ (object->flags & OBJ_ONEMAPPING) != 0);
+}
+
+static void
+vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex,
+ vm_size_t size)
+{
+
+ if (advice == MADV_FREE && object->type == OBJT_SWAP)
+ swap_pager_freespace(object, pindex, size);
+}
+
+/*
* vm_object_madvise:
*
* Implements the madvise function at the object/page level.
@@ -1102,103 +1186,109 @@
*/
void
vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end,
- int advise)
+ int advice)
{
vm_pindex_t tpindex;
vm_object_t backing_object, tobject;
- vm_page_t m;
+ vm_page_t m, tm;
if (object == NULL)
return;
+
+relookup:
VM_OBJECT_WLOCK(object);
- /*
- * Locate and adjust resident pages
- */
- for (; pindex < end; pindex += 1) {
-relookup:
+ if (!vm_object_advice_applies(object, advice)) {
+ VM_OBJECT_WUNLOCK(object);
+ return;
+ }
+ for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) {
tobject = object;
- tpindex = pindex;
-shadowlookup:
+
/*
- * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
- * and those pages must be OBJ_ONEMAPPING.
+ * If the next page isn't resident in the top-level object, we
+ * need to search the shadow chain. When applying MADV_FREE, we
+ * take care to release any swap space used to store
+ * non-resident pages.
*/
- if (advise == MADV_FREE) {
- if ((tobject->type != OBJT_DEFAULT &&
- tobject->type != OBJT_SWAP) ||
- (tobject->flags & OBJ_ONEMAPPING) == 0) {
- goto unlock_tobject;
- }
- } else if ((tobject->flags & OBJ_UNMANAGED) != 0)
- goto unlock_tobject;
- m = vm_page_lookup(tobject, tpindex);
- if (m == NULL && advise == MADV_WILLNEED) {
+ if (m == NULL || pindex < m->pindex) {
/*
- * If the page is cached, reactivate it.
+ * Optimize a common case: if the top-level object has
+ * no backing object, we can skip over the non-resident
+ * range in constant time.
*/
- m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
- VM_ALLOC_NOBUSY);
+ if (object->backing_object == NULL) {
+ tpindex = (m != NULL && m->pindex < end) ?
+ m->pindex : end;
+ vm_object_madvise_freespace(object, advice,
+ pindex, tpindex - pindex);
+ if ((pindex = tpindex) == end)
+ break;
+ goto next_page;
+ }
+
+ tpindex = pindex;
+ do {
+ vm_object_madvise_freespace(tobject, advice,
+ tpindex, 1);
+ /*
+ * Prepare to search the next object in the
+ * chain.
+ */
+ backing_object = tobject->backing_object;
+ if (backing_object == NULL)
+ goto next_pindex;
+ VM_OBJECT_WLOCK(backing_object);
+ tpindex +=
+ OFF_TO_IDX(tobject->backing_object_offset);
+ if (tobject != object)
+ VM_OBJECT_WUNLOCK(tobject);
+ tobject = backing_object;
+ if (!vm_object_advice_applies(tobject, advice))
+ goto next_pindex;
+ } while ((tm = vm_page_lookup(tobject, tpindex)) ==
+ NULL);
+ } else {
+next_page:
+ tm = m;
+ m = TAILQ_NEXT(m, listq);
}
- if (m == NULL) {
- /*
- * There may be swap even if there is no backing page
- */
- if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
- swap_pager_freespace(tobject, tpindex, 1);
- /*
- * next object
- */
- backing_object = tobject->backing_object;
- if (backing_object == NULL)
- goto unlock_tobject;
- VM_OBJECT_WLOCK(backing_object);
- tpindex += OFF_TO_IDX(tobject->backing_object_offset);
- if (tobject != object)
- VM_OBJECT_WUNLOCK(tobject);
- tobject = backing_object;
- goto shadowlookup;
- } else if (m->valid != VM_PAGE_BITS_ALL)
- goto unlock_tobject;
+
/*
* If the page is not in a normal state, skip it.
*/
- vm_page_lock(m);
- if (m->hold_count != 0 || m->wire_count != 0) {
- vm_page_unlock(m);
- goto unlock_tobject;
+ if (tm->valid != VM_PAGE_BITS_ALL)
+ goto next_pindex;
+ vm_page_lock(tm);
+ if (tm->hold_count != 0 || tm->wire_count != 0) {
+ vm_page_unlock(tm);
+ goto next_pindex;
}
- KASSERT((m->flags & PG_FICTITIOUS) == 0,
- ("vm_object_madvise: page %p is fictitious", m));
- KASSERT((m->oflags & VPO_UNMANAGED) == 0,
- ("vm_object_madvise: page %p is not managed", m));
- if (vm_page_busied(m)) {
- if (advise == MADV_WILLNEED) {
+ KASSERT((tm->flags & PG_FICTITIOUS) == 0,
+ ("vm_object_madvise: page %p is fictitious", tm));
+ KASSERT((tm->oflags & VPO_UNMANAGED) == 0,
+ ("vm_object_madvise: page %p is not managed", tm));
+ if (vm_page_busied(tm)) {
+ if (object != tobject)
+ VM_OBJECT_WUNLOCK(tobject);
+ VM_OBJECT_WUNLOCK(object);
+ if (advice == MADV_WILLNEED) {
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
- * likely to reclaim it.
+ * likely to reclaim it.
*/
- vm_page_aflag_set(m, PGA_REFERENCED);
+ vm_page_aflag_set(tm, PGA_REFERENCED);
}
- if (object != tobject)
- VM_OBJECT_WUNLOCK(object);
- VM_OBJECT_WUNLOCK(tobject);
- vm_page_busy_sleep(m, "madvpo", false);
- VM_OBJECT_WLOCK(object);
+ vm_page_busy_sleep(tm, "madvpo", false);
goto relookup;
}
- if (advise == MADV_WILLNEED) {
- vm_page_activate(m);
- } else {
- vm_page_advise(m, advise);
- }
- vm_page_unlock(m);
- if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
- swap_pager_freespace(tobject, tpindex, 1);
-unlock_tobject:
+ vm_page_advise(tm, advice);
+ vm_page_unlock(tm);
+ vm_object_madvise_freespace(tobject, advice, tm->pindex, 1);
+next_pindex:
if (tobject != object)
VM_OBJECT_WUNLOCK(tobject);
- }
+ }
VM_OBJECT_WUNLOCK(object);
}
@@ -1368,11 +1458,11 @@
goto retry;
}
- /* vm_page_rename() will handle dirty and cache. */
+ /* vm_page_rename() will dirty the page. */
if (vm_page_rename(m, new_object, idx)) {
VM_OBJECT_WUNLOCK(new_object);
VM_OBJECT_WUNLOCK(orig_object);
- VM_WAIT;
+ vm_radix_wait();
VM_OBJECT_WLOCK(orig_object);
VM_OBJECT_WLOCK(new_object);
goto retry;
@@ -1403,19 +1493,6 @@
swap_pager_copy(orig_object, new_object, offidxstart, 0);
TAILQ_FOREACH(m, &new_object->memq, listq)
vm_page_xunbusy(m);
-
- /*
- * Transfer any cached pages from orig_object to new_object.
- * If swap_pager_copy() found swapped out pages within the
- * specified range of orig_object, then it changed
- * new_object's type to OBJT_SWAP when it transferred those
- * pages to new_object. Otherwise, new_object's type
- * should still be OBJT_DEFAULT and orig_object should not
- * contain any cached pages within the specified range.
- */
- if (__predict_false(!vm_object_cache_is_empty(orig_object)))
- vm_page_cache_transfer(orig_object, offidxstart,
- new_object);
}
VM_OBJECT_WUNLOCK(orig_object);
VM_OBJECT_WUNLOCK(new_object);
@@ -1425,12 +1502,11 @@
VM_OBJECT_WLOCK(new_object);
}
-#define OBSC_TEST_ALL_SHADOWED 0x0001
#define OBSC_COLLAPSE_NOWAIT 0x0002
#define OBSC_COLLAPSE_WAIT 0x0004
static vm_page_t
-vm_object_backing_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
+vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
int op)
{
vm_object_t backing_object;
@@ -1448,8 +1524,9 @@
vm_page_lock(p);
VM_OBJECT_WUNLOCK(object);
VM_OBJECT_WUNLOCK(backing_object);
+ /* The page is only NULL when rename fails. */
if (p == NULL)
- VM_WAIT;
+ vm_radix_wait();
else
vm_page_busy_sleep(p, "vmocol", false);
VM_OBJECT_WLOCK(object);
@@ -1458,192 +1535,195 @@
}
static bool
-vm_object_backing_scan(vm_object_t object, int op)
+vm_object_scan_all_shadowed(vm_object_t object)
{
vm_object_t backing_object;
- vm_page_t next, p, pp;
- vm_pindex_t backing_offset_index, new_pindex;
+ vm_page_t p, pp;
+ vm_pindex_t backing_offset_index, new_pindex, pi, ps;
VM_OBJECT_ASSERT_WLOCKED(object);
VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
backing_object = object->backing_object;
- backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
/*
- * Initial conditions
+ * Initial conditions:
+ *
+ * We do not want to have to test for the existence of swap
+ * pages in the backing object. XXX but with the new swapper this
+ * would be pretty easy to do.
*/
- if (op & OBSC_TEST_ALL_SHADOWED) {
+ if (backing_object->type != OBJT_DEFAULT &&
+ backing_object->type != OBJT_SWAP)
+ return (false);
+
+ pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
+ p = vm_page_find_least(backing_object, pi);
+ ps = swap_pager_find_least(backing_object, pi);
+
+ /*
+ * Only check pages inside the parent object's range and
+ * inside the parent object's mapping of the backing object.
+ */
+ for (;; pi++) {
+ if (p != NULL && p->pindex < pi)
+ p = TAILQ_NEXT(p, listq);
+ if (ps < pi)
+ ps = swap_pager_find_least(backing_object, pi);
+ if (p == NULL && ps >= backing_object->size)
+ break;
+ else if (p == NULL)
+ pi = ps;
+ else
+ pi = MIN(p->pindex, ps);
+
+ new_pindex = pi - backing_offset_index;
+ if (new_pindex >= object->size)
+ break;
+
/*
- * We do not want to have to test for the existence of cache
- * or swap pages in the backing object. XXX but with the
- * new swapper this would be pretty easy to do.
+ * See if the parent has the page or if the parent's object
+ * pager has the page. If the parent has the page but the page
+ * is not valid, the parent's object pager must have the page.
*
- * XXX what about anonymous MAP_SHARED memory that hasn't
- * been ZFOD faulted yet? If we do not test for this, the
- * shadow test may succeed! XXX
+ * If this fails, the parent does not completely shadow the
+ * object and we might as well give up now.
*/
- if (backing_object->type != OBJT_DEFAULT) {
+ pp = vm_page_lookup(object, new_pindex);
+ if ((pp == NULL || pp->valid == 0) &&
+ !vm_pager_has_page(object, new_pindex, NULL, NULL))
return (false);
- }
}
- if (op & OBSC_COLLAPSE_WAIT) {
+ return (true);
+}
+
+static bool
+vm_object_collapse_scan(vm_object_t object, int op)
+{
+ vm_object_t backing_object;
+ vm_page_t next, p, pp;
+ vm_pindex_t backing_offset_index, new_pindex;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
+
+ backing_object = object->backing_object;
+ backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
+
+ /*
+ * Initial conditions
+ */
+ if ((op & OBSC_COLLAPSE_WAIT) != 0)
vm_object_set_flag(backing_object, OBJ_DEAD);
- }
/*
* Our scan
*/
- p = TAILQ_FIRST(&backing_object->memq);
- while (p) {
+ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
next = TAILQ_NEXT(p, listq);
new_pindex = p->pindex - backing_offset_index;
- if (op & OBSC_TEST_ALL_SHADOWED) {
- /*
- * Ignore pages outside the parent object's range
- * and outside the parent object's mapping of the
- * backing object.
- *
- * Note that we do not busy the backing object's
- * page.
- */
- if (p->pindex < backing_offset_index ||
- new_pindex >= object->size) {
- p = next;
- continue;
- }
- /*
- * See if the parent has the page or if the parent's
- * object pager has the page. If the parent has the
- * page but the page is not valid, the parent's
- * object pager must have the page.
- *
- * If this fails, the parent does not completely shadow
- * the object and we might as well give up now.
- */
-
- pp = vm_page_lookup(object, new_pindex);
- if ((pp == NULL || pp->valid == 0) &&
- !vm_pager_has_page(object, new_pindex, NULL, NULL))
- return (false);
- }
-
/*
* Check for busy page
*/
- if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
- if (vm_page_busied(p)) {
- p = vm_object_backing_scan_wait(object, p,
- next, op);
- continue;
- }
+ if (vm_page_busied(p)) {
+ next = vm_object_collapse_scan_wait(object, p, next, op);
+ continue;
+ }
- KASSERT(p->object == backing_object,
- ("vm_object_backing_scan: object mismatch"));
+ KASSERT(p->object == backing_object,
+ ("vm_object_collapse_scan: object mismatch"));
- if (p->pindex < backing_offset_index ||
- new_pindex >= object->size) {
- if (backing_object->type == OBJT_SWAP)
- swap_pager_freespace(backing_object,
- p->pindex, 1);
+ if (p->pindex < backing_offset_index ||
+ new_pindex >= object->size) {
+ if (backing_object->type == OBJT_SWAP)
+ swap_pager_freespace(backing_object, p->pindex,
+ 1);
- /*
- * Page is out of the parent object's range, we
- * can simply destroy it.
- */
- vm_page_lock(p);
- KASSERT(!pmap_page_is_mapped(p),
- ("freeing mapped page %p", p));
- if (p->wire_count == 0)
- vm_page_free(p);
- else
- vm_page_remove(p);
- vm_page_unlock(p);
- p = next;
- continue;
- }
+ /*
+ * Page is out of the parent object's range, we can
+ * simply destroy it.
+ */
+ vm_page_lock(p);
+ KASSERT(!pmap_page_is_mapped(p),
+ ("freeing mapped page %p", p));
+ if (p->wire_count == 0)
+ vm_page_free(p);
+ else
+ vm_page_remove(p);
+ vm_page_unlock(p);
+ continue;
+ }
- pp = vm_page_lookup(object, new_pindex);
- if (pp != NULL && vm_page_busied(pp)) {
- /*
- * The page in the parent is busy and
- * possibly not (yet) valid. Until
- * its state is finalized by the busy
- * bit owner, we can't tell whether it
- * shadows the original page.
- * Therefore, we must either skip it
- * and the original (backing_object)
- * page or wait for its state to be
- * finalized.
- *
- * This is due to a race with vm_fault()
- * where we must unbusy the original
- * (backing_obj) page before we can
- * (re)lock the parent. Hence we can
- * get here.
- */
- p = vm_object_backing_scan_wait(object, pp,
- next, op);
- continue;
- }
+ pp = vm_page_lookup(object, new_pindex);
+ if (pp != NULL && vm_page_busied(pp)) {
+ /*
+ * The page in the parent is busy and possibly not
+ * (yet) valid. Until its state is finalized by the
+ * busy bit owner, we can't tell whether it shadows the
+ * original page. Therefore, we must either skip it
+ * and the original (backing_object) page or wait for
+ * its state to be finalized.
+ *
+ * This is due to a race with vm_fault() where we must
+ * unbusy the original (backing_obj) page before we can
+ * (re)lock the parent. Hence we can get here.
+ */
+ next = vm_object_collapse_scan_wait(object, pp, next,
+ op);
+ continue;
+ }
- KASSERT(pp == NULL || pp->valid != 0,
- ("unbusy invalid page %p", pp));
+ KASSERT(pp == NULL || pp->valid != 0,
+ ("unbusy invalid page %p", pp));
- if (pp != NULL || vm_pager_has_page(object,
- new_pindex, NULL, NULL)) {
- /*
- * The page already exists in the
- * parent OR swap exists for this
- * location in the parent. Leave the
- * parent's page alone. Destroy the
- * original page from the backing
- * object.
- */
- if (backing_object->type == OBJT_SWAP)
- swap_pager_freespace(backing_object,
- p->pindex, 1);
- vm_page_lock(p);
- KASSERT(!pmap_page_is_mapped(p),
- ("freeing mapped page %p", p));
- if (p->wire_count == 0)
- vm_page_free(p);
- else
- vm_page_remove(p);
- vm_page_unlock(p);
- p = next;
- continue;
- }
-
+ if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL,
+ NULL)) {
/*
- * Page does not exist in parent, rename the
- * page from the backing object to the main object.
- *
- * If the page was mapped to a process, it can remain
- * mapped through the rename.
- * vm_page_rename() will handle dirty and cache.
+ * The page already exists in the parent OR swap exists
+ * for this location in the parent. Leave the parent's
+ * page alone. Destroy the original page from the
+ * backing object.
*/
- if (vm_page_rename(p, object, new_pindex)) {
- p = vm_object_backing_scan_wait(object, NULL,
- next, op);
- continue;
- }
-
- /* Use the old pindex to free the right page. */
if (backing_object->type == OBJT_SWAP)
- swap_pager_freespace(backing_object,
- new_pindex + backing_offset_index, 1);
+ swap_pager_freespace(backing_object, p->pindex,
+ 1);
+ vm_page_lock(p);
+ KASSERT(!pmap_page_is_mapped(p),
+ ("freeing mapped page %p", p));
+ if (p->wire_count == 0)
+ vm_page_free(p);
+ else
+ vm_page_remove(p);
+ vm_page_unlock(p);
+ continue;
+ }
+ /*
+ * Page does not exist in parent, rename the page from the
+ * backing object to the main object.
+ *
+ * If the page was mapped to a process, it can remain mapped
+ * through the rename. vm_page_rename() will dirty the page.
+ */
+ if (vm_page_rename(p, object, new_pindex)) {
+ next = vm_object_collapse_scan_wait(object, NULL, next,
+ op);
+ continue;
+ }
+
+ /* Use the old pindex to free the right page. */
+ if (backing_object->type == OBJT_SWAP)
+ swap_pager_freespace(backing_object,
+ new_pindex + backing_offset_index, 1);
+
#if VM_NRESERVLEVEL > 0
- /*
- * Rename the reservation.
- */
- vm_reserv_rename(p, object, backing_object,
- backing_offset_index);
+ /*
+ * Rename the reservation.
+ */
+ vm_reserv_rename(p, object, backing_object,
+ backing_offset_index);
#endif
- }
- p = next;
}
return (true);
}
@@ -1665,7 +1745,7 @@
if (backing_object->ref_count != 1)
return;
- vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
+ vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT);
}
/*
@@ -1698,8 +1778,8 @@
VM_OBJECT_WLOCK(backing_object);
if (backing_object->handle != NULL ||
(backing_object->type != OBJT_DEFAULT &&
- backing_object->type != OBJT_SWAP) ||
- (backing_object->flags & OBJ_DEAD) ||
+ backing_object->type != OBJT_SWAP) ||
+ (backing_object->flags & (OBJ_DEAD | OBJ_NOSPLIT)) != 0 ||
object->handle != NULL ||
(object->type != OBJT_DEFAULT &&
object->type != OBJT_SWAP) ||
@@ -1722,7 +1802,7 @@
* all the resident pages in the entire backing object.
*
* This is ignoring pager-backed pages such as swap pages.
- * vm_object_backing_scan fails the shadowing test in this
+ * vm_object_collapse_scan fails the shadowing test in this
* case.
*/
if (backing_object->ref_count == 1) {
@@ -1731,9 +1811,9 @@
/*
* If there is exactly one reference to the backing
- * object, we can collapse it into the parent.
+ * object, we can collapse it into the parent.
*/
- vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
+ vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT);
#if VM_NRESERVLEVEL > 0
/*
@@ -1759,13 +1839,6 @@
backing_object,
object,
OFF_TO_IDX(object->backing_object_offset), TRUE);
-
- /*
- * Free any cached pages from backing_object.
- */
- if (__predict_false(
- !vm_object_cache_is_empty(backing_object)))
- vm_page_cache_free(backing_object, 0, 0);
}
/*
* Object now shadows whatever backing_object did.
@@ -1814,8 +1887,7 @@
* there is nothing we can do so we give up.
*/
if (object->resident_page_count != object->size &&
- !vm_object_backing_scan(object,
- OBSC_TEST_ALL_SHADOWED)) {
+ !vm_object_scan_all_shadowed(object)) {
VM_OBJECT_WUNLOCK(backing_object);
break;
}
@@ -1889,6 +1961,8 @@
int options)
{
vm_page_t p, next;
+ struct mtx *mtx;
+ struct pglist pgl;
VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
@@ -1895,10 +1969,12 @@
(options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
("vm_object_page_remove: illegal options for object %p", object));
if (object->resident_page_count == 0)
- goto skipmemq;
+ return;
vm_object_pip_add(object, 1);
+ TAILQ_INIT(&pgl);
again:
p = vm_page_find_least(object, start);
+ mtx = NULL;
/*
* Here, the variable "p" is either (1) the page with the least pindex
@@ -1915,7 +1991,7 @@
* however, be invalidated if the option OBJPR_CLEANONLY is
* not specified.
*/
- vm_page_lock(p);
+ vm_page_change_lock(p, &mtx);
if (vm_page_xbusied(p)) {
VM_OBJECT_WUNLOCK(object);
vm_page_busy_sleep(p, "vmopax", true);
@@ -1923,13 +1999,14 @@
goto again;
}
if (p->wire_count != 0) {
- if ((options & OBJPR_NOTMAPPED) == 0)
+ if ((options & OBJPR_NOTMAPPED) == 0 &&
+ object->ref_count != 0)
pmap_remove_all(p);
if ((options & OBJPR_CLEANONLY) == 0) {
p->valid = 0;
vm_page_undirty(p);
}
- goto next;
+ continue;
}
if (vm_page_busied(p)) {
VM_OBJECT_WUNLOCK(object);
@@ -1940,33 +2017,34 @@
KASSERT((p->flags & PG_FICTITIOUS) == 0,
("vm_object_page_remove: page %p is fictitious", p));
if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
- if ((options & OBJPR_NOTMAPPED) == 0)
+ if ((options & OBJPR_NOTMAPPED) == 0 &&
+ object->ref_count != 0)
pmap_remove_write(p);
- if (p->dirty)
- goto next;
+ if (p->dirty != 0)
+ continue;
}
- if ((options & OBJPR_NOTMAPPED) == 0)
+ if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0)
pmap_remove_all(p);
- vm_page_free(p);
-next:
- vm_page_unlock(p);
+ p->flags &= ~PG_ZERO;
+ if (vm_page_free_prep(p, false))
+ TAILQ_INSERT_TAIL(&pgl, p, listq);
}
+ if (mtx != NULL)
+ mtx_unlock(mtx);
+ vm_page_free_phys_pglist(&pgl);
vm_object_pip_wakeup(object);
-skipmemq:
- if (__predict_false(!vm_object_cache_is_empty(object)))
- vm_page_cache_free(object, start, end);
}
/*
- * vm_object_page_cache:
+ * vm_object_page_noreuse:
*
- * For the given object, attempt to move the specified clean
- * pages to the cache queue. If a page is wired for any reason,
- * then it will not be changed. Pages are specified by the given
- * range ["start", "end"). As a special case, if "end" is zero,
- * then the range extends from "start" to the end of the object.
- * Any mappings to the specified pages are removed before the
- * pages are moved to the cache queue.
+ * For the given object, attempt to move the specified pages to
+ * the head of the inactive queue. This bypasses regular LRU
+ * operation and allows the pages to be reused quickly under memory
+ * pressure. If a page is wired for any reason, then it will not
+ * be queued. Pages are specified by the range ["start", "end").
+ * As a special case, if "end" is zero, then the range extends from
+ * "start" to the end of the object.
*
* This operation should only be performed on objects that
* contain non-fictitious, managed pages.
@@ -1974,14 +2052,14 @@
* The object must be locked.
*/
void
-vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
{
- struct mtx *mtx, *new_mtx;
+ struct mtx *mtx;
vm_page_t p, next;
- VM_OBJECT_ASSERT_WLOCKED(object);
+ VM_OBJECT_ASSERT_LOCKED(object);
KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
- ("vm_object_page_cache: illegal object %p", object));
+ ("vm_object_page_noreuse: illegal object %p", object));
if (object->resident_page_count == 0)
return;
p = vm_page_find_least(object, start);
@@ -1993,18 +2071,8 @@
mtx = NULL;
for (; p != NULL && (p->pindex < end || end == 0); p = next) {
next = TAILQ_NEXT(p, listq);
-
- /*
- * Avoid releasing and reacquiring the same page lock.
- */
- new_mtx = vm_page_lockptr(p);
- if (mtx != new_mtx) {
- if (mtx != NULL)
- mtx_unlock(mtx);
- mtx = new_mtx;
- mtx_lock(mtx);
- }
- vm_page_try_to_cache(p);
+ vm_page_change_lock(p, &mtx);
+ vm_page_deactivate_noreuse(p);
}
if (mtx != NULL)
mtx_unlock(mtx);
@@ -2023,7 +2091,7 @@
boolean_t
vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
{
- vm_page_t m, ma[1];
+ vm_page_t m;
vm_pindex_t pindex;
int rv;
@@ -2031,11 +2099,7 @@
for (pindex = start; pindex < end; pindex++) {
m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
if (m->valid != VM_PAGE_BITS_ALL) {
- ma[0] = m;
- rv = vm_pager_get_pages(object, ma, 1, 0);
- m = vm_page_lookup(object, pindex);
- if (m == NULL)
- break;
+ rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
if (rv != VM_PAGER_OK) {
vm_page_lock(m);
vm_page_free(m);
@@ -2090,7 +2154,7 @@
VM_OBJECT_WLOCK(prev_object);
if ((prev_object->type != OBJT_DEFAULT &&
prev_object->type != OBJT_SWAP) ||
- (prev_object->flags & OBJ_TMPFS_NODE) != 0) {
+ (prev_object->flags & OBJ_NOSPLIT) != 0) {
VM_OBJECT_WUNLOCK(prev_object);
return (FALSE);
}
@@ -2127,7 +2191,7 @@
/*
* If prev_object was charged, then this mapping,
- * althought not charged now, may become writable
+ * although not charged now, may become writable
* later. Non-NULL cred in the object would prevent
* swap reservation during enabling of the write
* access, so reserve swap now. Failed reservation
@@ -2205,7 +2269,7 @@
vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length,
uint8_t queue)
{
- vm_object_t tobject;
+ vm_object_t tobject, t1object;
vm_page_t m, tm;
vm_pindex_t end_pindex, pindex, tpindex;
int depth, locked_depth;
@@ -2219,6 +2283,7 @@
return;
pindex = OFF_TO_IDX(offset);
end_pindex = pindex + atop(length);
+again:
locked_depth = 1;
VM_OBJECT_RLOCK(object);
m = vm_page_find_least(object, pindex);
@@ -2252,6 +2317,16 @@
m = TAILQ_NEXT(m, listq);
}
vm_page_lock(tm);
+ if (vm_page_xbusied(tm)) {
+ for (tobject = object; locked_depth >= 1;
+ locked_depth--) {
+ t1object = tobject->backing_object;
+ VM_OBJECT_RUNLOCK(tobject);
+ tobject = t1object;
+ }
+ vm_page_busy_sleep(tm, "unwbo", true);
+ goto again;
+ }
vm_page_unwire(tm, queue);
vm_page_unlock(tm);
next_page:
@@ -2258,10 +2333,10 @@
pindex++;
}
/* Release the accumulated object locks. */
- for (depth = 0; depth < locked_depth; depth++) {
- tobject = object->backing_object;
- VM_OBJECT_RUNLOCK(object);
- object = tobject;
+ for (tobject = object; locked_depth >= 1; locked_depth--) {
+ t1object = tobject->backing_object;
+ VM_OBJECT_RUNLOCK(tobject);
+ tobject = t1object;
}
}
@@ -2340,9 +2415,9 @@
* sysctl is only meant to give an
* approximation of the system anyway.
*/
- if (m->queue == PQ_ACTIVE)
+ if (vm_page_active(m))
kvo->kvo_active++;
- else if (m->queue == PQ_INACTIVE)
+ else if (vm_page_inactive(m))
kvo->kvo_inactive++;
}
Modified: trunk/sys/vm/vm_object.h
===================================================================
--- trunk/sys/vm/vm_object.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_object.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: stable/10/sys/vm/vm_object.h 313384 2017-02-07 08:33:46Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_object.h 331722 2018-03-29 02:50:57Z eadler $
*/
/*
@@ -71,6 +71,7 @@
#include <sys/queue.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
+#include <sys/_pctrie.h>
#include <sys/_rwlock.h>
#include <vm/_vm_radix.h>
@@ -80,17 +81,6 @@
*
* vm_object_t Virtual memory object.
*
- * The root of cached pages pool is protected by both the per-object lock
- * and the free pages queue mutex.
- * On insert in the cache radix trie, the per-object lock is expected
- * to be already held and the free pages queue mutex will be
- * acquired during the operation too.
- * On remove and lookup from the cache radix trie, only the free
- * pages queue mutex is expected to be locked.
- * These rules allow for reliably checking for the presence of cached
- * pages with only the per-object lock held, thereby reducing contention
- * for the free pages queue mutex.
- *
* List of locks
* (c) const until freed
* (o) per-object lock
@@ -98,12 +88,17 @@
*
*/
+#ifndef VM_PAGE_HAVE_PGLIST
+TAILQ_HEAD(pglist, vm_page);
+#define VM_PAGE_HAVE_PGLIST
+#endif
+
struct vm_object {
struct rwlock lock;
TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
- TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
+ struct pglist memq; /* list of resident pages */
struct vm_radix rtree; /* root of the resident page radix trie*/
vm_pindex_t size; /* Object size */
int generation; /* generation ID */
@@ -119,7 +114,6 @@
vm_ooffset_t backing_object_offset;/* Offset in backing object */
TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
LIST_HEAD(, vm_reserv) rvq; /* list of reservations */
- struct vm_radix cache; /* (o + f) root of the cache page radix trie */
void *handle;
union {
/*
@@ -164,17 +158,17 @@
* the handle changed and hash-chain
* invalid.
*
- * swp_bcount - number of swap 'swblock' metablocks, each
- * contains up to 16 swapblk assignments.
- * see vm/swap_pager.h
+ * swp_blks - pc-trie of the allocated swap blocks.
+ *
*/
struct {
void *swp_tmpfs;
- int swp_bcount;
+ struct pctrie swp_blks;
} swp;
} un_pager;
struct ucred *cred;
vm_ooffset_t charge;
+ void *umtx_data;
};
/*
@@ -182,10 +176,13 @@
*/
#define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */
#define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */
-#define OBJ_DEAD 0x0008 /* dead objects (during rundown) */
+#define OBJ_POPULATE 0x0004 /* pager implements populate() */
+#define OBJ_DEAD 0x0008 /* dead objects (during rundown) */
#define OBJ_NOSPLIT 0x0010 /* dont split this object */
-#define OBJ_PIPWNT 0x0040 /* paging in progress wanted */
-#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */
+#define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */
+#define OBJ_PIPWNT 0x0040 /* paging in progress wanted */
+#define OBJ_PG_DTOR 0x0080 /* dont reset object, leave that for dtor */
+#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */
#define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */
#define OBJ_TMPFS_DIRTY 0x0400 /* dirty tmpfs obj */
#define OBJ_COLORED 0x1000 /* pg_color is defined */
@@ -193,14 +190,29 @@
#define OBJ_DISCONNECTWNT 0x4000 /* disconnect from vnode wanted */
#define OBJ_TMPFS 0x8000 /* has tmpfs vnode allocated */
+/*
+ * Helpers to perform conversion between vm_object page indexes and offsets.
+ * IDX_TO_OFF() converts an index into an offset.
+ * OFF_TO_IDX() converts an offset into an index. Since offsets are signed
+ * by default, the sign propagation in OFF_TO_IDX(), when applied to
+ * negative offsets, is intentional and returns a vm_object page index
+ * that cannot be created by a userspace mapping.
+ * UOFF_TO_IDX() treats the offset as an unsigned value and converts it
+ * into an index accordingly. Use it only when the full range of offset
+ * values are allowed. Currently, this only applies to device mappings.
+ * OBJ_MAX_SIZE specifies the maximum page index corresponding to the
+ * maximum unsigned offset.
+ */
#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
#define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
+#define UOFF_TO_IDX(off) (((vm_pindex_t)(off)) >> PAGE_SHIFT)
+#define OBJ_MAX_SIZE (UOFF_TO_IDX(UINT64_MAX) + 1)
#ifdef _KERNEL
#define OBJPC_SYNC 0x1 /* sync I/O */
#define OBJPC_INVAL 0x2 /* invalidate */
-#define OBJPC_NOSYNC 0x4 /* skip if PG_NOSYNC */
+#define OBJPC_NOSYNC 0x4 /* skip if VPO_NOSYNC */
/*
* The following options are supported by vm_object_page_remove().
@@ -243,6 +255,8 @@
rw_try_upgrade(&(object)->lock)
#define VM_OBJECT_WLOCK(object) \
rw_wlock(&(object)->lock)
+#define VM_OBJECT_WOWNED(object) \
+ rw_wowned(&(object)->lock)
#define VM_OBJECT_WUNLOCK(object) \
rw_wunlock(&(object)->lock)
@@ -256,6 +270,30 @@
object->flags |= bits;
}
+/*
+ * Conditionally set the object's color, which (1) enables the allocation
+ * of physical memory reservations for anonymous objects and larger-than-
+ * superpage-sized named objects and (2) determines the first page offset
+ * within the object at which a reservation may be allocated. In other
+ * words, the color determines the alignment of the object with respect
+ * to the largest superpage boundary. When mapping named objects, like
+ * files or POSIX shared memory objects, the color should be set to zero
+ * before a virtual address is selected for the mapping. In contrast,
+ * for anonymous objects, the color may be set after the virtual address
+ * is selected.
+ *
+ * The object must be locked.
+ */
+static __inline void
+vm_object_color(vm_object_t object, u_short color)
+{
+
+ if ((object->flags & OBJ_COLORED) == 0) {
+ object->pg_color = color;
+ object->flags |= OBJ_COLORED;
+ }
+}
+
void vm_object_clear_flag(vm_object_t object, u_short bits);
void vm_object_pip_add(vm_object_t object, short i);
void vm_object_pip_subtract(vm_object_t object, short i);
@@ -263,13 +301,10 @@
void vm_object_pip_wakeupn(vm_object_t object, short i);
void vm_object_pip_wait(vm_object_t object, char *waitid);
-static __inline boolean_t
-vm_object_cache_is_empty(vm_object_t object)
-{
+void umtx_shm_object_init(vm_object_t object);
+void umtx_shm_object_terminated(vm_object_t object);
+extern int umtx_shm_vnobj_persistent;
- return (vm_radix_is_empty(&object->cache));
-}
-
vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t,
boolean_t);
@@ -280,10 +315,10 @@
void vm_object_set_writeable_dirty (vm_object_t);
void vm_object_init (void);
void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int);
-void vm_object_page_cache(vm_object_t object, vm_pindex_t start,
- vm_pindex_t end);
boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
vm_ooffset_t end, int flags);
+void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start,
+ vm_pindex_t end);
void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
vm_pindex_t end, int options);
boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);
Modified: trunk/sys/vm/vm_page.c
===================================================================
--- trunk/sys/vm/vm_page.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_page.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -83,7 +83,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_page.c 320190 2017-06-21 14:39:31Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_page.c 342797 2019-01-06 00:38:28Z kib $");
#include "opt_vm.h"
@@ -92,6 +92,7 @@
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/limits.h>
+#include <sys/linker.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/msgbuf.h>
@@ -98,6 +99,8 @@
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
@@ -125,9 +128,9 @@
*/
struct vm_domain vm_dom[MAXMEMDOM];
-struct mtx_padalign vm_page_queue_free_mtx;
+struct mtx_padalign __exclusive_cache_line vm_page_queue_free_mtx;
-struct mtx_padalign pa_lock[PA_LOCK_COUNT];
+struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
vm_page_t vm_page_array;
long vm_page_array_size;
@@ -135,25 +138,37 @@
int vm_page_zero_count;
static int boot_pages = UMA_BOOT_PAGES;
-TUNABLE_INT("vm.boot_pages", &boot_pages);
-SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
- "number of pages allocated for bootstrapping the VM system");
+SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+ &boot_pages, 0,
+ "number of pages allocated for bootstrapping the VM system");
static int pa_tryrelock_restart;
SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
&pa_tryrelock_restart, 0, "Number of tryrelock restarts");
+static TAILQ_HEAD(, vm_page) blacklist_head;
+static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
+
+/* Is the page daemon waiting for free pages? */
+static int vm_pageout_pages_needed;
+
static uma_zone_t fakepg_zone;
-static struct vnode *vm_page_alloc_init(vm_page_t m);
-static void vm_page_cache_turn_free(vm_page_t m);
+static void vm_page_alloc_check(vm_page_t m);
static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
-static void vm_page_enqueue(int queue, vm_page_t m);
+static void vm_page_enqueue(uint8_t queue, vm_page_t m);
+static void vm_page_free_phys(vm_page_t m);
+static void vm_page_free_wakeup(void);
static void vm_page_init_fakepg(void *dummy);
static int vm_page_insert_after(vm_page_t m, vm_object_t object,
vm_pindex_t pindex, vm_page_t mpred);
static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
vm_page_t mpred);
+static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
+ vm_paddr_t high);
+static int vm_page_alloc_fail(vm_object_t object, int req);
SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
@@ -162,7 +177,7 @@
{
fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
- NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
}
/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
@@ -210,43 +225,171 @@
void
vm_set_page_size(void)
{
- if (cnt.v_page_size == 0)
- cnt.v_page_size = PAGE_SIZE;
- if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
+ if (vm_cnt.v_page_size == 0)
+ vm_cnt.v_page_size = PAGE_SIZE;
+ if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
panic("vm_set_page_size: page size not a power of two");
}
/*
- * vm_page_blacklist_lookup:
+ * vm_page_blacklist_next:
*
- * See if a physical address in this page has been listed
- * in the blacklist tunable. Entries in the tunable are
- * separated by spaces or commas. If an invalid integer is
- * encountered then the rest of the string is skipped.
+ * Find the next entry in the provided string of blacklist
+ * addresses. Entries are separated by space, comma, or newline.
+ * If an invalid integer is encountered then the rest of the
+ * string is skipped. Updates the list pointer to the next
+ * character, or NULL if the string is exhausted or invalid.
*/
-static int
-vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
+static vm_paddr_t
+vm_page_blacklist_next(char **list, char *end)
{
vm_paddr_t bad;
char *cp, *pos;
- for (pos = list; *pos != '\0'; pos = cp) {
+ if (list == NULL || *list == NULL)
+ return (0);
+ if (**list =='\0') {
+ *list = NULL;
+ return (0);
+ }
+
+ /*
+ * If there's no end pointer then the buffer is coming from
+ * the kenv and we know it's null-terminated.
+ */
+ if (end == NULL)
+ end = *list + strlen(*list);
+
+ /* Ensure that strtoq() won't walk off the end */
+ if (*end != '\0') {
+ if (*end == '\n' || *end == ' ' || *end == ',')
+ *end = '\0';
+ else {
+ printf("Blacklist not terminated, skipping\n");
+ *list = NULL;
+ return (0);
+ }
+ }
+
+ for (pos = *list; *pos != '\0'; pos = cp) {
bad = strtoq(pos, &cp, 0);
- if (*cp != '\0') {
- if (*cp == ' ' || *cp == ',') {
- cp++;
- if (cp == pos)
+ if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
+ if (bad == 0) {
+ if (++cp < end)
continue;
- } else
- break;
- }
- if (pa == trunc_page(bad))
- return (1);
+ else
+ break;
+ }
+ } else
+ break;
+ if (*cp == '\0' || ++cp >= end)
+ *list = NULL;
+ else
+ *list = cp;
+ return (trunc_page(bad));
}
+ printf("Garbage in RAM blacklist, skipping\n");
+ *list = NULL;
return (0);
}
+bool
+vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
+{
+ vm_page_t m;
+ int ret;
+
+ m = vm_phys_paddr_to_vm_page(pa);
+ if (m == NULL)
+ return (true); /* page does not exist, no failure */
+
+ mtx_lock(&vm_page_queue_free_mtx);
+ ret = vm_phys_unfree_page(m);
+ if (ret != 0)
+ vm_phys_freecnt_adj(m, -1);
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (ret != 0) {
+ TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
+ if (verbose)
+ printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
+ }
+ return (ret);
+}
+
+/*
+ * vm_page_blacklist_check:
+ *
+ * Iterate through the provided string of blacklist addresses, pulling
+ * each entry out of the physical allocator free list and putting it
+ * onto a list for reporting via the vm.page_blacklist sysctl.
+ */
static void
+vm_page_blacklist_check(char *list, char *end)
+{
+ vm_paddr_t pa;
+ char *next;
+
+ next = list;
+ while (next != NULL) {
+ if ((pa = vm_page_blacklist_next(&next, end)) == 0)
+ continue;
+ vm_page_blacklist_add(pa, bootverbose);
+ }
+}
+
+/*
+ * vm_page_blacklist_load:
+ *
+ * Search for a special module named "ram_blacklist". It'll be a
+ * plain text file provided by the user via the loader directive
+ * of the same name.
+ */
+static void
+vm_page_blacklist_load(char **list, char **end)
+{
+ void *mod;
+ u_char *ptr;
+ u_int len;
+
+ mod = NULL;
+ ptr = NULL;
+
+ mod = preload_search_by_type("ram_blacklist");
+ if (mod != NULL) {
+ ptr = preload_fetch_addr(mod);
+ len = preload_fetch_size(mod);
+ }
+ *list = ptr;
+ if (ptr != NULL)
+ *end = ptr + len;
+ else
+ *end = NULL;
+ return;
+}
+
+static int
+sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
+{
+ vm_page_t m;
+ struct sbuf sbuf;
+ int error, first;
+
+ first = 1;
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+ TAILQ_FOREACH(m, &blacklist_head, listq) {
+ sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
+ (uintmax_t)m->phys_addr);
+ first = 0;
+ }
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ return (error);
+}
+
+static void
vm_page_domain_init(struct vm_domain *vmd)
{
struct vm_pagequeue *pq;
@@ -255,16 +398,19 @@
*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
"vm inactive pagequeue";
*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
- &cnt.v_inactive_count;
+ &vm_cnt.v_inactive_count;
*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
"vm active pagequeue";
*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
- &cnt.v_active_count;
+ &vm_cnt.v_active_count;
+ *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
+ "vm laundry pagequeue";
+ *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) =
+ &vm_cnt.v_laundry_count;
vmd->vmd_page_count = 0;
vmd->vmd_free_count = 0;
vmd->vmd_segs = 0;
vmd->vmd_oom = FALSE;
- vmd->vmd_pass = 0;
for (i = 0; i < PQ_COUNT; i++) {
pq = &vmd->vmd_pagequeues[i];
TAILQ_INIT(&pq->pq_pl);
@@ -274,6 +420,29 @@
}
/*
+ * Initialize a physical page in preparation for adding it to the free
+ * lists.
+ */
+static void
+vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind)
+{
+
+ m->object = NULL;
+ m->wire_count = 0;
+ m->busy_lock = VPB_UNBUSIED;
+ m->hold_count = 0;
+ m->flags = 0;
+ m->phys_addr = pa;
+ m->queue = PQ_NONE;
+ m->psind = 0;
+ m->segind = segind;
+ m->order = VM_NFREEORDER;
+ m->pool = VM_FREEPOOL_DEFAULT;
+ m->valid = m->dirty = 0;
+ pmap_page_init(m);
+}
+
+/*
* vm_page_startup:
*
* Initializes the resident memory module. Allocates physical memory for
@@ -284,19 +453,16 @@
vm_offset_t
vm_page_startup(vm_offset_t vaddr)
{
+ struct vm_domain *vmd;
+ struct vm_phys_seg *seg;
+ vm_page_t m;
+ char *list, *listend;
vm_offset_t mapped;
- vm_paddr_t high_avail, low_avail, page_range, size;
- vm_paddr_t new_end;
- int i;
- vm_paddr_t pa;
- vm_paddr_t last_pa;
- char *list;
+ vm_paddr_t end, high_avail, low_avail, new_end, page_range, size;
+ vm_paddr_t biggestsize, last_pa, pa;
+ u_long pagecount;
+ int biggestone, i, pages_per_zone, segind;
- /* the biggest memory array is the second group of pages */
- vm_paddr_t end;
- vm_paddr_t biggestsize;
- int biggestone;
-
biggestsize = 0;
biggestone = 0;
vaddr = round_page(vaddr);
@@ -305,15 +471,6 @@
phys_avail[i] = round_page(phys_avail[i]);
phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
}
-
-#ifdef XEN
- /*
- * There is no obvious reason why i386 PV Xen needs vm_page structs
- * created for these pseudo-physical addresses. XXX
- */
- vm_phys_add_seg(0, phys_avail[0]);
-#endif
-
for (i = 0; phys_avail[i + 1]; i += 2) {
size = phys_avail[i + 1] - phys_avail[i];
if (size > biggestsize) {
@@ -334,9 +491,27 @@
vm_page_domain_init(&vm_dom[i]);
/*
+ * Almost all of the pages needed for bootstrapping UMA are used
+ * for zone structures, so if the number of CPUs results in those
+ * structures taking more than one page each, we set aside more pages
+ * in proportion to the zone structure size.
+ */
+ pages_per_zone = howmany(sizeof(struct uma_zone) +
+ sizeof(struct uma_cache) * (mp_maxid + 1) +
+ roundup2(sizeof(struct uma_slab), sizeof(void *)), UMA_SLAB_SIZE);
+ if (pages_per_zone > 1) {
+ /* Reserve more pages so that we don't run out. */
+ boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone;
+ }
+
+ /*
* Allocate memory for use when boot strapping the kernel memory
* allocator.
+ *
+ * CTFLAG_RDTUN doesn't work during the early boot process, so we must
+ * manually fetch the value.
*/
+ TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages);
new_end = end - (boot_pages * UMA_SLAB_SIZE);
new_end = trunc_page(new_end);
mapped = pmap_map(&vaddr, new_end, end,
@@ -344,8 +519,8 @@
bzero((void *)mapped, end - new_end);
uma_startup((void *)mapped, boot_pages);
-#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
- defined(__mips__)
+#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
+ defined(__i386__) || defined(__mips__)
/*
* Allocate a bitmap to indicate that a random physical page
* needs to be included in a minidump.
@@ -367,8 +542,10 @@
vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
bzero((void *)vm_page_dump, vm_page_dump_size);
+#else
+ (void)last_pa;
#endif
-#if defined(__amd64__) || defined(__mips__)
+#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
/*
* Include the UMA bootstrap pages and vm_page_dump in a crash dump.
* When pmap_map() uses the direct map, they are not automatically
@@ -471,7 +648,9 @@
new_end = trunc_page(end - page_range * sizeof(struct vm_page));
mapped = pmap_map(&vaddr, new_end, end,
VM_PROT_READ | VM_PROT_WRITE);
- vm_page_array = (vm_page_t) mapped;
+ vm_page_array = (vm_page_t)mapped;
+ vm_page_array_size = page_range;
+
#if VM_NRESERVLEVEL > 0
/*
* Allocate physical memory for the reservation management system's
@@ -481,13 +660,13 @@
high_avail = new_end;
new_end = vm_reserv_startup(&vaddr, new_end, high_avail);
#endif
-#if defined(__amd64__) || defined(__mips__)
+#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
/*
* Include vm_page_array and vm_reserv_array in a crash dump.
*/
for (pa = new_end; pa < end; pa += PAGE_SIZE)
dump_add_page(pa);
-#endif
+#endif
phys_avail[biggestone + 1] = new_end;
/*
@@ -498,38 +677,60 @@
vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
/*
- * Clear all of the page structures
- */
- bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
- for (i = 0; i < page_range; i++)
- vm_page_array[i].order = VM_NFREEORDER;
- vm_page_array_size = page_range;
-
- /*
* Initialize the physical memory allocator.
*/
vm_phys_init();
/*
- * Add every available physical page that is not blacklisted to
- * the free lists.
+ * Initialize the page structures and add every available page to the
+ * physical memory allocator's free lists.
*/
- cnt.v_page_count = 0;
- cnt.v_free_count = 0;
- list = getenv("vm.blacklist");
- for (i = 0; phys_avail[i + 1] != 0; i += 2) {
- pa = phys_avail[i];
- last_pa = phys_avail[i + 1];
- while (pa < last_pa) {
- if (list != NULL &&
- vm_page_blacklist_lookup(list, pa))
- printf("Skipping page with pa 0x%jx\n",
- (uintmax_t)pa);
- else
- vm_phys_add_page(pa);
- pa += PAGE_SIZE;
+ vm_cnt.v_page_count = 0;
+ vm_cnt.v_free_count = 0;
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ seg = &vm_phys_segs[segind];
+ for (m = seg->first_page, pa = seg->start; pa < seg->end;
+ m++, pa += PAGE_SIZE)
+ vm_page_init_page(m, pa, segind);
+
+ /*
+ * Add the segment to the free lists only if it is covered by
+ * one of the ranges in phys_avail. Because we've added the
+ * ranges to the vm_phys_segs array, we can assume that each
+ * segment is either entirely contained in one of the ranges,
+ * or doesn't overlap any of them.
+ */
+ for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+ if (seg->start < phys_avail[i] ||
+ seg->end > phys_avail[i + 1])
+ continue;
+
+ m = seg->first_page;
+ pagecount = (u_long)atop(seg->end - seg->start);
+
+ mtx_lock(&vm_page_queue_free_mtx);
+ vm_phys_free_contig(m, pagecount);
+ vm_phys_freecnt_adj(m, (int)pagecount);
+ mtx_unlock(&vm_page_queue_free_mtx);
+ vm_cnt.v_page_count += (u_int)pagecount;
+
+ vmd = &vm_dom[seg->domain];
+ vmd->vmd_page_count += (u_int)pagecount;
+ vmd->vmd_segs |= 1UL << m->segind;
+ break;
}
}
+
+ /*
+ * Remove blacklisted pages from the physical memory allocator.
+ */
+ TAILQ_INIT(&blacklist_head);
+ vm_page_blacklist_load(&list, &listend);
+ vm_page_blacklist_check(list, listend);
+
+ list = kern_getenv("vm.blacklist");
+ vm_page_blacklist_check(list, NULL);
+
freeenv(list);
#if VM_NRESERVLEVEL > 0
/*
@@ -603,6 +804,7 @@
{
u_int x;
+ vm_page_lock_assert(m, MA_NOTOWNED);
vm_page_assert_sbusied(m);
for (;;) {
@@ -683,6 +885,41 @@
}
}
+static void
+vm_page_xunbusy_locked(vm_page_t m)
+{
+
+ vm_page_assert_xbusied(m);
+ vm_page_assert_locked(m);
+
+ atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
+ /* There is a waiter, do wakeup() instead of vm_page_flash(). */
+ wakeup(m);
+}
+
+void
+vm_page_xunbusy_maybelocked(vm_page_t m)
+{
+ bool lockacq;
+
+ vm_page_assert_xbusied(m);
+
+ /*
+ * Fast path for unbusy. If it succeeds, we know that there
+ * are no waiters, so we do not need a wakeup.
+ */
+ if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER,
+ VPB_UNBUSIED))
+ return;
+
+ lockacq = !mtx_owned(vm_page_lockptr(m));
+ if (lockacq)
+ vm_page_lock(m);
+ vm_page_xunbusy_locked(m);
+ if (lockacq)
+ vm_page_unlock(m);
+}
+
/*
* vm_page_xunbusy_hard:
*
@@ -696,8 +933,7 @@
vm_page_assert_xbusied(m);
vm_page_lock(m);
- atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
- wakeup(m);
+ vm_page_xunbusy_locked(m);
vm_page_unlock(m);
}
@@ -728,6 +964,23 @@
}
/*
+ * Avoid releasing and reacquiring the same page lock.
+ */
+void
+vm_page_change_lock(vm_page_t m, struct mtx **mtx)
+{
+ struct mtx *mtx1;
+
+ mtx1 = vm_page_lockptr(m);
+ if (*mtx == mtx1)
+ return;
+ if (*mtx != NULL)
+ mtx_unlock(*mtx);
+ *mtx = mtx1;
+ mtx_lock(mtx1);
+}
+
+/*
* Keep page from being freed by the page daemon
* much of the same effect as wiring, except much lower
* overhead and should be used only for *very* temporary
@@ -756,24 +1009,15 @@
* vm_page_unhold_pages:
*
* Unhold each of the pages that is referenced by the given array.
- */
+ */
void
vm_page_unhold_pages(vm_page_t *ma, int count)
{
- struct mtx *mtx, *new_mtx;
+ struct mtx *mtx;
mtx = NULL;
for (; count != 0; count--) {
- /*
- * Avoid releasing and reacquiring the same page lock.
- */
- new_mtx = vm_page_lockptr(*ma);
- if (mtx != new_mtx) {
- if (mtx != NULL)
- mtx_unlock(mtx);
- mtx = new_mtx;
- mtx_lock(mtx);
- }
+ vm_page_change_lock(*ma, &mtx);
vm_page_unhold(*ma);
ma++;
}
@@ -905,39 +1149,29 @@
}
/*
- * Unbusy and handle the page queueing for a page from the VOP_GETPAGES()
- * array which is not the request page.
+ * Unbusy and handle the page queueing for a page from a getpages request that
+ * was optionally read ahead or behind.
*/
void
vm_page_readahead_finish(vm_page_t m)
{
- if (m->valid != 0) {
- /*
- * Since the page is not the requested page, whether
- * it should be activated or deactivated is not
- * obvious. Empirical results have shown that
- * deactivating the page is usually the best choice,
- * unless the page is wanted by another thread.
- */
- vm_page_lock(m);
- if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
- vm_page_activate(m);
- else
- vm_page_deactivate(m);
- vm_page_unlock(m);
- vm_page_xunbusy(m);
- } else {
- /*
- * Free the completely invalid page. Such page state
- * occurs due to the short read operation which did
- * not covered our page at all, or in case when a read
- * error happens.
- */
- vm_page_lock(m);
- vm_page_free(m);
- vm_page_unlock(m);
- }
+ /* We shouldn't put invalid pages on queues. */
+ KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m));
+
+ /*
+ * Since the page is not the actually needed one, whether it should
+ * be activated or deactivated is not obvious. Empirical results
+ * have shown that deactivating the page is usually the best choice,
+ * unless the page is wanted by another thread.
+ */
+ vm_page_lock(m);
+ if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+ vm_page_activate(m);
+ else
+ vm_page_deactivate(m);
+ vm_page_unlock(m);
+ vm_page_xunbusy(m);
}
/*
@@ -991,11 +1225,7 @@
vm_page_dirty_KBI(vm_page_t m)
{
- /* These assertions refer to this operation by its public name. */
- KASSERT((m->flags & PG_CACHED) == 0,
- ("vm_page_dirty: page in cache!"));
- KASSERT(!VM_PAGE_IS_FREE(m),
- ("vm_page_dirty: page is free!"));
+ /* Refer to this operation by its public name. */
KASSERT(m->valid == VM_PAGE_BITS_ALL,
("vm_page_dirty: page is invalid!"));
m->dirty = VM_PAGE_BITS_ALL;
@@ -1119,9 +1349,8 @@
/*
* vm_page_remove:
*
- * Removes the given mem entry from the object/offset-page
- * table and the object page list, but do not invalidate/terminate
- * the backing store.
+ * Removes the specified page from its containing object, but does not
+ * invalidate any backing storage.
*
* The object must be locked. The page must be locked if it is managed.
*/
@@ -1129,30 +1358,21 @@
vm_page_remove(vm_page_t m)
{
vm_object_t object;
- boolean_t lockacq;
+ vm_page_t mrem;
if ((m->oflags & VPO_UNMANAGED) == 0)
- vm_page_lock_assert(m, MA_OWNED);
+ vm_page_assert_locked(m);
if ((object = m->object) == NULL)
return;
VM_OBJECT_ASSERT_WLOCKED(object);
- if (vm_page_xbusied(m)) {
- lockacq = FALSE;
- if ((m->oflags & VPO_UNMANAGED) != 0 &&
- !mtx_owned(vm_page_lockptr(m))) {
- lockacq = TRUE;
- vm_page_lock(m);
- }
- vm_page_flash(m);
- atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
- if (lockacq)
- vm_page_unlock(m);
- }
+ if (vm_page_xbusied(m))
+ vm_page_xunbusy_maybelocked(m);
+ mrem = vm_radix_remove(&object->rtree, m->pindex);
+ KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
/*
* Now remove from the object's list of backed pages.
*/
- vm_radix_remove(&object->rtree, m->pindex);
TAILQ_REMOVE(&object->memq, m, listq);
/*
@@ -1215,7 +1435,7 @@
{
vm_page_t next;
- VM_OBJECT_ASSERT_WLOCKED(m->object);
+ VM_OBJECT_ASSERT_LOCKED(m->object);
if ((next = TAILQ_NEXT(m, listq)) != NULL) {
MPASS(next->object == m->object);
if (next->pindex != m->pindex + 1)
@@ -1235,7 +1455,7 @@
{
vm_page_t prev;
- VM_OBJECT_ASSERT_WLOCKED(m->object);
+ VM_OBJECT_ASSERT_LOCKED(m->object);
if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
MPASS(prev->object == m->object);
if (prev->pindex != m->pindex - 1)
@@ -1253,9 +1473,13 @@
vm_page_t
vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
{
- vm_page_t mold, mpred;
+ vm_page_t mold;
VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(mnew->object == NULL,
+ ("vm_page_replace: page %p already in object", mnew));
+ KASSERT(mnew->queue == PQ_NONE,
+ ("vm_page_replace: new page %p is on a paging queue", mnew));
/*
* This function mostly follows vm_page_insert() and
@@ -1262,31 +1486,24 @@
* vm_page_remove() without the radix, object count and vnode
* dance. Double check such functions for more comments.
*/
- mpred = vm_radix_lookup(&object->rtree, pindex);
- KASSERT(mpred != NULL,
- ("vm_page_replace: replacing page not present with pindex"));
- mpred = TAILQ_PREV(mpred, respgs, listq);
- if (mpred != NULL)
- KASSERT(mpred->pindex < pindex,
- ("vm_page_insert_after: mpred doesn't precede pindex"));
mnew->object = object;
mnew->pindex = pindex;
mold = vm_radix_replace(&object->rtree, mnew);
KASSERT(mold->queue == PQ_NONE,
- ("vm_page_replace: mold is on a paging queue"));
+ ("vm_page_replace: old page %p is on a paging queue", mold));
- /* Detach the old page from the resident tailq. */
+ /* Keep the resident page list in sorted order. */
+ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
TAILQ_REMOVE(&object->memq, mold, listq);
mold->object = NULL;
- vm_page_xunbusy(mold);
+ vm_page_xunbusy_maybelocked(mold);
- /* Insert the new page in the resident tailq. */
- if (mpred != NULL)
- TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq);
- else
- TAILQ_INSERT_HEAD(&object->memq, mnew, listq);
+ /*
+ * The object's resident_page_count does not change because we have
+ * swapped one page for another, but OBJ_MIGHTBEDIRTY.
+ */
if (pmap_page_is_write_mapped(mnew))
vm_object_set_writeable_dirty(object);
return (mold);
@@ -1306,9 +1523,7 @@
*
* Note: we *always* dirty the page. It is necessary both for the
* fact that we moved it, and because we may be invalidating
- * swap. If the page is on the cache, we have to deactivate it
- * or vm_page_dirty() will panic. Dirty pages are not allowed
- * on the cache.
+ * swap.
*
* The objects must be locked.
*/
@@ -1354,142 +1569,6 @@
}
/*
- * Convert all of the given object's cached pages that have a
- * pindex within the given range into free pages. If the value
- * zero is given for "end", then the range's upper bound is
- * infinity. If the given object is backed by a vnode and it
- * transitions from having one or more cached pages to none, the
- * vnode's hold count is reduced.
- */
-void
-vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
-{
- vm_page_t m;
- boolean_t empty;
-
- mtx_lock(&vm_page_queue_free_mtx);
- if (__predict_false(vm_radix_is_empty(&object->cache))) {
- mtx_unlock(&vm_page_queue_free_mtx);
- return;
- }
- while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) {
- if (end != 0 && m->pindex >= end)
- break;
- vm_radix_remove(&object->cache, m->pindex);
- vm_page_cache_turn_free(m);
- }
- empty = vm_radix_is_empty(&object->cache);
- mtx_unlock(&vm_page_queue_free_mtx);
- if (object->type == OBJT_VNODE && empty)
- vdrop(object->handle);
-}
-
-/*
- * Returns the cached page that is associated with the given
- * object and offset. If, however, none exists, returns NULL.
- *
- * The free page queue must be locked.
- */
-static inline vm_page_t
-vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
-{
-
- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- return (vm_radix_lookup(&object->cache, pindex));
-}
-
-/*
- * Remove the given cached page from its containing object's
- * collection of cached pages.
- *
- * The free page queue must be locked.
- */
-static void
-vm_page_cache_remove(vm_page_t m)
-{
-
- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- KASSERT((m->flags & PG_CACHED) != 0,
- ("vm_page_cache_remove: page %p is not cached", m));
- vm_radix_remove(&m->object->cache, m->pindex);
- m->object = NULL;
- cnt.v_cache_count--;
-}
-
-/*
- * Transfer all of the cached pages with offset greater than or
- * equal to 'offidxstart' from the original object's cache to the
- * new object's cache. However, any cached pages with offset
- * greater than or equal to the new object's size are kept in the
- * original object. Initially, the new object's cache must be
- * empty. Offset 'offidxstart' in the original object must
- * correspond to offset zero in the new object.
- *
- * The new object must be locked.
- */
-void
-vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
- vm_object_t new_object)
-{
- vm_page_t m;
-
- /*
- * Insertion into an object's collection of cached pages
- * requires the object to be locked. In contrast, removal does
- * not.
- */
- VM_OBJECT_ASSERT_WLOCKED(new_object);
- KASSERT(vm_radix_is_empty(&new_object->cache),
- ("vm_page_cache_transfer: object %p has cached pages",
- new_object));
- mtx_lock(&vm_page_queue_free_mtx);
- while ((m = vm_radix_lookup_ge(&orig_object->cache,
- offidxstart)) != NULL) {
- /*
- * Transfer all of the pages with offset greater than or
- * equal to 'offidxstart' from the original object's
- * cache to the new object's cache.
- */
- if ((m->pindex - offidxstart) >= new_object->size)
- break;
- vm_radix_remove(&orig_object->cache, m->pindex);
- /* Update the page's object and offset. */
- m->object = new_object;
- m->pindex -= offidxstart;
- if (vm_radix_insert(&new_object->cache, m))
- vm_page_cache_turn_free(m);
- }
- mtx_unlock(&vm_page_queue_free_mtx);
-}
-
-/*
- * Returns TRUE if a cached page is associated with the given object and
- * offset, and FALSE otherwise.
- *
- * The object must be locked.
- */
-boolean_t
-vm_page_is_cached(vm_object_t object, vm_pindex_t pindex)
-{
- vm_page_t m;
-
- /*
- * Insertion into an object's collection of cached pages requires the
- * object to be locked. Therefore, if the object is locked and the
- * object's collection is empty, there is no need to acquire the free
- * page queues lock in order to prove that the specified page doesn't
- * exist.
- */
- VM_OBJECT_ASSERT_WLOCKED(object);
- if (__predict_true(vm_object_cache_is_empty(object)))
- return (FALSE);
- mtx_lock(&vm_page_queue_free_mtx);
- m = vm_page_cache_lookup(object, pindex);
- mtx_unlock(&vm_page_queue_free_mtx);
- return (m != NULL);
-}
-
-/*
* vm_page_alloc:
*
* Allocate and return a page that is associated with the specified
@@ -1505,13 +1584,10 @@
* optional allocation flags:
* VM_ALLOC_COUNT(number) the number of additional pages that the caller
* intends to allocate
- * VM_ALLOC_IFCACHED return page only if it is cached
- * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page
- * is cached
* VM_ALLOC_NOBUSY do not exclusive busy the page
* VM_ALLOC_NODUMP do not include the page in a kernel core dump
* VM_ALLOC_NOOBJ page is not associated with an object and
- * should not be exclusive busy
+ * should not be exclusive busy
* VM_ALLOC_SBUSY shared busy the allocated page
* VM_ALLOC_WIRED wire the allocated page
* VM_ALLOC_ZERO prefer a zeroed page
@@ -1521,21 +1597,41 @@
vm_page_t
vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
{
- struct vnode *vp = NULL;
- vm_object_t m_object;
- vm_page_t m, mpred;
+
+ return (vm_page_alloc_after(object, pindex, req, object != NULL ?
+ vm_radix_lookup_le(&object->rtree, pindex) : NULL));
+}
+
+/*
+ * Allocate a page in the specified object with the given page index. To
+ * optimize insertion of the page into the object, the caller must also specifiy
+ * the resident page in the object with largest index smaller than the given
+ * page index, or NULL if no such page exists.
+ */
+vm_page_t
+vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req,
+ vm_page_t mpred)
+{
+ vm_page_t m;
int flags, req_class;
+ u_int free_count;
- mpred = 0; /* XXX: pacify gcc */
KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
(object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
- ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
- req));
+ ("inconsistent object(%p)/req(%x)", object, req));
+ KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
+ ("Can't sleep and retry object insertion."));
+ KASSERT(mpred == NULL || mpred->pindex < pindex,
+ ("mpred %p doesn't precede pindex 0x%jx", mpred,
+ (uintmax_t)pindex));
if (object != NULL)
VM_OBJECT_ASSERT_WLOCKED(object);
+ if (__predict_false((req & VM_ALLOC_IFCACHED) != 0))
+ return (NULL);
+
req_class = req & VM_ALLOC_CLASS_MASK;
/*
@@ -1544,52 +1640,29 @@
if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
req_class = VM_ALLOC_SYSTEM;
- if (object != NULL) {
- mpred = vm_radix_lookup_le(&object->rtree, pindex);
- KASSERT(mpred == NULL || mpred->pindex != pindex,
- ("vm_page_alloc: pindex already allocated"));
- }
-
/*
- * The page allocation request can came from consumers which already
- * hold the free page queue mutex, like vm_page_insert() in
- * vm_page_cache().
+ * Allocate a page if the number of free pages exceeds the minimum
+ * for the request class.
*/
- mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
- if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
+again:
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
(req_class == VM_ALLOC_SYSTEM &&
- cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
+ vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
(req_class == VM_ALLOC_INTERRUPT &&
- cnt.v_free_count + cnt.v_cache_count > 0)) {
+ vm_cnt.v_free_count > 0)) {
/*
- * Allocate from the free queue if the number of free pages
- * exceeds the minimum for the request class.
+ * Can we allocate the page from a reservation?
*/
- if (object != NULL &&
- (m = vm_page_cache_lookup(object, pindex)) != NULL) {
- if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
- mtx_unlock(&vm_page_queue_free_mtx);
- return (NULL);
- }
- if (vm_phys_unfree_page(m))
- vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
#if VM_NRESERVLEVEL > 0
- else if (!vm_reserv_reactivate_page(m))
-#else
- else
-#endif
- panic("vm_page_alloc: cache page %p is missing"
- " from the free queue", m);
- } else if ((req & VM_ALLOC_IFCACHED) != 0) {
- mtx_unlock(&vm_page_queue_free_mtx);
- return (NULL);
-#if VM_NRESERVLEVEL > 0
- } else if (object == NULL || (object->flags & (OBJ_COLORED |
+ if (object == NULL || (object->flags & (OBJ_COLORED |
OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
- vm_reserv_alloc_page(object, pindex, mpred)) == NULL) {
-#else
- } else {
+ vm_reserv_alloc_page(object, pindex, mpred)) == NULL)
#endif
+ {
+ /*
+ * If not, allocate it from the free page queues.
+ */
m = vm_phys_alloc_pages(object != NULL ?
VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
#if VM_NRESERVLEVEL > 0
@@ -1604,10 +1677,8 @@
/*
* Not allocatable, give up.
*/
- mtx_unlock(&vm_page_queue_free_mtx);
- atomic_add_int(&vm_pageout_deficit,
- max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
- pagedaemon_wakeup();
+ if (vm_page_alloc_fail(object, req))
+ goto again;
return (NULL);
}
@@ -1614,52 +1685,23 @@
/*
* At this point we had better have found a good page.
*/
- KASSERT(m != NULL, ("vm_page_alloc: missing page"));
- KASSERT(m->queue == PQ_NONE,
- ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
- KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
- KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
- KASSERT(!vm_page_busied(m), ("vm_page_alloc: page %p is busy", m));
- KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
- KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
- ("vm_page_alloc: page %p has unexpected memattr %d", m,
- pmap_page_get_memattr(m)));
- if ((m->flags & PG_CACHED) != 0) {
- KASSERT((m->flags & PG_ZERO) == 0,
- ("vm_page_alloc: cached page %p is PG_ZERO", m));
- KASSERT(m->valid != 0,
- ("vm_page_alloc: cached page %p is invalid", m));
- if (m->object == object && m->pindex == pindex)
- cnt.v_reactivated++;
- else
- m->valid = 0;
- m_object = m->object;
- vm_page_cache_remove(m);
- if (m_object->type == OBJT_VNODE &&
- vm_object_cache_is_empty(m_object))
- vp = m_object->handle;
- } else {
- KASSERT(VM_PAGE_IS_FREE(m),
- ("vm_page_alloc: page %p is not free", m));
- KASSERT(m->valid == 0,
- ("vm_page_alloc: free page %p is valid", m));
- vm_phys_freecnt_adj(m, -1);
- }
+ KASSERT(m != NULL, ("missing page"));
+ free_count = vm_phys_freecnt_adj(m, -1);
+ if ((m->flags & PG_ZERO) != 0)
+ vm_page_zero_count--;
+ mtx_unlock(&vm_page_queue_free_mtx);
+ vm_page_alloc_check(m);
/*
- * Only the PG_ZERO flag is inherited. The PG_CACHED or PG_FREE flag
- * must be cleared before the free page queues lock is released.
+ * Initialize the page. Only the PG_ZERO flag is inherited.
*/
flags = 0;
- if (m->flags & PG_ZERO) {
- vm_page_zero_count--;
- if (req & VM_ALLOC_ZERO)
- flags = PG_ZERO;
- }
- if (req & VM_ALLOC_NODUMP)
+ if ((req & VM_ALLOC_ZERO) != 0)
+ flags = PG_ZERO;
+ flags &= m->flags;
+ if ((req & VM_ALLOC_NODUMP) != 0)
flags |= PG_NODUMP;
m->flags = flags;
- mtx_unlock(&vm_page_queue_free_mtx);
m->aflags = 0;
m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
VPO_UNMANAGED : 0;
@@ -1673,7 +1715,7 @@
* The page lock is not required for wiring a page until that
* page is inserted into the object.
*/
- atomic_add_int(&cnt.v_wire_count, 1);
+ atomic_add_int(&vm_cnt.v_wire_count, 1);
m->wire_count = 1;
}
m->act_count = 0;
@@ -1680,18 +1722,21 @@
if (object != NULL) {
if (vm_page_insert_after(m, object, pindex, mpred)) {
- /* See the comment below about hold count. */
- if (vp != NULL)
- vdrop(vp);
pagedaemon_wakeup();
if (req & VM_ALLOC_WIRED) {
- atomic_subtract_int(&cnt.v_wire_count, 1);
+ atomic_subtract_int(&vm_cnt.v_wire_count, 1);
m->wire_count = 0;
}
- m->object = NULL;
+ KASSERT(m->object == NULL, ("page %p has object", m));
m->oflags = VPO_UNMANAGED;
m->busy_lock = VPB_UNBUSIED;
- vm_page_free(m);
+ /* Don't change PG_ZERO. */
+ vm_page_free_toq(m);
+ if (req & VM_ALLOC_WAITFAIL) {
+ VM_OBJECT_WUNLOCK(object);
+ vm_radix_wait();
+ VM_OBJECT_WLOCK(object);
+ }
return (NULL);
}
@@ -1703,34 +1748,15 @@
m->pindex = pindex;
/*
- * The following call to vdrop() must come after the above call
- * to vm_page_insert() in case both affect the same object and
- * vnode. Otherwise, the affected vnode's hold count could
- * temporarily become zero.
- */
- if (vp != NULL)
- vdrop(vp);
-
- /*
* Don't wakeup too often - wakeup the pageout daemon when
* we would be nearly out of memory.
*/
- if (vm_paging_needed())
+ if (vm_paging_needed(free_count))
pagedaemon_wakeup();
return (m);
}
-static void
-vm_page_alloc_contig_vdrop(struct spglist *lst)
-{
-
- while (!SLIST_EMPTY(lst)) {
- vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv);
- SLIST_REMOVE_HEAD(lst, plinks.s.ss);
- }
-}
-
/*
* vm_page_alloc_contig:
*
@@ -1752,6 +1778,8 @@
* memory attribute setting for the physical pages cannot be configured
* to VM_MEMATTR_DEFAULT.
*
+ * The specified object may not contain fictitious pages.
+ *
* The caller must always specify an allocation class.
*
* allocation classes:
@@ -1763,7 +1791,7 @@
* VM_ALLOC_NOBUSY do not exclusive busy the page
* VM_ALLOC_NODUMP do not include the page in a kernel core dump
* VM_ALLOC_NOOBJ page is not associated with an object and
- * should not be exclusive busy
+ * should not be exclusive busy
* VM_ALLOC_SBUSY shared busy the allocated page
* VM_ALLOC_WIRED wire the allocated page
* VM_ALLOC_ZERO prefer a zeroed page
@@ -1775,22 +1803,23 @@
u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
vm_paddr_t boundary, vm_memattr_t memattr)
{
- struct vnode *drop;
- struct spglist deferred_vdrop_list;
- vm_page_t m, m_tmp, m_ret;
- u_int flags, oflags;
+ vm_page_t m, m_ret, mpred;
+ u_int busy_lock, flags, oflags;
int req_class;
+ mpred = NULL; /* XXX: pacify gcc */
KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
(object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
- ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
+ ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object,
req));
+ KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
+ ("Can't sleep and retry object insertion."));
if (object != NULL) {
VM_OBJECT_ASSERT_WLOCKED(object);
- KASSERT(object->type == OBJT_PHYS,
- ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
+ KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
+ ("vm_page_alloc_contig: object %p has fictitious pages",
object));
}
KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
@@ -1802,40 +1831,48 @@
if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
req_class = VM_ALLOC_SYSTEM;
- SLIST_INIT(&deferred_vdrop_list);
+ if (object != NULL) {
+ mpred = vm_radix_lookup_le(&object->rtree, pindex);
+ KASSERT(mpred == NULL || mpred->pindex != pindex,
+ ("vm_page_alloc_contig: pindex already allocated"));
+ }
+
+ /*
+ * Can we allocate the pages without the number of free pages falling
+ * below the lower bound for the allocation class?
+ */
+again:
mtx_lock(&vm_page_queue_free_mtx);
- if (cnt.v_free_count + cnt.v_cache_count >= npages +
- cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
- cnt.v_free_count + cnt.v_cache_count >= npages +
- cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
- cnt.v_free_count + cnt.v_cache_count >= npages)) {
+ if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved ||
+ (req_class == VM_ALLOC_SYSTEM &&
+ vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) ||
+ (req_class == VM_ALLOC_INTERRUPT &&
+ vm_cnt.v_free_count >= npages)) {
+ /*
+ * Can we allocate the pages from a reservation?
+ */
#if VM_NRESERVLEVEL > 0
retry:
if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
(m_ret = vm_reserv_alloc_contig(object, pindex, npages,
- low, high, alignment, boundary)) == NULL)
+ low, high, alignment, boundary, mpred)) == NULL)
#endif
+ /*
+ * If not, allocate them from the free page queues.
+ */
m_ret = vm_phys_alloc_contig(npages, low, high,
alignment, boundary);
} else {
- mtx_unlock(&vm_page_queue_free_mtx);
- atomic_add_int(&vm_pageout_deficit, npages);
- pagedaemon_wakeup();
+ if (vm_page_alloc_fail(object, req))
+ goto again;
return (NULL);
}
- if (m_ret != NULL)
- for (m = m_ret; m < &m_ret[npages]; m++) {
- drop = vm_page_alloc_init(m);
- if (drop != NULL) {
- /*
- * Enqueue the vnode for deferred vdrop().
- */
- m->plinks.s.pv = drop;
- SLIST_INSERT_HEAD(&deferred_vdrop_list, m,
- plinks.s.ss);
- }
- }
- else {
+ if (m_ret != NULL) {
+ vm_phys_freecnt_adj(m_ret, -npages);
+ for (m = m_ret; m < &m_ret[npages]; m++)
+ if ((m->flags & PG_ZERO) != 0)
+ vm_page_zero_count--;
+ } else {
#if VM_NRESERVLEVEL > 0
if (vm_reserv_reclaim_contig(npages, low, high, alignment,
boundary))
@@ -1845,6 +1882,8 @@
mtx_unlock(&vm_page_queue_free_mtx);
if (m_ret == NULL)
return (NULL);
+ for (m = m_ret; m < &m_ret[npages]; m++)
+ vm_page_alloc_check(m);
/*
* Initialize the pages. Only the PG_ZERO flag is inherited.
@@ -1854,9 +1893,15 @@
flags = PG_ZERO;
if ((req & VM_ALLOC_NODUMP) != 0)
flags |= PG_NODUMP;
+ oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
+ VPO_UNMANAGED : 0;
+ busy_lock = VPB_UNBUSIED;
+ if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
+ busy_lock = VPB_SINGLE_EXCLUSIVER;
+ if ((req & VM_ALLOC_SBUSY) != 0)
+ busy_lock = VPB_SHARERS_WORD(1);
if ((req & VM_ALLOC_WIRED) != 0)
- atomic_add_int(&cnt.v_wire_count, npages);
- oflags = VPO_UNMANAGED;
+ atomic_add_int(&vm_cnt.v_wire_count, npages);
if (object != NULL) {
if (object->memattr != VM_MEMATTR_DEFAULT &&
memattr == VM_MEMATTR_DEFAULT)
@@ -1865,39 +1910,37 @@
for (m = m_ret; m < &m_ret[npages]; m++) {
m->aflags = 0;
m->flags = (m->flags | PG_NODUMP) & flags;
- m->busy_lock = VPB_UNBUSIED;
- if (object != NULL) {
- if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
- m->busy_lock = VPB_SINGLE_EXCLUSIVER;
- if ((req & VM_ALLOC_SBUSY) != 0)
- m->busy_lock = VPB_SHARERS_WORD(1);
- }
+ m->busy_lock = busy_lock;
if ((req & VM_ALLOC_WIRED) != 0)
m->wire_count = 1;
- /* Unmanaged pages don't use "act_count". */
+ m->act_count = 0;
m->oflags = oflags;
if (object != NULL) {
- if (vm_page_insert(m, object, pindex)) {
- vm_page_alloc_contig_vdrop(
- &deferred_vdrop_list);
- if (vm_paging_needed())
- pagedaemon_wakeup();
+ if (vm_page_insert_after(m, object, pindex, mpred)) {
+ pagedaemon_wakeup();
if ((req & VM_ALLOC_WIRED) != 0)
- atomic_subtract_int(&cnt.v_wire_count,
- npages);
- for (m_tmp = m, m = m_ret;
- m < &m_ret[npages]; m++) {
- if ((req & VM_ALLOC_WIRED) != 0)
+ atomic_subtract_int(
+ &vm_cnt.v_wire_count, npages);
+ KASSERT(m->object == NULL,
+ ("page %p has object", m));
+ mpred = m;
+ for (m = m_ret; m < &m_ret[npages]; m++) {
+ if (m <= mpred &&
+ (req & VM_ALLOC_WIRED) != 0)
m->wire_count = 0;
- if (m >= m_tmp) {
- m->object = NULL;
- m->oflags |= VPO_UNMANAGED;
- }
+ m->oflags = VPO_UNMANAGED;
m->busy_lock = VPB_UNBUSIED;
- vm_page_free(m);
+ /* Don't change PG_ZERO. */
+ vm_page_free_toq(m);
}
+ if (req & VM_ALLOC_WAITFAIL) {
+ VM_OBJECT_WUNLOCK(object);
+ vm_radix_wait();
+ VM_OBJECT_WLOCK(object);
+ }
return (NULL);
}
+ mpred = m;
} else
m->pindex = pindex;
if (memattr != VM_MEMATTR_DEFAULT)
@@ -1904,63 +1947,29 @@
pmap_page_set_memattr(m, memattr);
pindex++;
}
- vm_page_alloc_contig_vdrop(&deferred_vdrop_list);
- if (vm_paging_needed())
+ if (vm_paging_needed(vm_cnt.v_free_count))
pagedaemon_wakeup();
return (m_ret);
}
/*
- * Initialize a page that has been freshly dequeued from a freelist.
- * The caller has to drop the vnode returned, if it is not NULL.
- *
- * This function may only be used to initialize unmanaged pages.
- *
- * To be called with vm_page_queue_free_mtx held.
+ * Check a page that has been freshly dequeued from a freelist.
*/
-static struct vnode *
-vm_page_alloc_init(vm_page_t m)
+static void
+vm_page_alloc_check(vm_page_t m)
{
- struct vnode *drop;
- vm_object_t m_object;
+ KASSERT(m->object == NULL, ("page %p has object", m));
KASSERT(m->queue == PQ_NONE,
- ("vm_page_alloc_init: page %p has unexpected queue %d",
- m, m->queue));
- KASSERT(m->wire_count == 0,
- ("vm_page_alloc_init: page %p is wired", m));
- KASSERT(m->hold_count == 0,
- ("vm_page_alloc_init: page %p is held", m));
- KASSERT(!vm_page_busied(m),
- ("vm_page_alloc_init: page %p is busy", m));
- KASSERT(m->dirty == 0,
- ("vm_page_alloc_init: page %p is dirty", m));
+ ("page %p has unexpected queue %d", m, m->queue));
+ KASSERT(m->wire_count == 0, ("page %p is wired", m));
+ KASSERT(m->hold_count == 0, ("page %p is held", m));
+ KASSERT(!vm_page_busied(m), ("page %p is busy", m));
+ KASSERT(m->dirty == 0, ("page %p is dirty", m));
KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
- ("vm_page_alloc_init: page %p has unexpected memattr %d",
+ ("page %p has unexpected memattr %d",
m, pmap_page_get_memattr(m)));
- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- drop = NULL;
- if ((m->flags & PG_CACHED) != 0) {
- KASSERT((m->flags & PG_ZERO) == 0,
- ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
- m->valid = 0;
- m_object = m->object;
- vm_page_cache_remove(m);
- if (m_object->type == OBJT_VNODE &&
- vm_object_cache_is_empty(m_object))
- drop = m_object->handle;
- } else {
- KASSERT(VM_PAGE_IS_FREE(m),
- ("vm_page_alloc_init: page %p is not free", m));
- KASSERT(m->valid == 0,
- ("vm_page_alloc_init: free page %p is valid", m));
- vm_phys_freecnt_adj(m, -1);
- if ((m->flags & PG_ZERO) != 0)
- vm_page_zero_count--;
- }
- /* Don't clear the PG_ZERO flag; we'll need it later. */
- m->flags &= PG_ZERO;
- return (drop);
+ KASSERT(m->valid == 0, ("free page %p is valid", m));
}
/*
@@ -1986,9 +1995,8 @@
vm_page_t
vm_page_alloc_freelist(int flind, int req)
{
- struct vnode *drop;
vm_page_t m;
- u_int flags;
+ u_int flags, free_count;
int req_class;
req_class = req & VM_ALLOC_CLASS_MASK;
@@ -2002,18 +2010,17 @@
/*
* Do not allocate reserved pages unless the req has asked for it.
*/
- mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
- if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
+again:
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
(req_class == VM_ALLOC_SYSTEM &&
- cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
+ vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
(req_class == VM_ALLOC_INTERRUPT &&
- cnt.v_free_count + cnt.v_cache_count > 0))
+ vm_cnt.v_free_count > 0)) {
m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
- else {
- mtx_unlock(&vm_page_queue_free_mtx);
- atomic_add_int(&vm_pageout_deficit,
- max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
- pagedaemon_wakeup();
+ } else {
+ if (vm_page_alloc_fail(NULL, req))
+ goto again;
return (NULL);
}
if (m == NULL) {
@@ -2020,8 +2027,11 @@
mtx_unlock(&vm_page_queue_free_mtx);
return (NULL);
}
- drop = vm_page_alloc_init(m);
+ free_count = vm_phys_freecnt_adj(m, -1);
+ if ((m->flags & PG_ZERO) != 0)
+ vm_page_zero_count--;
mtx_unlock(&vm_page_queue_free_mtx);
+ vm_page_alloc_check(m);
/*
* Initialize the page. Only the PG_ZERO flag is inherited.
@@ -2036,44 +2046,602 @@
* The page lock is not required for wiring a page that does
* not belong to an object.
*/
- atomic_add_int(&cnt.v_wire_count, 1);
+ atomic_add_int(&vm_cnt.v_wire_count, 1);
m->wire_count = 1;
}
/* Unmanaged pages don't use "act_count". */
m->oflags = VPO_UNMANAGED;
- if (drop != NULL)
- vdrop(drop);
- if (vm_paging_needed())
+ if (vm_paging_needed(free_count))
pagedaemon_wakeup();
return (m);
}
+#define VPSC_ANY 0 /* No restrictions. */
+#define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */
+#define VPSC_NOSUPER 2 /* Skip superpages. */
+
/*
+ * vm_page_scan_contig:
+ *
+ * Scan vm_page_array[] between the specified entries "m_start" and
+ * "m_end" for a run of contiguous physical pages that satisfy the
+ * specified conditions, and return the lowest page in the run. The
+ * specified "alignment" determines the alignment of the lowest physical
+ * page in the run. If the specified "boundary" is non-zero, then the
+ * run of physical pages cannot span a physical address that is a
+ * multiple of "boundary".
+ *
+ * "m_end" is never dereferenced, so it need not point to a vm_page
+ * structure within vm_page_array[].
+ *
+ * "npages" must be greater than zero. "m_start" and "m_end" must not
+ * span a hole (or discontiguity) in the physical address space. Both
+ * "alignment" and "boundary" must be a power of two.
+ */
+vm_page_t
+vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
+ u_long alignment, vm_paddr_t boundary, int options)
+{
+ struct mtx *m_mtx;
+ vm_object_t object;
+ vm_paddr_t pa;
+ vm_page_t m, m_run;
+#if VM_NRESERVLEVEL > 0
+ int level;
+#endif
+ int m_inc, order, run_ext, run_len;
+
+ KASSERT(npages > 0, ("npages is 0"));
+ KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+ KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+ m_run = NULL;
+ run_len = 0;
+ m_mtx = NULL;
+ for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
+ KASSERT((m->flags & PG_MARKER) == 0,
+ ("page %p is PG_MARKER", m));
+ KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1,
+ ("fictitious page %p has invalid wire count", m));
+
+ /*
+ * If the current page would be the start of a run, check its
+ * physical address against the end, alignment, and boundary
+ * conditions. If it doesn't satisfy these conditions, either
+ * terminate the scan or advance to the next page that
+ * satisfies the failed condition.
+ */
+ if (run_len == 0) {
+ KASSERT(m_run == NULL, ("m_run != NULL"));
+ if (m + npages > m_end)
+ break;
+ pa = VM_PAGE_TO_PHYS(m);
+ if ((pa & (alignment - 1)) != 0) {
+ m_inc = atop(roundup2(pa, alignment) - pa);
+ continue;
+ }
+ if (rounddown2(pa ^ (pa + ptoa(npages) - 1),
+ boundary) != 0) {
+ m_inc = atop(roundup2(pa, boundary) - pa);
+ continue;
+ }
+ } else
+ KASSERT(m_run != NULL, ("m_run == NULL"));
+
+ vm_page_change_lock(m, &m_mtx);
+ m_inc = 1;
+retry:
+ if (m->wire_count != 0 || m->hold_count != 0)
+ run_ext = 0;
+#if VM_NRESERVLEVEL > 0
+ else if ((level = vm_reserv_level(m)) >= 0 &&
+ (options & VPSC_NORESERV) != 0) {
+ run_ext = 0;
+ /* Advance to the end of the reservation. */
+ pa = VM_PAGE_TO_PHYS(m);
+ m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
+ pa);
+ }
+#endif
+ else if ((object = m->object) != NULL) {
+ /*
+ * The page is considered eligible for relocation if
+ * and only if it could be laundered or reclaimed by
+ * the page daemon.
+ */
+ if (!VM_OBJECT_TRYRLOCK(object)) {
+ mtx_unlock(m_mtx);
+ VM_OBJECT_RLOCK(object);
+ mtx_lock(m_mtx);
+ if (m->object != object) {
+ /*
+ * The page may have been freed.
+ */
+ VM_OBJECT_RUNLOCK(object);
+ goto retry;
+ } else if (m->wire_count != 0 ||
+ m->hold_count != 0) {
+ run_ext = 0;
+ goto unlock;
+ }
+ }
+ KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+ ("page %p is PG_UNHOLDFREE", m));
+ /* Don't care: PG_NODUMP, PG_ZERO. */
+ if (object->type != OBJT_DEFAULT &&
+ object->type != OBJT_SWAP &&
+ object->type != OBJT_VNODE) {
+ run_ext = 0;
+#if VM_NRESERVLEVEL > 0
+ } else if ((options & VPSC_NOSUPER) != 0 &&
+ (level = vm_reserv_level_iffullpop(m)) >= 0) {
+ run_ext = 0;
+ /* Advance to the end of the superpage. */
+ pa = VM_PAGE_TO_PHYS(m);
+ m_inc = atop(roundup2(pa + 1,
+ vm_reserv_size(level)) - pa);
+#endif
+ } else if (object->memattr == VM_MEMATTR_DEFAULT &&
+ m->queue != PQ_NONE && !vm_page_busied(m)) {
+ /*
+ * The page is allocated but eligible for
+ * relocation. Extend the current run by one
+ * page.
+ */
+ KASSERT(pmap_page_get_memattr(m) ==
+ VM_MEMATTR_DEFAULT,
+ ("page %p has an unexpected memattr", m));
+ KASSERT((m->oflags & (VPO_SWAPINPROG |
+ VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
+ ("page %p has unexpected oflags", m));
+ /* Don't care: VPO_NOSYNC. */
+ run_ext = 1;
+ } else
+ run_ext = 0;
+unlock:
+ VM_OBJECT_RUNLOCK(object);
+#if VM_NRESERVLEVEL > 0
+ } else if (level >= 0) {
+ /*
+ * The page is reserved but not yet allocated. In
+ * other words, it is still free. Extend the current
+ * run by one page.
+ */
+ run_ext = 1;
+#endif
+ } else if ((order = m->order) < VM_NFREEORDER) {
+ /*
+ * The page is enqueued in the physical memory
+ * allocator's free page queues. Moreover, it is the
+ * first page in a power-of-two-sized run of
+ * contiguous free pages. Add these pages to the end
+ * of the current run, and jump ahead.
+ */
+ run_ext = 1 << order;
+ m_inc = 1 << order;
+ } else {
+ /*
+ * Skip the page for one of the following reasons: (1)
+ * It is enqueued in the physical memory allocator's
+ * free page queues. However, it is not the first
+ * page in a run of contiguous free pages. (This case
+ * rarely occurs because the scan is performed in
+ * ascending order.) (2) It is not reserved, and it is
+ * transitioning from free to allocated. (Conversely,
+ * the transition from allocated to free for managed
+ * pages is blocked by the page lock.) (3) It is
+ * allocated but not contained by an object and not
+ * wired, e.g., allocated by Xen's balloon driver.
+ */
+ run_ext = 0;
+ }
+
+ /*
+ * Extend or reset the current run of pages.
+ */
+ if (run_ext > 0) {
+ if (run_len == 0)
+ m_run = m;
+ run_len += run_ext;
+ } else {
+ if (run_len > 0) {
+ m_run = NULL;
+ run_len = 0;
+ }
+ }
+ }
+ if (m_mtx != NULL)
+ mtx_unlock(m_mtx);
+ if (run_len >= npages)
+ return (m_run);
+ return (NULL);
+}
+
+/*
+ * vm_page_reclaim_run:
+ *
+ * Try to relocate each of the allocated virtual pages within the
+ * specified run of physical pages to a new physical address. Free the
+ * physical pages underlying the relocated virtual pages. A virtual page
+ * is relocatable if and only if it could be laundered or reclaimed by
+ * the page daemon. Whenever possible, a virtual page is relocated to a
+ * physical address above "high".
+ *
+ * Returns 0 if every physical page within the run was already free or
+ * just freed by a successful relocation. Otherwise, returns a non-zero
+ * value indicating why the last attempt to relocate a virtual page was
+ * unsuccessful.
+ *
+ * "req_class" must be an allocation class.
+ */
+static int
+vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
+ vm_paddr_t high)
+{
+ struct mtx *m_mtx;
+ struct spglist free;
+ vm_object_t object;
+ vm_paddr_t pa;
+ vm_page_t m, m_end, m_new;
+ int error, order, req;
+
+ KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
+ ("req_class is not an allocation class"));
+ SLIST_INIT(&free);
+ error = 0;
+ m = m_run;
+ m_end = m_run + npages;
+ m_mtx = NULL;
+ for (; error == 0 && m < m_end; m++) {
+ KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
+ ("page %p is PG_FICTITIOUS or PG_MARKER", m));
+
+ /*
+ * Avoid releasing and reacquiring the same page lock.
+ */
+ vm_page_change_lock(m, &m_mtx);
+retry:
+ if (m->wire_count != 0 || m->hold_count != 0)
+ error = EBUSY;
+ else if ((object = m->object) != NULL) {
+ /*
+ * The page is relocated if and only if it could be
+ * laundered or reclaimed by the page daemon.
+ */
+ if (!VM_OBJECT_TRYWLOCK(object)) {
+ mtx_unlock(m_mtx);
+ VM_OBJECT_WLOCK(object);
+ mtx_lock(m_mtx);
+ if (m->object != object) {
+ /*
+ * The page may have been freed.
+ */
+ VM_OBJECT_WUNLOCK(object);
+ goto retry;
+ } else if (m->wire_count != 0 ||
+ m->hold_count != 0) {
+ error = EBUSY;
+ goto unlock;
+ }
+ }
+ KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+ ("page %p is PG_UNHOLDFREE", m));
+ /* Don't care: PG_NODUMP, PG_ZERO. */
+ if (object->type != OBJT_DEFAULT &&
+ object->type != OBJT_SWAP &&
+ object->type != OBJT_VNODE)
+ error = EINVAL;
+ else if (object->memattr != VM_MEMATTR_DEFAULT)
+ error = EINVAL;
+ else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
+ KASSERT(pmap_page_get_memattr(m) ==
+ VM_MEMATTR_DEFAULT,
+ ("page %p has an unexpected memattr", m));
+ KASSERT((m->oflags & (VPO_SWAPINPROG |
+ VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
+ ("page %p has unexpected oflags", m));
+ /* Don't care: VPO_NOSYNC. */
+ if (m->valid != 0) {
+ /*
+ * First, try to allocate a new page
+ * that is above "high". Failing
+ * that, try to allocate a new page
+ * that is below "m_run". Allocate
+ * the new page between the end of
+ * "m_run" and "high" only as a last
+ * resort.
+ */
+ req = req_class | VM_ALLOC_NOOBJ;
+ if ((m->flags & PG_NODUMP) != 0)
+ req |= VM_ALLOC_NODUMP;
+ if (trunc_page(high) !=
+ ~(vm_paddr_t)PAGE_MASK) {
+ m_new = vm_page_alloc_contig(
+ NULL, 0, req, 1,
+ round_page(high),
+ ~(vm_paddr_t)0,
+ PAGE_SIZE, 0,
+ VM_MEMATTR_DEFAULT);
+ } else
+ m_new = NULL;
+ if (m_new == NULL) {
+ pa = VM_PAGE_TO_PHYS(m_run);
+ m_new = vm_page_alloc_contig(
+ NULL, 0, req, 1,
+ 0, pa - 1, PAGE_SIZE, 0,
+ VM_MEMATTR_DEFAULT);
+ }
+ if (m_new == NULL) {
+ pa += ptoa(npages);
+ m_new = vm_page_alloc_contig(
+ NULL, 0, req, 1,
+ pa, high, PAGE_SIZE, 0,
+ VM_MEMATTR_DEFAULT);
+ }
+ if (m_new == NULL) {
+ error = ENOMEM;
+ goto unlock;
+ }
+ KASSERT(m_new->wire_count == 0,
+ ("page %p is wired", m_new));
+
+ /*
+ * Replace "m" with the new page. For
+ * vm_page_replace(), "m" must be busy
+ * and dequeued. Finally, change "m"
+ * as if vm_page_free() was called.
+ */
+ if (object->ref_count != 0)
+ pmap_remove_all(m);
+ m_new->aflags = m->aflags;
+ KASSERT(m_new->oflags == VPO_UNMANAGED,
+ ("page %p is managed", m_new));
+ m_new->oflags = m->oflags & VPO_NOSYNC;
+ pmap_copy_page(m, m_new);
+ m_new->valid = m->valid;
+ m_new->dirty = m->dirty;
+ m->flags &= ~PG_ZERO;
+ vm_page_xbusy(m);
+ vm_page_remque(m);
+ vm_page_replace_checked(m_new, object,
+ m->pindex, m);
+ m->valid = 0;
+ vm_page_undirty(m);
+
+ /*
+ * The new page must be deactivated
+ * before the object is unlocked.
+ */
+ vm_page_change_lock(m_new, &m_mtx);
+ vm_page_deactivate(m_new);
+ } else {
+ m->flags &= ~PG_ZERO;
+ vm_page_remque(m);
+ vm_page_remove(m);
+ KASSERT(m->dirty == 0,
+ ("page %p is dirty", m));
+ }
+ SLIST_INSERT_HEAD(&free, m, plinks.s.ss);
+ } else
+ error = EBUSY;
+unlock:
+ VM_OBJECT_WUNLOCK(object);
+ } else {
+ mtx_lock(&vm_page_queue_free_mtx);
+ order = m->order;
+ if (order < VM_NFREEORDER) {
+ /*
+ * The page is enqueued in the physical memory
+ * allocator's free page queues. Moreover, it
+ * is the first page in a power-of-two-sized
+ * run of contiguous free pages. Jump ahead
+ * to the last page within that run, and
+ * continue from there.
+ */
+ m += (1 << order) - 1;
+ }
+#if VM_NRESERVLEVEL > 0
+ else if (vm_reserv_is_page_free(m))
+ order = 0;
+#endif
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (order == VM_NFREEORDER)
+ error = EINVAL;
+ }
+ }
+ if (m_mtx != NULL)
+ mtx_unlock(m_mtx);
+ if ((m = SLIST_FIRST(&free)) != NULL) {
+ mtx_lock(&vm_page_queue_free_mtx);
+ do {
+ SLIST_REMOVE_HEAD(&free, plinks.s.ss);
+ vm_page_free_phys(m);
+ } while ((m = SLIST_FIRST(&free)) != NULL);
+ vm_page_zero_idle_wakeup();
+ vm_page_free_wakeup();
+ mtx_unlock(&vm_page_queue_free_mtx);
+ }
+ return (error);
+}
+
+#define NRUNS 16
+
+CTASSERT(powerof2(NRUNS));
+
+#define RUN_INDEX(count) ((count) & (NRUNS - 1))
+
+#define MIN_RECLAIM 8
+
+/*
+ * vm_page_reclaim_contig:
+ *
+ * Reclaim allocated, contiguous physical memory satisfying the specified
+ * conditions by relocating the virtual pages using that physical memory.
+ * Returns true if reclamation is successful and false otherwise. Since
+ * relocation requires the allocation of physical pages, reclamation may
+ * fail due to a shortage of free pages. When reclamation fails, callers
+ * are expected to perform VM_WAIT before retrying a failed allocation
+ * operation, e.g., vm_page_alloc_contig().
+ *
+ * The caller must always specify an allocation class through "req".
+ *
+ * allocation classes:
+ * VM_ALLOC_NORMAL normal process request
+ * VM_ALLOC_SYSTEM system *really* needs a page
+ * VM_ALLOC_INTERRUPT interrupt time request
+ *
+ * The optional allocation flags are ignored.
+ *
+ * "npages" must be greater than zero. Both "alignment" and "boundary"
+ * must be a power of two.
+ */
+bool
+vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
+ u_long alignment, vm_paddr_t boundary)
+{
+ vm_paddr_t curr_low;
+ vm_page_t m_run, m_runs[NRUNS];
+ u_long count, reclaimed;
+ int error, i, options, req_class;
+
+ KASSERT(npages > 0, ("npages is 0"));
+ KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+ KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+ req_class = req & VM_ALLOC_CLASS_MASK;
+
+ /*
+ * The page daemon is allowed to dig deeper into the free page list.
+ */
+ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+ req_class = VM_ALLOC_SYSTEM;
+
+ /*
+ * Return if the number of free pages cannot satisfy the requested
+ * allocation.
+ */
+ count = vm_cnt.v_free_count;
+ if (count < npages + vm_cnt.v_free_reserved || (count < npages +
+ vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
+ (count < npages && req_class == VM_ALLOC_INTERRUPT))
+ return (false);
+
+ /*
+ * Scan up to three times, relaxing the restrictions ("options") on
+ * the reclamation of reservations and superpages each time.
+ */
+ for (options = VPSC_NORESERV;;) {
+ /*
+ * Find the highest runs that satisfy the given constraints
+ * and restrictions, and record them in "m_runs".
+ */
+ curr_low = low;
+ count = 0;
+ for (;;) {
+ m_run = vm_phys_scan_contig(npages, curr_low, high,
+ alignment, boundary, options);
+ if (m_run == NULL)
+ break;
+ curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
+ m_runs[RUN_INDEX(count)] = m_run;
+ count++;
+ }
+
+ /*
+ * Reclaim the highest runs in LIFO (descending) order until
+ * the number of reclaimed pages, "reclaimed", is at least
+ * MIN_RECLAIM. Reset "reclaimed" each time because each
+ * reclamation is idempotent, and runs will (likely) recur
+ * from one scan to the next as restrictions are relaxed.
+ */
+ reclaimed = 0;
+ for (i = 0; count > 0 && i < NRUNS; i++) {
+ count--;
+ m_run = m_runs[RUN_INDEX(count)];
+ error = vm_page_reclaim_run(req_class, npages, m_run,
+ high);
+ if (error == 0) {
+ reclaimed += npages;
+ if (reclaimed >= MIN_RECLAIM)
+ return (true);
+ }
+ }
+
+ /*
+ * Either relax the restrictions on the next scan or return if
+ * the last scan had no restrictions.
+ */
+ if (options == VPSC_NORESERV)
+ options = VPSC_NOSUPER;
+ else if (options == VPSC_NOSUPER)
+ options = VPSC_ANY;
+ else if (options == VPSC_ANY)
+ return (reclaimed != 0);
+ }
+}
+
+/*
* vm_wait: (also see VM_WAIT macro)
*
* Sleep until free pages are available for allocation.
* - Called in various places before memory allocations.
*/
-void
-vm_wait(void)
+static void
+_vm_wait(void)
{
- mtx_lock(&vm_page_queue_free_mtx);
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
if (curproc == pageproc) {
vm_pageout_pages_needed = 1;
msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
PDROP | PSWP, "VMWait", 0);
} else {
- if (!vm_pages_needed) {
- vm_pages_needed = 1;
- wakeup(&vm_pages_needed);
- }
- msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
- "vmwait", 0);
+ if (pageproc == NULL)
+ panic("vm_wait in early boot");
+ pagedaemon_wait(PVM, "vmwait");
}
}
+void
+vm_wait(void)
+{
+
+ mtx_lock(&vm_page_queue_free_mtx);
+ _vm_wait();
+}
+
/*
+ * vm_page_alloc_fail:
+ *
+ * Called when a page allocation function fails. Informs the
+ * pagedaemon and performs the requested wait. Requires the
+ * page_queue_free and object lock on entry. Returns with the
+ * object lock held and free lock released. Returns an error when
+ * retry is necessary.
+ *
+ */
+static int
+vm_page_alloc_fail(vm_object_t object, int req)
+{
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+ atomic_add_int(&vm_pageout_deficit,
+ max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
+ if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
+ if (object != NULL)
+ VM_OBJECT_WUNLOCK(object);
+ _vm_wait();
+ if (object != NULL)
+ VM_OBJECT_WLOCK(object);
+ if (req & VM_ALLOC_WAITOK)
+ return (EAGAIN);
+ } else {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ pagedaemon_wakeup();
+ }
+ return (0);
+}
+
+/*
* vm_waitpfault: (also see VM_WAITPFAULT macro)
*
* Sleep until free pages are available for allocation.
@@ -2088,12 +2656,7 @@
{
mtx_lock(&vm_page_queue_free_mtx);
- if (!vm_pages_needed) {
- vm_pages_needed = 1;
- wakeup(&vm_pages_needed);
- }
- msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
- "pfault", 0);
+ pagedaemon_wait(PUSER, "pfault");
}
struct vm_pagequeue *
@@ -2100,7 +2663,10 @@
vm_page_pagequeue(vm_page_t m)
{
- return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
+ if (vm_page_in_laundry(m))
+ return (&vm_dom[0].vmd_pagequeues[m->queue]);
+ else
+ return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
}
/*
@@ -2115,9 +2681,9 @@
{
struct vm_pagequeue *pq;
- vm_page_lock_assert(m, MA_OWNED);
- KASSERT(m->queue != PQ_NONE,
- ("vm_page_dequeue: page %p is not queued", m));
+ vm_page_assert_locked(m);
+ KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
+ m));
pq = vm_page_pagequeue(m);
vm_pagequeue_lock(pq);
m->queue = PQ_NONE;
@@ -2154,12 +2720,18 @@
* The page must be locked.
*/
static void
-vm_page_enqueue(int queue, vm_page_t m)
+vm_page_enqueue(uint8_t queue, vm_page_t m)
{
struct vm_pagequeue *pq;
vm_page_lock_assert(m, MA_OWNED);
- pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
+ KASSERT(queue < PQ_COUNT,
+ ("vm_page_enqueue: invalid queue %u request for page %p",
+ queue, m));
+ if (queue == PQ_LAUNDRY)
+ pq = &vm_dom[0].vmd_pagequeues[queue];
+ else
+ pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
vm_pagequeue_lock(pq);
m->queue = queue;
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
@@ -2243,13 +2815,12 @@
/*
* vm_page_free_wakeup:
*
- * Helper routine for vm_page_free_toq() and vm_page_cache(). This
- * routine is called when a page has been added to the cache or free
- * queues.
+ * Helper routine for vm_page_free_toq(). This routine is called
+ * when a page is added to the free queues.
*
* The page queues must be locked.
*/
-static inline void
+static void
vm_page_free_wakeup(void)
{
@@ -2259,7 +2830,7 @@
* some free.
*/
if (vm_pageout_pages_needed &&
- cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
+ vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) {
wakeup(&vm_pageout_pages_needed);
vm_pageout_pages_needed = 0;
}
@@ -2269,45 +2840,36 @@
* lots of memory. this process will swapin processes.
*/
if (vm_pages_needed && !vm_page_count_min()) {
- vm_pages_needed = 0;
- wakeup(&cnt.v_free_count);
+ vm_pages_needed = false;
+ wakeup(&vm_cnt.v_free_count);
}
}
/*
- * Turn a cached page into a free page, by changing its attributes.
- * Keep the statistics up-to-date.
+ * vm_page_free_prep:
*
- * The free page queue must be locked.
- */
-static void
-vm_page_cache_turn_free(vm_page_t m)
-{
-
- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-
- m->object = NULL;
- m->valid = 0;
- /* Clear PG_CACHED and set PG_FREE. */
- m->flags ^= PG_CACHED | PG_FREE;
- KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
- ("vm_page_cache_free: page %p has inconsistent flags", m));
- cnt.v_cache_count--;
- vm_phys_freecnt_adj(m, 1);
-}
-
-/*
- * vm_page_free_toq:
+ * Prepares the given page to be put on the free list,
+ * disassociating it from any VM object. The caller may return
+ * the page to the free list only if this function returns true.
*
- * Returns the given page to the free list,
- * disassociating it with any VM object.
- *
- * The object must be locked. The page must be locked if it is managed.
+ * The object must be locked. The page must be locked if it is
+ * managed. For a queued managed page, the pagequeue_locked
+ * argument specifies whether the page queue is already locked.
*/
-void
-vm_page_free_toq(vm_page_t m)
+bool
+vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
{
+#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
+ if ((m->flags & PG_ZERO) != 0) {
+ uint64_t *p;
+ int i;
+ p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
+ for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
+ KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
+ m, i, (uintmax_t)*p));
+ }
+#endif
if ((m->oflags & VPO_UNMANAGED) == 0) {
vm_page_lock_assert(m, MA_OWNED);
KASSERT(!pmap_page_is_mapped(m),
@@ -2317,9 +2879,7 @@
("vm_page_free_toq: unmanaged page %p is queued", m));
PCPU_INC(cnt.v_tfree);
- if (VM_PAGE_IS_FREE(m))
- panic("vm_page_free: freeing free page %p", m);
- else if (vm_page_sbusied(m))
+ if (vm_page_sbusied(m))
panic("vm_page_free: freeing busy page %p", m);
/*
@@ -2328,7 +2888,12 @@
* callback routine until after we've put the page on the
* appropriate free queue.
*/
- vm_page_remque(m);
+ if (m->queue != PQ_NONE) {
+ if (pagequeue_locked)
+ vm_page_dequeue_locked(m);
+ else
+ vm_page_dequeue(m);
+ }
vm_page_remove(m);
/*
@@ -2335,9 +2900,8 @@
* If fictitious remove object association and
* return, otherwise delay object association removal.
*/
- if ((m->flags & PG_FICTITIOUS) != 0) {
- return;
- }
+ if ((m->flags & PG_FICTITIOUS) != 0)
+ return (false);
m->valid = 0;
vm_page_undirty(m);
@@ -2349,36 +2913,75 @@
KASSERT((m->flags & PG_UNHOLDFREE) == 0,
("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
m->flags |= PG_UNHOLDFREE;
- } else {
- /*
- * Restore the default memory attribute to the page.
- */
- if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
- pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
+ return (false);
+ }
- /*
- * Insert the page into the physical memory allocator's
- * cache/free page queues.
- */
- mtx_lock(&vm_page_queue_free_mtx);
- m->flags |= PG_FREE;
- vm_phys_freecnt_adj(m, 1);
+ /*
+ * Restore the default memory attribute to the page.
+ */
+ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
+ pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
+
+ return (true);
+}
+
+/*
+ * Insert the page into the physical memory allocator's free page
+ * queues. This is the last step to free a page.
+ */
+static void
+vm_page_free_phys(vm_page_t m)
+{
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+ vm_phys_freecnt_adj(m, 1);
#if VM_NRESERVLEVEL > 0
- if (!vm_reserv_free_page(m))
-#else
- if (TRUE)
+ if (!vm_reserv_free_page(m))
#endif
vm_phys_free_pages(m, 0);
- if ((m->flags & PG_ZERO) != 0)
- ++vm_page_zero_count;
- else
- vm_page_zero_idle_wakeup();
- vm_page_free_wakeup();
- mtx_unlock(&vm_page_queue_free_mtx);
- }
+ if ((m->flags & PG_ZERO) != 0)
+ ++vm_page_zero_count;
+ else
+ vm_page_zero_idle_wakeup();
}
+void
+vm_page_free_phys_pglist(struct pglist *tq)
+{
+ vm_page_t m;
+
+ if (TAILQ_EMPTY(tq))
+ return;
+ mtx_lock(&vm_page_queue_free_mtx);
+ TAILQ_FOREACH(m, tq, listq)
+ vm_page_free_phys(m);
+ vm_page_free_wakeup();
+ mtx_unlock(&vm_page_queue_free_mtx);
+}
+
/*
+ * vm_page_free_toq:
+ *
+ * Returns the given page to the free list, disassociating it
+ * from any VM object.
+ *
+ * The object must be locked. The page must be locked if it is
+ * managed.
+ */
+void
+vm_page_free_toq(vm_page_t m)
+{
+
+ if (!vm_page_free_prep(m, false))
+ return;
+ mtx_lock(&vm_page_queue_free_mtx);
+ vm_page_free_phys(m);
+ vm_page_free_wakeup();
+ mtx_unlock(&vm_page_queue_free_mtx);
+}
+
+/*
* vm_page_wire:
*
* Mark this page as wired down by yet
@@ -2410,7 +3013,7 @@
m->queue == PQ_NONE,
("vm_page_wire: unmanaged page %p is queued", m));
vm_page_remque(m);
- atomic_add_int(&cnt.v_wire_count, 1);
+ atomic_add_int(&vm_cnt.v_wire_count, 1);
}
m->wire_count++;
KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
@@ -2419,41 +3022,43 @@
/*
* vm_page_unwire:
*
- * Release one wiring of the specified page, potentially enabling it to be
- * paged again. If paging is enabled, then the value of the parameter
- * "activate" determines to which queue the page is added. If "activate" is
- * non-zero, then the page is added to the active queue. Otherwise, it is
- * added to the inactive queue.
+ * Release one wiring of the specified page, potentially allowing it to be
+ * paged out. Returns TRUE if the number of wirings transitions to zero and
+ * FALSE otherwise.
*
- * However, unless the page belongs to an object, it is not enqueued because
- * it cannot be paged out.
+ * Only managed pages belonging to an object can be paged out. If the number
+ * of wirings transitions to zero and the page is eligible for page out, then
+ * the page is added to the specified paging queue (unless PQ_NONE is
+ * specified).
*
* If a page is fictitious, then its wire count must always be one.
*
* A managed page must be locked.
*/
-void
-vm_page_unwire(vm_page_t m, int activate)
+boolean_t
+vm_page_unwire(vm_page_t m, uint8_t queue)
{
+ KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
+ ("vm_page_unwire: invalid queue %u request for page %p",
+ queue, m));
if ((m->oflags & VPO_UNMANAGED) == 0)
- vm_page_lock_assert(m, MA_OWNED);
+ vm_page_assert_locked(m);
if ((m->flags & PG_FICTITIOUS) != 0) {
KASSERT(m->wire_count == 1,
("vm_page_unwire: fictitious page %p's wire count isn't one", m));
- return;
+ return (FALSE);
}
if (m->wire_count > 0) {
m->wire_count--;
if (m->wire_count == 0) {
- atomic_subtract_int(&cnt.v_wire_count, 1);
- if ((m->oflags & VPO_UNMANAGED) != 0 ||
- m->object == NULL)
- return;
- if (!activate)
- m->flags &= ~PG_WINATCFLS;
- vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m);
- }
+ atomic_subtract_int(&vm_cnt.v_wire_count, 1);
+ if ((m->oflags & VPO_UNMANAGED) == 0 &&
+ m->object != NULL && queue != PQ_NONE)
+ vm_page_enqueue(queue, m);
+ return (TRUE);
+ } else
+ return (FALSE);
} else
panic("vm_page_unwire: page %p's wire count is zero", m);
}
@@ -2461,25 +3066,16 @@
/*
* Move the specified page to the inactive queue.
*
- * Many pages placed on the inactive queue should actually go
- * into the cache, but it is difficult to figure out which. What
- * we do instead, if the inactive target is well met, is to put
- * clean pages at the head of the inactive queue instead of the tail.
- * This will cause them to be moved to the cache more quickly and
- * if not actively re-referenced, reclaimed more quickly. If we just
- * stick these pages at the end of the inactive queue, heavy filesystem
- * meta-data accesses can cause an unnecessary paging load on memory bound
- * processes. This optimization causes one-time-use metadata to be
- * reused more quickly.
+ * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
+ * queue. However, setting "noreuse" to TRUE will accelerate the specified
+ * page's reclamation, but it will not unmap the page from any address space.
+ * This is implemented by inserting the page near the head of the inactive
+ * queue, using a marker page to guide FIFO insertion ordering.
*
- * Normally athead is 0 resulting in LRU operation. athead is set
- * to 1 if we want this page to be 'as if it were placed in the cache',
- * except without unmapping it from the process address space.
- *
* The page must be locked.
*/
static inline void
-_vm_page_deactivate(vm_page_t m, int athead)
+_vm_page_deactivate(vm_page_t m, boolean_t noreuse)
{
struct vm_pagequeue *pq;
int queue;
@@ -2490,7 +3086,7 @@
* Ignore if the page is already inactive, unless it is unlikely to be
* reactivated.
*/
- if ((queue = m->queue) == PQ_INACTIVE && !athead)
+ if ((queue = m->queue) == PQ_INACTIVE && !noreuse)
return;
if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
@@ -2501,12 +3097,12 @@
} else {
if (queue != PQ_NONE)
vm_page_dequeue(m);
- m->flags &= ~PG_WINATCFLS;
vm_pagequeue_lock(pq);
}
m->queue = PQ_INACTIVE;
- if (athead)
- TAILQ_INSERT_HEAD(&pq->pq_pl, m, plinks.q);
+ if (noreuse)
+ TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead,
+ m, plinks.q);
else
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
vm_pagequeue_cnt_inc(pq);
@@ -2523,165 +3119,73 @@
vm_page_deactivate(vm_page_t m)
{
- _vm_page_deactivate(m, 0);
+ _vm_page_deactivate(m, FALSE);
}
/*
- * vm_page_try_to_cache:
+ * Move the specified page to the inactive queue with the expectation
+ * that it is unlikely to be reused.
*
- * Returns 0 on failure, 1 on success
+ * The page must be locked.
*/
-int
-vm_page_try_to_cache(vm_page_t m)
+void
+vm_page_deactivate_noreuse(vm_page_t m)
{
- vm_page_lock_assert(m, MA_OWNED);
- VM_OBJECT_ASSERT_WLOCKED(m->object);
- if (m->dirty || m->hold_count || m->wire_count ||
- (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
- return (0);
- pmap_remove_all(m);
- if (m->dirty)
- return (0);
- vm_page_cache(m);
- return (1);
+ _vm_page_deactivate(m, TRUE);
}
/*
- * vm_page_try_to_free()
+ * vm_page_launder
*
- * Attempt to free the page. If we cannot free it, we do nothing.
- * 1 is returned on success, 0 on failure.
+ * Put a page in the laundry.
*/
-int
-vm_page_try_to_free(vm_page_t m)
+void
+vm_page_launder(vm_page_t m)
{
+ int queue;
- vm_page_lock_assert(m, MA_OWNED);
- if (m->object != NULL)
- VM_OBJECT_ASSERT_WLOCKED(m->object);
- if (m->dirty || m->hold_count || m->wire_count ||
- (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
- return (0);
- pmap_remove_all(m);
- if (m->dirty)
- return (0);
- vm_page_free(m);
- return (1);
+ vm_page_assert_locked(m);
+ if ((queue = m->queue) != PQ_LAUNDRY) {
+ if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
+ if (queue != PQ_NONE)
+ vm_page_dequeue(m);
+ vm_page_enqueue(PQ_LAUNDRY, m);
+ } else
+ KASSERT(queue == PQ_NONE,
+ ("wired page %p is queued", m));
+ }
}
/*
- * vm_page_cache
+ * vm_page_try_to_free()
*
- * Put the specified page onto the page cache queue (if appropriate).
- *
- * The object and page must be locked.
+ * Attempt to free the page. If we cannot free it, we do nothing.
+ * true is returned on success, false on failure.
*/
-void
-vm_page_cache(vm_page_t m)
+bool
+vm_page_try_to_free(vm_page_t m)
{
- vm_object_t object;
- boolean_t cache_was_empty;
- vm_page_lock_assert(m, MA_OWNED);
- object = m->object;
- VM_OBJECT_ASSERT_WLOCKED(object);
- if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) ||
- m->hold_count || m->wire_count)
- panic("vm_page_cache: attempting to cache busy page");
- KASSERT(!pmap_page_is_mapped(m),
- ("vm_page_cache: page %p is mapped", m));
- KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m));
- if (m->valid == 0 || object->type == OBJT_DEFAULT ||
- (object->type == OBJT_SWAP &&
- !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
- /*
- * Hypothesis: A cache-elgible page belonging to a
- * default object or swap object but without a backing
- * store must be zero filled.
- */
- vm_page_free(m);
- return;
+ vm_page_assert_locked(m);
+ if (m->object != NULL)
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if (m->dirty != 0 || m->hold_count != 0 || m->wire_count != 0 ||
+ (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
+ return (false);
+ if (m->object != NULL && m->object->ref_count != 0) {
+ pmap_remove_all(m);
+ if (m->dirty != 0)
+ return (false);
}
- KASSERT((m->flags & PG_CACHED) == 0,
- ("vm_page_cache: page %p is already cached", m));
-
- /*
- * Remove the page from the paging queues.
- */
- vm_page_remque(m);
-
- /*
- * Remove the page from the object's collection of resident
- * pages.
- */
- vm_radix_remove(&object->rtree, m->pindex);
- TAILQ_REMOVE(&object->memq, m, listq);
- object->resident_page_count--;
-
- /*
- * Restore the default memory attribute to the page.
- */
- if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
- pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
-
- /*
- * Insert the page into the object's collection of cached pages
- * and the physical memory allocator's cache/free page queues.
- */
- m->flags &= ~PG_ZERO;
- mtx_lock(&vm_page_queue_free_mtx);
- cache_was_empty = vm_radix_is_empty(&object->cache);
- if (vm_radix_insert(&object->cache, m)) {
- mtx_unlock(&vm_page_queue_free_mtx);
- if (object->type == OBJT_VNODE &&
- object->resident_page_count == 0)
- vdrop(object->handle);
- m->object = NULL;
- vm_page_free(m);
- return;
- }
-
- /*
- * The above call to vm_radix_insert() could reclaim the one pre-
- * existing cached page from this object, resulting in a call to
- * vdrop().
- */
- if (!cache_was_empty)
- cache_was_empty = vm_radix_is_singleton(&object->cache);
-
- m->flags |= PG_CACHED;
- cnt.v_cache_count++;
- PCPU_INC(cnt.v_tcached);
-#if VM_NRESERVLEVEL > 0
- if (!vm_reserv_free_page(m)) {
-#else
- if (TRUE) {
-#endif
- vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
- vm_phys_free_pages(m, 0);
- }
- vm_page_free_wakeup();
- mtx_unlock(&vm_page_queue_free_mtx);
-
- /*
- * Increment the vnode's hold count if this is the object's only
- * cached page. Decrement the vnode's hold count if this was
- * the object's only resident page.
- */
- if (object->type == OBJT_VNODE) {
- if (cache_was_empty && object->resident_page_count != 0)
- vhold(object->handle);
- else if (!cache_was_empty && object->resident_page_count == 0)
- vdrop(object->handle);
- }
+ vm_page_free(m);
+ return (true);
}
/*
* vm_page_advise
*
- * Deactivate or do nothing, as appropriate. This routine is used
- * by madvise() and vop_stdadvise().
+ * Apply the specified advice to the given page.
*
* The object and page must be locked.
*/
@@ -2694,20 +3198,16 @@
if (advice == MADV_FREE)
/*
* Mark the page clean. This will allow the page to be freed
- * up by the system. However, such pages are often reused
- * quickly by malloc() so we do not do anything that would
- * cause a page fault if we can help it.
- *
- * Specifically, we do not try to actually free the page now
- * nor do we try to put it in the cache (which would cause a
- * page fault on reuse).
- *
- * But we do make the page as freeable as we can without
- * actually taking the step of unmapping it.
+ * without first paging it out. MADV_FREE pages are often
+ * quickly reused by malloc(3), so we do not do anything that
+ * would result in a page fault on a later access.
*/
vm_page_undirty(m);
- else if (advice != MADV_DONTNEED)
+ else if (advice != MADV_DONTNEED) {
+ if (advice == MADV_WILLNEED)
+ vm_page_activate(m);
return;
+ }
/*
* Clear any references to the page. Otherwise, the page daemon will
@@ -2719,11 +3219,15 @@
vm_page_dirty(m);
/*
- * Place clean pages at the head of the inactive queue rather than the
- * tail, thus defeating the queue's LRU operation and ensuring that the
- * page will be reused quickly.
+ * Place clean pages near the head of the inactive queue rather than
+ * the tail, thus defeating the queue's LRU operation and ensuring that
+ * the page will be reused quickly. Dirty pages not already in the
+ * laundry are moved there.
*/
- _vm_page_deactivate(m, m->dirty == 0);
+ if (m->dirty == 0)
+ vm_page_deactivate_noreuse(m);
+ else
+ vm_page_launder(m);
}
/*
@@ -2742,16 +3246,23 @@
{
vm_page_t m;
int sleep;
+ int pflags;
VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
(allocflags & VM_ALLOC_IGN_SBUSY) != 0,
("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
+ pflags = allocflags &
+ ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+ if ((allocflags & VM_ALLOC_NOWAIT) == 0)
+ pflags |= VM_ALLOC_WAITFAIL;
retrylookup:
if ((m = vm_page_lookup(object, pindex)) != NULL) {
sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
vm_page_xbusied(m) : vm_page_busied(m);
if (sleep) {
+ if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+ return (NULL);
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
@@ -2778,14 +3289,12 @@
return (m);
}
}
- m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_IGN_SBUSY);
+ m = vm_page_alloc(object, pindex, pflags);
if (m == NULL) {
- VM_OBJECT_WUNLOCK(object);
- VM_WAIT;
- VM_OBJECT_WLOCK(object);
+ if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+ return (NULL);
goto retrylookup;
- } else if (m->valid != 0)
- return (m);
+ }
if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
pmap_zero_page(m);
return (m);
@@ -2792,6 +3301,114 @@
}
/*
+ * Return the specified range of pages from the given object. For each
+ * page offset within the range, if a page already exists within the object
+ * at that offset and it is busy, then wait for it to change state. If,
+ * instead, the page doesn't exist, then allocate it.
+ *
+ * The caller must always specify an allocation class.
+ *
+ * allocation classes:
+ * VM_ALLOC_NORMAL normal process request
+ * VM_ALLOC_SYSTEM system *really* needs the pages
+ *
+ * The caller must always specify that the pages are to be busied and/or
+ * wired.
+ *
+ * optional allocation flags:
+ * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages
+ * VM_ALLOC_NOBUSY do not exclusive busy the page
+ * VM_ALLOC_NOWAIT do not sleep
+ * VM_ALLOC_SBUSY set page to sbusy state
+ * VM_ALLOC_WIRED wire the pages
+ * VM_ALLOC_ZERO zero and validate any invalid pages
+ *
+ * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it
+ * may return a partial prefix of the requested range.
+ */
+int
+vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+ vm_page_t *ma, int count)
+{
+ vm_page_t m, mpred;
+ int pflags;
+ int i;
+ bool sleep;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
+ ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
+ KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
+ (allocflags & VM_ALLOC_WIRED) != 0,
+ ("vm_page_grab_pages: the pages must be busied or wired"));
+ KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
+ (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
+ ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch"));
+ if (count == 0)
+ return (0);
+ pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK |
+ VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY);
+ if ((allocflags & VM_ALLOC_NOWAIT) == 0)
+ pflags |= VM_ALLOC_WAITFAIL;
+ i = 0;
+retrylookup:
+ m = vm_radix_lookup_le(&object->rtree, pindex + i);
+ if (m == NULL || m->pindex != pindex + i) {
+ mpred = m;
+ m = NULL;
+ } else
+ mpred = TAILQ_PREV(m, pglist, listq);
+ for (; i < count; i++) {
+ if (m != NULL) {
+ sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
+ vm_page_xbusied(m) : vm_page_busied(m);
+ if (sleep) {
+ if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+ break;
+ /*
+ * Reference the page before unlocking and
+ * sleeping so that the page daemon is less
+ * likely to reclaim it.
+ */
+ vm_page_aflag_set(m, PGA_REFERENCED);
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(object);
+ vm_page_busy_sleep(m, "grbmaw", (allocflags &
+ VM_ALLOC_IGN_SBUSY) != 0);
+ VM_OBJECT_WLOCK(object);
+ goto retrylookup;
+ }
+ if ((allocflags & VM_ALLOC_WIRED) != 0) {
+ vm_page_lock(m);
+ vm_page_wire(m);
+ vm_page_unlock(m);
+ }
+ if ((allocflags & (VM_ALLOC_NOBUSY |
+ VM_ALLOC_SBUSY)) == 0)
+ vm_page_xbusy(m);
+ if ((allocflags & VM_ALLOC_SBUSY) != 0)
+ vm_page_sbusy(m);
+ } else {
+ m = vm_page_alloc_after(object, pindex + i,
+ pflags | VM_ALLOC_COUNT(count - i), mpred);
+ if (m == NULL) {
+ if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+ break;
+ goto retrylookup;
+ }
+ }
+ if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) {
+ if ((m->flags & PG_ZERO) == 0)
+ pmap_zero_page(m);
+ m->valid = VM_PAGE_BITS_ALL;
+ }
+ ma[i] = mpred = m;
+ m = vm_page_next(m);
+ }
+ return (i);
+}
+
+/*
* Mapping function for valid or dirty bits in a page.
*
* Inputs are required to range within a page.
@@ -2841,17 +3458,17 @@
* bit is clear, we have to zero out a portion of the
* first block.
*/
- if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
+ if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
(m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
pmap_zero_page_area(m, frag, base - frag);
/*
- * If the ending offset is not DEV_BSIZE aligned and the
+ * If the ending offset is not DEV_BSIZE aligned and the
* valid bit is clear, we have to zero out a portion of
* the last block.
*/
endoff = base + size;
- if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
+ if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
(m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
pmap_zero_page_area(m, endoff,
DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
@@ -2858,7 +3475,7 @@
/*
* Assert that no previously invalid block that is now being validated
- * is already dirty.
+ * is already dirty.
*/
KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
("vm_page_set_valid_range: page %p is dirty", m));
@@ -2948,17 +3565,17 @@
* bit is clear, we have to zero out a portion of the
* first block.
*/
- if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
+ if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
(m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
pmap_zero_page_area(m, frag, base - frag);
/*
- * If the ending offset is not DEV_BSIZE aligned and the
+ * If the ending offset is not DEV_BSIZE aligned and the
* valid bit is clear, we have to zero out a portion of
* the last block.
*/
endoff = base + size;
- if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
+ if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
(m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
pmap_zero_page_area(m, endoff,
DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
@@ -3050,12 +3667,12 @@
/*
* vm_page_zero_invalid()
*
- * The kernel assumes that the invalid portions of a page contain
+ * The kernel assumes that the invalid portions of a page contain
* garbage, but such pages can be mapped into memory by user code.
* When this occurs, we must zero out the non-valid portions of the
* page so user code sees what it expects.
*
- * Pages are most often semi-valid when the end of a file is mapped
+ * Pages are most often semi-valid when the end of a file is mapped
* into memory and the file's size is not page aligned.
*/
void
@@ -3072,10 +3689,10 @@
* vm_page_set_validclean().
*/
for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
- if (i == (PAGE_SIZE / DEV_BSIZE) ||
+ if (i == (PAGE_SIZE / DEV_BSIZE) ||
(m->valid & ((vm_page_bits_t)1 << i))) {
if (i > b) {
- pmap_zero_page_area(m,
+ pmap_zero_page_area(m,
b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
}
b = i + 1;
@@ -3109,16 +3726,19 @@
}
/*
- * vm_page_ps_is_valid:
- *
- * Returns TRUE if the entire (super)page is valid and FALSE otherwise.
+ * Returns true if all of the specified predicates are true for the entire
+ * (super)page and false otherwise.
*/
-boolean_t
-vm_page_ps_is_valid(vm_page_t m)
+bool
+vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
{
+ vm_object_t object;
int i, npages;
- VM_OBJECT_ASSERT_LOCKED(m->object);
+ object = m->object;
+ if (skip_m != NULL && skip_m->object != object)
+ return (false);
+ VM_OBJECT_ASSERT_LOCKED(object);
npages = atop(pagesizes[m->psind]);
/*
@@ -3127,10 +3747,28 @@
* occupy adjacent entries in vm_page_array[].
*/
for (i = 0; i < npages; i++) {
- if (m[i].valid != VM_PAGE_BITS_ALL)
- return (FALSE);
+ /* Always test object consistency, including "skip_m". */
+ if (m[i].object != object)
+ return (false);
+ if (&m[i] == skip_m)
+ continue;
+ if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
+ return (false);
+ if ((flags & PS_ALL_DIRTY) != 0) {
+ /*
+ * Calling vm_page_test_dirty() or pmap_is_modified()
+ * might stop this case from spuriously returning
+ * "false". However, that would require a write lock
+ * on the object containing "m[i]".
+ */
+ if (m[i].dirty != VM_PAGE_BITS_ALL)
+ return (false);
+ }
+ if ((flags & PS_ALL_VALID) != 0 &&
+ m[i].valid != VM_PAGE_BITS_ALL)
+ return (false);
}
- return (TRUE);
+ return (true);
}
/*
@@ -3224,16 +3862,16 @@
DB_SHOW_COMMAND(page, vm_page_print_page_info)
{
- db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
- db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
- db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
- db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
- db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
- db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
- db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
- db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
- db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
- db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
+
+ db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count);
+ db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
+ db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
+ db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count);
+ db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
+ db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
+ db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
+ db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
+ db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
}
DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
@@ -3240,17 +3878,16 @@
{
int dom;
- db_printf("pq_free %d pq_cache %d\n",
- cnt.v_free_count, cnt.v_cache_count);
+ db_printf("pq_free %d\n", vm_cnt.v_free_count);
for (dom = 0; dom < vm_ndomains; dom++) {
db_printf(
- "dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n",
+ "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n",
dom,
vm_dom[dom].vmd_page_count,
vm_dom[dom].vmd_free_count,
vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
- vm_dom[dom].vmd_pass);
+ vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt);
}
}
@@ -3257,7 +3894,7 @@
DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
{
vm_page_t m;
- boolean_t phys;
+ boolean_t phys, virt;
if (!have_addr) {
db_printf("show pginfo addr\n");
@@ -3265,7 +3902,10 @@
}
phys = strchr(modif, 'p') != NULL;
- if (phys)
+ virt = strchr(modif, 'v') != NULL;
+ if (virt)
+ m = PHYS_TO_VM_PAGE(pmap_kextract(addr));
+ else if (phys)
m = PHYS_TO_VM_PAGE(addr);
else
m = (vm_page_t)addr;
Modified: trunk/sys/vm/vm_page.h
===================================================================
--- trunk/sys/vm/vm_page.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_page.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: stable/10/sys/vm/vm_page.h 307672 2016-10-20 13:12:19Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_page.h 332505 2018-04-14 17:41:54Z kib $
*/
/*
@@ -142,7 +142,7 @@
vm_object_t object; /* which object am I in (O,P) */
vm_pindex_t pindex; /* offset into object (O,P) */
vm_paddr_t phys_addr; /* physical address of page */
- struct md_page md; /* machine dependant stuff */
+ struct md_page md; /* machine dependent stuff */
u_int wire_count; /* wired down maps refs (P) */
volatile u_int busy_lock; /* busy owners lock */
uint16_t hold_count; /* page hold count (P) */
@@ -150,6 +150,7 @@
uint8_t aflags; /* access is atomic */
uint8_t oflags; /* page VPO_* flags (O) */
uint8_t queue; /* page queue index (P,Q) */
+ int8_t psind; /* pagesizes[] index (O) */
int8_t segind;
uint8_t order; /* index of the buddy queue */
uint8_t pool;
@@ -158,7 +159,6 @@
/* so, on normal X86 kernels, they must be at least 8 bits wide */
vm_page_bits_t valid; /* map of valid DEV_BSIZE chunks (O) */
vm_page_bits_t dirty; /* map of dirty DEV_BSIZE chunks (M) */
- int8_t psind; /* pagesizes[] index (O) */
};
/*
@@ -207,9 +207,13 @@
#define PQ_NONE 255
#define PQ_INACTIVE 0
#define PQ_ACTIVE 1
-#define PQ_COUNT 2
+#define PQ_LAUNDRY 2
+#define PQ_COUNT 3
+#ifndef VM_PAGE_HAVE_PGLIST
TAILQ_HEAD(pglist, vm_page);
+#define VM_PAGE_HAVE_PGLIST
+#endif
SLIST_HEAD(spglist, vm_page);
struct vm_pagequeue {
@@ -227,10 +231,11 @@
u_int vmd_free_count;
long vmd_segs; /* bitmask of the segments */
boolean_t vmd_oom;
- int vmd_pass; /* local pagedaemon pass */
int vmd_oom_seq;
int vmd_last_active_scan;
+ struct vm_page vmd_laundry_marker;
struct vm_page vmd_marker; /* marker for pagedaemon private use */
+ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
};
extern struct vm_domain vm_dom[MAXMEMDOM];
@@ -237,6 +242,7 @@
#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED)
#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex)
+#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex)
#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex)
#ifdef _KERNEL
@@ -324,12 +330,9 @@
* Page flags. If changed at any other time than page allocation or
* freeing, the modification must be protected by the vm_page lock.
*/
-#define PG_CACHED 0x0001 /* page is cached */
-#define PG_FREE 0x0002 /* page is free */
#define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */
#define PG_ZERO 0x0008 /* page is zeroed */
#define PG_MARKER 0x0010 /* special queue marker page */
-#define PG_WINATCFLS 0x0040 /* flush dirty page on inactive q */
#define PG_NODUMP 0x0080 /* don't include this page in a dump */
#define PG_UNHOLDFREE 0x0100 /* delayed free of a held page */
@@ -353,19 +356,16 @@
* free
* Available for allocation now.
*
- * cache
- * Almost available for allocation. Still associated with
- * an object, but clean and immediately freeable.
- *
- * The following lists are LRU sorted:
- *
* inactive
* Low activity, candidates for reclamation.
+ * This list is approximately LRU ordered.
+ *
+ * laundry
* This is the list of pages that should be
* paged out next.
*
* active
- * Pages that are "active" i.e. they have been
+ * Pages that are "active", i.e., they have been
* recently referenced.
*
*/
@@ -376,28 +376,51 @@
extern long vm_page_array_size; /* number of vm_page_t's */
extern long first_page; /* first physical page number */
-#define VM_PAGE_IS_FREE(m) (((m)->flags & PG_FREE) != 0)
-
#define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr)
+/*
+ * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory
+ * page to which the given physical address belongs. The correct vm_page_t
+ * object is returned for addresses that are not page-aligned.
+ */
vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
-/* page allocation classes: */
+/*
+ * Page allocation parameters for vm_page for the functions
+ * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and
+ * vm_page_alloc_freelist(). Some functions support only a subset
+ * of the flags, and ignore others, see the flags legend.
+ *
+ * The meaning of VM_ALLOC_ZERO differs slightly between the vm_page_alloc*()
+ * and the vm_page_grab*() functions. See these functions for details.
+ *
+ * Bits 0 - 1 define class.
+ * Bits 2 - 15 dedicated for flags.
+ * Legend:
+ * (a) - vm_page_alloc() supports the flag.
+ * (c) - vm_page_alloc_contig() supports the flag.
+ * (f) - vm_page_alloc_freelist() supports the flag.
+ * (g) - vm_page_grab() supports the flag.
+ * (p) - vm_page_grab_pages() supports the flag.
+ * Bits above 15 define the count of additional pages that the caller
+ * intends to allocate.
+ */
#define VM_ALLOC_NORMAL 0
#define VM_ALLOC_INTERRUPT 1
#define VM_ALLOC_SYSTEM 2
#define VM_ALLOC_CLASS_MASK 3
-/* page allocation flags: */
-#define VM_ALLOC_WIRED 0x0020 /* non pageable */
-#define VM_ALLOC_ZERO 0x0040 /* Try to obtain a zeroed page */
-#define VM_ALLOC_NOOBJ 0x0100 /* No associated object */
-#define VM_ALLOC_NOBUSY 0x0200 /* Do not busy the page */
-#define VM_ALLOC_IFCACHED 0x0400 /* Fail if the page is not cached */
-#define VM_ALLOC_IFNOTCACHED 0x0800 /* Fail if the page is cached */
-#define VM_ALLOC_IGN_SBUSY 0x1000 /* vm_page_grab() only */
-#define VM_ALLOC_NODUMP 0x2000 /* don't include in dump */
-#define VM_ALLOC_SBUSY 0x4000 /* Shared busy the page */
-
+#define VM_ALLOC_WAITOK 0x0008 /* (acf) Sleep and retry */
+#define VM_ALLOC_WAITFAIL 0x0010 /* (acf) Sleep and return error */
+#define VM_ALLOC_WIRED 0x0020 /* (acfgp) Allocate a wired page */
+#define VM_ALLOC_ZERO 0x0040 /* (acfgp) Allocate a prezeroed page */
+#define VM_ALLOC_NOOBJ 0x0100 /* (acg) No associated object */
+#define VM_ALLOC_NOBUSY 0x0200 /* (acgp) Do not excl busy the page */
+#define VM_ALLOC_IFCACHED 0x0400
+#define VM_ALLOC_IFNOTCACHED 0x0800
+#define VM_ALLOC_IGN_SBUSY 0x1000 /* (gp) Ignore shared busy flag */
+#define VM_ALLOC_NODUMP 0x2000 /* (ag) don't include in dump */
+#define VM_ALLOC_SBUSY 0x4000 /* (acgp) Shared busy the page */
+#define VM_ALLOC_NOWAIT 0x8000 /* (acfgp) Do not sleep */
#define VM_ALLOC_COUNT_SHIFT 16
#define VM_ALLOC_COUNT(count) ((count) << VM_ALLOC_COUNT_SHIFT)
@@ -416,10 +439,26 @@
pflags |= VM_ALLOC_ZERO;
if ((malloc_flags & M_NODUMP) != 0)
pflags |= VM_ALLOC_NODUMP;
+ if ((malloc_flags & M_NOWAIT))
+ pflags |= VM_ALLOC_NOWAIT;
+ if ((malloc_flags & M_WAITOK))
+ pflags |= VM_ALLOC_WAITOK;
return (pflags);
}
#endif
+/*
+ * Predicates supported by vm_page_ps_test():
+ *
+ * PS_ALL_DIRTY is true only if the entire (super)page is dirty.
+ * However, it can be spuriously false when the (super)page has become
+ * dirty in the pmap but that information has not been propagated to the
+ * machine-independent layer.
+ */
+#define PS_ALL_DIRTY 0x1
+#define PS_ALL_VALID 0x2
+#define PS_NONE_BUSY 0x4
+
void vm_page_busy_downgrade(vm_page_t m);
void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared);
void vm_page_flash(vm_page_t m);
@@ -430,33 +469,38 @@
void vm_page_activate (vm_page_t);
void vm_page_advise(vm_page_t m, int advice);
-vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
+vm_page_t vm_page_alloc(vm_object_t, vm_pindex_t, int);
+vm_page_t vm_page_alloc_after(vm_object_t, vm_pindex_t, int, vm_page_t);
vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
vm_paddr_t boundary, vm_memattr_t memattr);
vm_page_t vm_page_alloc_freelist(int, int);
+bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose);
+void vm_page_change_lock(vm_page_t m, struct mtx **mtx);
vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
-void vm_page_cache(vm_page_t);
-void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
-void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
-int vm_page_try_to_cache (vm_page_t);
-int vm_page_try_to_free (vm_page_t);
+int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+ vm_page_t *ma, int count);
void vm_page_deactivate (vm_page_t);
+void vm_page_deactivate_noreuse(vm_page_t);
void vm_page_dequeue(vm_page_t m);
void vm_page_dequeue_locked(vm_page_t m);
vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
+void vm_page_free_phys_pglist(struct pglist *tq);
+bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked);
vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
-boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
+void vm_page_launder(vm_page_t m);
vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
vm_page_t vm_page_next(vm_page_t m);
int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
vm_page_t vm_page_prev(vm_page_t m);
-boolean_t vm_page_ps_is_valid(vm_page_t m);
+bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m);
void vm_page_putfake(vm_page_t m);
void vm_page_readahead_finish(vm_page_t m);
+bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
+ vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
void vm_page_reference(vm_page_t m);
void vm_page_remove (vm_page_t);
int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
@@ -465,16 +509,20 @@
void vm_page_requeue(vm_page_t m);
void vm_page_requeue_locked(vm_page_t m);
int vm_page_sbusied(vm_page_t m);
+vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start,
+ vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options);
void vm_page_set_valid_range(vm_page_t m, int base, int size);
int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
vm_offset_t vm_page_startup(vm_offset_t vaddr);
void vm_page_sunbusy(vm_page_t m);
+bool vm_page_try_to_free(vm_page_t m);
int vm_page_trysbusy(vm_page_t m);
void vm_page_unhold_pages(vm_page_t *ma, int count);
-void vm_page_unwire (vm_page_t, int);
+boolean_t vm_page_unwire(vm_page_t m, uint8_t queue);
void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_wire (vm_page_t);
void vm_page_xunbusy_hard(vm_page_t m);
+void vm_page_xunbusy_maybelocked(vm_page_t m);
void vm_page_set_validclean (vm_page_t, int, int);
void vm_page_clear_dirty (vm_page_t, int, int);
void vm_page_set_invalid (vm_page_t, int, int);
@@ -497,17 +545,17 @@
#define vm_page_assert_sbusied(m) \
KASSERT(vm_page_sbusied(m), \
("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \
- (void *)m, __FILE__, __LINE__));
+ (m), __FILE__, __LINE__))
#define vm_page_assert_unbusied(m) \
KASSERT(!vm_page_busied(m), \
("vm_page_assert_unbusied: page %p busy @ %s:%d", \
- (void *)m, __FILE__, __LINE__));
+ (m), __FILE__, __LINE__))
#define vm_page_assert_xbusied(m) \
KASSERT(vm_page_xbusied(m), \
("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \
- (void *)m, __FILE__, __LINE__));
+ (m), __FILE__, __LINE__))
#define vm_page_busied(m) \
((m)->busy_lock != VPB_UNBUSIED)
@@ -514,22 +562,24 @@
#define vm_page_sbusy(m) do { \
if (!vm_page_trysbusy(m)) \
- panic("%s: page %p failed shared busing", __func__, m); \
+ panic("%s: page %p failed shared busying", __func__, \
+ (m)); \
} while (0)
#define vm_page_tryxbusy(m) \
- (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED, \
+ (atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED, \
VPB_SINGLE_EXCLUSIVER))
#define vm_page_xbusied(m) \
- ((m->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0)
+ (((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0)
#define vm_page_xbusy(m) do { \
if (!vm_page_tryxbusy(m)) \
- panic("%s: page %p failed exclusive busing", __func__, \
- m); \
+ panic("%s: page %p failed exclusive busying", __func__, \
+ (m)); \
} while (0)
+/* Note: page m's lock must not be owned by the caller. */
#define vm_page_xunbusy(m) do { \
if (!atomic_cmpset_rel_int(&(m)->busy_lock, \
VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) \
@@ -660,5 +710,41 @@
m->dirty = 0;
}
+static inline void
+vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
+ vm_page_t mold)
+{
+ vm_page_t mret;
+
+ mret = vm_page_replace(mnew, object, pindex);
+ KASSERT(mret == mold,
+ ("invalid page replacement, mold=%p, mret=%p", mold, mret));
+
+ /* Unused if !INVARIANTS. */
+ (void)mold;
+ (void)mret;
+}
+
+static inline bool
+vm_page_active(vm_page_t m)
+{
+
+ return (m->queue == PQ_ACTIVE);
+}
+
+static inline bool
+vm_page_inactive(vm_page_t m)
+{
+
+ return (m->queue == PQ_INACTIVE);
+}
+
+static inline bool
+vm_page_in_laundry(vm_page_t m)
+{
+
+ return (m->queue == PQ_LAUNDRY);
+}
+
#endif /* _KERNEL */
#endif /* !_VM_PAGE_ */
Modified: trunk/sys/vm/vm_pageout.c
===================================================================
--- trunk/sys/vm/vm_pageout.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pageout.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -74,10 +74,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pageout.c 320550 2017-07-01 19:24:53Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_pageout.c 331722 2018-03-29 02:50:57Z eadler $");
#include "opt_vm.h"
-#include "opt_kdtrace.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -120,8 +120,9 @@
/* the kernel process "vm_pageout"*/
static void vm_pageout(void);
static void vm_pageout_init(void);
-static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_scan(struct vm_domain *vmd, int pass);
+static int vm_pageout_clean(vm_page_t m, int *numpagedout);
+static int vm_pageout_cluster(vm_page_t m);
+static bool vm_pageout_scan(struct vm_domain *vmd, int pass);
static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
int starting_page_shortage);
@@ -139,82 +140,49 @@
&page_kp);
SDT_PROVIDER_DEFINE(vm);
-SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
-#if !defined(NO_SWAPPING)
-/* the kernel process "vm_daemon"*/
-static void vm_daemon(void);
-static struct proc *vmproc;
+/* Pagedaemon activity rates, in subdivisions of one second. */
+#define VM_LAUNDER_RATE 10
+#define VM_INACT_SCAN_RATE 2
-static struct kproc_desc vm_kp = {
- "vmdaemon",
- vm_daemon,
- &vmproc
-};
-SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
-#endif
-
-
-int vm_pages_needed; /* Event on which pageout daemon sleeps */
int vm_pageout_deficit; /* Estimated number of pages deficit */
-int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */
-int vm_pageout_wakeup_thresh;
+u_int vm_pageout_wakeup_thresh;
static int vm_pageout_oom_seq = 12;
+bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */
+bool vm_pages_needed; /* Are threads waiting for free pages? */
-#if !defined(NO_SWAPPING)
-static int vm_pageout_req_swapout; /* XXX */
-static int vm_daemon_needed;
-static struct mtx vm_daemon_mtx;
-/* Allow for use by vm_pageout before vm_daemon is initialized. */
-MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
-#endif
-static int vm_max_launder = 32;
+/* Pending request for dirty page laundering. */
+static enum {
+ VM_LAUNDRY_IDLE,
+ VM_LAUNDRY_BACKGROUND,
+ VM_LAUNDRY_SHORTFALL
+} vm_laundry_request = VM_LAUNDRY_IDLE;
+
static int vm_pageout_update_period;
-static int defer_swap_pageouts;
static int disable_swap_pageouts;
static int lowmem_period = 10;
static time_t lowmem_uptime;
-#if defined(NO_SWAPPING)
-static int vm_swap_enabled = 0;
-static int vm_swap_idle_enabled = 0;
-#else
-static int vm_swap_enabled = 1;
-static int vm_swap_idle_enabled = 0;
-#endif
+static int vm_panic_on_oom = 0;
+SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
+ CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
+ "panic on out of memory instead of killing the largest process");
+
SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
- CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
+ CTLFLAG_RWTUN, &vm_pageout_wakeup_thresh, 0,
"free page threshold for waking up the pageout daemon");
-SYSCTL_INT(_vm, OID_AUTO, max_launder,
- CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
-
SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
- CTLFLAG_RW, &vm_pageout_update_period, 0,
+ CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
"Maximum active LRU update period");
-SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
+SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
"Low memory callback period");
-#if defined(NO_SWAPPING)
-SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
- CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
- CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
-#else
-SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
- CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
- CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
-#endif
-
-SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
- CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
-
SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
- CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
+ CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
static int pageout_lock_miss;
SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
@@ -221,24 +189,39 @@
CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
- CTLFLAG_RW, &vm_pageout_oom_seq, 0,
+ CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
"back-to-back calls to oom detector to start OOM");
-#define VM_PAGEOUT_PAGE_COUNT 16
-int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
+static int act_scan_laundry_weight = 3;
+SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
+ &act_scan_laundry_weight, 0,
+ "weight given to clean vs. dirty pages in active queue scans");
+static u_int vm_background_launder_target;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RWTUN,
+ &vm_background_launder_target, 0,
+ "background laundering target, in pages");
+
+static u_int vm_background_launder_rate = 4096;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
+ &vm_background_launder_rate, 0,
+ "background laundering rate, in kilobytes per second");
+
+static u_int vm_background_launder_max = 20 * 1024;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
+ &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
+
+int vm_pageout_page_count = 32;
+
int vm_page_max_wired; /* XXX max # of wired pages system-wide */
SYSCTL_INT(_vm, OID_AUTO, max_wired,
CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
+static u_int isqrt(u_int num);
static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
-static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
- vm_paddr_t);
-#if !defined(NO_SWAPPING)
-static void vm_pageout_map_deactivate_pages(vm_map_t, long);
-static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
-static void vm_req_vmdaemon(int req);
-#endif
+static int vm_pageout_launder(struct vm_domain *vmd, int launder,
+ bool in_shortfall);
+static void vm_pageout_laundry_worker(void *arg);
static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
/*
@@ -352,41 +335,30 @@
}
/*
- * vm_pageout_clean:
- *
- * Clean the page and remove it from the laundry.
- *
- * We set the busy bit to cause potential page faults on this page to
- * block. Note the careful timing, however, the busy bit isn't set till
- * late and we cannot do anything that will mess with the page.
+ * Scan for pages at adjacent offsets within the given page's object that are
+ * eligible for laundering, form a cluster of these pages and the given page,
+ * and launder that cluster.
*/
static int
-vm_pageout_clean(vm_page_t m)
+vm_pageout_cluster(vm_page_t m)
{
vm_object_t object;
- vm_page_t mc[2*vm_pageout_page_count], pb, ps;
- int pageout_count;
- int ib, is, page_base;
- vm_pindex_t pindex = m->pindex;
+ vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
+ vm_pindex_t pindex;
+ int ib, is, page_base, pageout_count;
- vm_page_lock_assert(m, MA_OWNED);
+ vm_page_assert_locked(m);
object = m->object;
VM_OBJECT_ASSERT_WLOCKED(object);
+ pindex = m->pindex;
/*
- * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
- * with the new swapper, but we could have serious problems paging
- * out other object types if there is insufficient memory.
- *
- * Unfortunately, checking free memory here is far too late, so the
- * check has been moved up a procedural level.
+ * We can't clean the page if it is busy or held.
*/
+ vm_page_assert_unbusied(m);
+ KASSERT(m->hold_count == 0, ("page %p is held", m));
- /*
- * Can't clean the page if it's busy or held.
- */
- vm_page_assert_unbusied(m);
- KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
+ pmap_remove_write(m);
vm_page_unlock(m);
mc[vm_pageout_page_count] = pb = ps = m;
@@ -396,33 +368,23 @@
is = 1;
/*
- * Scan object for clusterable pages.
+ * We can cluster only if the page is not clean, busy, or held, and
+ * the page is in the laundry queue.
*
- * We can cluster ONLY if: ->> the page is NOT
- * clean, wired, busy, held, or mapped into a
- * buffer, and one of the following:
- * 1) The page is inactive, or a seldom used
- * active page.
- * -or-
- * 2) we force the issue.
- *
* During heavy mmap/modification loads the pageout
* daemon can really fragment the underlying file
- * due to flushing pages out of order and not trying
- * align the clusters (which leave sporatic out-of-order
+ * due to flushing pages out of order and not trying to
+ * align the clusters (which leaves sporadic out-of-order
* holes). To solve this problem we do the reverse scan
* first and attempt to align our cluster, then do a
* forward scan if room remains.
*/
more:
- while (ib && pageout_count < vm_pageout_page_count) {
- vm_page_t p;
-
+ while (ib != 0 && pageout_count < vm_pageout_page_count) {
if (ib > pindex) {
ib = 0;
break;
}
-
if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
ib = 0;
break;
@@ -433,28 +395,27 @@
break;
}
vm_page_lock(p);
- if (p->queue != PQ_INACTIVE ||
+ if (!vm_page_in_laundry(p) ||
p->hold_count != 0) { /* may be undergoing I/O */
vm_page_unlock(p);
ib = 0;
break;
}
+ pmap_remove_write(p);
vm_page_unlock(p);
mc[--page_base] = pb = p;
++pageout_count;
++ib;
+
/*
- * alignment boundry, stop here and switch directions. Do
- * not clear ib.
+ * We are at an alignment boundary. Stop here, and switch
+ * directions. Do not clear ib.
*/
if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
break;
}
-
while (pageout_count < vm_pageout_page_count &&
pindex + is < object->size) {
- vm_page_t p;
-
if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
break;
vm_page_test_dirty(p);
@@ -461,11 +422,12 @@
if (p->dirty == 0)
break;
vm_page_lock(p);
- if (p->queue != PQ_INACTIVE ||
+ if (!vm_page_in_laundry(p) ||
p->hold_count != 0) { /* may be undergoing I/O */
vm_page_unlock(p);
break;
}
+ pmap_remove_write(p);
vm_page_unlock(p);
mc[page_base + pageout_count] = ps = p;
++pageout_count;
@@ -474,17 +436,14 @@
/*
* If we exhausted our forward scan, continue with the reverse scan
- * when possible, even past a page boundry. This catches boundry
- * conditions.
+ * when possible, even past an alignment boundary. This catches
+ * boundary conditions.
*/
- if (ib && pageout_count < vm_pageout_page_count)
+ if (ib != 0 && pageout_count < vm_pageout_page_count)
goto more;
- /*
- * we allow reads during pageouts...
- */
- return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL,
- NULL));
+ return (vm_pageout_flush(&mc[page_base], pageout_count,
+ VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
}
/*
@@ -513,8 +472,8 @@
VM_OBJECT_ASSERT_WLOCKED(object);
/*
- * Initiate I/O. Bump the vm_page_t->busy counter and
- * mark the pages read-only.
+ * Initiate I/O. Mark the pages busy and verify that they're valid
+ * and read-only.
*
* We do not have to fixup the clean/dirty bits here... we can
* allow the pager to do it after the I/O completes.
@@ -526,8 +485,9 @@
KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
("vm_pageout_flush: partially invalid page %p index %d/%d",
mc[i], i, count));
+ KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0,
+ ("vm_pageout_flush: writeable page %p", mc[i]));
vm_page_sbusy(mc[i]);
- pmap_remove_write(mc[i]);
}
vm_object_pip_add(object, count);
@@ -544,23 +504,33 @@
("vm_pageout_flush: page %p is not write protected", mt));
switch (pageout_status[i]) {
case VM_PAGER_OK:
+ vm_page_lock(mt);
+ if (vm_page_in_laundry(mt))
+ vm_page_deactivate_noreuse(mt);
+ vm_page_unlock(mt);
+ /* FALLTHROUGH */
case VM_PAGER_PEND:
numpagedout++;
break;
case VM_PAGER_BAD:
/*
- * Page outside of range of object. Right now we
- * essentially lose the changes by pretending it
- * worked.
+ * The page is outside the object's range. We pretend
+ * that the page out worked and clean the page, so the
+ * changes will be lost if the page is reclaimed by
+ * the page daemon.
*/
vm_page_undirty(mt);
+ vm_page_lock(mt);
+ if (vm_page_in_laundry(mt))
+ vm_page_deactivate_noreuse(mt);
+ vm_page_unlock(mt);
break;
case VM_PAGER_ERROR:
case VM_PAGER_FAIL:
/*
- * If page couldn't be paged out, then reactivate the
- * page so it doesn't clog the inactive list. (We
- * will try paging out it again later).
+ * If the page couldn't be paged out, then reactivate
+ * it so that it doesn't clog the laundry and inactive
+ * queues. (We will try paging it out again later).
*/
vm_page_lock(mt);
vm_page_activate(mt);
@@ -583,11 +553,6 @@
if (pageout_status[i] != VM_PAGER_PEND) {
vm_object_pip_wakeup(object);
vm_page_sunbusy(mt);
- if (vm_page_count_severe()) {
- vm_page_lock(mt);
- vm_page_try_to_cache(mt);
- vm_page_unlock(mt);
- }
}
}
if (prunlen != NULL)
@@ -595,24 +560,172 @@
return (numpagedout);
}
-static boolean_t
-vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
- vm_paddr_t high)
+/*
+ * Attempt to acquire all of the necessary locks to launder a page and
+ * then call through the clustering layer to PUTPAGES. Wait a short
+ * time for a vnode lock.
+ *
+ * Requires the page and object lock on entry, releases both before return.
+ * Returns 0 on success and an errno otherwise.
+ */
+static int
+vm_pageout_clean(vm_page_t m, int *numpagedout)
{
+ struct vnode *vp;
struct mount *mp;
- struct vnode *vp;
vm_object_t object;
- vm_paddr_t pa;
- vm_page_t m, m_tmp, next;
- int lockmode;
+ vm_pindex_t pindex;
+ int error, lockmode;
+ vm_page_assert_locked(m);
+ object = m->object;
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ error = 0;
+ vp = NULL;
+ mp = NULL;
+
+ /*
+ * The object is already known NOT to be dead. It
+ * is possible for the vget() to block the whole
+ * pageout daemon, but the new low-memory handling
+ * code should prevent it.
+ *
+ * We can't wait forever for the vnode lock, we might
+ * deadlock due to a vn_read() getting stuck in
+ * vm_wait while holding this vnode. We skip the
+ * vnode if we can't get it in a reasonable amount
+ * of time.
+ */
+ if (object->type == OBJT_VNODE) {
+ vm_page_unlock(m);
+ vp = object->handle;
+ if (vp->v_type == VREG &&
+ vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+ mp = NULL;
+ error = EDEADLK;
+ goto unlock_all;
+ }
+ KASSERT(mp != NULL,
+ ("vp %p with NULL v_mount", vp));
+ vm_object_reference_locked(object);
+ pindex = m->pindex;
+ VM_OBJECT_WUNLOCK(object);
+ lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
+ LK_SHARED : LK_EXCLUSIVE;
+ if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
+ vp = NULL;
+ error = EDEADLK;
+ goto unlock_mp;
+ }
+ VM_OBJECT_WLOCK(object);
+
+ /*
+ * Ensure that the object and vnode were not disassociated
+ * while locks were dropped.
+ */
+ if (vp->v_object != object) {
+ error = ENOENT;
+ goto unlock_all;
+ }
+ vm_page_lock(m);
+
+ /*
+ * While the object and page were unlocked, the page
+ * may have been:
+ * (1) moved to a different queue,
+ * (2) reallocated to a different object,
+ * (3) reallocated to a different offset, or
+ * (4) cleaned.
+ */
+ if (!vm_page_in_laundry(m) || m->object != object ||
+ m->pindex != pindex || m->dirty == 0) {
+ vm_page_unlock(m);
+ error = ENXIO;
+ goto unlock_all;
+ }
+
+ /*
+ * The page may have been busied or held while the object
+ * and page locks were released.
+ */
+ if (vm_page_busied(m) || m->hold_count != 0) {
+ vm_page_unlock(m);
+ error = EBUSY;
+ goto unlock_all;
+ }
+ }
+
+ /*
+ * If a page is dirty, then it is either being washed
+ * (but not yet cleaned) or it is still in the
+ * laundry. If it is still in the laundry, then we
+ * start the cleaning operation.
+ */
+ if ((*numpagedout = vm_pageout_cluster(m)) == 0)
+ error = EIO;
+
+unlock_all:
+ VM_OBJECT_WUNLOCK(object);
+
+unlock_mp:
+ vm_page_lock_assert(m, MA_NOTOWNED);
+ if (mp != NULL) {
+ if (vp != NULL)
+ vput(vp);
+ vm_object_deallocate(object);
+ vn_finished_write(mp);
+ }
+
+ return (error);
+}
+
+/*
+ * Attempt to launder the specified number of pages.
+ *
+ * Returns the number of pages successfully laundered.
+ */
+static int
+vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
+{
+ struct vm_pagequeue *pq;
+ vm_object_t object;
+ vm_page_t m, next;
+ int act_delta, error, maxscan, numpagedout, starting_target;
+ int vnodes_skipped;
+ bool pageout_ok, queue_locked;
+
+ starting_target = launder;
+ vnodes_skipped = 0;
+
+ /*
+ * Scan the laundry queue for pages eligible to be laundered. We stop
+ * once the target number of dirty pages have been laundered, or once
+ * we've reached the end of the queue. A single iteration of this loop
+ * may cause more than one page to be laundered because of clustering.
+ *
+ * maxscan ensures that we don't re-examine requeued pages. Any
+ * additional pages written as part of a cluster are subtracted from
+ * maxscan since they must be taken from the laundry queue.
+ */
+ pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
+ maxscan = pq->pq_cnt;
+
vm_pagequeue_lock(pq);
- TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) {
+ queue_locked = true;
+ for (m = TAILQ_FIRST(&pq->pq_pl);
+ m != NULL && maxscan-- > 0 && launder > 0;
+ m = next) {
+ vm_pagequeue_assert_locked(pq);
+ KASSERT(queue_locked, ("unlocked laundry queue"));
+ KASSERT(vm_page_in_laundry(m),
+ ("page %p has an inconsistent queue", m));
+ next = TAILQ_NEXT(m, plinks.q);
if ((m->flags & PG_MARKER) != 0)
continue;
- pa = VM_PAGE_TO_PHYS(m);
- if (pa < low || pa + PAGE_SIZE > high)
- continue;
+ KASSERT((m->flags & PG_FICTITIOUS) == 0,
+ ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
vm_page_unlock(m);
continue;
@@ -621,326 +734,341 @@
if ((!VM_OBJECT_TRYWLOCK(object) &&
(!vm_pageout_fallback_object_lock(m, &next) ||
m->hold_count != 0)) || vm_page_busied(m)) {
+ VM_OBJECT_WUNLOCK(object);
vm_page_unlock(m);
- VM_OBJECT_WUNLOCK(object);
continue;
}
- vm_page_test_dirty(m);
- if (m->dirty == 0 && object->ref_count != 0)
- pmap_remove_all(m);
- if (m->dirty != 0) {
- vm_page_unlock(m);
- if (tries == 0 || (object->flags & OBJ_DEAD) != 0) {
- VM_OBJECT_WUNLOCK(object);
- continue;
- }
- if (object->type == OBJT_VNODE) {
- vm_pagequeue_unlock(pq);
- vp = object->handle;
- vm_object_reference_locked(object);
- VM_OBJECT_WUNLOCK(object);
- (void)vn_start_write(vp, &mp, V_WAIT);
- lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
- LK_SHARED : LK_EXCLUSIVE;
- vn_lock(vp, lockmode | LK_RETRY);
- VM_OBJECT_WLOCK(object);
- vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
- VM_OBJECT_WUNLOCK(object);
- VOP_UNLOCK(vp, 0);
- vm_object_deallocate(object);
- vn_finished_write(mp);
- return (TRUE);
- } else if (object->type == OBJT_SWAP ||
- object->type == OBJT_DEFAULT) {
- vm_pagequeue_unlock(pq);
- m_tmp = m;
- vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC,
- 0, NULL, NULL);
- VM_OBJECT_WUNLOCK(object);
- return (TRUE);
- }
- } else {
- /*
- * Dequeue here to prevent lock recursion in
- * vm_page_cache().
- */
- vm_page_dequeue_locked(m);
- vm_page_cache(m);
- vm_page_unlock(m);
+
+ /*
+ * Unlock the laundry queue, invalidating the 'next' pointer.
+ * Use a marker to remember our place in the laundry queue.
+ */
+ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker,
+ plinks.q);
+ vm_pagequeue_unlock(pq);
+ queue_locked = false;
+
+ /*
+ * Invalid pages can be easily freed. They cannot be
+ * mapped; vm_page_free() asserts this.
+ */
+ if (m->valid == 0)
+ goto free_page;
+
+ /*
+ * If the page has been referenced and the object is not dead,
+ * reactivate or requeue the page depending on whether the
+ * object is mapped.
+ */
+ if ((m->aflags & PGA_REFERENCED) != 0) {
+ vm_page_aflag_clear(m, PGA_REFERENCED);
+ act_delta = 1;
+ } else
+ act_delta = 0;
+ if (object->ref_count != 0)
+ act_delta += pmap_ts_referenced(m);
+ else {
+ KASSERT(!pmap_page_is_mapped(m),
+ ("page %p is mapped", m));
}
- VM_OBJECT_WUNLOCK(object);
- }
- vm_pagequeue_unlock(pq);
- return (FALSE);
-}
+ if (act_delta != 0) {
+ if (object->ref_count != 0) {
+ PCPU_INC(cnt.v_reactivated);
+ vm_page_activate(m);
-/*
- * Increase the number of cached pages. The specified value, "tries",
- * determines which categories of pages are cached:
- *
- * 0: All clean, inactive pages within the specified physical address range
- * are cached. Will not sleep.
- * 1: The vm_lowmem handlers are called. All inactive pages within
- * the specified physical address range are cached. May sleep.
- * 2: The vm_lowmem handlers are called. All inactive and active pages
- * within the specified physical address range are cached. May sleep.
- */
-void
-vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
-{
- int actl, actmax, inactl, inactmax, dom, initial_dom;
- static int start_dom = 0;
+ /*
+ * Increase the activation count if the page
+ * was referenced while in the laundry queue.
+ * This makes it less likely that the page will
+ * be returned prematurely to the inactive
+ * queue.
+ */
+ m->act_count += act_delta + ACT_ADVANCE;
- if (tries > 0) {
+ /*
+ * If this was a background laundering, count
+ * activated pages towards our target. The
+ * purpose of background laundering is to ensure
+ * that pages are eventually cycled through the
+ * laundry queue, and an activation is a valid
+ * way out.
+ */
+ if (!in_shortfall)
+ launder--;
+ goto drop_page;
+ } else if ((object->flags & OBJ_DEAD) == 0)
+ goto requeue_page;
+ }
+
/*
- * Decrease registered cache sizes. The vm_lowmem handlers
- * may acquire locks and/or sleep, so they can only be invoked
- * when "tries" is greater than zero.
+ * If the page appears to be clean at the machine-independent
+ * layer, then remove all of its mappings from the pmap in
+ * anticipation of freeing it. If, however, any of the page's
+ * mappings allow write access, then the page may still be
+ * modified until the last of those mappings are removed.
*/
- SDT_PROBE0(vm, , , vm__lowmem_cache);
- EVENTHANDLER_INVOKE(vm_lowmem, 0);
+ if (object->ref_count != 0) {
+ vm_page_test_dirty(m);
+ if (m->dirty == 0)
+ pmap_remove_all(m);
+ }
/*
- * We do this explicitly after the caches have been drained
- * above.
+ * Clean pages are freed, and dirty pages are paged out unless
+ * they belong to a dead object. Requeueing dirty pages from
+ * dead objects is pointless, as they are being paged out and
+ * freed by the thread that destroyed the object.
*/
- uma_reclaim();
+ if (m->dirty == 0) {
+free_page:
+ vm_page_free(m);
+ PCPU_INC(cnt.v_dfree);
+ } else if ((object->flags & OBJ_DEAD) == 0) {
+ if (object->type != OBJT_SWAP &&
+ object->type != OBJT_DEFAULT)
+ pageout_ok = true;
+ else if (disable_swap_pageouts)
+ pageout_ok = false;
+ else
+ pageout_ok = true;
+ if (!pageout_ok) {
+requeue_page:
+ vm_pagequeue_lock(pq);
+ queue_locked = true;
+ vm_page_requeue_locked(m);
+ goto drop_page;
+ }
+
+ /*
+ * Form a cluster with adjacent, dirty pages from the
+ * same object, and page out that entire cluster.
+ *
+ * The adjacent, dirty pages must also be in the
+ * laundry. However, their mappings are not checked
+ * for new references. Consequently, a recently
+ * referenced page may be paged out. However, that
+ * page will not be prematurely reclaimed. After page
+ * out, the page will be placed in the inactive queue,
+ * where any new references will be detected and the
+ * page reactivated.
+ */
+ error = vm_pageout_clean(m, &numpagedout);
+ if (error == 0) {
+ launder -= numpagedout;
+ maxscan -= numpagedout - 1;
+ } else if (error == EDEADLK) {
+ pageout_lock_miss++;
+ vnodes_skipped++;
+ }
+ goto relock_queue;
+ }
+drop_page:
+ vm_page_unlock(m);
+ VM_OBJECT_WUNLOCK(object);
+relock_queue:
+ if (!queue_locked) {
+ vm_pagequeue_lock(pq);
+ queue_locked = true;
+ }
+ next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q);
+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q);
}
+ vm_pagequeue_unlock(pq);
/*
- * Make the next scan start on the next domain.
+ * Wakeup the sync daemon if we skipped a vnode in a writeable object
+ * and we didn't launder enough pages.
*/
- initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
+ if (vnodes_skipped > 0 && launder > 0)
+ (void)speedup_syncer();
- inactl = 0;
- inactmax = cnt.v_inactive_count;
- actl = 0;
- actmax = tries < 2 ? 0 : cnt.v_active_count;
- dom = initial_dom;
-
- /*
- * Scan domains in round-robin order, first inactive queues,
- * then active. Since domain usually owns large physically
- * contiguous chunk of memory, it makes sense to completely
- * exhaust one domain before switching to next, while growing
- * the pool of contiguous physical pages.
- *
- * Do not even start launder a domain which cannot contain
- * the specified address range, as indicated by segments
- * constituting the domain.
- */
-again_inact:
- if (inactl < inactmax) {
- if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
- low, high) &&
- vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
- tries, low, high)) {
- inactl++;
- goto again_inact;
- }
- if (++dom == vm_ndomains)
- dom = 0;
- if (dom != initial_dom)
- goto again_inact;
- }
-again_act:
- if (actl < actmax) {
- if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
- low, high) &&
- vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
- tries, low, high)) {
- actl++;
- goto again_act;
- }
- if (++dom == vm_ndomains)
- dom = 0;
- if (dom != initial_dom)
- goto again_act;
- }
+ return (starting_target - launder);
}
-#if !defined(NO_SWAPPING)
/*
- * vm_pageout_object_deactivate_pages
- *
- * Deactivate enough pages to satisfy the inactive target
- * requirements.
- *
- * The object and map must be locked.
+ * Compute the integer square root.
*/
-static void
-vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
- long desired)
+static u_int
+isqrt(u_int num)
{
- vm_object_t backing_object, object;
- vm_page_t p;
- int act_delta, remove_mode;
+ u_int bit, root, tmp;
- VM_OBJECT_ASSERT_LOCKED(first_object);
- if ((first_object->flags & OBJ_FICTITIOUS) != 0)
- return;
- for (object = first_object;; object = backing_object) {
- if (pmap_resident_count(pmap) <= desired)
- goto unlock_return;
- VM_OBJECT_ASSERT_LOCKED(object);
- if ((object->flags & OBJ_UNMANAGED) != 0 ||
- object->paging_in_progress != 0)
- goto unlock_return;
-
- remove_mode = 0;
- if (object->shadow_count > 1)
- remove_mode = 1;
- /*
- * Scan the object's entire memory queue.
- */
- TAILQ_FOREACH(p, &object->memq, listq) {
- if (pmap_resident_count(pmap) <= desired)
- goto unlock_return;
- if (vm_page_busied(p))
- continue;
- PCPU_INC(cnt.v_pdpages);
- vm_page_lock(p);
- if (p->wire_count != 0 || p->hold_count != 0 ||
- !pmap_page_exists_quick(pmap, p)) {
- vm_page_unlock(p);
- continue;
- }
- act_delta = pmap_ts_referenced(p);
- if ((p->aflags & PGA_REFERENCED) != 0) {
- if (act_delta == 0)
- act_delta = 1;
- vm_page_aflag_clear(p, PGA_REFERENCED);
- }
- if (p->queue != PQ_ACTIVE && act_delta != 0) {
- vm_page_activate(p);
- p->act_count += act_delta;
- } else if (p->queue == PQ_ACTIVE) {
- if (act_delta == 0) {
- p->act_count -= min(p->act_count,
- ACT_DECLINE);
- if (!remove_mode && p->act_count == 0) {
- pmap_remove_all(p);
- vm_page_deactivate(p);
- } else
- vm_page_requeue(p);
- } else {
- vm_page_activate(p);
- if (p->act_count < ACT_MAX -
- ACT_ADVANCE)
- p->act_count += ACT_ADVANCE;
- vm_page_requeue(p);
- }
- } else if (p->queue == PQ_INACTIVE)
- pmap_remove_all(p);
- vm_page_unlock(p);
+ bit = 1u << ((NBBY * sizeof(u_int)) - 2);
+ while (bit > num)
+ bit >>= 2;
+ root = 0;
+ while (bit != 0) {
+ tmp = root + bit;
+ root >>= 1;
+ if (num >= tmp) {
+ num -= tmp;
+ root += bit;
}
- if ((backing_object = object->backing_object) == NULL)
- goto unlock_return;
- VM_OBJECT_RLOCK(backing_object);
- if (object != first_object)
- VM_OBJECT_RUNLOCK(object);
+ bit >>= 2;
}
-unlock_return:
- if (object != first_object)
- VM_OBJECT_RUNLOCK(object);
+ return (root);
}
/*
- * deactivate some number of pages in a map, try to do it fairly, but
- * that is really hard to do.
+ * Perform the work of the laundry thread: periodically wake up and determine
+ * whether any pages need to be laundered. If so, determine the number of pages
+ * that need to be laundered, and launder them.
*/
static void
-vm_pageout_map_deactivate_pages(map, desired)
- vm_map_t map;
- long desired;
+vm_pageout_laundry_worker(void *arg)
{
- vm_map_entry_t tmpe;
- vm_object_t obj, bigobj;
- int nothingwired;
+ struct vm_domain *domain;
+ struct vm_pagequeue *pq;
+ uint64_t nclean, ndirty;
+ u_int last_launder, wakeups;
+ int domidx, last_target, launder, shortfall, shortfall_cycle, target;
+ bool in_shortfall;
- if (!vm_map_trylock(map))
- return;
+ domidx = (uintptr_t)arg;
+ domain = &vm_dom[domidx];
+ pq = &domain->vmd_pagequeues[PQ_LAUNDRY];
+ KASSERT(domain->vmd_segs != 0, ("domain without segments"));
+ vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY);
- bigobj = NULL;
- nothingwired = TRUE;
+ shortfall = 0;
+ in_shortfall = false;
+ shortfall_cycle = 0;
+ target = 0;
+ last_launder = 0;
/*
- * first, search out the biggest object, and try to free pages from
- * that.
+ * The pageout laundry worker is never done, so loop forever.
*/
- tmpe = map->header.next;
- while (tmpe != &map->header) {
- if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
- obj = tmpe->object.vm_object;
- if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
- if (obj->shadow_count <= 1 &&
- (bigobj == NULL ||
- bigobj->resident_page_count < obj->resident_page_count)) {
- if (bigobj != NULL)
- VM_OBJECT_RUNLOCK(bigobj);
- bigobj = obj;
- } else
- VM_OBJECT_RUNLOCK(obj);
- }
+ for (;;) {
+ KASSERT(target >= 0, ("negative target %d", target));
+ KASSERT(shortfall_cycle >= 0,
+ ("negative cycle %d", shortfall_cycle));
+ launder = 0;
+ wakeups = VM_METER_PCPU_CNT(v_pdwakeups);
+
+ /*
+ * First determine whether we need to launder pages to meet a
+ * shortage of free pages.
+ */
+ if (shortfall > 0) {
+ in_shortfall = true;
+ shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
+ target = shortfall;
+ } else if (!in_shortfall)
+ goto trybackground;
+ else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) {
+ /*
+ * We recently entered shortfall and began laundering
+ * pages. If we have completed that laundering run
+ * (and we are no longer in shortfall) or we have met
+ * our laundry target through other activity, then we
+ * can stop laundering pages.
+ */
+ in_shortfall = false;
+ target = 0;
+ goto trybackground;
}
- if (tmpe->wired_count > 0)
- nothingwired = FALSE;
- tmpe = tmpe->next;
- }
+ last_launder = wakeups;
+ launder = target / shortfall_cycle--;
+ goto dolaundry;
- if (bigobj != NULL) {
- vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
- VM_OBJECT_RUNLOCK(bigobj);
- }
- /*
- * Next, hunt around for other pages to deactivate. We actually
- * do this search sort of wrong -- .text first is not the best idea.
- */
- tmpe = map->header.next;
- while (tmpe != &map->header) {
- if (pmap_resident_count(vm_map_pmap(map)) <= desired)
- break;
- if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
- obj = tmpe->object.vm_object;
- if (obj != NULL) {
- VM_OBJECT_RLOCK(obj);
- vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
- VM_OBJECT_RUNLOCK(obj);
+ /*
+ * There's no immediate need to launder any pages; see if we
+ * meet the conditions to perform background laundering:
+ *
+ * 1. The ratio of dirty to clean inactive pages exceeds the
+ * background laundering threshold and the pagedaemon has
+ * been woken up to reclaim pages since our last
+ * laundering, or
+ * 2. we haven't yet reached the target of the current
+ * background laundering run.
+ *
+ * The background laundering threshold is not a constant.
+ * Instead, it is a slowly growing function of the number of
+ * page daemon wakeups since the last laundering. Thus, as the
+ * ratio of dirty to clean inactive pages grows, the amount of
+ * memory pressure required to trigger laundering decreases.
+ */
+trybackground:
+ nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count;
+ ndirty = vm_cnt.v_laundry_count;
+ if (target == 0 && wakeups != last_launder &&
+ ndirty * isqrt(wakeups - last_launder) >= nclean) {
+ target = vm_background_launder_target;
+ }
+
+ /*
+ * We have a non-zero background laundering target. If we've
+ * laundered up to our maximum without observing a page daemon
+ * wakeup, just stop. This is a safety belt that ensures we
+ * don't launder an excessive amount if memory pressure is low
+ * and the ratio of dirty to clean pages is large. Otherwise,
+ * proceed at the background laundering rate.
+ */
+ if (target > 0) {
+ if (wakeups != last_launder) {
+ last_launder = wakeups;
+ last_target = target;
+ } else if (last_target - target >=
+ vm_background_launder_max * PAGE_SIZE / 1024) {
+ target = 0;
}
+ launder = vm_background_launder_rate * PAGE_SIZE / 1024;
+ launder /= VM_LAUNDER_RATE;
+ if (launder > target)
+ launder = target;
}
- tmpe = tmpe->next;
- }
-#ifdef __ia64__
- /*
- * Remove all non-wired, managed mappings if a process is swapped out.
- * This will free page table pages.
- */
- if (desired == 0)
- pmap_remove_pages(map->pmap);
-#else
- /*
- * Remove all mappings if a process is swapped out, this will free page
- * table pages.
- */
- if (desired == 0 && nothingwired) {
- pmap_remove(vm_map_pmap(map), vm_map_min(map),
- vm_map_max(map));
+dolaundry:
+ if (launder > 0) {
+ /*
+ * Because of I/O clustering, the number of laundered
+ * pages could exceed "target" by the maximum size of
+ * a cluster minus one.
+ */
+ target -= min(vm_pageout_launder(domain, launder,
+ in_shortfall), target);
+ pause("laundp", hz / VM_LAUNDER_RATE);
+ }
+
+ /*
+ * If we're not currently laundering pages and the page daemon
+ * hasn't posted a new request, sleep until the page daemon
+ * kicks us.
+ */
+ vm_pagequeue_lock(pq);
+ if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE)
+ (void)mtx_sleep(&vm_laundry_request,
+ vm_pagequeue_lockptr(pq), PVM, "launds", 0);
+
+ /*
+ * If the pagedaemon has indicated that it's in shortfall, start
+ * a shortfall laundering unless we're already in the middle of
+ * one. This may preempt a background laundering.
+ */
+ if (vm_laundry_request == VM_LAUNDRY_SHORTFALL &&
+ (!in_shortfall || shortfall_cycle == 0)) {
+ shortfall = vm_laundry_target() + vm_pageout_deficit;
+ target = 0;
+ } else
+ shortfall = 0;
+
+ if (target == 0)
+ vm_laundry_request = VM_LAUNDRY_IDLE;
+ vm_pagequeue_unlock(pq);
}
-#endif
-
- vm_map_unlock(map);
}
-#endif /* !defined(NO_SWAPPING) */
/*
* vm_pageout_scan does the dirty work for the pageout daemon.
*
- * pass 0 - Update active LRU/deactivate pages
- * pass 1 - Move inactive to cache or free
- * pass 2 - Launder dirty pages
+ * pass == 0: Update active LRU/deactivate pages
+ * pass >= 1: Free inactive pages
+ *
+ * Returns true if pass was zero or enough pages were freed by the inactive
+ * queue scan to meet the target.
*/
-static void
+static bool
vm_pageout_scan(struct vm_domain *vmd, int pass)
{
vm_page_t m, next;
@@ -947,10 +1075,8 @@
struct vm_pagequeue *pq;
vm_object_t object;
long min_scan;
- int act_delta, addl_page_shortage, deficit, maxscan, page_shortage;
- int vnodes_skipped = 0;
- int maxlaunder, scan_tick, scanned, starting_page_shortage;
- int lockmode;
+ int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
+ int page_shortage, scan_tick, scanned, starting_page_shortage;
boolean_t queue_locked;
/*
@@ -981,8 +1107,9 @@
addl_page_shortage = 0;
/*
- * Calculate the number of pages we want to either free or move
- * to the cache.
+ * Calculate the number of pages that we want to free. This number
+ * can be negative if many pages are freed between the wakeup call to
+ * the page daemon and this calculation.
*/
if (pass > 0) {
deficit = atomic_readandclear_int(&vm_pageout_deficit);
@@ -992,27 +1119,11 @@
starting_page_shortage = page_shortage;
/*
- * maxlaunder limits the number of dirty pages we flush per scan.
- * For most systems a smaller value (16 or 32) is more robust under
- * extreme memory and disk pressure because any unnecessary writes
- * to disk can result in extreme performance degredation. However,
- * systems with excessive dirty pages (especially when MAP_NOSYNC is
- * used) will die horribly with limited laundering. If the pageout
- * daemon cannot clean enough pages in the first pass, we let it go
- * all out in succeeding passes.
+ * Start scanning the inactive queue for pages that we can free. The
+ * scan will stop when we reach the target or we have scanned the
+ * entire queue. (Note that m->act_count is not used to make
+ * decisions for the inactive queue, only for the active queue.)
*/
- if ((maxlaunder = vm_max_launder) <= 1)
- maxlaunder = 1;
- if (pass > 1)
- maxlaunder = 10000;
-
- /*
- * Start scanning the inactive queue for pages we can move to the
- * cache or free. The scan will stop when the target is reached or
- * we have scanned the entire inactive queue. Note that m->act_count
- * is not used to form decisions for the inactive queue, only for the
- * active queue.
- */
pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
maxscan = pq->pq_cnt;
vm_pagequeue_lock(pq);
@@ -1022,7 +1133,7 @@
m = next) {
vm_pagequeue_assert_locked(pq);
KASSERT(queue_locked, ("unlocked inactive queue"));
- KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m));
+ KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
PCPU_INC(cnt.v_pdpages);
next = TAILQ_NEXT(m, plinks.q);
@@ -1044,55 +1155,76 @@
* different position within the queue. In either
* case, addl_page_shortage should not be incremented.
*/
- if (!vm_pageout_page_lock(m, &next)) {
- vm_page_unlock(m);
- continue;
+ if (!vm_pageout_page_lock(m, &next))
+ goto unlock_page;
+ else if (m->hold_count != 0) {
+ /*
+ * Held pages are essentially stuck in the
+ * queue. So, they ought to be discounted
+ * from the inactive count. See the
+ * calculation of inactq_shortage before the
+ * loop over the active queue below.
+ */
+ addl_page_shortage++;
+ goto unlock_page;
}
object = m->object;
- if (!VM_OBJECT_TRYWLOCK(object) &&
- !vm_pageout_fallback_object_lock(m, &next)) {
- vm_page_unlock(m);
- VM_OBJECT_WUNLOCK(object);
- continue;
+ if (!VM_OBJECT_TRYWLOCK(object)) {
+ if (!vm_pageout_fallback_object_lock(m, &next))
+ goto unlock_object;
+ else if (m->hold_count != 0) {
+ addl_page_shortage++;
+ goto unlock_object;
+ }
}
-
- /*
- * Don't mess with busy pages, keep them at at the
- * front of the queue, most likely they are being
- * paged out. Increment addl_page_shortage for busy
- * pages, because they may leave the inactive queue
- * shortly after page scan is finished.
- */
if (vm_page_busied(m)) {
+ /*
+ * Don't mess with busy pages. Leave them at
+ * the front of the queue. Most likely, they
+ * are being paged out and will leave the
+ * queue shortly after the scan finishes. So,
+ * they ought to be discounted from the
+ * inactive count.
+ */
+ addl_page_shortage++;
+unlock_object:
+ VM_OBJECT_WUNLOCK(object);
+unlock_page:
vm_page_unlock(m);
- VM_OBJECT_WUNLOCK(object);
- addl_page_shortage++;
continue;
}
+ KASSERT(m->hold_count == 0, ("Held page %p", m));
/*
- * We unlock the inactive page queue, invalidating the
- * 'next' pointer. Use our marker to remember our
- * place.
+ * Dequeue the inactive page and unlock the inactive page
+ * queue, invalidating the 'next' pointer. Dequeueing the
+ * page here avoids a later reacquisition (and release) of
+ * the inactive page queue lock when vm_page_activate(),
+ * vm_page_free(), or vm_page_launder() is called. Use a
+ * marker to remember our place in the inactive queue.
*/
TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
+ vm_page_dequeue_locked(m);
vm_pagequeue_unlock(pq);
queue_locked = FALSE;
/*
- * We bump the activation count if the page has been
- * referenced while in the inactive queue. This makes
- * it less likely that the page will be added back to the
- * inactive queue prematurely again. Here we check the
- * page tables (or emulated bits, if any), given the upper
- * level VM system not knowing anything about existing
- * references.
+ * Invalid pages can be easily freed. They cannot be
+ * mapped, vm_page_free() asserts this.
*/
- act_delta = 0;
+ if (m->valid == 0)
+ goto free_page;
+
+ /*
+ * If the page has been referenced and the object is not dead,
+ * reactivate or requeue the page depending on whether the
+ * object is mapped.
+ */
if ((m->aflags & PGA_REFERENCED) != 0) {
vm_page_aflag_clear(m, PGA_REFERENCED);
act_delta = 1;
- }
+ } else
+ act_delta = 0;
if (object->ref_count != 0) {
act_delta += pmap_ts_referenced(m);
} else {
@@ -1099,47 +1231,36 @@
KASSERT(!pmap_page_is_mapped(m),
("vm_pageout_scan: page %p is mapped", m));
}
-
- /*
- * If the upper level VM system knows about any page
- * references, we reactivate the page or requeue it.
- */
if (act_delta != 0) {
- if (object->ref_count) {
+ if (object->ref_count != 0) {
+ PCPU_INC(cnt.v_reactivated);
vm_page_activate(m);
+
+ /*
+ * Increase the activation count if the page
+ * was referenced while in the inactive queue.
+ * This makes it less likely that the page will
+ * be returned prematurely to the inactive
+ * queue.
+ */
m->act_count += act_delta + ACT_ADVANCE;
- } else {
+ goto drop_page;
+ } else if ((object->flags & OBJ_DEAD) == 0) {
vm_pagequeue_lock(pq);
queue_locked = TRUE;
- vm_page_requeue_locked(m);
+ m->queue = PQ_INACTIVE;
+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+ vm_pagequeue_cnt_inc(pq);
+ goto drop_page;
}
- VM_OBJECT_WUNLOCK(object);
- vm_page_unlock(m);
- goto relock_queue;
}
- if (m->hold_count != 0) {
- vm_page_unlock(m);
- VM_OBJECT_WUNLOCK(object);
-
- /*
- * Held pages are essentially stuck in the
- * queue. So, they ought to be discounted
- * from the inactive count. See the
- * calculation of the page_shortage for the
- * loop over the active queue below.
- */
- addl_page_shortage++;
- goto relock_queue;
- }
-
/*
* If the page appears to be clean at the machine-independent
* layer, then remove all of its mappings from the pmap in
- * anticipation of placing it onto the cache queue. If,
- * however, any of the page's mappings allow write access,
- * then the page may still be modified until the last of those
- * mappings are removed.
+ * anticipation of freeing it. If, however, any of the page's
+ * mappings allow write access, then the page may still be
+ * modified until the last of those mappings are removed.
*/
if (object->ref_count != 0) {
vm_page_test_dirty(m);
@@ -1147,199 +1268,23 @@
pmap_remove_all(m);
}
- if (m->valid == 0) {
- /*
- * Invalid pages can be easily freed
- */
+ /*
+ * Clean pages can be freed, but dirty pages must be sent back
+ * to the laundry, unless they belong to a dead object.
+ * Requeueing dirty pages from dead objects is pointless, as
+ * they are being paged out and freed by the thread that
+ * destroyed the object.
+ */
+ if (m->dirty == 0) {
+free_page:
vm_page_free(m);
PCPU_INC(cnt.v_dfree);
--page_shortage;
- } else if (m->dirty == 0) {
- /*
- * Clean pages can be placed onto the cache queue.
- * This effectively frees them.
- */
- vm_page_cache(m);
- --page_shortage;
- } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) {
- /*
- * Dirty pages need to be paged out, but flushing
- * a page is extremely expensive verses freeing
- * a clean page. Rather then artificially limiting
- * the number of pages we can flush, we instead give
- * dirty pages extra priority on the inactive queue
- * by forcing them to be cycled through the queue
- * twice before being flushed, after which the
- * (now clean) page will cycle through once more
- * before being freed. This significantly extends
- * the thrash point for a heavily loaded machine.
- */
- m->flags |= PG_WINATCFLS;
- vm_pagequeue_lock(pq);
- queue_locked = TRUE;
- vm_page_requeue_locked(m);
- } else if (maxlaunder > 0) {
- /*
- * We always want to try to flush some dirty pages if
- * we encounter them, to keep the system stable.
- * Normally this number is small, but under extreme
- * pressure where there are insufficient clean pages
- * on the inactive queue, we may have to go all out.
- */
- int swap_pageouts_ok;
- struct vnode *vp = NULL;
- struct mount *mp = NULL;
-
- if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
- swap_pageouts_ok = 1;
- } else {
- swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
- swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
- vm_page_count_min());
-
- }
-
- /*
- * We don't bother paging objects that are "dead".
- * Those objects are in a "rundown" state.
- */
- if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
- vm_pagequeue_lock(pq);
- vm_page_unlock(m);
- VM_OBJECT_WUNLOCK(object);
- queue_locked = TRUE;
- vm_page_requeue_locked(m);
- goto relock_queue;
- }
-
- /*
- * The object is already known NOT to be dead. It
- * is possible for the vget() to block the whole
- * pageout daemon, but the new low-memory handling
- * code should prevent it.
- *
- * The previous code skipped locked vnodes and, worse,
- * reordered pages in the queue. This results in
- * completely non-deterministic operation and, on a
- * busy system, can lead to extremely non-optimal
- * pageouts. For example, it can cause clean pages
- * to be freed and dirty pages to be moved to the end
- * of the queue. Since dirty pages are also moved to
- * the end of the queue once-cleaned, this gives
- * way too large a weighting to defering the freeing
- * of dirty pages.
- *
- * We can't wait forever for the vnode lock, we might
- * deadlock due to a vn_read() getting stuck in
- * vm_wait while holding this vnode. We skip the
- * vnode if we can't get it in a reasonable amount
- * of time.
- */
- if (object->type == OBJT_VNODE) {
- vm_page_unlock(m);
- vp = object->handle;
- if (vp->v_type == VREG &&
- vn_start_write(vp, &mp, V_NOWAIT) != 0) {
- mp = NULL;
- ++pageout_lock_miss;
- if (object->flags & OBJ_MIGHTBEDIRTY)
- vnodes_skipped++;
- goto unlock_and_continue;
- }
- KASSERT(mp != NULL,
- ("vp %p with NULL v_mount", vp));
- vm_object_reference_locked(object);
- VM_OBJECT_WUNLOCK(object);
- lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
- LK_SHARED : LK_EXCLUSIVE;
- if (vget(vp, lockmode | LK_TIMELOCK,
- curthread)) {
- VM_OBJECT_WLOCK(object);
- ++pageout_lock_miss;
- if (object->flags & OBJ_MIGHTBEDIRTY)
- vnodes_skipped++;
- vp = NULL;
- goto unlock_and_continue;
- }
- VM_OBJECT_WLOCK(object);
- vm_page_lock(m);
- vm_pagequeue_lock(pq);
- queue_locked = TRUE;
- /*
- * The page might have been moved to another
- * queue during potential blocking in vget()
- * above. The page might have been freed and
- * reused for another vnode.
- */
- if (m->queue != PQ_INACTIVE ||
- m->object != object ||
- TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) {
- vm_page_unlock(m);
- if (object->flags & OBJ_MIGHTBEDIRTY)
- vnodes_skipped++;
- goto unlock_and_continue;
- }
-
- /*
- * The page may have been busied during the
- * blocking in vget(). We don't move the
- * page back onto the end of the queue so that
- * statistics are more correct if we don't.
- */
- if (vm_page_busied(m)) {
- vm_page_unlock(m);
- addl_page_shortage++;
- goto unlock_and_continue;
- }
-
- /*
- * If the page has become held it might
- * be undergoing I/O, so skip it
- */
- if (m->hold_count != 0) {
- vm_page_unlock(m);
- addl_page_shortage++;
- if (object->flags & OBJ_MIGHTBEDIRTY)
- vnodes_skipped++;
- goto unlock_and_continue;
- }
- vm_pagequeue_unlock(pq);
- queue_locked = FALSE;
- }
-
- /*
- * If a page is dirty, then it is either being washed
- * (but not yet cleaned) or it is still in the
- * laundry. If it is still in the laundry, then we
- * start the cleaning operation.
- *
- * decrement page_shortage on success to account for
- * the (future) cleaned page. Otherwise we could wind
- * up laundering or cleaning too many pages.
- */
- if (vm_pageout_clean(m) != 0) {
- --page_shortage;
- --maxlaunder;
- }
-unlock_and_continue:
- vm_page_lock_assert(m, MA_NOTOWNED);
- VM_OBJECT_WUNLOCK(object);
- if (mp != NULL) {
- if (queue_locked) {
- vm_pagequeue_unlock(pq);
- queue_locked = FALSE;
- }
- if (vp != NULL)
- vput(vp);
- vm_object_deallocate(object);
- vn_finished_write(mp);
- }
- vm_page_lock_assert(m, MA_NOTOWNED);
- goto relock_queue;
- }
+ } else if ((object->flags & OBJ_DEAD) == 0)
+ vm_page_launder(m);
+drop_page:
vm_page_unlock(m);
VM_OBJECT_WUNLOCK(object);
-relock_queue:
if (!queue_locked) {
vm_pagequeue_lock(pq);
queue_locked = TRUE;
@@ -1349,22 +1294,30 @@
}
vm_pagequeue_unlock(pq);
-#if !defined(NO_SWAPPING)
/*
- * Wakeup the swapout daemon if we didn't cache or free the targeted
- * number of pages.
+ * Wake up the laundry thread so that it can perform any needed
+ * laundering. If we didn't meet our target, we're in shortfall and
+ * need to launder more aggressively.
*/
- if (vm_swap_enabled && page_shortage > 0)
- vm_req_vmdaemon(VM_SWAP_NORMAL);
-#endif
+ if (vm_laundry_request == VM_LAUNDRY_IDLE &&
+ starting_page_shortage > 0) {
+ pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY];
+ vm_pagequeue_lock(pq);
+ if (page_shortage > 0) {
+ vm_laundry_request = VM_LAUNDRY_SHORTFALL;
+ PCPU_INC(cnt.v_pdshortfalls);
+ } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL)
+ vm_laundry_request = VM_LAUNDRY_BACKGROUND;
+ wakeup(&vm_laundry_request);
+ vm_pagequeue_unlock(pq);
+ }
/*
- * Wakeup the sync daemon if we skipped a vnode in a writeable object
- * and we didn't cache or free enough pages.
+ * Wakeup the swapout daemon if we didn't free the targeted number of
+ * pages.
*/
- if (vnodes_skipped > 0 && page_shortage > cnt.v_free_target -
- cnt.v_free_min)
- (void)speedup_syncer();
+ if (page_shortage > 0)
+ vm_swapout_run();
/*
* If the inactive queue scan fails repeatedly to meet its
@@ -1374,10 +1327,20 @@
/*
* Compute the number of pages we want to try to move from the
- * active queue to the inactive queue.
+ * active queue to either the inactive or laundry queue.
+ *
+ * When scanning active pages, we make clean pages count more heavily
+ * towards the page shortage than dirty pages. This is because dirty
+ * pages must be laundered before they can be reused and thus have less
+ * utility when attempting to quickly alleviate a shortage. However,
+ * this weighting also causes the scan to deactivate dirty pages more
+ * more aggressively, improving the effectiveness of clustering and
+ * ensuring that they can eventually be reused.
*/
- page_shortage = cnt.v_inactive_target - cnt.v_inactive_count +
+ inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count +
+ vm_cnt.v_laundry_count / act_scan_laundry_weight) +
vm_paging_target() + deficit + addl_page_shortage;
+ inactq_shortage *= act_scan_laundry_weight;
pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
vm_pagequeue_lock(pq);
@@ -1394,7 +1357,7 @@
min_scan /= hz * vm_pageout_update_period;
} else
min_scan = 0;
- if (min_scan > 0 || (page_shortage > 0 && maxscan > 0))
+ if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0))
vmd->vmd_last_active_scan = scan_tick;
/*
@@ -1403,7 +1366,7 @@
* candidates. Held pages may be deactivated.
*/
for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
- min_scan || (page_shortage > 0 && scanned < maxscan)); m = next,
+ min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next,
scanned++) {
KASSERT(m->queue == PQ_ACTIVE,
("vm_pageout_scan: page %p isn't active", m));
@@ -1428,11 +1391,12 @@
/*
* Check to see "how much" the page has been used.
*/
- act_delta = 0;
- if (m->aflags & PGA_REFERENCED) {
+ if ((m->aflags & PGA_REFERENCED) != 0) {
vm_page_aflag_clear(m, PGA_REFERENCED);
- act_delta += 1;
- }
+ act_delta = 1;
+ } else
+ act_delta = 0;
+
/*
* Perform an unsynchronized object ref count check. While
* the page lock ensures that the page is not reallocated to
@@ -1452,41 +1416,60 @@
/*
* Advance or decay the act_count based on recent usage.
*/
- if (act_delta) {
+ if (act_delta != 0) {
m->act_count += ACT_ADVANCE + act_delta;
if (m->act_count > ACT_MAX)
m->act_count = ACT_MAX;
- } else {
+ } else
m->act_count -= min(m->act_count, ACT_DECLINE);
- act_delta = m->act_count;
- }
/*
- * Move this page to the tail of the active or inactive
+ * Move this page to the tail of the active, inactive or laundry
* queue depending on usage.
*/
- if (act_delta == 0) {
+ if (m->act_count == 0) {
/* Dequeue to avoid later lock recursion. */
vm_page_dequeue_locked(m);
- vm_page_deactivate(m);
- page_shortage--;
+
+ /*
+ * When not short for inactive pages, let dirty pages go
+ * through the inactive queue before moving to the
+ * laundry queues. This gives them some extra time to
+ * be reactivated, potentially avoiding an expensive
+ * pageout. During a page shortage, the inactive queue
+ * is necessarily small, so we may move dirty pages
+ * directly to the laundry queue.
+ */
+ if (inactq_shortage <= 0)
+ vm_page_deactivate(m);
+ else {
+ /*
+ * Calling vm_page_test_dirty() here would
+ * require acquisition of the object's write
+ * lock. However, during a page shortage,
+ * directing dirty pages into the laundry
+ * queue is only an optimization and not a
+ * requirement. Therefore, we simply rely on
+ * the opportunistic updates to the page's
+ * dirty field by the pmap.
+ */
+ if (m->dirty == 0) {
+ vm_page_deactivate(m);
+ inactq_shortage -=
+ act_scan_laundry_weight;
+ } else {
+ vm_page_launder(m);
+ inactq_shortage--;
+ }
+ }
} else
vm_page_requeue_locked(m);
vm_page_unlock(m);
}
vm_pagequeue_unlock(pq);
-#if !defined(NO_SWAPPING)
- /*
- * Idle process swapout -- run once per second.
- */
- if (vm_swap_idle_enabled) {
- static long lsec;
- if (time_second != lsec) {
- vm_req_vmdaemon(VM_SWAP_IDLE);
- lsec = time_second;
- }
- }
-#endif
+ if (pass > 0)
+ vm_swapout_run_idle();
+ return (page_shortage <= 0);
}
static int vm_pageout_oom_vote;
@@ -1668,19 +1651,21 @@
PROC_UNLOCK(p);
continue;
}
- _PHOLD(p);
+ _PHOLD_LITE(p);
+ PROC_UNLOCK(p);
+ sx_sunlock(&allproc_lock);
if (!vm_map_trylock_read(&vm->vm_map)) {
- _PRELE(p);
- PROC_UNLOCK(p);
vmspace_free(vm);
+ sx_slock(&allproc_lock);
+ PRELE(p);
continue;
}
- PROC_UNLOCK(p);
size = vmspace_swap_count(vm);
if (shortage == VM_OOM_MEM)
size += vm_pageout_oom_pagecount(vm);
vm_map_unlock_read(&vm->vm_map);
vmspace_free(vm);
+ sx_slock(&allproc_lock);
/*
* If this process is bigger than the biggest one,
@@ -1697,12 +1682,14 @@
}
sx_sunlock(&allproc_lock);
if (bigproc != NULL) {
+ if (vm_panic_on_oom != 0)
+ panic("out of swap space");
PROC_LOCK(bigproc);
killproc(bigproc, "out of swap space");
sched_nice(bigproc, PRIO_MIN);
_PRELE(bigproc);
PROC_UNLOCK(bigproc);
- wakeup(&cnt.v_free_count);
+ wakeup(&vm_cnt.v_free_count);
}
}
@@ -1710,10 +1697,13 @@
vm_pageout_worker(void *arg)
{
struct vm_domain *domain;
- int domidx;
+ int domidx, pass;
+ bool target_met;
domidx = (uintptr_t)arg;
domain = &vm_dom[domidx];
+ pass = 0;
+ target_met = true;
/*
* XXXKIB It could be useful to bind pageout daemon threads to
@@ -1724,54 +1714,80 @@
KASSERT(domain->vmd_segs != 0, ("domain without segments"));
domain->vmd_last_active_scan = ticks;
vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
+ vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE);
+ TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl,
+ &domain->vmd_inacthead, plinks.q);
/*
* The pageout daemon worker is never done, so loop forever.
*/
while (TRUE) {
+ mtx_lock(&vm_page_queue_free_mtx);
+
/*
- * If we have enough free memory, wakeup waiters. Do
- * not clear vm_pages_needed until we reach our target,
- * otherwise we may be woken up over and over again and
- * waste a lot of cpu.
+ * Generally, after a level >= 1 scan, if there are enough
+ * free pages to wakeup the waiters, then they are already
+ * awake. A call to vm_page_free() during the scan awakened
+ * them. However, in the following case, this wakeup serves
+ * to bound the amount of time that a thread might wait.
+ * Suppose a thread's call to vm_page_alloc() fails, but
+ * before that thread calls VM_WAIT, enough pages are freed by
+ * other threads to alleviate the free page shortage. The
+ * thread will, nonetheless, wait until another page is freed
+ * or this wakeup is performed.
*/
- mtx_lock(&vm_page_queue_free_mtx);
if (vm_pages_needed && !vm_page_count_min()) {
- if (!vm_paging_needed())
- vm_pages_needed = 0;
- wakeup(&cnt.v_free_count);
+ vm_pages_needed = false;
+ wakeup(&vm_cnt.v_free_count);
}
- if (vm_pages_needed) {
+
+ /*
+ * Do not clear vm_pageout_wanted until we reach our free page
+ * target. Otherwise, we may be awakened over and over again,
+ * wasting CPU time.
+ */
+ if (vm_pageout_wanted && target_met)
+ vm_pageout_wanted = false;
+
+ /*
+ * Might the page daemon receive a wakeup call?
+ */
+ if (vm_pageout_wanted) {
/*
- * We're still not done. Either vm_pages_needed was
- * set by another thread during the previous scan
- * (typically, this happens during a level 0 scan) or
- * vm_pages_needed was already set and the scan failed
- * to free enough pages. If we haven't yet performed
- * a level >= 2 scan (unlimited dirty cleaning), then
- * upgrade the level and scan again now. Otherwise,
- * sleep a bit and try again later. While sleeping,
- * vm_pages_needed can be cleared.
+ * No. Either vm_pageout_wanted was set by another
+ * thread during the previous scan, which must have
+ * been a level 0 scan, or vm_pageout_wanted was
+ * already set and the scan failed to free enough
+ * pages. If we haven't yet performed a level >= 1
+ * (page reclamation) scan, then increase the level
+ * and scan again now. Otherwise, sleep a bit and
+ * try again later.
*/
- if (domain->vmd_pass > 1)
- msleep(&vm_pages_needed,
- &vm_page_queue_free_mtx, PVM, "psleep",
- hz / 2);
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (pass >= 1)
+ pause("pwait", hz / VM_INACT_SCAN_RATE);
+ pass++;
} else {
/*
- * Good enough, sleep until required to refresh
- * stats.
+ * Yes. If threads are still sleeping in VM_WAIT
+ * then we immediately start a new scan. Otherwise,
+ * sleep until the next wakeup or until pages need to
+ * have their reference stats updated.
*/
- msleep(&vm_pages_needed, &vm_page_queue_free_mtx,
- PVM, "psleep", hz);
+ if (vm_pages_needed) {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (pass == 0)
+ pass++;
+ } else if (mtx_sleep(&vm_pageout_wanted,
+ &vm_page_queue_free_mtx, PDROP | PVM, "psleep",
+ hz) == 0) {
+ PCPU_INC(cnt.v_pdwakeups);
+ pass = 1;
+ } else
+ pass = 0;
}
- if (vm_pages_needed) {
- cnt.v_pdwakeups++;
- domain->vmd_pass++;
- } else
- domain->vmd_pass = 0;
- mtx_unlock(&vm_page_queue_free_mtx);
- vm_pageout_scan(domain, domain->vmd_pass);
+
+ target_met = vm_pageout_scan(domain, pass);
}
}
@@ -1784,8 +1800,8 @@
/*
* Initialize some paging parameters.
*/
- cnt.v_interrupt_free_min = 2;
- if (cnt.v_page_count < 2000)
+ vm_cnt.v_interrupt_free_min = 2;
+ if (vm_cnt.v_page_count < 2000)
vm_pageout_page_count = 8;
/*
@@ -1793,27 +1809,27 @@
* swap pager structures plus enough for any pv_entry structs
* when paging.
*/
- if (cnt.v_page_count > 1024)
- cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
+ if (vm_cnt.v_page_count > 1024)
+ vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200;
else
- cnt.v_free_min = 4;
- cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
- cnt.v_interrupt_free_min;
- cnt.v_free_reserved = vm_pageout_page_count +
- cnt.v_pageout_free_min + (cnt.v_page_count / 768);
- cnt.v_free_severe = cnt.v_free_min / 2;
- cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
- cnt.v_free_min += cnt.v_free_reserved;
- cnt.v_free_severe += cnt.v_free_reserved;
- cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
- if (cnt.v_inactive_target > cnt.v_free_count / 3)
- cnt.v_inactive_target = cnt.v_free_count / 3;
+ vm_cnt.v_free_min = 4;
+ vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
+ vm_cnt.v_interrupt_free_min;
+ vm_cnt.v_free_reserved = vm_pageout_page_count +
+ vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768);
+ vm_cnt.v_free_severe = vm_cnt.v_free_min / 2;
+ vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved;
+ vm_cnt.v_free_min += vm_cnt.v_free_reserved;
+ vm_cnt.v_free_severe += vm_cnt.v_free_reserved;
+ vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2;
+ if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3)
+ vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
/*
* Set the default wakeup threshold to be 10% above the minimum
* page limit. This keeps the steady state out of shortfall.
*/
- vm_pageout_wakeup_thresh = (cnt.v_free_min / 10) * 11;
+ vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11;
/*
* Set interval in seconds for active scan. We want to visit each
@@ -1825,7 +1841,15 @@
/* XXX does not really belong here */
if (vm_page_max_wired == 0)
- vm_page_max_wired = cnt.v_free_count / 3;
+ vm_page_max_wired = vm_cnt.v_free_count / 3;
+
+ /*
+ * Target amount of memory to move out of the laundry queue during a
+ * background laundering. This is proportional to the amount of system
+ * memory.
+ */
+ vm_background_launder_target = (vm_cnt.v_free_target -
+ vm_cnt.v_free_min) / 10;
}
/*
@@ -1835,12 +1859,17 @@
vm_pageout(void)
{
int error;
-#if MAXMEMDOM > 1
+#ifdef VM_NUMA_ALLOC
int i;
#endif
swap_pager_swap_init();
-#if MAXMEMDOM > 1
+ snprintf(curthread->td_name, sizeof(curthread->td_name), "dom0");
+ error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL,
+ 0, 0, "laundry: dom0");
+ if (error != 0)
+ panic("starting laundry for domain 0, error %d", error);
+#ifdef VM_NUMA_ALLOC
for (i = 1; i < vm_ndomains; i++) {
error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
curproc, NULL, 0, 0, "dom%d", i);
@@ -1858,175 +1887,42 @@
}
/*
- * Unless the free page queue lock is held by the caller, this function
- * should be regarded as advisory. Specifically, the caller should
- * not msleep() on &cnt.v_free_count following this function unless
- * the free page queue lock is held until the msleep() is performed.
+ * Perform an advisory wakeup of the page daemon.
*/
void
pagedaemon_wakeup(void)
{
- if (!vm_pages_needed && curthread->td_proc != pageproc) {
- vm_pages_needed = 1;
- wakeup(&vm_pages_needed);
- }
-}
+ mtx_assert(&vm_page_queue_free_mtx, MA_NOTOWNED);
-#if !defined(NO_SWAPPING)
-static void
-vm_req_vmdaemon(int req)
-{
- static int lastrun = 0;
-
- mtx_lock(&vm_daemon_mtx);
- vm_pageout_req_swapout |= req;
- if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
- wakeup(&vm_daemon_needed);
- lastrun = ticks;
+ if (!vm_pageout_wanted && curthread->td_proc != pageproc) {
+ vm_pageout_wanted = true;
+ wakeup(&vm_pageout_wanted);
}
- mtx_unlock(&vm_daemon_mtx);
}
-static void
-vm_daemon(void)
+/*
+ * Wake up the page daemon and wait for it to reclaim free pages.
+ *
+ * This function returns with the free queues mutex unlocked.
+ */
+void
+pagedaemon_wait(int pri, const char *wmesg)
{
- struct rlimit rsslim;
- struct proc *p;
- struct thread *td;
- struct vmspace *vm;
- int breakout, swapout_flags, tryagain, attempts;
-#ifdef RACCT
- uint64_t rsize, ravailable;
-#endif
- while (TRUE) {
- mtx_lock(&vm_daemon_mtx);
- msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
-#ifdef RACCT
- racct_enable ? hz : 0
-#else
- 0
-#endif
- );
- swapout_flags = vm_pageout_req_swapout;
- vm_pageout_req_swapout = 0;
- mtx_unlock(&vm_daemon_mtx);
- if (swapout_flags)
- swapout_procs(swapout_flags);
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- /*
- * scan the processes for exceeding their rlimits or if
- * process is swapped out -- deactivate pages
- */
- tryagain = 0;
- attempts = 0;
-again:
- attempts++;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- vm_pindex_t limit, size;
-
- /*
- * if this is a system process or if we have already
- * looked at this process, skip it.
- */
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL ||
- p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * if the process is in a non-running type state,
- * don't touch it.
- */
- breakout = 0;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (!TD_ON_RUNQ(td) &&
- !TD_IS_RUNNING(td) &&
- !TD_IS_SLEEPING(td) &&
- !TD_IS_SUSPENDED(td)) {
- thread_unlock(td);
- breakout = 1;
- break;
- }
- thread_unlock(td);
- }
- if (breakout) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * get a limit
- */
- lim_rlimit(p, RLIMIT_RSS, &rsslim);
- limit = OFF_TO_IDX(
- qmin(rsslim.rlim_cur, rsslim.rlim_max));
-
- /*
- * let processes that are swapped out really be
- * swapped out set the limit to nothing (will force a
- * swap-out.)
- */
- if ((p->p_flag & P_INMEM) == 0)
- limit = 0; /* XXX */
- vm = vmspace_acquire_ref(p);
- PROC_UNLOCK(p);
- if (vm == NULL)
- continue;
-
- size = vmspace_resident_count(vm);
- if (size >= limit) {
- vm_pageout_map_deactivate_pages(
- &vm->vm_map, limit);
- size = vmspace_resident_count(vm);
- }
-#ifdef RACCT
- if (racct_enable) {
- rsize = IDX_TO_OFF(size);
- PROC_LOCK(p);
- if (p->p_state == PRS_NORMAL)
- racct_set(p, RACCT_RSS, rsize);
- ravailable = racct_get_available(p, RACCT_RSS);
- PROC_UNLOCK(p);
- if (rsize > ravailable) {
- /*
- * Don't be overly aggressive; this
- * might be an innocent process,
- * and the limit could've been exceeded
- * by some memory hog. Don't try
- * to deactivate more than 1/4th
- * of process' resident set size.
- */
- if (attempts <= 8) {
- if (ravailable < rsize -
- (rsize / 4)) {
- ravailable = rsize -
- (rsize / 4);
- }
- }
- vm_pageout_map_deactivate_pages(
- &vm->vm_map,
- OFF_TO_IDX(ravailable));
- /* Update RSS usage after paging out. */
- size = vmspace_resident_count(vm);
- rsize = IDX_TO_OFF(size);
- PROC_LOCK(p);
- if (p->p_state == PRS_NORMAL)
- racct_set(p, RACCT_RSS, rsize);
- PROC_UNLOCK(p);
- if (rsize > ravailable)
- tryagain = 1;
- }
- }
-#endif
- vmspace_free(vm);
- }
- sx_sunlock(&allproc_lock);
- if (tryagain != 0 && attempts <= 10)
- goto again;
+ /*
+ * vm_pageout_wanted may have been set by an advisory wakeup, but if the
+ * page daemon is running on a CPU, the wakeup will have been lost.
+ * Thus, deliver a potentially spurious wakeup to ensure that the page
+ * daemon has been notified of the shortage.
+ */
+ if (!vm_pageout_wanted || !vm_pages_needed) {
+ vm_pageout_wanted = true;
+ wakeup(&vm_pageout_wanted);
}
+ vm_pages_needed = true;
+ msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | pri,
+ wmesg, 0);
}
-#endif /* !defined(NO_SWAPPING) */
Modified: trunk/sys/vm/vm_pageout.h
===================================================================
--- trunk/sys/vm/vm_pageout.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pageout.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,12 +58,14 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: stable/10/sys/vm/vm_pageout.h 314664 2017-03-04 12:05:50Z avg $
+ * $FreeBSD: stable/11/sys/vm/vm_pageout.h 331722 2018-03-29 02:50:57Z eadler $
*/
#ifndef _VM_VM_PAGEOUT_H_
#define _VM_VM_PAGEOUT_H_
+#ifdef _KERNEL
+
/*
* Header file for pageout daemon.
*/
@@ -73,17 +75,11 @@
*/
extern int vm_page_max_wired;
-extern int vm_pages_needed; /* should be some "event" structure */
-extern int vm_pageout_pages_needed;
extern int vm_pageout_deficit;
extern int vm_pageout_page_count;
+extern bool vm_pageout_wanted;
+extern bool vm_pages_needed;
-/*
- * Swap out requests
- */
-#define VM_SWAP_NORMAL 1
-#define VM_SWAP_IDLE 2
-
#define VM_OOM_MEM 1
#define VM_OOM_SWAPZ 2
@@ -101,15 +97,17 @@
* Signal pageout-daemon and wait for it.
*/
-extern void pagedaemon_wakeup(void);
+void pagedaemon_wait(int pri, const char *wmesg);
+void pagedaemon_wakeup(void);
#define VM_WAIT vm_wait()
#define VM_WAITPFAULT vm_waitpfault()
-extern void vm_wait(void);
-extern void vm_waitpfault(void);
+void vm_wait(void);
+void vm_waitpfault(void);
-#ifdef _KERNEL
int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *);
-void vm_pageout_grow_cache(int, vm_paddr_t, vm_paddr_t);
void vm_pageout_oom(int shortage);
-#endif
+
+void vm_swapout_run(void);
+void vm_swapout_run_idle(void);
+#endif /* _KERNEL */
#endif /* _VM_VM_PAGEOUT_H_ */
Modified: trunk/sys/vm/vm_pager.c
===================================================================
--- trunk/sys/vm/vm_pager.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pager.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -65,7 +65,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pager.c 311645 2017-01-07 12:04:30Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_pager.c 331722 2018-03-29 02:50:57Z eadler $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -87,7 +87,9 @@
int cluster_pbuf_freecnt = -1; /* unlimited to begin with */
-static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int);
+struct buf *swbuf;
+
+static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
vm_ooffset_t, struct ucred *);
static void dead_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
@@ -95,13 +97,11 @@
static void dead_pager_dealloc(vm_object_t);
static int
-dead_pager_getpages(obj, ma, count, req)
- vm_object_t obj;
- vm_page_t *ma;
- int count;
- int req;
+dead_pager_getpages(vm_object_t obj, vm_page_t *ma, int count, int *rbehind,
+ int *rahead)
{
- return VM_PAGER_FAIL;
+
+ return (VM_PAGER_FAIL);
}
static vm_object_t
@@ -158,8 +158,6 @@
&mgtdevicepagerops, /* OBJT_MGTDEVICE */
};
-static const int npagers = sizeof(pagertab) / sizeof(pagertab[0]);
-
/*
* Kernel address space for mapping pages.
* Used by pagers where KVAs are needed for IO.
@@ -168,7 +166,7 @@
* cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size
* (MAXPHYS == 64k) if you want to get the most efficiency.
*/
-struct mtx_padalign pbuf_mtx;
+struct mtx_padalign __exclusive_cache_line pbuf_mtx;
static TAILQ_HEAD(swqueue, buf) bswlist;
static int bswneeded;
vm_offset_t swapbkva; /* swap buffers kva */
@@ -182,7 +180,7 @@
/*
* Initialize known pagers
*/
- for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++)
+ for (pgops = pagertab; pgops < &pagertab[nitems(pagertab)]; pgops++)
if ((*pgops)->pgo_init != NULL)
(*(*pgops)->pgo_init)();
}
@@ -208,6 +206,7 @@
cluster_pbuf_freecnt = nswbuf / 2;
vnode_pbuf_freecnt = nswbuf / 2 + 1;
+ vnode_async_pbuf_freecnt = nswbuf / 2;
}
/*
@@ -241,8 +240,80 @@
(*pagertab[object->type]->pgo_dealloc) (object);
}
+static void
+vm_pager_assert_in(vm_object_t object, vm_page_t *m, int count)
+{
+#ifdef INVARIANTS
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(count > 0, ("%s: 0 count", __func__));
+ /*
+ * All pages must be busied, not mapped, not fully valid,
+ * not dirty and belong to the proper object.
+ */
+ for (int i = 0 ; i < count; i++) {
+ vm_page_assert_xbusied(m[i]);
+ KASSERT(!pmap_page_is_mapped(m[i]),
+ ("%s: page %p is mapped", __func__, m[i]));
+ KASSERT(m[i]->valid != VM_PAGE_BITS_ALL,
+ ("%s: request for a valid page %p", __func__, m[i]));
+ KASSERT(m[i]->dirty == 0,
+ ("%s: page %p is dirty", __func__, m[i]));
+ KASSERT(m[i]->object == object,
+ ("%s: wrong object %p/%p", __func__, object, m[i]->object));
+ }
+#endif
+}
+
/*
- * vm_pager_get_pages() - inline, see vm/vm_pager.h
+ * Page in the pages for the object using its associated pager.
+ * The requested page must be fully valid on successful return.
+ */
+int
+vm_pager_get_pages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+ int *rahead)
+{
+#ifdef INVARIANTS
+ vm_pindex_t pindex = m[0]->pindex;
+#endif
+ int r;
+
+ vm_pager_assert_in(object, m, count);
+
+ r = (*pagertab[object->type]->pgo_getpages)(object, m, count, rbehind,
+ rahead);
+ if (r != VM_PAGER_OK)
+ return (r);
+
+ for (int i = 0; i < count; i++) {
+ /*
+ * If pager has replaced a page, assert that it had
+ * updated the array.
+ */
+ KASSERT(m[i] == vm_page_lookup(object, pindex++),
+ ("%s: mismatch page %p pindex %ju", __func__,
+ m[i], (uintmax_t )pindex - 1));
+ /*
+ * Zero out partially filled data.
+ */
+ if (m[i]->valid != VM_PAGE_BITS_ALL)
+ vm_page_zero_invalid(m[i], TRUE);
+ }
+ return (VM_PAGER_OK);
+}
+
+int
+vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count,
+ int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
+{
+
+ vm_pager_assert_in(object, m, count);
+
+ return ((*pagertab[object->type]->pgo_getpages_async)(object, m,
+ count, rbehind, rahead, iodone, arg));
+}
+
+/*
* vm_pager_put_pages() - inline, see vm/vm_pager.h
* vm_pager_has_page() - inline, see vm/vm_pager.h
*/
@@ -289,12 +360,11 @@
bp->b_rcred = NOCRED;
bp->b_wcred = NOCRED;
bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */
- bp->b_saveaddr = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva;
- bp->b_data = bp->b_saveaddr;
- bp->b_kvabase = bp->b_saveaddr;
+ bp->b_kvabase = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva;
+ bp->b_data = bp->b_kvabase;
bp->b_kvasize = MAXPHYS;
+ bp->b_flags = 0;
bp->b_xflags = 0;
- bp->b_flags = 0;
bp->b_ioflags = 0;
bp->b_iodone = NULL;
bp->b_error = 0;
Modified: trunk/sys/vm/vm_pager.h
===================================================================
--- trunk/sys/vm/vm_pager.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pager.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -33,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vm_pager.h 8.4 (Berkeley) 1/12/94
- * $FreeBSD: stable/10/sys/vm/vm_pager.h 308365 2016-11-06 13:37:33Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_pager.h 331722 2018-03-29 02:50:57Z eadler $
*/
/*
@@ -51,19 +51,26 @@
typedef vm_object_t pgo_alloc_t(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t,
struct ucred *);
typedef void pgo_dealloc_t(vm_object_t);
-typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int);
+typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int *, int *);
+typedef void pgo_getpages_iodone_t(void *, vm_page_t *, int, int);
+typedef int pgo_getpages_async_t(vm_object_t, vm_page_t *, int, int *, int *,
+ pgo_getpages_iodone_t, void *);
typedef void pgo_putpages_t(vm_object_t, vm_page_t *, int, int, int *);
typedef boolean_t pgo_haspage_t(vm_object_t, vm_pindex_t, int *, int *);
+typedef int pgo_populate_t(vm_object_t, vm_pindex_t, int, vm_prot_t,
+ vm_pindex_t *, vm_pindex_t *);
typedef void pgo_pageunswapped_t(vm_page_t);
struct pagerops {
- pgo_init_t *pgo_init; /* Initialize pager. */
- pgo_alloc_t *pgo_alloc; /* Allocate pager. */
- pgo_dealloc_t *pgo_dealloc; /* Disassociate. */
- pgo_getpages_t *pgo_getpages; /* Get (read) page. */
- pgo_putpages_t *pgo_putpages; /* Put (write) page. */
- pgo_haspage_t *pgo_haspage; /* Does pager have page? */
- pgo_pageunswapped_t *pgo_pageunswapped;
+ pgo_init_t *pgo_init; /* Initialize pager. */
+ pgo_alloc_t *pgo_alloc; /* Allocate pager. */
+ pgo_dealloc_t *pgo_dealloc; /* Disassociate. */
+ pgo_getpages_t *pgo_getpages; /* Get (read) page. */
+ pgo_getpages_async_t *pgo_getpages_async; /* Get page asyncly. */
+ pgo_putpages_t *pgo_putpages; /* Put (write) page. */
+ pgo_haspage_t *pgo_haspage; /* Query page. */
+ pgo_populate_t *pgo_populate; /* Bulk spec pagein. */
+ pgo_pageunswapped_t *pgo_pageunswapped;
};
extern struct pagerops defaultpagerops;
@@ -92,6 +99,7 @@
#define VM_PAGER_PUT_SYNC 0x0001
#define VM_PAGER_PUT_INVAL 0x0002
+#define VM_PAGER_PUT_NOREUSE 0x0004
#define VM_PAGER_CLUSTER_OK 0x0008
#ifdef _KERNEL
@@ -103,34 +111,12 @@
vm_ooffset_t, struct ucred *);
void vm_pager_bufferinit(void);
void vm_pager_deallocate(vm_object_t);
-static __inline int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int);
+int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int *, int *);
+int vm_pager_get_pages_async(vm_object_t, vm_page_t *, int, int *, int *,
+ pgo_getpages_iodone_t, void *);
void vm_pager_init(void);
vm_object_t vm_pager_object_lookup(struct pagerlst *, void *);
-/*
- * vm_page_get_pages:
- *
- * Retrieve pages from the VM system in order to map them into an object
- * ( or into VM space somewhere ). If the pagein was successful, we
- * must fully validate it.
- */
-static __inline int
-vm_pager_get_pages(
- vm_object_t object,
- vm_page_t *m,
- int count,
- int reqpage
-) {
- int r;
-
- VM_OBJECT_ASSERT_WLOCKED(object);
- r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage);
- if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) {
- vm_page_zero_invalid(m[reqpage], TRUE);
- }
- return (r);
-}
-
static __inline void
vm_pager_put_pages(
vm_object_t object,
@@ -170,6 +156,19 @@
return (ret);
}
+static __inline int
+vm_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
+ vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+
+ MPASS((object->flags & OBJ_POPULATE) != 0);
+ MPASS(pidx < object->size);
+ MPASS(object->paging_in_progress > 0);
+ return ((*pagertab[object->type]->pgo_populate)(object, pidx,
+ fault_type, max_prot, first, last));
+}
+
+
/*
* vm_pager_page_unswapped
*
@@ -195,6 +194,9 @@
struct cdev_pager_ops {
int (*cdev_pg_fault)(vm_object_t vm_obj, vm_ooffset_t offset,
int prot, vm_page_t *mres);
+ int (*cdev_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first,
+ vm_pindex_t *last);
int (*cdev_pg_ctor)(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color);
void (*cdev_pg_dtor)(void *handle);
Modified: trunk/sys/vm/vm_param.h
===================================================================
--- trunk/sys/vm/vm_param.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_param.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: stable/10/sys/vm/vm_param.h 254168 2013-08-09 23:47:43Z zont $
+ * $FreeBSD: stable/11/sys/vm/vm_param.h 331722 2018-03-29 02:50:57Z eadler $
*/
/*
@@ -76,16 +76,17 @@
#define VM_TOTAL 1 /* struct vmtotal */
#define VM_METER VM_TOTAL/* deprecated, use VM_TOTAL */
#define VM_LOADAVG 2 /* struct loadavg */
-#define VM_V_FREE_MIN 3 /* cnt.v_free_min */
-#define VM_V_FREE_TARGET 4 /* cnt.v_free_target */
-#define VM_V_FREE_RESERVED 5 /* cnt.v_free_reserved */
-#define VM_V_INACTIVE_TARGET 6 /* cnt.v_inactive_target */
-#define VM_V_CACHE_MIN 7 /* cnt.v_cache_min */
-#define VM_V_CACHE_MAX 8 /* cnt.v_cache_max */
-#define VM_V_PAGEOUT_FREE_MIN 9 /* cnt.v_pageout_free_min */
+#define VM_V_FREE_MIN 3 /* vm_cnt.v_free_min */
+#define VM_V_FREE_TARGET 4 /* vm_cnt.v_free_target */
+#define VM_V_FREE_RESERVED 5 /* vm_cnt.v_free_reserved */
+#define VM_V_INACTIVE_TARGET 6 /* vm_cnt.v_inactive_target */
+#define VM_OBSOLETE_7 7 /* unused, formerly v_cache_min */
+#define VM_OBSOLETE_8 8 /* unused, formerly v_cache_max */
+#define VM_V_PAGEOUT_FREE_MIN 9 /* vm_cnt.v_pageout_free_min */
#define VM_OBSOLETE_10 10 /* pageout algorithm */
#define VM_SWAPPING_ENABLED 11 /* swapping enabled */
-#define VM_MAXID 12 /* number of valid vm ids */
+#define VM_OVERCOMMIT 12 /* vm.overcommit */
+#define VM_MAXID 13 /* number of valid vm ids */
/*
* Structure for swap device statistics
Modified: trunk/sys/vm/vm_phys.c
===================================================================
--- trunk/sys/vm/vm_phys.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_phys.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -38,7 +38,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_phys.c 308349 2016-11-05 20:14:23Z markj $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_phys.c 331614 2018-03-27 13:09:35Z kib $");
#include "opt_ddb.h"
#include "opt_vm.h"
@@ -49,13 +49,14 @@
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
-#if MAXMEMDOM > 1
#include <sys/proc.h>
-#endif
#include <sys/queue.h>
+#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/sysctl.h>
+#include <sys/tree.h>
#include <sys/vmmeter.h>
+#include <sys/seq.h>
#include <ddb/ddb.h>
@@ -66,10 +67,15 @@
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
+#include <vm/vm_domain.h>
+
_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
"Too many physsegs.");
+#ifdef VM_NUMA_ALLOC
struct mem_affinity *mem_affinity;
+int *mem_locality;
+#endif
int vm_ndomains = 1;
@@ -76,13 +82,25 @@
struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
int vm_phys_nsegs;
-#define VM_PHYS_FICTITIOUS_NSEGS 8
-static struct vm_phys_fictitious_seg {
+struct vm_phys_fictitious_seg;
+static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
+ struct vm_phys_fictitious_seg *);
+
+RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
+ RB_INITIALIZER(_vm_phys_fictitious_tree);
+
+struct vm_phys_fictitious_seg {
+ RB_ENTRY(vm_phys_fictitious_seg) node;
+ /* Memory region data */
vm_paddr_t start;
vm_paddr_t end;
vm_page_t first_page;
-} vm_phys_fictitious_segs[VM_PHYS_FICTITIOUS_NSEGS];
-static struct mtx vm_phys_fictitious_reg_mtx;
+};
+
+RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
+ vm_phys_fictitious_cmp);
+
+static struct rwlock vm_phys_fictitious_reg_lock;
MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
static struct vm_freelist
@@ -127,21 +145,139 @@
SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
+#ifdef VM_NUMA_ALLOC
+static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
+#endif
+
SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
&vm_ndomains, 0, "Number of physical memory domains available.");
+/*
+ * Default to first-touch + round-robin.
+ */
+static struct mtx vm_default_policy_mtx;
+MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
+ MTX_DEF);
+#ifdef VM_NUMA_ALLOC
+static struct vm_domain_policy vm_default_policy =
+ VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
+#else
+/* Use round-robin so the domain policy code will only try once per allocation */
+static struct vm_domain_policy vm_default_policy =
+ VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
+#endif
+
static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
int order);
+static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
+ u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
+ vm_paddr_t boundary);
static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
-static int vm_phys_paddr_to_segind(vm_paddr_t pa);
static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
int order);
+static int
+sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
+{
+ char policy_name[32];
+ int error;
+
+ mtx_lock(&vm_default_policy_mtx);
+
+ /* Map policy to output string */
+ switch (vm_default_policy.p.policy) {
+ case VM_POLICY_FIRST_TOUCH:
+ strcpy(policy_name, "first-touch");
+ break;
+ case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+ strcpy(policy_name, "first-touch-rr");
+ break;
+ case VM_POLICY_ROUND_ROBIN:
+ default:
+ strcpy(policy_name, "rr");
+ break;
+ }
+ mtx_unlock(&vm_default_policy_mtx);
+
+ error = sysctl_handle_string(oidp, &policy_name[0],
+ sizeof(policy_name), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ mtx_lock(&vm_default_policy_mtx);
+ /* Set: match on the subset of policies that make sense as a default */
+ if (strcmp("first-touch-rr", policy_name) == 0) {
+ vm_domain_policy_set(&vm_default_policy,
+ VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
+ } else if (strcmp("first-touch", policy_name) == 0) {
+ vm_domain_policy_set(&vm_default_policy,
+ VM_POLICY_FIRST_TOUCH, 0);
+ } else if (strcmp("rr", policy_name) == 0) {
+ vm_domain_policy_set(&vm_default_policy,
+ VM_POLICY_ROUND_ROBIN, 0);
+ } else {
+ error = EINVAL;
+ goto finish;
+ }
+
+ error = 0;
+finish:
+ mtx_unlock(&vm_default_policy_mtx);
+ return (error);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
+ 0, 0, sysctl_vm_default_policy, "A",
+ "Default policy (rr, first-touch, first-touch-rr");
+
+/*
+ * Red-black tree helpers for vm fictitious range management.
+ */
+static inline int
+vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
+ struct vm_phys_fictitious_seg *range)
+{
+
+ KASSERT(range->start != 0 && range->end != 0,
+ ("Invalid range passed on search for vm_fictitious page"));
+ if (p->start >= range->end)
+ return (1);
+ if (p->start < range->start)
+ return (-1);
+
+ return (0);
+}
+
+static int
+vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
+ struct vm_phys_fictitious_seg *p2)
+{
+
+ /* Check if this is a search for a page */
+ if (p1->end == 0)
+ return (vm_phys_fictitious_in_range(p1, p2));
+
+ KASSERT(p2->end != 0,
+ ("Invalid range passed as second parameter to vm fictitious comparison"));
+
+ /* Searching to add a new range */
+ if (p1->end <= p2->start)
+ return (-1);
+ if (p1->start >= p2->end)
+ return (1);
+
+ panic("Trying to add overlapping vm fictitious ranges:\n"
+ "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
+ (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
+}
+
static __inline int
vm_rr_selectdomain(void)
{
-#if MAXMEMDOM > 1
+#ifdef VM_NUMA_ALLOC
struct thread *td;
td = curthread;
@@ -154,6 +290,53 @@
#endif
}
+/*
+ * Initialise a VM domain iterator.
+ *
+ * Check the thread policy, then the proc policy,
+ * then default to the system policy.
+ *
+ * Later on the various layers will have this logic
+ * plumbed into them and the phys code will be explicitly
+ * handed a VM domain policy to use.
+ */
+static void
+vm_policy_iterator_init(struct vm_domain_iterator *vi)
+{
+#ifdef VM_NUMA_ALLOC
+ struct vm_domain_policy lcl;
+#endif
+
+ vm_domain_iterator_init(vi);
+
+#ifdef VM_NUMA_ALLOC
+ /* Copy out the thread policy */
+ vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
+ if (lcl.p.policy != VM_POLICY_NONE) {
+ /* Thread policy is present; use it */
+ vm_domain_iterator_set_policy(vi, &lcl);
+ return;
+ }
+
+ vm_domain_policy_localcopy(&lcl,
+ &curthread->td_proc->p_vm_dom_policy);
+ if (lcl.p.policy != VM_POLICY_NONE) {
+ /* Process policy is present; use it */
+ vm_domain_iterator_set_policy(vi, &lcl);
+ return;
+ }
+#endif
+ /* Use system default policy */
+ vm_domain_iterator_set_policy(vi, &vm_default_policy);
+}
+
+static void
+vm_policy_iterator_finish(struct vm_domain_iterator *vi)
+{
+
+ vm_domain_iterator_cleanup(vi);
+}
+
boolean_t
vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
{
@@ -243,6 +426,54 @@
return (error);
}
+/*
+ * Return affinity, or -1 if there's no affinity information.
+ */
+int
+vm_phys_mem_affinity(int f, int t)
+{
+
+#ifdef VM_NUMA_ALLOC
+ if (mem_locality == NULL)
+ return (-1);
+ if (f >= vm_ndomains || t >= vm_ndomains)
+ return (-1);
+ return (mem_locality[f * vm_ndomains + t]);
+#else
+ return (-1);
+#endif
+}
+
+#ifdef VM_NUMA_ALLOC
+/*
+ * Outputs the VM locality table.
+ */
+static int
+sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sbuf;
+ int error, i, j;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+
+ sbuf_printf(&sbuf, "\n");
+
+ for (i = 0; i < vm_ndomains; i++) {
+ sbuf_printf(&sbuf, "%d: ", i);
+ for (j = 0; j < vm_ndomains; j++) {
+ sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
+ }
+ sbuf_printf(&sbuf, "\n");
+ }
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ return (error);
+}
+#endif
+
static void
vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
{
@@ -289,6 +520,7 @@
static void
vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
{
+#ifdef VM_NUMA_ALLOC
int i;
if (mem_affinity == NULL) {
@@ -313,6 +545,9 @@
mem_affinity[i].domain);
start = mem_affinity[i].end;
}
+#else
+ _vm_phys_create_seg(start, end, 0);
+#endif
}
/*
@@ -473,7 +708,8 @@
}
}
}
- mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF);
+
+ rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
}
/*
@@ -495,36 +731,6 @@
}
/*
- * Initialize a physical page and add it to the free lists.
- */
-void
-vm_phys_add_page(vm_paddr_t pa)
-{
- vm_page_t m;
- struct vm_domain *vmd;
-
- cnt.v_page_count++;
- m = vm_phys_paddr_to_vm_page(pa);
- m->busy_lock = VPB_UNBUSIED;
- m->phys_addr = pa;
- m->queue = PQ_NONE;
- m->segind = vm_phys_paddr_to_segind(pa);
- vmd = vm_phys_domain(m);
- vmd->vmd_page_count++;
- vmd->vmd_segs |= 1UL << m->segind;
- m->flags = PG_FREE;
- KASSERT(m->order == VM_NFREEORDER,
- ("vm_phys_add_page: page %p has unexpected order %d",
- m, m->order));
- m->pool = VM_FREEPOOL_DEFAULT;
- pmap_page_init(m);
- mtx_lock(&vm_page_queue_free_mtx);
- vm_phys_freecnt_adj(m, 1);
- vm_phys_free_pages(m, 0);
- mtx_unlock(&vm_page_queue_free_mtx);
-}
-
-/*
* Allocate a contiguous, power of two-sized set of physical pages
* from the free lists.
*
@@ -534,7 +740,8 @@
vm_phys_alloc_pages(int pool, int order)
{
vm_page_t m;
- int dom, domain, flind;
+ int domain, flind;
+ struct vm_domain_iterator vi;
KASSERT(pool < VM_NFREEPOOL,
("vm_phys_alloc_pages: pool %d is out of range", pool));
@@ -541,8 +748,9 @@
KASSERT(order < VM_NFREEORDER,
("vm_phys_alloc_pages: order %d is out of range", order));
- for (dom = 0; dom < vm_ndomains; dom++) {
- domain = vm_rr_selectdomain();
+ vm_policy_iterator_init(&vi);
+
+ while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
for (flind = 0; flind < vm_nfreelists; flind++) {
m = vm_phys_alloc_domain_pages(domain, flind, pool,
order);
@@ -550,6 +758,8 @@
return (m);
}
}
+
+ vm_policy_iterator_finish(&vi);
return (NULL);
}
@@ -564,7 +774,8 @@
vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
{
vm_page_t m;
- int dom, domain;
+ struct vm_domain_iterator vi;
+ int domain;
KASSERT(freelist < VM_NFREELIST,
("vm_phys_alloc_freelist_pages: freelist %d is out of range",
@@ -573,13 +784,17 @@
("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
KASSERT(order < VM_NFREEORDER,
("vm_phys_alloc_freelist_pages: order %d is out of range", order));
- for (dom = 0; dom < vm_ndomains; dom++) {
- domain = vm_rr_selectdomain();
+
+ vm_policy_iterator_init(&vi);
+
+ while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
m = vm_phys_alloc_domain_pages(domain,
vm_freelist_to_flind[freelist], pool, order);
if (m != NULL)
return (m);
}
+
+ vm_policy_iterator_finish(&vi);
return (NULL);
}
@@ -643,23 +858,39 @@
vm_page_t
vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
{
- struct vm_phys_fictitious_seg *seg;
+ struct vm_phys_fictitious_seg tmp, *seg;
vm_page_t m;
- int segind;
m = NULL;
- for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
- seg = &vm_phys_fictitious_segs[segind];
- if (pa >= seg->start && pa < seg->end) {
- m = &seg->first_page[atop(pa - seg->start)];
- KASSERT((m->flags & PG_FICTITIOUS) != 0,
- ("%p not fictitious", m));
- break;
- }
- }
+ tmp.start = pa;
+ tmp.end = 0;
+
+ rw_rlock(&vm_phys_fictitious_reg_lock);
+ seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
+ rw_runlock(&vm_phys_fictitious_reg_lock);
+ if (seg == NULL)
+ return (NULL);
+
+ m = &seg->first_page[atop(pa - seg->start)];
+ KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
+
return (m);
}
+static inline void
+vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
+ long page_count, vm_memattr_t memattr)
+{
+ long i;
+
+ bzero(range, page_count * sizeof(*range));
+ for (i = 0; i < page_count; i++) {
+ vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
+ range[i].oflags &= ~VPO_UNMANAGED;
+ range[i].busy_lock = VPB_UNBUSIED;
+ }
+}
+
int
vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
vm_memattr_t memattr)
@@ -666,104 +897,145 @@
{
struct vm_phys_fictitious_seg *seg;
vm_page_t fp;
- long i, page_count;
- int segind;
+ long page_count;
#ifdef VM_PHYSSEG_DENSE
- long pi;
- boolean_t malloced;
+ long pi, pe;
+ long dpage_count;
#endif
+ KASSERT(start < end,
+ ("Start of segment isn't less than end (start: %jx end: %jx)",
+ (uintmax_t)start, (uintmax_t)end));
+
page_count = (end - start) / PAGE_SIZE;
#ifdef VM_PHYSSEG_DENSE
pi = atop(start);
- if (pi >= first_page && pi < vm_page_array_size + first_page) {
- if (atop(end) >= vm_page_array_size + first_page)
- return (EINVAL);
+ pe = atop(end);
+ if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
fp = &vm_page_array[pi - first_page];
- malloced = FALSE;
- } else
+ if ((pe - first_page) > vm_page_array_size) {
+ /*
+ * We have a segment that starts inside
+ * of vm_page_array, but ends outside of it.
+ *
+ * Use vm_page_array pages for those that are
+ * inside of the vm_page_array range, and
+ * allocate the remaining ones.
+ */
+ dpage_count = vm_page_array_size - (pi - first_page);
+ vm_phys_fictitious_init_range(fp, start, dpage_count,
+ memattr);
+ page_count -= dpage_count;
+ start += ptoa(dpage_count);
+ goto alloc;
+ }
+ /*
+ * We can allocate the full range from vm_page_array,
+ * so there's no need to register the range in the tree.
+ */
+ vm_phys_fictitious_init_range(fp, start, page_count, memattr);
+ return (0);
+ } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
+ /*
+ * We have a segment that ends inside of vm_page_array,
+ * but starts outside of it.
+ */
+ fp = &vm_page_array[0];
+ dpage_count = pe - first_page;
+ vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
+ memattr);
+ end -= ptoa(dpage_count);
+ page_count -= dpage_count;
+ goto alloc;
+ } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
+ /*
+ * Trying to register a fictitious range that expands before
+ * and after vm_page_array.
+ */
+ return (EINVAL);
+ } else {
+alloc:
#endif
- {
fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
- M_WAITOK | M_ZERO);
+ M_WAITOK);
#ifdef VM_PHYSSEG_DENSE
- malloced = TRUE;
-#endif
}
- for (i = 0; i < page_count; i++) {
- vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr);
- fp[i].oflags &= ~VPO_UNMANAGED;
- fp[i].busy_lock = VPB_UNBUSIED;
- }
- mtx_lock(&vm_phys_fictitious_reg_mtx);
- for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
- seg = &vm_phys_fictitious_segs[segind];
- if (seg->start == 0 && seg->end == 0) {
- seg->start = start;
- seg->end = end;
- seg->first_page = fp;
- mtx_unlock(&vm_phys_fictitious_reg_mtx);
- return (0);
- }
- }
- mtx_unlock(&vm_phys_fictitious_reg_mtx);
-#ifdef VM_PHYSSEG_DENSE
- if (malloced)
#endif
- free(fp, M_FICT_PAGES);
- return (EBUSY);
+ vm_phys_fictitious_init_range(fp, start, page_count, memattr);
+
+ seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
+ seg->start = start;
+ seg->end = end;
+ seg->first_page = fp;
+
+ rw_wlock(&vm_phys_fictitious_reg_lock);
+ RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
+ rw_wunlock(&vm_phys_fictitious_reg_lock);
+
+ return (0);
}
void
vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
{
- struct vm_phys_fictitious_seg *seg;
- vm_page_t fp;
- int segind;
+ struct vm_phys_fictitious_seg *seg, tmp;
#ifdef VM_PHYSSEG_DENSE
- long pi;
+ long pi, pe;
#endif
+ KASSERT(start < end,
+ ("Start of segment isn't less than end (start: %jx end: %jx)",
+ (uintmax_t)start, (uintmax_t)end));
+
#ifdef VM_PHYSSEG_DENSE
pi = atop(start);
-#endif
-
- mtx_lock(&vm_phys_fictitious_reg_mtx);
- for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
- seg = &vm_phys_fictitious_segs[segind];
- if (seg->start == start && seg->end == end) {
- seg->start = seg->end = 0;
- fp = seg->first_page;
- seg->first_page = NULL;
- mtx_unlock(&vm_phys_fictitious_reg_mtx);
-#ifdef VM_PHYSSEG_DENSE
- if (pi < first_page || atop(end) >= vm_page_array_size)
-#endif
- free(fp, M_FICT_PAGES);
+ pe = atop(end);
+ if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
+ if ((pe - first_page) <= vm_page_array_size) {
+ /*
+ * This segment was allocated using vm_page_array
+ * only, there's nothing to do since those pages
+ * were never added to the tree.
+ */
return;
}
+ /*
+ * We have a segment that starts inside
+ * of vm_page_array, but ends outside of it.
+ *
+ * Calculate how many pages were added to the
+ * tree and free them.
+ */
+ start = ptoa(first_page + vm_page_array_size);
+ } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
+ /*
+ * We have a segment that ends inside of vm_page_array,
+ * but starts outside of it.
+ */
+ end = ptoa(first_page);
+ } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
+ /* Since it's not possible to register such a range, panic. */
+ panic(
+ "Unregistering not registered fictitious range [%#jx:%#jx]",
+ (uintmax_t)start, (uintmax_t)end);
}
- mtx_unlock(&vm_phys_fictitious_reg_mtx);
- KASSERT(0, ("Unregistering not registered fictitious range"));
-}
+#endif
+ tmp.start = start;
+ tmp.end = 0;
-/*
- * Find the segment containing the given physical address.
- */
-static int
-vm_phys_paddr_to_segind(vm_paddr_t pa)
-{
- struct vm_phys_seg *seg;
- int segind;
-
- for (segind = 0; segind < vm_phys_nsegs; segind++) {
- seg = &vm_phys_segs[segind];
- if (pa >= seg->start && pa < seg->end)
- return (segind);
+ rw_wlock(&vm_phys_fictitious_reg_lock);
+ seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
+ if (seg->start != start || seg->end != end) {
+ rw_wunlock(&vm_phys_fictitious_reg_lock);
+ panic(
+ "Unregistering not registered fictitious range [%#jx:%#jx]",
+ (uintmax_t)start, (uintmax_t)end);
}
- panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
- (uintmax_t)pa);
+ RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
+ rw_wunlock(&vm_phys_fictitious_reg_lock);
+ free(seg->first_page, M_FICT_PAGES);
+ free(seg, M_FICT_PAGES);
}
/*
@@ -853,6 +1125,56 @@
}
/*
+ * Scan physical memory between the specified addresses "low" and "high" for a
+ * run of contiguous physical pages that satisfy the specified conditions, and
+ * return the lowest page in the run. The specified "alignment" determines
+ * the alignment of the lowest physical page in the run. If the specified
+ * "boundary" is non-zero, then the run of physical pages cannot span a
+ * physical address that is a multiple of "boundary".
+ *
+ * "npages" must be greater than zero. Both "alignment" and "boundary" must
+ * be a power of two.
+ */
+vm_page_t
+vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+ u_long alignment, vm_paddr_t boundary, int options)
+{
+ vm_paddr_t pa_end;
+ vm_page_t m_end, m_run, m_start;
+ struct vm_phys_seg *seg;
+ int segind;
+
+ KASSERT(npages > 0, ("npages is 0"));
+ KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+ KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+ if (low >= high)
+ return (NULL);
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ seg = &vm_phys_segs[segind];
+ if (seg->start >= high)
+ break;
+ if (low >= seg->end)
+ continue;
+ if (low <= seg->start)
+ m_start = seg->first_page;
+ else
+ m_start = &seg->first_page[atop(low - seg->start)];
+ if (high < seg->end)
+ pa_end = high;
+ else
+ pa_end = seg->end;
+ if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
+ continue;
+ m_end = &seg->first_page[atop(pa_end - seg->start)];
+ m_run = vm_page_scan_contig(npages, m_start, m_end,
+ alignment, boundary, options);
+ if (m_run != NULL)
+ return (m_run);
+ }
+ return (NULL);
+}
+
+/*
* Set the pool for a contiguous, power of two-sized set of physical pages.
*/
void
@@ -946,7 +1268,7 @@
for (;;) {
TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) {
for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
- if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
+ if ((m_tmp->flags & PG_ZERO) == 0) {
vm_phys_unfree_page(m_tmp);
vm_phys_freecnt_adj(m, -1);
mtx_unlock(&vm_page_queue_free_mtx);
@@ -990,85 +1312,125 @@
vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
u_long alignment, vm_paddr_t boundary)
{
+ vm_paddr_t pa_end, pa_start;
+ vm_page_t m_run;
+ struct vm_domain_iterator vi;
+ struct vm_phys_seg *seg;
+ int domain, segind;
+
+ KASSERT(npages > 0, ("npages is 0"));
+ KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+ KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ if (low >= high)
+ return (NULL);
+ vm_policy_iterator_init(&vi);
+restartdom:
+ if (vm_domain_iterator_run(&vi, &domain) != 0) {
+ vm_policy_iterator_finish(&vi);
+ return (NULL);
+ }
+ m_run = NULL;
+ for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
+ seg = &vm_phys_segs[segind];
+ if (seg->start >= high || seg->domain != domain)
+ continue;
+ if (low >= seg->end)
+ break;
+ if (low <= seg->start)
+ pa_start = seg->start;
+ else
+ pa_start = low;
+ if (high < seg->end)
+ pa_end = high;
+ else
+ pa_end = seg->end;
+ if (pa_end - pa_start < ptoa(npages))
+ continue;
+ m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
+ alignment, boundary);
+ if (m_run != NULL)
+ break;
+ }
+ if (m_run == NULL && !vm_domain_iterator_isdone(&vi))
+ goto restartdom;
+ vm_policy_iterator_finish(&vi);
+ return (m_run);
+}
+
+/*
+ * Allocate a run of contiguous physical pages from the free list for the
+ * specified segment.
+ */
+static vm_page_t
+vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
+ vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
+{
struct vm_freelist *fl;
- struct vm_phys_seg *seg;
- vm_paddr_t pa, pa_last, size;
+ vm_paddr_t pa, pa_end, size;
vm_page_t m, m_ret;
u_long npages_end;
- int dom, domain, flind, oind, order, pind;
+ int oind, order, pind;
+ KASSERT(npages > 0, ("npages is 0"));
+ KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+ KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- size = npages << PAGE_SHIFT;
- KASSERT(size != 0,
- ("vm_phys_alloc_contig: size must not be 0"));
- KASSERT((alignment & (alignment - 1)) == 0,
- ("vm_phys_alloc_contig: alignment must be a power of 2"));
- KASSERT((boundary & (boundary - 1)) == 0,
- ("vm_phys_alloc_contig: boundary must be a power of 2"));
/* Compute the queue that is the best fit for npages. */
for (order = 0; (1 << order) < npages; order++);
- dom = 0;
-restartdom:
- domain = vm_rr_selectdomain();
- for (flind = 0; flind < vm_nfreelists; flind++) {
- for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
- for (pind = 0; pind < VM_NFREEPOOL; pind++) {
- fl = &vm_phys_free_queues[domain][flind][pind][0];
- TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
+ /* Search for a run satisfying the specified conditions. */
+ size = npages << PAGE_SHIFT;
+ for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
+ oind++) {
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ fl = (*seg->free_queues)[pind];
+ TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
+ /*
+ * Is the size of this allocation request
+ * larger than the largest block size?
+ */
+ if (order >= VM_NFREEORDER) {
/*
- * A free list may contain physical pages
- * from one or more segments.
+ * Determine if a sufficient number of
+ * subsequent blocks to satisfy the
+ * allocation request are free.
*/
- seg = &vm_phys_segs[m_ret->segind];
- if (seg->start > high ||
- low >= seg->end)
+ pa = VM_PAGE_TO_PHYS(m_ret);
+ pa_end = pa + size;
+ if (pa_end < pa)
continue;
-
- /*
- * Is the size of this allocation request
- * larger than the largest block size?
- */
- if (order >= VM_NFREEORDER) {
- /*
- * Determine if a sufficient number
- * of subsequent blocks to satisfy
- * the allocation request are free.
- */
- pa = VM_PAGE_TO_PHYS(m_ret);
- pa_last = pa + size;
- for (;;) {
- pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
- if (pa >= pa_last)
- break;
- if (pa < seg->start ||
- pa >= seg->end)
- break;
- m = &seg->first_page[atop(pa - seg->start)];
- if (m->order != VM_NFREEORDER - 1)
- break;
- }
- /* If not, continue to the next block. */
- if (pa < pa_last)
- continue;
+ for (;;) {
+ pa += 1 << (PAGE_SHIFT +
+ VM_NFREEORDER - 1);
+ if (pa >= pa_end ||
+ pa < seg->start ||
+ pa >= seg->end)
+ break;
+ m = &seg->first_page[atop(pa -
+ seg->start)];
+ if (m->order != VM_NFREEORDER -
+ 1)
+ break;
}
+ /* If not, go to the next block. */
+ if (pa < pa_end)
+ continue;
+ }
- /*
- * Determine if the blocks are within the given range,
- * satisfy the given alignment, and do not cross the
- * given boundary.
- */
- pa = VM_PAGE_TO_PHYS(m_ret);
- if (pa >= low &&
- pa + size <= high &&
- (pa & (alignment - 1)) == 0 &&
- ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
- goto done;
- }
+ /*
+ * Determine if the blocks are within the
+ * given range, satisfy the given alignment,
+ * and do not cross the given boundary.
+ */
+ pa = VM_PAGE_TO_PHYS(m_ret);
+ pa_end = pa + size;
+ if (pa >= low && pa_end <= high &&
+ (pa & (alignment - 1)) == 0 &&
+ rounddown2(pa ^ (pa_end - 1), boundary) == 0)
+ goto done;
}
}
}
- if (++dom < vm_ndomains)
- goto restartdom;
return (NULL);
done:
for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
Modified: trunk/sys/vm/vm_phys.h
===================================================================
--- trunk/sys/vm/vm_phys.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_phys.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -29,7 +29,7 @@
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/vm/vm_phys.h 285634 2015-07-16 14:41:58Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_phys.h 329381 2018-02-16 16:16:33Z mjg $
*/
/*
@@ -62,6 +62,7 @@
};
extern struct mem_affinity *mem_affinity;
+extern int *mem_locality;
extern int vm_ndomains;
extern struct vm_phys_seg vm_phys_segs[];
extern int vm_phys_nsegs;
@@ -69,7 +70,6 @@
/*
* The following functions are only to be used by the virtual memory system.
*/
-void vm_phys_add_page(vm_paddr_t pa);
void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end);
vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
u_long alignment, vm_paddr_t boundary);
@@ -84,9 +84,12 @@
void vm_phys_free_pages(vm_page_t m, int order);
void vm_phys_init(void);
vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
+vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+ u_long alignment, vm_paddr_t boundary, int options);
void vm_phys_set_pool(int pool, vm_page_t m, int order);
boolean_t vm_phys_unfree_page(vm_page_t m);
boolean_t vm_phys_zero_pages_idle(void);
+int vm_phys_mem_affinity(int f, int t);
/*
* vm_phys_domain:
@@ -96,7 +99,7 @@
static inline struct vm_domain *
vm_phys_domain(vm_page_t m)
{
-#if MAXMEMDOM > 1
+#ifdef VM_NUMA_ALLOC
int domn, segind;
/* XXXKIB try to assert that the page is managed */
@@ -110,13 +113,13 @@
#endif
}
-static inline void
+static inline u_int
vm_phys_freecnt_adj(vm_page_t m, int adj)
{
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- cnt.v_free_count += adj;
vm_phys_domain(m)->vmd_free_count += adj;
+ return (vm_cnt.v_free_count += adj);
}
#endif /* _KERNEL */
Modified: trunk/sys/vm/vm_radix.c
===================================================================
--- trunk/sys/vm/vm_radix.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_radix.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -50,7 +50,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_radix.c 298653 2016-04-26 17:39:54Z pfg $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_radix.c 327785 2018-01-10 20:39:26Z markj $");
#include "opt_ddb.h"
@@ -299,21 +299,19 @@
* are needed to store them.
*/
if (!uma_zone_reserve_kva(vm_radix_node_zone,
- ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
+ ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
sizeof(struct vm_radix_node))))
panic("%s: unable to reserve KVA", __func__);
}
-SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND,
+SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_THIRD,
vm_radix_reserve_kva, NULL);
#endif
/*
* Initialize the UMA slab zone.
- * Until vm_radix_prealloc() is called, the zone will be served by the
- * UMA boot-time pre-allocated pool of pages.
*/
void
-vm_radix_init(void)
+vm_radix_zinit(void)
{
vm_radix_node_zone = uma_zcreate("RADIX NODE",
@@ -342,8 +340,6 @@
index = page->pindex;
-restart:
-
/*
* The owner of record for root is not really important because it
* will never be used.
@@ -361,32 +357,10 @@
panic("%s: key %jx is already present",
__func__, (uintmax_t)index);
clev = vm_radix_keydiff(m->pindex, index);
-
- /*
- * During node allocation the trie that is being
- * walked can be modified because of recursing radix
- * trie operations.
- * If this is the case, the recursing functions signal
- * such situation and the insert operation must
- * start from scratch again.
- * The freed radix node will then be in the UMA
- * caches very likely to avoid the same situation
- * to happen.
- */
- rtree->rt_flags |= RT_INSERT_INPROG;
tmp = vm_radix_node_get(vm_radix_trimkey(index,
clev + 1), 2, clev);
- rtree->rt_flags &= ~RT_INSERT_INPROG;
- if (tmp == NULL) {
- rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ if (tmp == NULL)
return (ENOMEM);
- }
- if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
- rtree->rt_flags &= ~RT_TRIE_MODIFIED;
- tmp->rn_count = 0;
- vm_radix_node_put(tmp);
- goto restart;
- }
*parentp = tmp;
vm_radix_addpage(tmp, index, clev, page);
vm_radix_addpage(tmp, m->pindex, clev, m);
@@ -410,21 +384,9 @@
*/
newind = rnode->rn_owner;
clev = vm_radix_keydiff(newind, index);
-
- /* See the comments above. */
- rtree->rt_flags |= RT_INSERT_INPROG;
tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
- rtree->rt_flags &= ~RT_INSERT_INPROG;
- if (tmp == NULL) {
- rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ if (tmp == NULL)
return (ENOMEM);
- }
- if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
- rtree->rt_flags &= ~RT_TRIE_MODIFIED;
- tmp->rn_count = 0;
- vm_radix_node_put(tmp);
- goto restart;
- }
*parentp = tmp;
vm_radix_addpage(tmp, index, clev, page);
slot = vm_radix_slot(newind, clev);
@@ -699,10 +661,10 @@
}
/*
- * Remove the specified index from the tree.
- * Panics if the key is not present.
+ * Remove the specified index from the trie, and return the value stored at
+ * that index. If the index is not present, return NULL.
*/
-void
+vm_page_t
vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
{
struct vm_radix_node *rnode, *parent;
@@ -709,41 +671,27 @@
vm_page_t m;
int i, slot;
- /*
- * Detect if a page is going to be removed from a trie which is
- * already undergoing another trie operation.
- * Right now this is only possible for vm_radix_remove() recursing
- * into vm_radix_insert().
- * If this is the case, the caller must be notified about this
- * situation. It will also takecare to update the RT_TRIE_MODIFIED
- * accordingly.
- * The RT_TRIE_MODIFIED bit is set here because the remove operation
- * will always succeed.
- */
- if ((rtree->rt_flags & RT_INSERT_INPROG) != 0)
- rtree->rt_flags |= RT_TRIE_MODIFIED;
-
rnode = vm_radix_getroot(rtree);
if (vm_radix_isleaf(rnode)) {
m = vm_radix_topage(rnode);
if (m->pindex != index)
- panic("%s: invalid key found", __func__);
+ return (NULL);
vm_radix_setroot(rtree, NULL);
- return;
+ return (m);
}
parent = NULL;
for (;;) {
if (rnode == NULL)
- panic("vm_radix_remove: impossible to locate the key");
+ return (NULL);
slot = vm_radix_slot(index, rnode->rn_clev);
if (vm_radix_isleaf(rnode->rn_child[slot])) {
m = vm_radix_topage(rnode->rn_child[slot]);
if (m->pindex != index)
- panic("%s: invalid key found", __func__);
+ return (NULL);
rnode->rn_child[slot] = NULL;
rnode->rn_count--;
if (rnode->rn_count > 1)
- break;
+ return (m);
for (i = 0; i < VM_RADIX_COUNT; i++)
if (rnode->rn_child[i] != NULL)
break;
@@ -760,7 +708,7 @@
rnode->rn_count--;
rnode->rn_child[i] = NULL;
vm_radix_node_put(rnode);
- break;
+ return (m);
}
parent = rnode;
rnode = rnode->rn_child[slot];
@@ -777,9 +725,6 @@
{
struct vm_radix_node *root;
- KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0,
- ("vm_radix_reclaim_allnodes: unexpected trie recursion"));
-
root = vm_radix_getroot(rtree);
if (root == NULL)
return;
@@ -831,6 +776,12 @@
panic("%s: original replacing page not found", __func__);
}
+void
+vm_radix_wait(void)
+{
+ uma_zwait(vm_radix_node_zone);
+}
+
#ifdef DDB
/*
* Show details about the given radix node.
Modified: trunk/sys/vm/vm_radix.h
===================================================================
--- trunk/sys/vm/vm_radix.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_radix.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -26,7 +26,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/vm/vm_radix.h 266591 2014-05-23 17:47:49Z alc $
+ * $FreeBSD: stable/11/sys/vm/vm_radix.h 327785 2018-01-10 20:39:26Z markj $
*/
#ifndef _VM_RADIX_H_
@@ -36,15 +36,30 @@
#ifdef _KERNEL
-void vm_radix_init(void);
int vm_radix_insert(struct vm_radix *rtree, vm_page_t page);
+void vm_radix_wait(void);
boolean_t vm_radix_is_singleton(struct vm_radix *rtree);
vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index);
vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index);
vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index);
void vm_radix_reclaim_allnodes(struct vm_radix *rtree);
-void vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index);
vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage);
+void vm_radix_zinit(void);
+static __inline void
+vm_radix_init(struct vm_radix *rtree)
+{
+
+ rtree->rt_root = 0;
+}
+
+static __inline boolean_t
+vm_radix_is_empty(struct vm_radix *rtree)
+{
+
+ return (rtree->rt_root == 0);
+}
+
#endif /* _KERNEL */
#endif /* !_VM_RADIX_H_ */
Modified: trunk/sys/vm/vm_reserv.c
===================================================================
--- trunk/sys/vm/vm_reserv.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_reserv.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -1,7 +1,7 @@
/* $MidnightBSD$ */
/*-
* Copyright (c) 2002-2006 Rice University
- * Copyright (c) 2007-2008 Alan L. Cox <alc at cs.rice.edu>
+ * Copyright (c) 2007-2011 Alan L. Cox <alc at cs.rice.edu>
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Alan L. Cox,
@@ -38,7 +38,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_reserv.c 280045 2015-03-15 18:40:06Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_reserv.c 351826 2019-09-04 19:31:37Z ray $");
#include "opt_vm.h"
@@ -52,6 +52,7 @@
#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
+#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -63,7 +64,7 @@
/*
* The reservation system supports the speculative allocation of large physical
- * pages ("superpages"). Speculative allocation enables the fully-automatic
+ * pages ("superpages"). Speculative allocation enables the fully automatic
* utilization of superpages by the virtual memory system. In other words, no
* programmatic directives are required to use superpages.
*/
@@ -94,6 +95,61 @@
(((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
/*
+ * The size of a population map entry
+ */
+typedef u_long popmap_t;
+
+/*
+ * The number of bits in a population map entry
+ */
+#define NBPOPMAP (NBBY * sizeof(popmap_t))
+
+/*
+ * The number of population map entries in a reservation
+ */
+#define NPOPMAP howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
+
+/*
+ * Clear a bit in the population map.
+ */
+static __inline void
+popmap_clear(popmap_t popmap[], int i)
+{
+
+ popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
+}
+
+/*
+ * Set a bit in the population map.
+ */
+static __inline void
+popmap_set(popmap_t popmap[], int i)
+{
+
+ popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
+}
+
+/*
+ * Is a bit in the population map clear?
+ */
+static __inline boolean_t
+popmap_is_clear(popmap_t popmap[], int i)
+{
+
+ return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
+}
+
+/*
+ * Is a bit in the population map set?
+ */
+static __inline boolean_t
+popmap_is_set(popmap_t popmap[], int i)
+{
+
+ return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
+}
+
+/*
* The reservation structure
*
* A reservation structure is constructed whenever a large physical page is
@@ -101,11 +157,11 @@
* physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
* within that object. The reservation's "popcnt" tracks the number of these
* small physical pages that are in use at any given time. When and if the
- * reservation is not fully utilized, it appears in the queue of partially-
+ * reservation is not fully utilized, it appears in the queue of partially
* populated reservations. The reservation always appears on the containing
* object's list of reservations.
*
- * A partially-populated reservation can be broken and reclaimed at any time.
+ * A partially populated reservation can be broken and reclaimed at any time.
*/
struct vm_reserv {
TAILQ_ENTRY(vm_reserv) partpopq;
@@ -115,6 +171,7 @@
vm_page_t pages; /* first page of a superpage */
int popcnt; /* # of pages in use */
char inpartpopq;
+ popmap_t popmap[NPOPMAP]; /* bit vector of used pages */
};
/*
@@ -141,11 +198,11 @@
static vm_reserv_t vm_reserv_array;
/*
- * The partially-populated reservation queue
+ * The partially populated reservation queue
*
- * This queue enables the fast recovery of an unused cached or free small page
- * from a partially-populated reservation. The reservation at the head of
- * this queue is the least-recently-changed, partially-populated reservation.
+ * This queue enables the fast recovery of an unused free small page from a
+ * partially populated reservation. The reservation at the head of this queue
+ * is the least recently changed, partially populated reservation.
*
* Access to this queue is synchronized by the free page queue lock.
*/
@@ -162,26 +219,60 @@
SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
&vm_reserv_freed, 0, "Cumulative number of freed reservations");
+static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
+
+SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
+ sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
+
static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
- sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues");
+ sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues");
static long vm_reserv_reclaimed;
SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
&vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
-static void vm_reserv_depopulate(vm_reserv_t rv);
+static void vm_reserv_break(vm_reserv_t rv);
+static void vm_reserv_depopulate(vm_reserv_t rv, int index);
static vm_reserv_t vm_reserv_from_page(vm_page_t m);
static boolean_t vm_reserv_has_pindex(vm_reserv_t rv,
vm_pindex_t pindex);
-static void vm_reserv_populate(vm_reserv_t rv);
+static void vm_reserv_populate(vm_reserv_t rv, int index);
static void vm_reserv_reclaim(vm_reserv_t rv);
/*
- * Describes the current state of the partially-populated reservation queue.
+ * Returns the current number of full reservations.
+ *
+ * Since the number of full reservations is computed without acquiring the
+ * free page queue lock, the returned value may be inexact.
*/
static int
+sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
+{
+ vm_paddr_t paddr;
+ struct vm_phys_seg *seg;
+ vm_reserv_t rv;
+ int fullpop, segind;
+
+ fullpop = 0;
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ seg = &vm_phys_segs[segind];
+ paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
+ while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
+ VM_LEVEL_0_SIZE <= seg->end) {
+ rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
+ fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
+ paddr += VM_LEVEL_0_SIZE;
+ }
+ }
+ return (sysctl_handle_int(oidp, &fullpop, 0, req));
+}
+
+/*
+ * Describes the current state of the partially populated reservation queue.
+ */
+static int
sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
{
struct sbuf sbuf;
@@ -213,18 +304,21 @@
/*
* Reduces the given reservation's population count. If the population count
* becomes zero, the reservation is destroyed. Additionally, moves the
- * reservation to the tail of the partially-populated reservations queue if the
+ * reservation to the tail of the partially populated reservation queue if the
* population count is non-zero.
*
* The free page queue lock must be held.
*/
static void
-vm_reserv_depopulate(vm_reserv_t rv)
+vm_reserv_depopulate(vm_reserv_t rv, int index)
{
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
KASSERT(rv->object != NULL,
("vm_reserv_depopulate: reserv %p is free", rv));
+ KASSERT(popmap_is_set(rv->popmap, index),
+ ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
+ index));
KASSERT(rv->popcnt > 0,
("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
if (rv->inpartpopq) {
@@ -236,6 +330,7 @@
rv));
rv->pages->psind = 0;
}
+ popmap_clear(rv->popmap, index);
rv->popcnt--;
if (rv->popcnt == 0) {
LIST_REMOVE(rv, objq);
@@ -271,17 +366,20 @@
/*
* Increases the given reservation's population count. Moves the reservation
- * to the tail of the partially-populated reservation queue.
+ * to the tail of the partially populated reservation queue.
*
* The free page queue must be locked.
*/
static void
-vm_reserv_populate(vm_reserv_t rv)
+vm_reserv_populate(vm_reserv_t rv, int index)
{
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
KASSERT(rv->object != NULL,
("vm_reserv_populate: reserv %p is free", rv));
+ KASSERT(popmap_is_clear(rv->popmap, index),
+ ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
+ index));
KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
("vm_reserv_populate: reserv %p is already full", rv));
KASSERT(rv->pages->psind == 0,
@@ -290,6 +388,7 @@
TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
rv->inpartpopq = FALSE;
}
+ popmap_set(rv->popmap, index);
rv->popcnt++;
if (rv->popcnt < VM_LEVEL_0_NPAGES) {
rv->inpartpopq = TRUE;
@@ -308,14 +407,18 @@
* physical address boundary that is a multiple of that value. Both
* "alignment" and "boundary" must be a power of two.
*
+ * The page "mpred" must immediately precede the offset "pindex" within the
+ * specified object.
+ *
* The object and free page queue must be locked.
*/
vm_page_t
vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
- vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
+ vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
+ vm_page_t mpred)
{
vm_paddr_t pa, size;
- vm_page_t m, m_ret, mpred, msucc;
+ vm_page_t m, m_ret, msucc;
vm_pindex_t first, leftcap, rightcap;
vm_reserv_t rv;
u_long allocpages, maxpages, minpages;
@@ -352,10 +455,11 @@
/*
* Look for an existing reservation.
*/
- mpred = vm_radix_lookup_le(&object->rtree, pindex);
if (mpred != NULL) {
+ KASSERT(mpred->object == object,
+ ("vm_reserv_alloc_contig: object doesn't contain mpred"));
KASSERT(mpred->pindex < pindex,
- ("vm_reserv_alloc_contig: pindex already allocated"));
+ ("vm_reserv_alloc_contig: mpred doesn't precede pindex"));
rv = vm_reserv_from_page(mpred);
if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
goto found;
@@ -364,7 +468,7 @@
msucc = TAILQ_FIRST(&object->memq);
if (msucc != NULL) {
KASSERT(msucc->pindex > pindex,
- ("vm_reserv_alloc_contig: pindex already allocated"));
+ ("vm_reserv_alloc_contig: msucc doesn't succeed pindex"));
rv = vm_reserv_from_page(msucc);
if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
goto found;
@@ -460,9 +564,13 @@
KASSERT(!rv->inpartpopq,
("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
rv));
+ for (i = 0; i < NPOPMAP; i++)
+ KASSERT(rv->popmap[i] == 0,
+ ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted",
+ rv));
n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
for (i = 0; i < n; i++)
- vm_reserv_populate(rv);
+ vm_reserv_populate(rv, index + i);
npages -= n;
if (m_ret == NULL) {
m_ret = &rv->pages[index];
@@ -489,15 +597,15 @@
return (NULL);
/* Handle vm_page_rename(m, new_object, ...). */
for (i = 0; i < npages; i++)
- if ((rv->pages[index + i].flags & (PG_CACHED | PG_FREE)) == 0)
+ if (popmap_is_set(rv->popmap, index + i))
return (NULL);
for (i = 0; i < npages; i++)
- vm_reserv_populate(rv);
+ vm_reserv_populate(rv, index + i);
return (m);
}
/*
- * Allocates a page from an existing or newly-created reservation.
+ * Allocates a page from an existing or newly created reservation.
*
* The page "mpred" must immediately precede the offset "pindex" within the
* specified object.
@@ -510,6 +618,7 @@
vm_page_t m, msucc;
vm_pindex_t first, leftcap, rightcap;
vm_reserv_t rv;
+ int i, index;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
VM_OBJECT_ASSERT_WLOCKED(object);
@@ -598,22 +707,93 @@
("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
KASSERT(!rv->inpartpopq,
("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
- vm_reserv_populate(rv);
- return (&rv->pages[VM_RESERV_INDEX(object, pindex)]);
+ for (i = 0; i < NPOPMAP; i++)
+ KASSERT(rv->popmap[i] == 0,
+ ("vm_reserv_alloc_page: reserv %p's popmap is corrupted",
+ rv));
+ index = VM_RESERV_INDEX(object, pindex);
+ vm_reserv_populate(rv, index);
+ return (&rv->pages[index]);
/*
* Found a matching reservation.
*/
found:
- m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
+ index = VM_RESERV_INDEX(object, pindex);
+ m = &rv->pages[index];
/* Handle vm_page_rename(m, new_object, ...). */
- if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
+ if (popmap_is_set(rv->popmap, index))
return (NULL);
- vm_reserv_populate(rv);
+ vm_reserv_populate(rv, index);
return (m);
}
/*
+ * Breaks the given reservation. All free pages in the reservation
+ * are returned to the physical memory allocator. The reservation's
+ * population count and map are reset to their initial state.
+ *
+ * The given reservation must not be in the partially populated reservation
+ * queue. The free page queue lock must be held.
+ */
+static void
+vm_reserv_break(vm_reserv_t rv)
+{
+ int begin_zeroes, hi, i, lo;
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ KASSERT(rv->object != NULL,
+ ("vm_reserv_break: reserv %p is free", rv));
+ KASSERT(!rv->inpartpopq,
+ ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv));
+ LIST_REMOVE(rv, objq);
+ rv->object = NULL;
+ rv->pages->psind = 0;
+ i = hi = 0;
+ do {
+ /* Find the next 0 bit. Any previous 0 bits are < "hi". */
+ lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
+ if (lo == 0) {
+ /* Redundantly clears bits < "hi". */
+ rv->popmap[i] = 0;
+ rv->popcnt -= NBPOPMAP - hi;
+ while (++i < NPOPMAP) {
+ lo = ffsl(~rv->popmap[i]);
+ if (lo == 0) {
+ rv->popmap[i] = 0;
+ rv->popcnt -= NBPOPMAP;
+ } else
+ break;
+ }
+ if (i == NPOPMAP)
+ break;
+ hi = 0;
+ }
+ KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo));
+ /* Convert from ffsl() to ordinary bit numbering. */
+ lo--;
+ if (lo > 0) {
+ /* Redundantly clears bits < "hi". */
+ rv->popmap[i] &= ~((1UL << lo) - 1);
+ rv->popcnt -= lo - hi;
+ }
+ begin_zeroes = NBPOPMAP * i + lo;
+ /* Find the next 1 bit. */
+ do
+ hi = ffsl(rv->popmap[i]);
+ while (hi == 0 && ++i < NPOPMAP);
+ if (i != NPOPMAP)
+ /* Convert from ffsl() to ordinary bit numbering. */
+ hi--;
+ vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
+ hi - begin_zeroes);
+ } while (i < NPOPMAP);
+ KASSERT(rv->popcnt == 0,
+ ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
+ vm_reserv_broken++;
+}
+
+/*
* Breaks all reservations belonging to the given object.
*/
void
@@ -620,7 +800,6 @@
vm_reserv_break_all(vm_object_t object)
{
vm_reserv_t rv;
- int i;
mtx_lock(&vm_page_queue_free_mtx);
while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
@@ -630,18 +809,7 @@
TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
rv->inpartpopq = FALSE;
}
- LIST_REMOVE(rv, objq);
- rv->object = NULL;
- for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
- if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
- vm_phys_free_pages(&rv->pages[i], 0);
- else
- rv->popcnt--;
- }
- KASSERT(rv->popcnt == 0,
- ("vm_reserv_break_all: reserv %p's popcnt is corrupted",
- rv));
- vm_reserv_broken++;
+ vm_reserv_break(rv);
}
mtx_unlock(&vm_page_queue_free_mtx);
}
@@ -661,10 +829,7 @@
rv = vm_reserv_from_page(m);
if (rv->object == NULL)
return (FALSE);
- if ((m->flags & PG_CACHED) != 0 && m->pool != VM_FREEPOOL_CACHE)
- vm_phys_set_pool(VM_FREEPOOL_CACHE, rv->pages,
- VM_LEVEL_0_ORDER);
- vm_reserv_depopulate(rv);
+ vm_reserv_depopulate(rv, m - rv->pages);
return (TRUE);
}
@@ -678,15 +843,18 @@
vm_reserv_init(void)
{
vm_paddr_t paddr;
- int i;
+ struct vm_phys_seg *seg;
+ int segind;
/*
* Initialize the reservation array. Specifically, initialize the
* "pages" field for every element that has an underlying superpage.
*/
- for (i = 0; phys_avail[i + 1] != 0; i += 2) {
- paddr = roundup2(phys_avail[i], VM_LEVEL_0_SIZE);
- while (paddr + VM_LEVEL_0_SIZE <= phys_avail[i + 1]) {
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ seg = &vm_phys_segs[segind];
+ paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
+ while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
+ VM_LEVEL_0_SIZE <= seg->end) {
vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
PHYS_TO_VM_PAGE(paddr);
paddr += VM_LEVEL_0_SIZE;
@@ -695,77 +863,50 @@
}
/*
- * Returns a reservation level if the given page belongs to a fully-populated
- * reservation and -1 otherwise.
+ * Returns true if the given page belongs to a reservation and that page is
+ * free. Otherwise, returns false.
*/
+bool
+vm_reserv_is_page_free(vm_page_t m)
+{
+ vm_reserv_t rv;
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ rv = vm_reserv_from_page(m);
+ if (rv->object == NULL)
+ return (false);
+ return (popmap_is_clear(rv->popmap, m - rv->pages));
+}
+
+/*
+ * If the given page belongs to a reservation, returns the level of that
+ * reservation. Otherwise, returns -1.
+ */
int
-vm_reserv_level_iffullpop(vm_page_t m)
+vm_reserv_level(vm_page_t m)
{
vm_reserv_t rv;
rv = vm_reserv_from_page(m);
- return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
+ return (rv->object != NULL ? 0 : -1);
}
/*
- * Prepare for the reactivation of a cached page.
- *
- * First, suppose that the given page "m" was allocated individually, i.e., not
- * as part of a reservation, and cached. Then, suppose a reservation
- * containing "m" is allocated by the same object. Although "m" and the
- * reservation belong to the same object, "m"'s pindex may not match the
- * reservation's.
- *
- * The free page queue must be locked.
+ * Returns a reservation level if the given page belongs to a fully populated
+ * reservation and -1 otherwise.
*/
-boolean_t
-vm_reserv_reactivate_page(vm_page_t m)
+int
+vm_reserv_level_iffullpop(vm_page_t m)
{
vm_reserv_t rv;
- int i, m_index;
- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
rv = vm_reserv_from_page(m);
- if (rv->object == NULL)
- return (FALSE);
- KASSERT((m->flags & PG_CACHED) != 0,
- ("vm_reserv_uncache_page: page %p is not cached", m));
- if (m->object == rv->object &&
- m->pindex - rv->pindex == VM_RESERV_INDEX(m->object, m->pindex))
- vm_reserv_populate(rv);
- else {
- KASSERT(rv->inpartpopq,
- ("vm_reserv_uncache_page: reserv %p's inpartpopq is FALSE",
- rv));
- TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
- rv->inpartpopq = FALSE;
- LIST_REMOVE(rv, objq);
- rv->object = NULL;
- /* Don't vm_phys_free_pages(m, 0). */
- m_index = m - rv->pages;
- for (i = 0; i < m_index; i++) {
- if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
- vm_phys_free_pages(&rv->pages[i], 0);
- else
- rv->popcnt--;
- }
- for (i++; i < VM_LEVEL_0_NPAGES; i++) {
- if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
- vm_phys_free_pages(&rv->pages[i], 0);
- else
- rv->popcnt--;
- }
- KASSERT(rv->popcnt == 0,
- ("vm_reserv_uncache_page: reserv %p's popcnt is corrupted",
- rv));
- vm_reserv_broken++;
- }
- return (TRUE);
+ return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
}
/*
- * Breaks the given partially-populated reservation, releasing its cached and
- * free pages to the physical memory allocator.
+ * Breaks the given partially populated reservation, releasing its free pages
+ * to the physical memory allocator.
*
* The free page queue lock must be held.
*/
@@ -772,32 +913,20 @@
static void
vm_reserv_reclaim(vm_reserv_t rv)
{
- int i;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
KASSERT(rv->inpartpopq,
- ("vm_reserv_reclaim: reserv %p's inpartpopq is corrupted", rv));
+ ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
rv->inpartpopq = FALSE;
- KASSERT(rv->object != NULL,
- ("vm_reserv_reclaim: reserv %p is free", rv));
- LIST_REMOVE(rv, objq);
- rv->object = NULL;
- for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
- if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
- vm_phys_free_pages(&rv->pages[i], 0);
- else
- rv->popcnt--;
- }
- KASSERT(rv->popcnt == 0,
- ("vm_reserv_reclaim: reserv %p's popcnt is corrupted", rv));
+ vm_reserv_break(rv);
vm_reserv_reclaimed++;
}
/*
- * Breaks the reservation at the head of the partially-populated reservation
- * queue, releasing its cached and free pages to the physical memory
- * allocator. Returns TRUE if a reservation is broken and FALSE otherwise.
+ * Breaks the reservation at the head of the partially populated reservation
+ * queue, releasing its free pages to the physical memory allocator. Returns
+ * TRUE if a reservation is broken and FALSE otherwise.
*
* The free page queue lock must be held.
*/
@@ -815,11 +944,10 @@
}
/*
- * Searches the partially-populated reservation queue for the least recently
- * active reservation with unused pages, i.e., cached or free, that satisfy the
- * given request for contiguous physical memory. If a satisfactory reservation
- * is found, it is broken. Returns TRUE if a reservation is broken and FALSE
- * otherwise.
+ * Searches the partially populated reservation queue for the least recently
+ * changed reservation with free pages that satisfy the given request for
+ * contiguous physical memory. If a satisfactory reservation is found, it is
+ * broken. Returns TRUE if a reservation is broken and FALSE otherwise.
*
* The free page queue lock must be held.
*/
@@ -827,9 +955,9 @@
vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
u_long alignment, vm_paddr_t boundary)
{
- vm_paddr_t pa, pa_length, size;
+ vm_paddr_t pa, size;
vm_reserv_t rv;
- int i;
+ int hi, i, lo, low_index, next_free;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
if (npages > VM_LEVEL_0_NPAGES - 1)
@@ -838,30 +966,72 @@
TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
if (pa + PAGE_SIZE - size < low) {
- /* this entire reservation is too low; go to next */
+ /* This entire reservation is too low; go to next. */
continue;
}
- pa_length = 0;
- for (i = 0; i < VM_LEVEL_0_NPAGES; i++)
- if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) {
- pa_length += PAGE_SIZE;
- if (pa_length == PAGE_SIZE) {
- pa = VM_PAGE_TO_PHYS(&rv->pages[i]);
- if (pa + size > high) {
- /* skip to next reservation */
- break;
- } else if (pa < low ||
- (pa & (alignment - 1)) != 0 ||
- ((pa ^ (pa + size - 1)) &
- ~(boundary - 1)) != 0)
- pa_length = 0;
+ pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
+ if (pa + size > high) {
+ /* This entire reservation is too high; go to next. */
+ continue;
+ }
+ if (pa < low) {
+ /* Start the search for free pages at "low". */
+ low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT;
+ i = low_index / NBPOPMAP;
+ hi = low_index % NBPOPMAP;
+ } else
+ i = hi = 0;
+ do {
+ /* Find the next free page. */
+ lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
+ while (lo == 0 && ++i < NPOPMAP)
+ lo = ffsl(~rv->popmap[i]);
+ if (i == NPOPMAP)
+ break;
+ /* Convert from ffsl() to ordinary bit numbering. */
+ lo--;
+ next_free = NBPOPMAP * i + lo;
+ pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]);
+ KASSERT(pa >= low,
+ ("vm_reserv_reclaim_contig: pa is too low"));
+ if (pa + size > high) {
+ /* The rest of this reservation is too high. */
+ break;
+ } else if ((pa & (alignment - 1)) != 0 ||
+ ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
+ /*
+ * The current page doesn't meet the alignment
+ * and/or boundary requirements. Continue
+ * searching this reservation until the rest
+ * of its free pages are either excluded or
+ * exhausted.
+ */
+ hi = lo + 1;
+ if (hi >= NBPOPMAP) {
+ hi = 0;
+ i++;
}
- if (pa_length >= size) {
+ continue;
+ }
+ /* Find the next used page. */
+ hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1));
+ while (hi == 0 && ++i < NPOPMAP) {
+ if ((NBPOPMAP * i - next_free) * PAGE_SIZE >=
+ size) {
vm_reserv_reclaim(rv);
return (TRUE);
}
- } else
- pa_length = 0;
+ hi = ffsl(rv->popmap[i]);
+ }
+ /* Convert from ffsl() to ordinary bit numbering. */
+ if (i != NPOPMAP)
+ hi--;
+ if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >=
+ size) {
+ vm_reserv_reclaim(rv);
+ return (TRUE);
+ }
+ } while (i < NPOPMAP);
}
return (FALSE);
}
@@ -892,6 +1062,23 @@
}
/*
+ * Returns the size (in bytes) of a reservation of the specified level.
+ */
+int
+vm_reserv_size(int level)
+{
+
+ switch (level) {
+ case 0:
+ return (VM_LEVEL_0_SIZE);
+ case -1:
+ return (PAGE_SIZE);
+ default:
+ return (0);
+ }
+}
+
+/*
* Allocates the virtual and physical memory required by the reservation
* management system's data structures, in particular, the reservation array.
*/
@@ -925,4 +1112,18 @@
return (new_end);
}
+/*
+ * Returns the superpage containing the given page.
+ */
+vm_page_t
+vm_reserv_to_superpage(vm_page_t m)
+{
+ vm_reserv_t rv;
+
+ VM_OBJECT_ASSERT_LOCKED(m->object);
+ rv = vm_reserv_from_page(m);
+ return (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES ?
+ rv->pages : NULL);
+}
+
#endif /* VM_NRESERVLEVEL > 0 */
Modified: trunk/sys/vm/vm_reserv.h
===================================================================
--- trunk/sys/vm/vm_reserv.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_reserv.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -29,7 +29,7 @@
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/vm/vm_reserv.h 250577 2013-05-12 16:50:18Z alc $
+ * $FreeBSD: stable/11/sys/vm/vm_reserv.h 324399 2017-10-07 20:22:04Z alc $
*/
/*
@@ -48,21 +48,24 @@
*/
vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex,
u_long npages, vm_paddr_t low, vm_paddr_t high,
- u_long alignment, vm_paddr_t boundary);
+ u_long alignment, vm_paddr_t boundary, vm_page_t mpred);
vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex,
vm_page_t mpred);
void vm_reserv_break_all(vm_object_t object);
boolean_t vm_reserv_free_page(vm_page_t m);
void vm_reserv_init(void);
+bool vm_reserv_is_page_free(vm_page_t m);
+int vm_reserv_level(vm_page_t m);
int vm_reserv_level_iffullpop(vm_page_t m);
-boolean_t vm_reserv_reactivate_page(vm_page_t m);
boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low,
vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
boolean_t vm_reserv_reclaim_inactive(void);
void vm_reserv_rename(vm_page_t m, vm_object_t new_object,
vm_object_t old_object, vm_pindex_t old_object_offset);
+int vm_reserv_size(int level);
vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end,
vm_paddr_t high_water);
+vm_page_t vm_reserv_to_superpage(vm_page_t m);
#endif /* VM_NRESERVLEVEL > 0 */
#endif /* _KERNEL */
Added: trunk/sys/vm/vm_swapout.c
===================================================================
--- trunk/sys/vm/vm_swapout.c (rev 0)
+++ trunk/sys/vm/vm_swapout.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,955 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 1994 David Greenman
+ * All rights reserved.
+ * Copyright (c) 2005 Yahoo! Technologies Norway AS
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * The Mach Operating System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
+ *
+ *
+ * Copyright (c) 1987, 1990 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Avadis Tevanian, Jr., Michael Wayne Young
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution at CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_swapout.c 338335 2018-08-27 09:39:34Z kib $");
+
+#include "opt_kstack_pages.h"
+#include "opt_kstack_max_pages.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/kernel.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/_kstack_cache.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/mount.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/vmmeter.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
+#include <vm/swap_pager.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+/* the kernel process "vm_daemon" */
+static void vm_daemon(void);
+static struct proc *vmproc;
+
+static struct kproc_desc vm_kp = {
+ "vmdaemon",
+ vm_daemon,
+ &vmproc
+};
+SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
+
+static int vm_swap_enabled = 1;
+static int vm_swap_idle_enabled = 0;
+
+SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW,
+ &vm_swap_enabled, 0,
+ "Enable entire process swapout");
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW,
+ &vm_swap_idle_enabled, 0,
+ "Allow swapout on idle criteria");
+
+/*
+ * Swap_idle_threshold1 is the guaranteed swapped in time for a process
+ */
+static int swap_idle_threshold1 = 2;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
+ &swap_idle_threshold1, 0,
+ "Guaranteed swapped in time for a process");
+
+/*
+ * Swap_idle_threshold2 is the time that a process can be idle before
+ * it will be swapped out, if idle swapping is enabled.
+ */
+static int swap_idle_threshold2 = 10;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
+ &swap_idle_threshold2, 0,
+ "Time before a process will be swapped out");
+
+static int vm_pageout_req_swapout; /* XXX */
+static int vm_daemon_needed;
+static struct mtx vm_daemon_mtx;
+/* Allow for use by vm_pageout before vm_daemon is initialized. */
+MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
+
+static int swapped_cnt;
+static int swap_inprogress; /* Pending swap-ins done outside swapper. */
+static int last_swapin;
+
+static void swapclear(struct proc *);
+static int swapout(struct proc *);
+static void vm_swapout_map_deactivate_pages(vm_map_t, long);
+static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long);
+static void swapout_procs(int action);
+static void vm_req_vmdaemon(int req);
+static void vm_thread_swapout(struct thread *td);
+
+/*
+ * vm_swapout_object_deactivate_pages
+ *
+ * Deactivate enough pages to satisfy the inactive target
+ * requirements.
+ *
+ * The object and map must be locked.
+ */
+static void
+vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
+ long desired)
+{
+ vm_object_t backing_object, object;
+ vm_page_t p;
+ int act_delta, remove_mode;
+
+ VM_OBJECT_ASSERT_LOCKED(first_object);
+ if ((first_object->flags & OBJ_FICTITIOUS) != 0)
+ return;
+ for (object = first_object;; object = backing_object) {
+ if (pmap_resident_count(pmap) <= desired)
+ goto unlock_return;
+ VM_OBJECT_ASSERT_LOCKED(object);
+ if ((object->flags & OBJ_UNMANAGED) != 0 ||
+ object->paging_in_progress != 0)
+ goto unlock_return;
+
+ remove_mode = 0;
+ if (object->shadow_count > 1)
+ remove_mode = 1;
+ /*
+ * Scan the object's entire memory queue.
+ */
+ TAILQ_FOREACH(p, &object->memq, listq) {
+ if (pmap_resident_count(pmap) <= desired)
+ goto unlock_return;
+ if (should_yield())
+ goto unlock_return;
+ if (vm_page_busied(p))
+ continue;
+ PCPU_INC(cnt.v_pdpages);
+ vm_page_lock(p);
+ if (p->wire_count != 0 || p->hold_count != 0 ||
+ !pmap_page_exists_quick(pmap, p)) {
+ vm_page_unlock(p);
+ continue;
+ }
+ act_delta = pmap_ts_referenced(p);
+ if ((p->aflags & PGA_REFERENCED) != 0) {
+ if (act_delta == 0)
+ act_delta = 1;
+ vm_page_aflag_clear(p, PGA_REFERENCED);
+ }
+ if (!vm_page_active(p) && act_delta != 0) {
+ vm_page_activate(p);
+ p->act_count += act_delta;
+ } else if (vm_page_active(p)) {
+ if (act_delta == 0) {
+ p->act_count -= min(p->act_count,
+ ACT_DECLINE);
+ if (!remove_mode && p->act_count == 0) {
+ pmap_remove_all(p);
+ vm_page_deactivate(p);
+ } else
+ vm_page_requeue(p);
+ } else {
+ vm_page_activate(p);
+ if (p->act_count < ACT_MAX -
+ ACT_ADVANCE)
+ p->act_count += ACT_ADVANCE;
+ vm_page_requeue(p);
+ }
+ } else if (vm_page_inactive(p))
+ pmap_remove_all(p);
+ vm_page_unlock(p);
+ }
+ if ((backing_object = object->backing_object) == NULL)
+ goto unlock_return;
+ VM_OBJECT_RLOCK(backing_object);
+ if (object != first_object)
+ VM_OBJECT_RUNLOCK(object);
+ }
+unlock_return:
+ if (object != first_object)
+ VM_OBJECT_RUNLOCK(object);
+}
+
+/*
+ * deactivate some number of pages in a map, try to do it fairly, but
+ * that is really hard to do.
+ */
+static void
+vm_swapout_map_deactivate_pages(vm_map_t map, long desired)
+{
+ vm_map_entry_t tmpe;
+ vm_object_t obj, bigobj;
+ int nothingwired;
+
+ if (!vm_map_trylock_read(map))
+ return;
+
+ bigobj = NULL;
+ nothingwired = TRUE;
+
+ /*
+ * first, search out the biggest object, and try to free pages from
+ * that.
+ */
+ tmpe = map->header.next;
+ while (tmpe != &map->header) {
+ if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
+ obj = tmpe->object.vm_object;
+ if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
+ if (obj->shadow_count <= 1 &&
+ (bigobj == NULL ||
+ bigobj->resident_page_count <
+ obj->resident_page_count)) {
+ if (bigobj != NULL)
+ VM_OBJECT_RUNLOCK(bigobj);
+ bigobj = obj;
+ } else
+ VM_OBJECT_RUNLOCK(obj);
+ }
+ }
+ if (tmpe->wired_count > 0)
+ nothingwired = FALSE;
+ tmpe = tmpe->next;
+ }
+
+ if (bigobj != NULL) {
+ vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired);
+ VM_OBJECT_RUNLOCK(bigobj);
+ }
+ /*
+ * Next, hunt around for other pages to deactivate. We actually
+ * do this search sort of wrong -- .text first is not the best idea.
+ */
+ tmpe = map->header.next;
+ while (tmpe != &map->header) {
+ if (pmap_resident_count(vm_map_pmap(map)) <= desired)
+ break;
+ if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
+ obj = tmpe->object.vm_object;
+ if (obj != NULL) {
+ VM_OBJECT_RLOCK(obj);
+ vm_swapout_object_deactivate_pages(map->pmap,
+ obj, desired);
+ VM_OBJECT_RUNLOCK(obj);
+ }
+ }
+ tmpe = tmpe->next;
+ }
+
+ /*
+ * Remove all mappings if a process is swapped out, this will free page
+ * table pages.
+ */
+ if (desired == 0 && nothingwired) {
+ pmap_remove(vm_map_pmap(map), vm_map_min(map),
+ vm_map_max(map));
+ }
+
+ vm_map_unlock_read(map);
+}
+
+/*
+ * Swap out requests
+ */
+#define VM_SWAP_NORMAL 1
+#define VM_SWAP_IDLE 2
+
+void
+vm_swapout_run(void)
+{
+
+ if (vm_swap_enabled)
+ vm_req_vmdaemon(VM_SWAP_NORMAL);
+}
+
+/*
+ * Idle process swapout -- run once per second when pagedaemons are
+ * reclaiming pages.
+ */
+void
+vm_swapout_run_idle(void)
+{
+ static long lsec;
+
+ if (!vm_swap_idle_enabled || time_second == lsec)
+ return;
+ vm_req_vmdaemon(VM_SWAP_IDLE);
+ lsec = time_second;
+}
+
+static void
+vm_req_vmdaemon(int req)
+{
+ static int lastrun = 0;
+
+ mtx_lock(&vm_daemon_mtx);
+ vm_pageout_req_swapout |= req;
+ if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
+ wakeup(&vm_daemon_needed);
+ lastrun = ticks;
+ }
+ mtx_unlock(&vm_daemon_mtx);
+}
+
+static void
+vm_daemon(void)
+{
+ struct rlimit rsslim;
+ struct proc *p;
+ struct thread *td;
+ struct vmspace *vm;
+ int breakout, swapout_flags, tryagain, attempts;
+#ifdef RACCT
+ uint64_t rsize, ravailable;
+#endif
+
+ while (TRUE) {
+ mtx_lock(&vm_daemon_mtx);
+ msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
+#ifdef RACCT
+ racct_enable ? hz : 0
+#else
+ 0
+#endif
+ );
+ swapout_flags = vm_pageout_req_swapout;
+ vm_pageout_req_swapout = 0;
+ mtx_unlock(&vm_daemon_mtx);
+ if (swapout_flags)
+ swapout_procs(swapout_flags);
+
+ /*
+ * scan the processes for exceeding their rlimits or if
+ * process is swapped out -- deactivate pages
+ */
+ tryagain = 0;
+ attempts = 0;
+again:
+ attempts++;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ vm_pindex_t limit, size;
+
+ /*
+ * if this is a system process or if we have already
+ * looked at this process, skip it.
+ */
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL ||
+ p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * if the process is in a non-running type state,
+ * don't touch it.
+ */
+ breakout = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (!TD_ON_RUNQ(td) &&
+ !TD_IS_RUNNING(td) &&
+ !TD_IS_SLEEPING(td) &&
+ !TD_IS_SUSPENDED(td)) {
+ thread_unlock(td);
+ breakout = 1;
+ break;
+ }
+ thread_unlock(td);
+ }
+ if (breakout) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * get a limit
+ */
+ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
+ limit = OFF_TO_IDX(
+ qmin(rsslim.rlim_cur, rsslim.rlim_max));
+
+ /*
+ * let processes that are swapped out really be
+ * swapped out set the limit to nothing (will force a
+ * swap-out.)
+ */
+ if ((p->p_flag & P_INMEM) == 0)
+ limit = 0; /* XXX */
+ vm = vmspace_acquire_ref(p);
+ _PHOLD_LITE(p);
+ PROC_UNLOCK(p);
+ if (vm == NULL) {
+ PRELE(p);
+ continue;
+ }
+ sx_sunlock(&allproc_lock);
+
+ size = vmspace_resident_count(vm);
+ if (size >= limit) {
+ vm_swapout_map_deactivate_pages(
+ &vm->vm_map, limit);
+ size = vmspace_resident_count(vm);
+ }
+#ifdef RACCT
+ if (racct_enable) {
+ rsize = IDX_TO_OFF(size);
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL)
+ racct_set(p, RACCT_RSS, rsize);
+ ravailable = racct_get_available(p, RACCT_RSS);
+ PROC_UNLOCK(p);
+ if (rsize > ravailable) {
+ /*
+ * Don't be overly aggressive; this
+ * might be an innocent process,
+ * and the limit could've been exceeded
+ * by some memory hog. Don't try
+ * to deactivate more than 1/4th
+ * of process' resident set size.
+ */
+ if (attempts <= 8) {
+ if (ravailable < rsize -
+ (rsize / 4)) {
+ ravailable = rsize -
+ (rsize / 4);
+ }
+ }
+ vm_swapout_map_deactivate_pages(
+ &vm->vm_map,
+ OFF_TO_IDX(ravailable));
+ /* Update RSS usage after paging out. */
+ size = vmspace_resident_count(vm);
+ rsize = IDX_TO_OFF(size);
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL)
+ racct_set(p, RACCT_RSS, rsize);
+ PROC_UNLOCK(p);
+ if (rsize > ravailable)
+ tryagain = 1;
+ }
+ }
+#endif
+ vmspace_free(vm);
+ sx_slock(&allproc_lock);
+ PRELE(p);
+ }
+ sx_sunlock(&allproc_lock);
+ if (tryagain != 0 && attempts <= 10) {
+ maybe_yield();
+ goto again;
+ }
+ }
+}
+
+/*
+ * Allow a thread's kernel stack to be paged out.
+ */
+static void
+vm_thread_swapout(struct thread *td)
+{
+ vm_object_t ksobj;
+ vm_page_t m;
+ int i, pages;
+
+ cpu_thread_swapout(td);
+ pages = td->td_kstack_pages;
+ ksobj = td->td_kstack_obj;
+ pmap_qremove(td->td_kstack, pages);
+ VM_OBJECT_WLOCK(ksobj);
+ for (i = 0; i < pages; i++) {
+ m = vm_page_lookup(ksobj, i);
+ if (m == NULL)
+ panic("vm_thread_swapout: kstack already missing?");
+ vm_page_dirty(m);
+ vm_page_lock(m);
+ vm_page_unwire(m, PQ_INACTIVE);
+ vm_page_unlock(m);
+ }
+ VM_OBJECT_WUNLOCK(ksobj);
+}
+
+/*
+ * Bring the kernel stack for a specified thread back in.
+ */
+static void
+vm_thread_swapin(struct thread *td, int oom_alloc)
+{
+ vm_object_t ksobj;
+ vm_page_t ma[KSTACK_MAX_PAGES];
+ int a, count, i, j, pages, rv;
+
+ pages = td->td_kstack_pages;
+ ksobj = td->td_kstack_obj;
+ VM_OBJECT_WLOCK(ksobj);
+ (void)vm_page_grab_pages(ksobj, 0, oom_alloc | VM_ALLOC_WIRED, ma,
+ pages);
+ for (i = 0; i < pages;) {
+ vm_page_assert_xbusied(ma[i]);
+ if (ma[i]->valid == VM_PAGE_BITS_ALL) {
+ vm_page_xunbusy(ma[i]);
+ i++;
+ continue;
+ }
+ vm_object_pip_add(ksobj, 1);
+ for (j = i + 1; j < pages; j++)
+ if (ma[j]->valid == VM_PAGE_BITS_ALL)
+ break;
+ rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a);
+ KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i]));
+ count = min(a + 1, j - i);
+ rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL);
+ KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d",
+ __func__, td->td_proc->p_pid));
+ vm_object_pip_wakeup(ksobj);
+ for (j = i; j < i + count; j++)
+ vm_page_xunbusy(ma[j]);
+ i += count;
+ }
+ VM_OBJECT_WUNLOCK(ksobj);
+ pmap_qenter(td->td_kstack, ma, pages);
+ cpu_thread_swapin(td);
+}
+
+void
+faultin(struct proc *p)
+{
+ struct thread *td;
+ int oom_alloc;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ /*
+ * If another process is swapping in this process,
+ * just wait until it finishes.
+ */
+ if (p->p_flag & P_SWAPPINGIN) {
+ while (p->p_flag & P_SWAPPINGIN)
+ msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
+ return;
+ }
+
+ if ((p->p_flag & P_INMEM) == 0) {
+ oom_alloc = (p->p_flag & P_WKILLED) != 0 ? VM_ALLOC_SYSTEM :
+ VM_ALLOC_NORMAL;
+
+ /*
+ * Don't let another thread swap process p out while we are
+ * busy swapping it in.
+ */
+ ++p->p_lock;
+ p->p_flag |= P_SWAPPINGIN;
+ PROC_UNLOCK(p);
+ sx_xlock(&allproc_lock);
+ MPASS(swapped_cnt > 0);
+ swapped_cnt--;
+ if (curthread != &thread0)
+ swap_inprogress++;
+ sx_xunlock(&allproc_lock);
+
+ /*
+ * We hold no lock here because the list of threads
+ * can not change while all threads in the process are
+ * swapped out.
+ */
+ FOREACH_THREAD_IN_PROC(p, td)
+ vm_thread_swapin(td, oom_alloc);
+
+ if (curthread != &thread0) {
+ sx_xlock(&allproc_lock);
+ MPASS(swap_inprogress > 0);
+ swap_inprogress--;
+ last_swapin = ticks;
+ sx_xunlock(&allproc_lock);
+ }
+ PROC_LOCK(p);
+ swapclear(p);
+ p->p_swtick = ticks;
+
+ /* Allow other threads to swap p out now. */
+ wakeup(&p->p_flag);
+ --p->p_lock;
+ }
+}
+
+/*
+ * This swapin algorithm attempts to swap-in processes only if there
+ * is enough space for them. Of course, if a process waits for a long
+ * time, it will be swapped in anyway.
+ */
+
+static struct proc *
+swapper_selector(bool wkilled_only)
+{
+ struct proc *p, *res;
+ struct thread *td;
+ int ppri, pri, slptime, swtime;
+
+ sx_assert(&allproc_lock, SA_SLOCKED);
+ if (swapped_cnt == 0)
+ return (NULL);
+ res = NULL;
+ ppri = INT_MIN;
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT |
+ P_SWAPPINGIN | P_INMEM)) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) {
+ /*
+ * A swapped-out process might have mapped a
+ * large portion of the system's pages as
+ * anonymous memory. There is no other way to
+ * release the memory other than to kill the
+ * process, for which we need to swap it in.
+ */
+ return (p);
+ }
+ if (wkilled_only) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ swtime = (ticks - p->p_swtick) / hz;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ /*
+ * An otherwise runnable thread of a process
+ * swapped out has only the TDI_SWAPPED bit set.
+ */
+ thread_lock(td);
+ if (td->td_inhibitors == TDI_SWAPPED) {
+ slptime = (ticks - td->td_slptick) / hz;
+ pri = swtime + slptime;
+ if ((td->td_flags & TDF_SWAPINREQ) == 0)
+ pri -= p->p_nice * 8;
+ /*
+ * if this thread is higher priority
+ * and there is enough space, then select
+ * this process instead of the previous
+ * selection.
+ */
+ if (pri > ppri) {
+ res = p;
+ ppri = pri;
+ }
+ }
+ thread_unlock(td);
+ }
+ PROC_UNLOCK(p);
+ }
+
+ if (res != NULL)
+ PROC_LOCK(res);
+ return (res);
+}
+
+#define SWAPIN_INTERVAL (MAXSLP * hz / 2)
+
+/*
+ * Limit swapper to swap in one non-WKILLED process in MAXSLP/2
+ * interval, assuming that there is:
+ * - no memory shortage;
+ * - no parallel swap-ins;
+ * - no other swap-ins in the current SWAPIN_INTERVAL.
+ */
+static bool
+swapper_wkilled_only(void)
+{
+
+ return (vm_page_count_min() || swap_inprogress > 0 ||
+ (u_int)(ticks - last_swapin) < SWAPIN_INTERVAL);
+}
+
+void
+swapper(void)
+{
+ struct proc *p;
+
+ for (;;) {
+ sx_slock(&allproc_lock);
+ p = swapper_selector(swapper_wkilled_only());
+ sx_sunlock(&allproc_lock);
+
+ if (p == NULL) {
+ tsleep(&proc0, PVM, "swapin", SWAPIN_INTERVAL);
+ } else {
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ /*
+ * Another process may be bringing or may have
+ * already brought this process in while we
+ * traverse all threads. Or, this process may
+ * have exited or even being swapped out
+ * again.
+ */
+ if (p->p_state == PRS_NORMAL && (p->p_flag & (P_INMEM |
+ P_SWAPPINGOUT | P_SWAPPINGIN)) == 0) {
+ faultin(p);
+ }
+ PROC_UNLOCK(p);
+ }
+ }
+}
+
+/*
+ * First, if any processes have been sleeping or stopped for at least
+ * "swap_idle_threshold1" seconds, they are swapped out. If, however,
+ * no such processes exist, then the longest-sleeping or stopped
+ * process is swapped out. Finally, and only as a last resort, if
+ * there are no sleeping or stopped processes, the longest-resident
+ * process is swapped out.
+ */
+static void
+swapout_procs(int action)
+{
+ struct proc *p;
+ struct thread *td;
+ int slptime;
+ bool didswap, doswap;
+
+ MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0);
+
+ didswap = false;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ /*
+ * Filter out not yet fully constructed processes. Do
+ * not swap out held processes. Avoid processes which
+ * are system, exiting, execing, traced, already swapped
+ * out or are in the process of being swapped in or out.
+ */
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag &
+ (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE |
+ P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) !=
+ P_INMEM) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+
+ /*
+ * Further consideration of this process for swap out
+ * requires iterating over its threads. We release
+ * allproc_lock here so that process creation and
+ * destruction are not blocked while we iterate.
+ *
+ * To later reacquire allproc_lock and resume
+ * iteration over the allproc list, we will first have
+ * to release the lock on the process. We place a
+ * hold on the process so that it remains in the
+ * allproc list while it is unlocked.
+ */
+ _PHOLD_LITE(p);
+ sx_sunlock(&allproc_lock);
+
+ /*
+ * Do not swapout a realtime process.
+ * Guarantee swap_idle_threshold1 time in memory.
+ * If the system is under memory stress, or if we are
+ * swapping idle processes >= swap_idle_threshold2,
+ * then swap the process out.
+ */
+ doswap = true;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ slptime = (ticks - td->td_slptick) / hz;
+ if (PRI_IS_REALTIME(td->td_pri_class) ||
+ slptime < swap_idle_threshold1 ||
+ !thread_safetoswapout(td) ||
+ ((action & VM_SWAP_NORMAL) == 0 &&
+ slptime < swap_idle_threshold2))
+ doswap = false;
+ thread_unlock(td);
+ if (!doswap)
+ break;
+ }
+ if (doswap && swapout(p) == 0)
+ didswap = true;
+
+ PROC_UNLOCK(p);
+ if (didswap) {
+ sx_xlock(&allproc_lock);
+ swapped_cnt++;
+ sx_downgrade(&allproc_lock);
+ } else
+ sx_slock(&allproc_lock);
+ PRELE(p);
+ }
+ sx_sunlock(&allproc_lock);
+
+ /*
+ * If we swapped something out, and another process needed memory,
+ * then wakeup the sched process.
+ */
+ if (didswap)
+ wakeup(&proc0);
+}
+
+static void
+swapclear(struct proc *p)
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ td->td_flags |= TDF_INMEM;
+ td->td_flags &= ~TDF_SWAPINREQ;
+ TD_CLR_SWAPPED(td);
+ if (TD_CAN_RUN(td))
+ if (setrunnable(td)) {
+#ifdef INVARIANTS
+ /*
+ * XXX: We just cleared TDI_SWAPPED
+ * above and set TDF_INMEM, so this
+ * should never happen.
+ */
+ panic("not waking up swapper");
+#endif
+ }
+ thread_unlock(td);
+ }
+ p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT);
+ p->p_flag |= P_INMEM;
+}
+
+static int
+swapout(struct proc *p)
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ /*
+ * The states of this process and its threads may have changed
+ * by now. Assuming that there is only one pageout daemon thread,
+ * this process should still be in memory.
+ */
+ KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) ==
+ P_INMEM, ("swapout: lost a swapout race?"));
+
+ /*
+ * Remember the resident count.
+ */
+ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
+
+ /*
+ * Check and mark all threads before we proceed.
+ */
+ p->p_flag &= ~P_INMEM;
+ p->p_flag |= P_SWAPPINGOUT;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (!thread_safetoswapout(td)) {
+ thread_unlock(td);
+ swapclear(p);
+ return (EBUSY);
+ }
+ td->td_flags &= ~TDF_INMEM;
+ TD_SET_SWAPPED(td);
+ thread_unlock(td);
+ }
+ td = FIRST_THREAD_IN_PROC(p);
+ ++td->td_ru.ru_nswap;
+ PROC_UNLOCK(p);
+
+ /*
+ * This list is stable because all threads are now prevented from
+ * running. The list is only modified in the context of a running
+ * thread in this process.
+ */
+ FOREACH_THREAD_IN_PROC(p, td)
+ vm_thread_swapout(td);
+
+ PROC_LOCK(p);
+ p->p_flag &= ~P_SWAPPINGOUT;
+ p->p_swtick = ticks;
+ return (0);
+}
Property changes on: trunk/sys/vm/vm_swapout.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/vm/vm_swapout_dummy.c
===================================================================
--- trunk/sys/vm/vm_swapout_dummy.c (rev 0)
+++ trunk/sys/vm/vm_swapout_dummy.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,123 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 1994 David Greenman
+ * All rights reserved.
+ * Copyright (c) 2005 Yahoo! Technologies Norway AS
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * The Mach Operating System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
+ *
+ *
+ * Copyright (c) 1987, 1990 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Avadis Tevanian, Jr., Michael Wayne Young
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution at CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_swapout_dummy.c 325647 2017-11-10 13:17:40Z kib $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
+
+static int vm_swap_enabled = 0;
+SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD,
+ &vm_swap_enabled, 0,
+ "Enable entire process swapout");
+
+static int vm_swap_idle_enabled = 0;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD,
+ &vm_swap_idle_enabled, 0,
+ "Allow swapout on idle criteria");
+
+void
+vm_swapout_run(void)
+{
+}
+
+void
+vm_swapout_run_idle(void)
+{
+}
+
+void
+faultin(struct proc *p)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if ((p->p_flag & P_INMEM) == 0)
+ panic("faultin: proc %p swapped out with NO_SWAPPING", p);
+}
+
+void
+swapper(void)
+{
+
+ for (;;)
+ tsleep(&proc0, PVM, "swapin", MAXSLP * hz);
+}
Property changes on: trunk/sys/vm/vm_swapout_dummy.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/vm_unix.c
===================================================================
--- trunk/sys/vm/vm_unix.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_unix.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -44,7 +44,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_unix.c 284665 2015-06-21 06:28:26Z trasz $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_unix.c 341467 2018-12-04 15:04:48Z emaste $");
#include <sys/param.h>
#include <sys/lock.h>
@@ -72,9 +72,7 @@
*/
/* ARGSUSED */
int
-sys_obreak(td, uap)
- struct thread *td;
- struct obreak_args *uap;
+sys_obreak(struct thread *td, struct obreak_args *uap)
{
struct vmspace *vm = td->td_proc->p_vmspace;
vm_map_t map = &vm->vm_map;
@@ -84,11 +82,9 @@
int error = 0;
boolean_t do_map_wirefuture;
- PROC_LOCK(td->td_proc);
- datalim = lim_cur(td->td_proc, RLIMIT_DATA);
- lmemlim = lim_cur(td->td_proc, RLIMIT_MEMLOCK);
- vmemlim = lim_cur(td->td_proc, RLIMIT_VMEM);
- PROC_UNLOCK(td->td_proc);
+ datalim = lim_cur(td, RLIMIT_DATA);
+ lmemlim = lim_cur(td, RLIMIT_MEMLOCK);
+ vmemlim = lim_cur(td, RLIMIT_VMEM);
do_map_wirefuture = FALSE;
new = round_page((vm_offset_t)uap->nsize);
@@ -167,7 +163,7 @@
#endif
prot = VM_PROT_RW;
#ifdef COMPAT_FREEBSD32
-#if defined(__amd64__) || defined(__ia64__)
+#if defined(__amd64__)
if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32))
prot |= VM_PROT_EXECUTE;
#endif
@@ -248,9 +244,7 @@
*/
/* ARGSUSED */
int
-sys_ovadvise(td, uap)
- struct thread *td;
- struct ovadvise_args *uap;
+sys_ovadvise(struct thread *td, struct ovadvise_args *uap)
{
/* START_GIANT_OPTIONAL */
/* END_GIANT_OPTIONAL */
Modified: trunk/sys/vm/vm_zeroidle.c
===================================================================
--- trunk/sys/vm/vm_zeroidle.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_zeroidle.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -34,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_zeroidle.c 254065 2013-08-07 16:36:38Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_zeroidle.c 267992 2014-06-28 03:56:17Z hselasky $");
#include <opt_sched.h>
@@ -56,10 +56,9 @@
#include <vm/vm_phys.h>
static int idlezero_enable_default = 0;
-TUNABLE_INT("vm.idlezero_enable", &idlezero_enable_default);
/* Defer setting the enable flag until the kthread is running. */
static int idlezero_enable = 0;
-SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0,
+SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RWTUN, &idlezero_enable, 0,
"Allow the kernel to use idle cpu cycles to zero-out pages");
/*
* Implement the pre-zeroed page mechanism.
@@ -85,9 +84,9 @@
* fast sleeps. We also do not want to be continuously zeroing
* pages because doing so may flush our L1 and L2 caches too much.
*/
- if (zero_state && vm_page_zero_count >= ZIDLE_LO(cnt.v_free_count))
+ if (zero_state && vm_page_zero_count >= ZIDLE_LO(vm_cnt.v_free_count))
return (0);
- if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
+ if (vm_page_zero_count >= ZIDLE_HI(vm_cnt.v_free_count))
return (0);
return (1);
}
@@ -99,7 +98,7 @@
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
zero_state = 0;
if (vm_phys_zero_pages_idle()) {
- if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
+ if (vm_page_zero_count >= ZIDLE_HI(vm_cnt.v_free_count))
zero_state = 1;
}
}
Modified: trunk/sys/vm/vnode_pager.c
===================================================================
--- trunk/sys/vm/vnode_pager.c 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vnode_pager.c 2020-02-08 19:35:48 UTC (rev 12314)
@@ -52,8 +52,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vnode_pager.c 291454 2015-11-29 14:44:40Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vnode_pager.c 331722 2018-03-29 02:50:57Z eadler $");
+#include "opt_vm.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
@@ -83,21 +85,27 @@
static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
static void vnode_pager_dealloc(vm_object_t);
-static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
+static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
+ int *, vop_getpages_iodone_t, void *);
static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
vm_ooffset_t, struct ucred *cred);
+static int vnode_pager_generic_getpages_done(struct buf *);
+static void vnode_pager_generic_getpages_done_async(struct buf *);
struct pagerops vnodepagerops = {
.pgo_alloc = vnode_pager_alloc,
.pgo_dealloc = vnode_pager_dealloc,
.pgo_getpages = vnode_pager_getpages,
+ .pgo_getpages_async = vnode_pager_getpages_async,
.pgo_putpages = vnode_pager_putpages,
.pgo_haspage = vnode_pager_haspage,
};
int vnode_pbuf_freecnt;
+int vnode_async_pbuf_freecnt;
/* Create the VM system backing object for this vnode */
int
@@ -157,14 +165,26 @@
return;
ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
VM_OBJECT_WLOCK(obj);
+ umtx_shm_object_terminated(obj);
if (obj->ref_count == 0) {
/*
* don't double-terminate the object
*/
- if ((obj->flags & OBJ_DEAD) == 0)
+ if ((obj->flags & OBJ_DEAD) == 0) {
vm_object_terminate(obj);
- else
+ } else {
+ /*
+ * Waiters were already handled during object
+ * termination. The exclusive vnode lock hopefully
+ * prevented new waiters from referencing the dying
+ * object.
+ */
+ KASSERT((obj->flags & OBJ_DISCONNECTWNT) == 0,
+ ("OBJ_DISCONNECTWNT set obj %p flags %x",
+ obj, obj->flags));
+ vp->v_object = NULL;
VM_OBJECT_WUNLOCK(obj);
+ }
} else {
/*
* Woe to the process that tries to page now :-).
@@ -172,7 +192,7 @@
vm_pager_deallocate(obj);
VM_OBJECT_WUNLOCK(obj);
}
- vp->v_object = NULL;
+ KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object));
}
@@ -241,9 +261,12 @@
VI_UNLOCK(vp);
} else {
object->ref_count++;
+#if VM_NRESERVLEVEL > 0
+ vm_object_color(object, 0);
+#endif
VM_OBJECT_WUNLOCK(object);
}
- vref(vp);
+ vrefact(vp);
return (object);
}
@@ -251,8 +274,7 @@
* The object must be locked.
*/
static void
-vnode_pager_dealloc(object)
- vm_object_t object;
+vnode_pager_dealloc(vm_object_t object)
{
struct vnode *vp;
int refs;
@@ -287,11 +309,8 @@
}
static boolean_t
-vnode_pager_haspage(object, pindex, before, after)
- vm_object_t object;
- vm_pindex_t pindex;
- int *before;
- int *after;
+vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+ int *after)
{
struct vnode *vp = object->handle;
daddr_t bn;
@@ -338,16 +357,21 @@
*before += poff;
}
if (after) {
- int numafter;
+ /*
+ * The BMAP vop can report a partial block in the
+ * 'after', but must not report blocks after EOF.
+ * Assert the latter, and truncate 'after' in case
+ * of the former.
+ */
+ KASSERT((reqblock + *after) * pagesperblock <
+ roundup2(object->size, pagesperblock),
+ ("%s: reqblock %jd after %d size %ju", __func__,
+ (intmax_t )reqblock, *after,
+ (uintmax_t )object->size));
*after *= pagesperblock;
- numafter = pagesperblock - (poff + 1);
- if (IDX_TO_OFF(pindex + numafter) >
- object->un_pager.vnp.vnp_size) {
- numafter =
- OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
- pindex;
- }
- *after += numafter;
+ *after += pagesperblock - (poff + 1);
+ if (pindex + *after >= object->size)
+ *after = object->size - 1 - pindex;
}
} else {
if (before) {
@@ -370,9 +394,7 @@
* operation (possibly at object termination time), so we must be careful.
*/
void
-vnode_pager_setsize(vp, nsize)
- struct vnode *vp;
- vm_ooffset_t nsize;
+vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
{
vm_object_t object;
vm_page_t m;
@@ -445,10 +467,6 @@
* replacement from working properly.
*/
vm_page_clear_dirty(m, base, PAGE_SIZE - base);
- } else if ((nsize & PAGE_MASK) &&
- vm_page_is_cached(object, OFF_TO_IDX(nsize))) {
- vm_page_cache_free(object, OFF_TO_IDX(nsize),
- nobjsize);
}
}
object->un_pager.vnp.vnp_size = nsize;
@@ -497,9 +515,7 @@
* small block filesystem vnode pager input
*/
static int
-vnode_pager_input_smlfs(object, m)
- vm_object_t object;
- vm_page_t m;
+vnode_pager_input_smlfs(vm_object_t object, vm_page_t m)
{
struct vnode *vp;
struct bufobj *bo;
@@ -591,9 +607,7 @@
* old style vnode pager input routine
*/
static int
-vnode_pager_input_old(object, m)
- vm_object_t object;
- vm_page_t m;
+vnode_pager_input_old(vm_object_t object, vm_page_t m)
{
struct uio auio;
struct iovec aiov;
@@ -666,19 +680,15 @@
* backing vp's VOP_GETPAGES.
*/
static int
-vnode_pager_getpages(object, m, count, reqpage)
- vm_object_t object;
- vm_page_t *m;
- int count;
- int reqpage;
+vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+ int *rahead)
{
+ struct vnode *vp;
int rtval;
- struct vnode *vp;
- int bytes = count * PAGE_SIZE;
vp = object->handle;
VM_OBJECT_WUNLOCK(object);
- rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
+ rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead);
KASSERT(rtval != EOPNOTSUPP,
("vnode_pager: FS getpages not implemented\n"));
VM_OBJECT_WLOCK(object);
@@ -685,261 +695,373 @@
return rtval;
}
+static int
+vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count,
+ int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg)
+{
+ struct vnode *vp;
+ int rtval;
+
+ vp = object->handle;
+ VM_OBJECT_WUNLOCK(object);
+ rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg);
+ KASSERT(rtval != EOPNOTSUPP,
+ ("vnode_pager: FS getpages_async not implemented\n"));
+ VM_OBJECT_WLOCK(object);
+ return (rtval);
+}
+
/*
+ * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for
+ * local filesystems, where partially valid pages can only occur at
+ * the end of file.
+ */
+int
+vnode_pager_local_getpages(struct vop_getpages_args *ap)
+{
+
+ return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
+ ap->a_rbehind, ap->a_rahead, NULL, NULL));
+}
+
+int
+vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap)
+{
+
+ return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
+ ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
+}
+
+/*
* This is now called from local media FS's to operate against their
* own vnodes if they fail to implement VOP_GETPAGES.
*/
int
-vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
- struct vnode *vp;
- vm_page_t *m;
- int bytecount;
- int reqpage;
+vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count,
+ int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg)
{
vm_object_t object;
struct bufobj *bo;
struct buf *bp;
- struct mount *mp;
- vm_offset_t kva;
- daddr_t firstaddr, reqblock;
- off_t foff, nextoff, tfoff, pib;
- int pbefore, pafter, i, size, bsize, first, last;
- int count, error, before, after, secmask;
+ off_t foff;
+ int bsize, pagesperblock, *freecnt;
+ int error, before, after, rbehind, rahead, poff, i;
+ int bytecount, secmask;
KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
- ("vnode_pager_generic_getpages does not support devices"));
+ ("%s does not support devices", __func__));
+
if (vp->v_iflag & VI_DOOMED)
return (VM_PAGER_BAD);
object = vp->v_object;
- count = bytecount / PAGE_SIZE;
+ foff = IDX_TO_OFF(m[0]->pindex);
bsize = vp->v_mount->mnt_stat.f_iosize;
+ pagesperblock = bsize / PAGE_SIZE;
- /* get the UNDERLYING device for the file with VOP_BMAP() */
+ KASSERT(foff < object->un_pager.vnp.vnp_size,
+ ("%s: page %p offset beyond vp %p size", __func__, m[0], vp));
+ KASSERT(count <= sizeof(bp->b_pages),
+ ("%s: requested %d pages", __func__, count));
/*
- * originally, we did not check for an error return value -- assuming
- * an fs always has a bmap entry point -- that assumption is wrong!!!
+ * The last page has valid blocks. Invalid part can only
+ * exist at the end of file, and the page is made fully valid
+ * by zeroing in vm_pager_get_pages().
*/
- foff = IDX_TO_OFF(m[reqpage]->pindex);
+ if (m[count - 1]->valid != 0 && --count == 0) {
+ if (iodone != NULL)
+ iodone(arg, m, 1, 0);
+ return (VM_PAGER_OK);
+ }
/*
- * if we can't bmap, use old VOP code
+ * Synchronous and asynchronous paging operations use different
+ * free pbuf counters. This is done to avoid asynchronous requests
+ * to consume all pbufs.
+ * Allocate the pbuf at the very beginning of the function, so that
+ * if we are low on certain kind of pbufs don't even proceed to BMAP,
+ * but sleep.
*/
- error = VOP_BMAP(vp, IDX_TO_OFF(m[reqpage]->pindex) / bsize, &bo,
- &reqblock, &after, &before);
+ freecnt = iodone != NULL ?
+ &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt;
+ bp = getpbuf(freecnt);
+
+ /*
+ * Get the underlying device blocks for the file with VOP_BMAP().
+ * If the file system doesn't support VOP_BMAP, use old way of
+ * getting pages via VOP_READ.
+ */
+ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before);
if (error == EOPNOTSUPP) {
+ relpbuf(bp, freecnt);
VM_OBJECT_WLOCK(object);
-
- for (i = 0; i < count; i++)
- if (i != reqpage) {
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
- }
- PCPU_INC(cnt.v_vnodein);
- PCPU_INC(cnt.v_vnodepgsin);
- error = vnode_pager_input_old(object, m[reqpage]);
+ for (i = 0; i < count; i++) {
+ PCPU_INC(cnt.v_vnodein);
+ PCPU_INC(cnt.v_vnodepgsin);
+ error = vnode_pager_input_old(object, m[i]);
+ if (error)
+ break;
+ }
VM_OBJECT_WUNLOCK(object);
return (error);
} else if (error != 0) {
- VM_OBJECT_WLOCK(object);
- for (i = 0; i < count; i++)
- if (i != reqpage) {
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
- }
- VM_OBJECT_WUNLOCK(object);
+ relpbuf(bp, freecnt);
return (VM_PAGER_ERROR);
+ }
- /*
- * if the blocksize is smaller than a page size, then use
- * special small filesystem code. NFS sometimes has a small
- * blocksize, but it can handle large reads itself.
- */
- } else if ((PAGE_SIZE / bsize) > 1 &&
- (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
- VM_OBJECT_WLOCK(object);
- for (i = 0; i < count; i++)
- if (i != reqpage) {
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
- }
- VM_OBJECT_WUNLOCK(object);
- PCPU_INC(cnt.v_vnodein);
- PCPU_INC(cnt.v_vnodepgsin);
- return (vnode_pager_input_smlfs(object, m[reqpage]));
+ /*
+ * If the file system supports BMAP, but blocksize is smaller
+ * than a page size, then use special small filesystem code.
+ */
+ if (pagesperblock == 0) {
+ relpbuf(bp, freecnt);
+ for (i = 0; i < count; i++) {
+ PCPU_INC(cnt.v_vnodein);
+ PCPU_INC(cnt.v_vnodepgsin);
+ error = vnode_pager_input_smlfs(object, m[i]);
+ if (error)
+ break;
+ }
+ return (error);
}
/*
- * If we have a completely valid page available to us, we can
- * clean up and return. Otherwise we have to re-read the
- * media.
+ * A sparse file can be encountered only for a single page request,
+ * which may not be preceded by call to vm_pager_haspage().
*/
- VM_OBJECT_WLOCK(object);
- if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
- for (i = 0; i < count; i++)
- if (i != reqpage) {
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
- }
+ if (bp->b_blkno == -1) {
+ KASSERT(count == 1,
+ ("%s: array[%d] request to a sparse file %p", __func__,
+ count, vp));
+ relpbuf(bp, freecnt);
+ pmap_zero_page(m[0]);
+ KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty",
+ __func__, m[0]));
+ VM_OBJECT_WLOCK(object);
+ m[0]->valid = VM_PAGE_BITS_ALL;
VM_OBJECT_WUNLOCK(object);
- return VM_PAGER_OK;
- } else if (reqblock == -1) {
- pmap_zero_page(m[reqpage]);
- KASSERT(m[reqpage]->dirty == 0,
- ("vnode_pager_generic_getpages: page %p is dirty", m));
- m[reqpage]->valid = VM_PAGE_BITS_ALL;
- for (i = 0; i < count; i++)
- if (i != reqpage) {
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
- }
- VM_OBJECT_WUNLOCK(object);
return (VM_PAGER_OK);
}
- m[reqpage]->valid = 0;
- VM_OBJECT_WUNLOCK(object);
- pib = IDX_TO_OFF(m[reqpage]->pindex) % bsize;
- pbefore = ((daddr_t)before * bsize + pib) / PAGE_SIZE;
- pafter = ((daddr_t)(after + 1) * bsize - pib) / PAGE_SIZE - 1;
- first = reqpage < pbefore ? 0 : reqpage - pbefore;
- last = reqpage + pafter >= count ? count - 1 : reqpage + pafter;
- if (first > 0 || last + 1 < count) {
+ bp->b_blkno += (foff % bsize) / DEV_BSIZE;
+
+ /* Recalculate blocks available after/before to pages. */
+ poff = (foff % bsize) / PAGE_SIZE;
+ before *= pagesperblock;
+ before += poff;
+ after *= pagesperblock;
+ after += pagesperblock - (poff + 1);
+ if (m[0]->pindex + after >= object->size)
+ after = object->size - 1 - m[0]->pindex;
+ KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d",
+ __func__, count, after + 1));
+ after -= count - 1;
+
+ /* Trim requested rbehind/rahead to possible values. */
+ rbehind = a_rbehind ? *a_rbehind : 0;
+ rahead = a_rahead ? *a_rahead : 0;
+ rbehind = min(rbehind, before);
+ rbehind = min(rbehind, m[0]->pindex);
+ rahead = min(rahead, after);
+ rahead = min(rahead, object->size - m[count - 1]->pindex);
+ KASSERT(rbehind + rahead + count <= sizeof(bp->b_pages),
+ ("%s: behind %d ahead %d count %d", __func__,
+ rbehind, rahead, count));
+
+ /*
+ * Fill in the bp->b_pages[] array with requested and optional
+ * read behind or read ahead pages. Read behind pages are looked
+ * up in a backward direction, down to a first cached page. Same
+ * for read ahead pages, but there is no need to shift the array
+ * in case of encountering a cached page.
+ */
+ i = bp->b_npages = 0;
+ if (rbehind) {
+ vm_pindex_t startpindex, tpindex;
+ vm_page_t p;
+
VM_OBJECT_WLOCK(object);
- for (i = 0; i < first; i++) {
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
+ startpindex = m[0]->pindex - rbehind;
+ if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL &&
+ p->pindex >= startpindex)
+ startpindex = p->pindex + 1;
+
+ /* tpindex is unsigned; beware of numeric underflow. */
+ for (tpindex = m[0]->pindex - 1;
+ tpindex >= startpindex && tpindex < m[0]->pindex;
+ tpindex--, i++) {
+ p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+ if (p == NULL) {
+ /* Shift the array. */
+ for (int j = 0; j < i; j++)
+ bp->b_pages[j] = bp->b_pages[j +
+ tpindex + 1 - startpindex];
+ break;
+ }
+ bp->b_pages[tpindex - startpindex] = p;
}
- for (i = last + 1; i < count; i++) {
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
+
+ bp->b_pgbefore = i;
+ bp->b_npages += i;
+ bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE;
+ } else
+ bp->b_pgbefore = 0;
+
+ /* Requested pages. */
+ for (int j = 0; j < count; j++, i++)
+ bp->b_pages[i] = m[j];
+ bp->b_npages += count;
+
+ if (rahead) {
+ vm_pindex_t endpindex, tpindex;
+ vm_page_t p;
+
+ if (!VM_OBJECT_WOWNED(object))
+ VM_OBJECT_WLOCK(object);
+ endpindex = m[count - 1]->pindex + rahead + 1;
+ if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL &&
+ p->pindex < endpindex)
+ endpindex = p->pindex;
+ if (endpindex > object->size)
+ endpindex = object->size;
+
+ for (tpindex = m[count - 1]->pindex + 1;
+ tpindex < endpindex; i++, tpindex++) {
+ p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+ if (p == NULL)
+ break;
+ bp->b_pages[i] = p;
}
- VM_OBJECT_WUNLOCK(object);
- }
- /*
- * here on direct device I/O
- */
- firstaddr = reqblock;
- firstaddr += pib / DEV_BSIZE;
- firstaddr -= IDX_TO_OFF(reqpage - first) / DEV_BSIZE;
+ bp->b_pgafter = i - bp->b_npages;
+ bp->b_npages = i;
+ } else
+ bp->b_pgafter = 0;
- /*
- * The first and last page have been calculated now, move
- * input pages to be zero based, and adjust the count.
- */
- m += first;
- reqpage -= first;
- count = last - first + 1;
+ if (VM_OBJECT_WOWNED(object))
+ VM_OBJECT_WUNLOCK(object);
- /*
- * calculate the file virtual address for the transfer
- */
- foff = IDX_TO_OFF(m[0]->pindex);
+ /* Report back actual behind/ahead read. */
+ if (a_rbehind)
+ *a_rbehind = bp->b_pgbefore;
+ if (a_rahead)
+ *a_rahead = bp->b_pgafter;
- /*
- * calculate the size of the transfer
- */
- size = count * PAGE_SIZE;
- KASSERT(count > 0, ("zero count"));
- if ((foff + size) > object->un_pager.vnp.vnp_size)
- size = object->un_pager.vnp.vnp_size - foff;
- KASSERT(size > 0, ("zero size"));
+ KASSERT(bp->b_npages <= sizeof(bp->b_pages),
+ ("%s: buf %p overflowed", __func__, bp));
/*
- * round up physical size for real devices.
+ * Recalculate first offset and bytecount with regards to read behind.
+ * Truncate bytecount to vnode real size and round up physical size
+ * for real devices.
*/
+ foff = IDX_TO_OFF(bp->b_pages[0]->pindex);
+ bytecount = bp->b_npages << PAGE_SHIFT;
+ if ((foff + bytecount) > object->un_pager.vnp.vnp_size)
+ bytecount = object->un_pager.vnp.vnp_size - foff;
secmask = bo->bo_bsize - 1;
KASSERT(secmask < PAGE_SIZE && secmask > 0,
- ("vnode_pager_generic_getpages: sector size %d too large",
- secmask + 1));
- size = (size + secmask) & ~secmask;
+ ("%s: sector size %d too large", __func__, secmask + 1));
+ bytecount = (bytecount + secmask) & ~secmask;
- bp = getpbuf(&vnode_pbuf_freecnt);
- kva = (vm_offset_t)bp->b_data;
-
/*
- * and map the pages to be read into the kva, if the filesystem
+ * And map the pages to be read into the kva, if the filesystem
* requires mapped buffers.
*/
- mp = vp->v_mount;
- if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
+ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
unmapped_buf_allowed) {
bp->b_data = unmapped_buf;
- bp->b_kvabase = unmapped_buf;
bp->b_offset = 0;
- bp->b_flags |= B_UNMAPPED;
- bp->b_npages = count;
- for (i = 0; i < count; i++)
- bp->b_pages[i] = m[i];
- } else
- pmap_qenter(kva, m, count);
+ } else {
+ bp->b_data = bp->b_kvabase;
+ pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+ }
- /* build a minimal buffer header */
+ /* Build a minimal buffer header. */
bp->b_iocmd = BIO_READ;
- bp->b_iodone = bdone;
KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
bp->b_rcred = crhold(curthread->td_ucred);
bp->b_wcred = crhold(curthread->td_ucred);
- bp->b_blkno = firstaddr;
pbgetbo(bo, bp);
bp->b_vp = vp;
- bp->b_bcount = size;
- bp->b_bufsize = size;
- bp->b_runningbufspace = bp->b_bufsize;
+ bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount;
+ bp->b_iooffset = dbtob(bp->b_blkno);
+
atomic_add_long(&runningbufspace, bp->b_runningbufspace);
-
PCPU_INC(cnt.v_vnodein);
- PCPU_ADD(cnt.v_vnodepgsin, count);
+ PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages);
- /* do the input */
- bp->b_iooffset = dbtob(bp->b_blkno);
- bstrategy(bp);
+ if (iodone != NULL) { /* async */
+ bp->b_pgiodone = iodone;
+ bp->b_caller1 = arg;
+ bp->b_iodone = vnode_pager_generic_getpages_done_async;
+ bp->b_flags |= B_ASYNC;
+ BUF_KERNPROC(bp);
+ bstrategy(bp);
+ return (VM_PAGER_OK);
+ } else {
+ bp->b_iodone = bdone;
+ bstrategy(bp);
+ bwait(bp, PVM, "vnread");
+ error = vnode_pager_generic_getpages_done(bp);
+ for (i = 0; i < bp->b_npages; i++)
+ bp->b_pages[i] = NULL;
+ bp->b_vp = NULL;
+ pbrelbo(bp);
+ relpbuf(bp, &vnode_pbuf_freecnt);
+ return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
+ }
+}
- bwait(bp, PVM, "vnread");
+static void
+vnode_pager_generic_getpages_done_async(struct buf *bp)
+{
+ int error;
- if ((bp->b_ioflags & BIO_ERROR) != 0)
- error = EIO;
+ error = vnode_pager_generic_getpages_done(bp);
+ /* Run the iodone upon the requested range. */
+ bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore,
+ bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error);
+ for (int i = 0; i < bp->b_npages; i++)
+ bp->b_pages[i] = NULL;
+ bp->b_vp = NULL;
+ pbrelbo(bp);
+ relpbuf(bp, &vnode_async_pbuf_freecnt);
+}
- if (error == 0 && size != count * PAGE_SIZE) {
- if ((bp->b_flags & B_UNMAPPED) != 0) {
- bp->b_flags &= ~B_UNMAPPED;
- pmap_qenter(kva, m, count);
+static int
+vnode_pager_generic_getpages_done(struct buf *bp)
+{
+ vm_object_t object;
+ off_t tfoff, nextoff;
+ int i, error;
+
+ error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0;
+ object = bp->b_vp->v_object;
+
+ if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) {
+ if (!buf_mapped(bp)) {
+ bp->b_data = bp->b_kvabase;
+ pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages,
+ bp->b_npages);
}
- bzero((caddr_t)kva + size, PAGE_SIZE * count - size);
+ bzero(bp->b_data + bp->b_bcount,
+ PAGE_SIZE * bp->b_npages - bp->b_bcount);
}
- if ((bp->b_flags & B_UNMAPPED) == 0)
- pmap_qremove(kva, count);
- if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) {
- bp->b_data = (caddr_t)kva;
- bp->b_kvabase = (caddr_t)kva;
- bp->b_flags &= ~B_UNMAPPED;
- for (i = 0; i < count; i++)
- bp->b_pages[i] = NULL;
+ if (buf_mapped(bp)) {
+ pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
+ bp->b_data = unmapped_buf;
}
- /*
- * free the buffer header back to the swap buffer pool
- */
- bp->b_vp = NULL;
- pbrelbo(bp);
- relpbuf(bp, &vnode_pbuf_freecnt);
-
VM_OBJECT_WLOCK(object);
- for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
+ for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex);
+ i < bp->b_npages; i++, tfoff = nextoff) {
vm_page_t mt;
nextoff = tfoff + PAGE_SIZE;
- mt = m[i];
+ mt = bp->b_pages[i];
if (nextoff <= object->un_pager.vnp.vnp_size) {
/*
@@ -947,11 +1069,9 @@
*/
mt->valid = VM_PAGE_BITS_ALL;
KASSERT(mt->dirty == 0,
- ("vnode_pager_generic_getpages: page %p is dirty",
- mt));
+ ("%s: page %p is dirty", __func__, mt));
KASSERT(!pmap_page_is_mapped(mt),
- ("vnode_pager_generic_getpages: page %p is mapped",
- mt));
+ ("%s: page %p is mapped", __func__, mt));
} else {
/*
* Read did not fill up entire page.
@@ -964,18 +1084,17 @@
object->un_pager.vnp.vnp_size - tfoff);
KASSERT((mt->dirty & vm_page_bits(0,
object->un_pager.vnp.vnp_size - tfoff)) == 0,
- ("vnode_pager_generic_getpages: page %p is dirty",
- mt));
+ ("%s: page %p is dirty", __func__, mt));
}
-
- if (i != reqpage)
+
+ if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter)
vm_page_readahead_finish(mt);
}
VM_OBJECT_WUNLOCK(object);
- if (error) {
- printf("vnode_pager_getpages: I/O read error\n");
- }
- return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
+ if (error != 0)
+ printf("%s: I/O read error %d\n", __func__, error);
+
+ return (error);
}
/*
@@ -1006,7 +1125,7 @@
* daemon up. This should be probably be addressed XXX.
*/
- if (cnt.v_free_count + cnt.v_cache_count < cnt.v_pageout_free_min)
+ if (vm_cnt.v_free_count < vm_cnt.v_pageout_free_min)
flags |= VM_PAGER_PUT_SYNC;
/*
@@ -1014,19 +1133,36 @@
*/
vp = object->handle;
VM_OBJECT_WUNLOCK(object);
- rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals, 0);
+ rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals);
KASSERT(rtval != EOPNOTSUPP,
("vnode_pager: stale FS putpages\n"));
VM_OBJECT_WLOCK(object);
}
+static int
+vn_off2bidx(vm_ooffset_t offset)
+{
+ return ((offset & PAGE_MASK) / DEV_BSIZE);
+}
+
+static bool
+vn_dirty_blk(vm_page_t m, vm_ooffset_t offset)
+{
+
+ KASSERT(IDX_TO_OFF(m->pindex) <= offset &&
+ offset < IDX_TO_OFF(m->pindex + 1),
+ ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex,
+ (uintmax_t)offset));
+ return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0);
+}
+
/*
* This is now called from local media FS's to operate against their
* own vnodes if they fail to implement VOP_PUTPAGES.
*
* This is typically called indirectly via the pageout daemon and
- * clustering has already typically occured, so in general we ask the
+ * clustering has already typically occurred, so in general we ask the
* underlying filesystem to write the data out asynchronously rather
* then delayed.
*/
@@ -1034,18 +1170,14 @@
vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
int flags, int *rtvals)
{
- int i;
vm_object_t object;
vm_page_t m;
- int count;
-
- int maxsize, ncount;
- vm_ooffset_t poffset;
+ vm_ooffset_t maxblksz, next_offset, poffset, prev_offset;
struct uio auio;
struct iovec aiov;
- int error;
- int ioflags;
- int ppscheck = 0;
+ off_t prev_resid, wrsz;
+ int count, error, i, maxsize, ncount, pgoff, ppscheck;
+ bool in_hole;
static struct timeval lastfail;
static int curfail;
@@ -1056,10 +1188,11 @@
rtvals[i] = VM_PAGER_ERROR;
if ((int64_t)ma[0]->pindex < 0) {
- printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n",
- (long)ma[0]->pindex, (u_long)ma[0]->dirty);
+ printf("vnode_pager_generic_putpages: "
+ "attempt to write meta-data 0x%jx(%lx)\n",
+ (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty);
rtvals[0] = VM_PAGER_BAD;
- return VM_PAGER_BAD;
+ return (VM_PAGER_BAD);
}
maxsize = count * PAGE_SIZE;
@@ -1069,7 +1202,7 @@
/*
* If the page-aligned write is larger then the actual file we
- * have to invalidate pages occuring beyond the file EOF. However,
+ * have to invalidate pages occurring beyond the file EOF. However,
* there is an edge case where a file may not be page-aligned where
* the last page is partially invalid. In this case the filesystem
* may not properly clear the dirty bits for the entire page (which
@@ -1079,14 +1212,20 @@
* We do not under any circumstances truncate the valid bits, as
* this will screw up bogus page replacement.
*/
- VM_OBJECT_WLOCK(object);
+ VM_OBJECT_RLOCK(object);
if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
+ if (!VM_OBJECT_TRYUPGRADE(object)) {
+ VM_OBJECT_RUNLOCK(object);
+ VM_OBJECT_WLOCK(object);
+ if (maxsize + poffset <= object->un_pager.vnp.vnp_size)
+ goto downgrade;
+ }
if (object->un_pager.vnp.vnp_size > poffset) {
- int pgoff;
-
maxsize = object->un_pager.vnp.vnp_size - poffset;
ncount = btoc(maxsize);
if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
+ pgoff = roundup2(pgoff, DEV_BSIZE);
+
/*
* If the object is locked and the following
* conditions hold, then the page's dirty
@@ -1097,6 +1236,7 @@
vm_page_assert_sbusied(m);
KASSERT(!pmap_page_is_write_mapped(m),
("vnode_pager_generic_putpages: page %p is not read-only", m));
+ MPASS(m->dirty != 0);
vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
pgoff);
}
@@ -1104,64 +1244,152 @@
maxsize = 0;
ncount = 0;
}
- if (ncount < count) {
- for (i = ncount; i < count; i++) {
- rtvals[i] = VM_PAGER_BAD;
+ for (i = ncount; i < count; i++)
+ rtvals[i] = VM_PAGER_BAD;
+downgrade:
+ VM_OBJECT_LOCK_DOWNGRADE(object);
+ }
+
+ auio.uio_iov = &aiov;
+ auio.uio_segflg = UIO_NOCOPY;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = NULL;
+ maxblksz = roundup2(poffset + maxsize, DEV_BSIZE);
+
+ for (prev_offset = poffset; prev_offset < maxblksz;) {
+ /* Skip clean blocks. */
+ for (in_hole = true; in_hole && prev_offset < maxblksz;) {
+ m = ma[OFF_TO_IDX(prev_offset - poffset)];
+ for (i = vn_off2bidx(prev_offset);
+ i < sizeof(vm_page_bits_t) * NBBY &&
+ prev_offset < maxblksz; i++) {
+ if (vn_dirty_blk(m, prev_offset)) {
+ in_hole = false;
+ break;
+ }
+ prev_offset += DEV_BSIZE;
}
}
+ if (in_hole)
+ goto write_done;
+
+ /* Find longest run of dirty blocks. */
+ for (next_offset = prev_offset; next_offset < maxblksz;) {
+ m = ma[OFF_TO_IDX(next_offset - poffset)];
+ for (i = vn_off2bidx(next_offset);
+ i < sizeof(vm_page_bits_t) * NBBY &&
+ next_offset < maxblksz; i++) {
+ if (!vn_dirty_blk(m, next_offset))
+ goto start_write;
+ next_offset += DEV_BSIZE;
+ }
+ }
+start_write:
+ if (next_offset > poffset + maxsize)
+ next_offset = poffset + maxsize;
+
+ /*
+ * Getting here requires finding a dirty block in the
+ * 'skip clean blocks' loop.
+ */
+ MPASS(prev_offset < next_offset);
+
+ VM_OBJECT_RUNLOCK(object);
+ aiov.iov_base = NULL;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = prev_offset;
+ prev_resid = auio.uio_resid = aiov.iov_len = next_offset -
+ prev_offset;
+ error = VOP_WRITE(vp, &auio,
+ vnode_pager_putpages_ioflags(flags), curthread->td_ucred);
+
+ wrsz = prev_resid - auio.uio_resid;
+ if (wrsz == 0) {
+ if (ppsratecheck(&lastfail, &curfail, 1) != 0) {
+ vn_printf(vp, "vnode_pager_putpages: "
+ "zero-length write at %ju resid %zd\n",
+ auio.uio_offset, auio.uio_resid);
+ }
+ VM_OBJECT_RLOCK(object);
+ break;
+ }
+
+ /* Adjust the starting offset for next iteration. */
+ prev_offset += wrsz;
+ MPASS(auio.uio_offset == prev_offset);
+
+ ppscheck = 0;
+ if (error != 0 && (ppscheck = ppsratecheck(&lastfail,
+ &curfail, 1)) != 0)
+ vn_printf(vp, "vnode_pager_putpages: I/O error %d\n",
+ error);
+ if (auio.uio_resid != 0 && (ppscheck != 0 ||
+ ppsratecheck(&lastfail, &curfail, 1) != 0))
+ vn_printf(vp, "vnode_pager_putpages: residual I/O %zd "
+ "at %ju\n", auio.uio_resid,
+ (uintmax_t)ma[0]->pindex);
+ VM_OBJECT_RLOCK(object);
+ if (error != 0 || auio.uio_resid != 0)
+ break;
}
- VM_OBJECT_WUNLOCK(object);
+write_done:
+ /* Mark completely processed pages. */
+ for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++)
+ rtvals[i] = VM_PAGER_OK;
+ /* Mark partial EOF page. */
+ if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0)
+ rtvals[i++] = VM_PAGER_OK;
+ /* Unwritten pages in range, free bonus if the page is clean. */
+ for (; i < ncount; i++)
+ rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR;
+ VM_OBJECT_RUNLOCK(object);
+ PCPU_ADD(cnt.v_vnodepgsout, i);
+ PCPU_INC(cnt.v_vnodeout);
+ return (rtvals[0]);
+}
+int
+vnode_pager_putpages_ioflags(int pager_flags)
+{
+ int ioflags;
+
/*
- * pageouts are already clustered, use IO_ASYNC to force a bawrite()
- * rather then a bdwrite() to prevent paging I/O from saturating
- * the buffer cache. Dummy-up the sequential heuristic to cause
- * large ranges to cluster. If neither IO_SYNC or IO_ASYNC is set,
- * the system decides how to cluster.
+ * Pageouts are already clustered, use IO_ASYNC to force a
+ * bawrite() rather then a bdwrite() to prevent paging I/O
+ * from saturating the buffer cache. Dummy-up the sequential
+ * heuristic to cause large ranges to cluster. If neither
+ * IO_SYNC or IO_ASYNC is set, the system decides how to
+ * cluster.
*/
ioflags = IO_VMIO;
- if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
+ if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0)
ioflags |= IO_SYNC;
- else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
+ else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0)
ioflags |= IO_ASYNC;
- ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
+ ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0;
+ ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0;
ioflags |= IO_SEQMAX << IO_SEQSHIFT;
-
- aiov.iov_base = (caddr_t) 0;
- aiov.iov_len = maxsize;
- auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- auio.uio_offset = poffset;
- auio.uio_segflg = UIO_NOCOPY;
- auio.uio_rw = UIO_WRITE;
- auio.uio_resid = maxsize;
- auio.uio_td = (struct thread *) 0;
- error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
- PCPU_INC(cnt.v_vnodeout);
- PCPU_ADD(cnt.v_vnodepgsout, ncount);
-
- if (error) {
- if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
- printf("vnode_pager_putpages: I/O error %d\n", error);
- }
- if (auio.uio_resid) {
- if (ppscheck || ppsratecheck(&lastfail, &curfail, 1))
- printf("vnode_pager_putpages: residual I/O %zd at %lu\n",
- auio.uio_resid, (u_long)ma[0]->pindex);
- }
- for (i = 0; i < ncount; i++) {
- rtvals[i] = VM_PAGER_OK;
- }
- return rtvals[0];
+ return (ioflags);
}
+/*
+ * vnode_pager_undirty_pages().
+ *
+ * A helper to mark pages as clean after pageout that was possibly
+ * done with a short write. The lpos argument specifies the page run
+ * length in bytes, and the written argument specifies how many bytes
+ * were actually written. eof is the offset past the last valid byte
+ * in the vnode using the absolute file position of the first byte in
+ * the run as the base from which it is computed.
+ */
void
-vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written)
+vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof,
+ int lpos)
{
vm_object_t obj;
- int i, pos;
+ int i, pos, pos_devb;
- if (written == 0)
+ if (written == 0 && eof >= lpos)
return;
obj = ma[0]->object;
VM_OBJECT_WLOCK(obj);
@@ -1175,6 +1403,37 @@
vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK);
}
}
+ if (eof >= lpos) /* avoid truncation */
+ goto done;
+ for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) {
+ if (pos != trunc_page(pos)) {
+ /*
+ * The page contains the last valid byte in
+ * the vnode, mark the rest of the page as
+ * clean, potentially making the whole page
+ * clean.
+ */
+ pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE);
+ vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE -
+ pos_devb);
+
+ /*
+ * If the page was cleaned, report the pageout
+ * on it as successful. msync() no longer
+ * needs to write out the page, endlessly
+ * creating write requests and dirty buffers.
+ */
+ if (ma[i]->dirty == 0)
+ rtvals[i] = VM_PAGER_OK;
+
+ pos = round_page(pos);
+ } else {
+ /* vm_pageout_flush() clears dirty */
+ rtvals[i] = VM_PAGER_BAD;
+ pos += PAGE_SIZE;
+ }
+ }
+done:
VM_OBJECT_WUNLOCK(obj);
}
Modified: trunk/sys/vm/vnode_pager.h
===================================================================
--- trunk/sys/vm/vnode_pager.h 2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vnode_pager.h 2020-02-08 19:35:48 UTC (rev 12314)
@@ -33,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vnode_pager.h 8.1 (Berkeley) 6/11/93
- * $FreeBSD: stable/10/sys/vm/vnode_pager.h 232071 2012-02-23 21:07:16Z kib $
+ * $FreeBSD: stable/11/sys/vm/vnode_pager.h 331722 2018-03-29 02:50:57Z eadler $
*/
#ifndef _VNODE_PAGER_
@@ -42,14 +42,17 @@
#ifdef _KERNEL
int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m,
- int count, int reqpage);
+ int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone,
+ void *arg);
int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m,
- int count, boolean_t sync,
- int *rtvals);
-
+ int count, int flags, int *rtvals);
+int vnode_pager_local_getpages(struct vop_getpages_args *ap);
+int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap);
+int vnode_pager_putpages_ioflags(int pager_flags);
void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start,
vm_offset_t end);
-void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written);
+void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written,
+ off_t eof, int lpos);
void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start,
vm_offset_t end);
More information about the Midnightbsd-cvs
mailing list