[Midnightbsd-cvs] src [9896] trunk/sys: sync with freebsd 10-stable
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Thu May 24 18:27:41 EDT 2018
Revision: 9896
http://svnweb.midnightbsd.org/src/?rev=9896
Author: laffer1
Date: 2018-05-24 18:27:41 -0400 (Thu, 24 May 2018)
Log Message:
-----------
sync with freebsd 10-stable
Modified Paths:
--------------
trunk/sys/vm/default_pager.c
trunk/sys/vm/device_pager.c
trunk/sys/vm/memguard.c
trunk/sys/vm/memguard.h
trunk/sys/vm/phys_pager.c
trunk/sys/vm/pmap.h
trunk/sys/vm/redzone.c
trunk/sys/vm/redzone.h
trunk/sys/vm/sg_pager.c
trunk/sys/vm/swap_pager.c
trunk/sys/vm/swap_pager.h
trunk/sys/vm/uma.h
trunk/sys/vm/uma_core.c
trunk/sys/vm/uma_dbg.c
trunk/sys/vm/uma_dbg.h
trunk/sys/vm/uma_int.h
trunk/sys/vm/vm.h
trunk/sys/vm/vm_extern.h
trunk/sys/vm/vm_fault.c
trunk/sys/vm/vm_glue.c
trunk/sys/vm/vm_init.c
trunk/sys/vm/vm_kern.c
trunk/sys/vm/vm_kern.h
trunk/sys/vm/vm_map.c
trunk/sys/vm/vm_map.h
trunk/sys/vm/vm_meter.c
trunk/sys/vm/vm_mmap.c
trunk/sys/vm/vm_object.c
trunk/sys/vm/vm_object.h
trunk/sys/vm/vm_page.c
trunk/sys/vm/vm_page.h
trunk/sys/vm/vm_pageout.c
trunk/sys/vm/vm_pageout.h
trunk/sys/vm/vm_pager.c
trunk/sys/vm/vm_pager.h
trunk/sys/vm/vm_param.h
trunk/sys/vm/vm_phys.c
trunk/sys/vm/vm_phys.h
trunk/sys/vm/vm_reserv.c
trunk/sys/vm/vm_reserv.h
trunk/sys/vm/vm_unix.c
trunk/sys/vm/vm_zeroidle.c
trunk/sys/vm/vnode_pager.c
trunk/sys/vm/vnode_pager.h
Added Paths:
-----------
trunk/sys/vm/_vm_radix.h
trunk/sys/vm/vm_radix.c
trunk/sys/vm/vm_radix.h
trunk/sys/x86/acpica/acpi_wakeup.c
Added: trunk/sys/vm/_vm_radix.h
===================================================================
--- trunk/sys/vm/_vm_radix.h (rev 0)
+++ trunk/sys/vm/_vm_radix.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -0,0 +1,56 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff at freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul at gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/10/sys/vm/_vm_radix.h 254141 2013-08-09 11:28:55Z attilio $
+ */
+
+#ifndef __VM_RADIX_H_
+#define __VM_RADIX_H_
+
+/*
+ * Radix tree root.
+ */
+struct vm_radix {
+ uintptr_t rt_root;
+ uint8_t rt_flags;
+};
+
+#define RT_INSERT_INPROG 0x01
+#define RT_TRIE_MODIFIED 0x02
+
+#ifdef _KERNEL
+
+static __inline boolean_t
+vm_radix_is_empty(struct vm_radix *rtree)
+{
+
+ return (rtree->rt_root == 0);
+}
+
+#endif /* _KERNEL */
+#endif /* !__VM_RADIX_H_ */
Property changes on: trunk/sys/vm/_vm_radix.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/default_pager.c
===================================================================
--- trunk/sys/vm/default_pager.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/default_pager.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1995, David Greenman
* All rights reserved.
@@ -38,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/default_pager.c 310363 2016-12-21 11:32:08Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -45,7 +46,7 @@
#include <sys/lock.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
-#include <sys/mutex.h>
+#include <sys/rwlock.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
@@ -63,6 +64,16 @@
int *);
/*
* pagerops for OBJT_DEFAULT - "default pager".
+ *
+ * This pager handles anonymous (no handle) swap-backed memory, just
+ * like the swap pager. It allows several optimizations based on the
+ * fact that no pages of a default object can be swapped out. The
+ * most important optimization is in vm_fault(), where the pager is
+ * never asked for a non-resident page. Instead, a freshly allocated
+ * zeroed page is used.
+ *
+ * On the first request to page out a page from a default object, the
+ * object is converted to swap pager type.
*/
struct pagerops defaultpagerops = {
.pgo_alloc = default_pager_alloc,
@@ -91,10 +102,10 @@
object = vm_object_allocate(OBJT_DEFAULT,
OFF_TO_IDX(round_page(offset + size)));
if (cred != NULL) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
object->cred = cred;
object->charge = size;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
return (object);
}
@@ -113,6 +124,7 @@
/*
* OBJT_DEFAULT objects have no special resources allocated to them.
*/
+ object->type = OBJT_DEAD;
}
/*
@@ -137,14 +149,11 @@
* cache to the free list.
*/
static void
-default_pager_putpages(object, m, c, sync, rtvals)
- vm_object_t object;
- vm_page_t *m;
- int c;
- boolean_t sync;
- int *rtvals;
+default_pager_putpages(vm_object_t object, vm_page_t *m, int count,
+ int flags, int *rtvals)
{
- swappagerops.pgo_putpages(object, m, c, sync, rtvals);
+
+ swappagerops.pgo_putpages(object, m, count, flags, rtvals);
}
/*
Modified: trunk/sys/vm/device_pager.c
===================================================================
--- trunk/sys/vm/device_pager.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/device_pager.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1990 University of Utah.
* Copyright (c) 1991, 1993
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/device_pager.c 320439 2017-06-28 06:13:58Z alc $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -44,6 +45,7 @@
#include <sys/proc.h>
#include <sys/mutex.h>
#include <sys/mman.h>
+#include <sys/rwlock.h>
#include <sys/sx.h>
#include <vm/vm.h>
@@ -51,6 +53,7 @@
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
#include <vm/uma.h>
static void dev_pager_init(void);
@@ -58,10 +61,8 @@
vm_ooffset_t, struct ucred *);
static void dev_pager_dealloc(vm_object_t);
static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int);
-static void dev_pager_putpages(vm_object_t, vm_page_t *, int,
- boolean_t, int *);
-static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *,
- int *);
+static void dev_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
+static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
static void dev_pager_free_page(vm_object_t object, vm_page_t m);
/* list of device pager objects */
@@ -99,8 +100,9 @@
};
static void
-dev_pager_init()
+dev_pager_init(void)
{
+
TAILQ_INIT(&dev_pager_object_list);
mtx_init(&dev_pager_mtx, "dev_pager list", NULL, MTX_DEF);
}
@@ -157,6 +159,7 @@
object1->pg_color = color;
object1->handle = handle;
object1->un_pager.devp.ops = ops;
+ object1->un_pager.devp.dev = handle;
TAILQ_INIT(&object1->un_pager.devp.devp_pglist);
mtx_lock(&dev_pager_mtx);
object = vm_pager_object_lookup(&dev_pager_object_list, handle);
@@ -204,7 +207,7 @@
cdev_pager_free_page(vm_object_t object, vm_page_t m)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
if (object->type == OBJT_MGTDEVICE) {
KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("unmanaged %p", m));
pmap_remove_all(m);
@@ -219,27 +222,26 @@
dev_pager_free_page(vm_object_t object, vm_page_t m)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT((object->type == OBJT_DEVICE &&
(m->oflags & VPO_UNMANAGED) != 0),
("Managed device or page obj %p m %p", object, m));
- TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, pageq);
+ TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, plinks.q);
vm_page_putfake(m);
}
static void
-dev_pager_dealloc(object)
- vm_object_t object;
+dev_pager_dealloc(vm_object_t object)
{
vm_page_t m;
- VM_OBJECT_UNLOCK(object);
- object->un_pager.devp.ops->cdev_pg_dtor(object->handle);
+ VM_OBJECT_WUNLOCK(object);
+ object->un_pager.devp.ops->cdev_pg_dtor(object->un_pager.devp.dev);
mtx_lock(&dev_pager_mtx);
TAILQ_REMOVE(&dev_pager_object_list, object, pager_object_list);
mtx_unlock(&dev_pager_mtx);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (object->type == OBJT_DEVICE) {
/*
@@ -249,6 +251,8 @@
!= NULL)
dev_pager_free_page(object, m);
}
+ object->handle = NULL;
+ object->type = OBJT_DEAD;
}
static int
@@ -256,11 +260,11 @@
{
int error, i;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
error = object->un_pager.devp.ops->cdev_pg_fault(object,
IDX_TO_OFF(ma[reqpage]->pindex), PROT_READ, &ma[reqpage]);
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
for (i = 0; i < count; i++) {
if (i != reqpage) {
@@ -278,7 +282,7 @@
("Wrong page type %p %p", ma[reqpage], object));
if (object->type == OBJT_DEVICE) {
TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist,
- ma[reqpage], pageq);
+ ma[reqpage], plinks.q);
}
}
@@ -289,7 +293,6 @@
old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
vm_page_t *mres)
{
- vm_pindex_t pidx;
vm_paddr_t paddr;
vm_page_t m_paddr, page;
struct cdev *dev;
@@ -296,18 +299,17 @@
struct cdevsw *csw;
struct file *fpop;
struct thread *td;
- vm_memattr_t memattr;
+ vm_memattr_t memattr, memattr1;
int ref, ret;
- pidx = OFF_TO_IDX(offset);
memattr = object->memattr;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
dev = object->handle;
csw = dev_refthread(dev, &ref);
if (csw == NULL) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
return (VM_PAGER_FAIL);
}
td = curthread;
@@ -319,16 +321,24 @@
if (ret != 0) {
printf(
"WARNING: dev_pager_getpage: map function returns error %d", ret);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
return (VM_PAGER_FAIL);
}
/* If "paddr" is a real page, perform a sanity check on "memattr". */
if ((m_paddr = vm_phys_paddr_to_vm_page(paddr)) != NULL &&
- pmap_page_get_memattr(m_paddr) != memattr) {
- memattr = pmap_page_get_memattr(m_paddr);
- printf(
- "WARNING: A device driver has set \"memattr\" inconsistently.\n");
+ (memattr1 = pmap_page_get_memattr(m_paddr)) != memattr) {
+ /*
+ * For the /dev/mem d_mmap routine to return the
+ * correct memattr, pmap_page_get_memattr() needs to
+ * be called, which we do there.
+ */
+ if ((csw->d_flags & D_MEM) == 0) {
+ printf("WARNING: Device driver %s has set "
+ "\"memattr\" inconsistently (drv %u pmap %u).\n",
+ csw->d_name, memattr, memattr1);
+ }
+ memattr = memattr1;
}
if (((*mres)->flags & PG_FICTITIOUS) != 0) {
/*
@@ -336,7 +346,7 @@
* the new physical address.
*/
page = *mres;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
vm_page_updatefake(page, paddr, memattr);
} else {
/*
@@ -344,12 +354,13 @@
* free up the all of the original pages.
*/
page = vm_page_getfake(paddr, memattr);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
+ if (vm_page_replace(page, object, (*mres)->pindex) != *mres)
+ panic("old_dev_pager_fault: invalid page replacement");
vm_page_lock(*mres);
vm_page_free(*mres);
vm_page_unlock(*mres);
*mres = page;
- vm_page_insert(page, object, pidx);
}
page->valid = VM_PAGE_BITS_ALL;
return (VM_PAGER_OK);
@@ -356,12 +367,8 @@
}
static void
-dev_pager_putpages(object, m, count, sync, rtvals)
- vm_object_t object;
- vm_page_t *m;
- int count;
- boolean_t sync;
- int *rtvals;
+dev_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags,
+ int *rtvals)
{
panic("dev_pager_putpage called");
@@ -368,12 +375,10 @@
}
static boolean_t
-dev_pager_haspage(object, pindex, before, after)
- vm_object_t object;
- vm_pindex_t pindex;
- int *before;
- int *after;
+dev_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+ int *after)
{
+
if (before != NULL)
*before = 0;
if (after != NULL)
@@ -408,6 +413,7 @@
* XXX assumes VM_PROT_* == PROT_*
*/
npages = OFF_TO_IDX(size);
+ paddr = 0; /* Make paddr initialized for the case of size == 0. */
for (off = foff; npages--; off += PAGE_SIZE) {
if (csw->d_mmap(dev, off, &paddr, (int)prot, &dummy) != 0) {
dev_relthread(dev, ref);
Modified: trunk/sys/vm/memguard.c
===================================================================
--- trunk/sys/vm/memguard.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/memguard.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2005, Bosko Milekic <bmilekic at FreeBSD.org>.
* Copyright (c) 2010 Isilon Systems, Inc. (http://www.isilon.com/)
@@ -26,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/memguard.c 325037 2017-10-27 14:23:53Z markj $");
/*
* MemGuard is a simple replacement allocator for debugging only
@@ -48,6 +49,7 @@
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/sysctl.h>
+#include <sys/vmem.h>
#include <vm/vm.h>
#include <vm/uma.h>
@@ -55,7 +57,9 @@
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
+#include <vm/uma_int.h>
#include <vm/memguard.h>
static SYSCTL_NODE(_vm, OID_AUTO, memguard, CTLFLAG_RW, NULL, "MemGuard data");
@@ -86,9 +90,7 @@
return (error);
mtx_lock(&malloc_mtx);
- /*
- * If mtp is NULL, it will be initialized in memguard_cmp().
- */
+ /* If mtp is NULL, it will be initialized in memguard_cmp() */
vm_memguard_mtype = malloc_desc2type(desc);
strlcpy(vm_memguard_desc, desc, sizeof(vm_memguard_desc));
mtx_unlock(&malloc_mtx);
@@ -98,8 +100,8 @@
CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
memguard_sysctl_desc, "A", "Short description of memory type to monitor");
-static vm_map_t memguard_map = NULL;
static vm_offset_t memguard_cursor;
+static vm_offset_t memguard_base;
static vm_size_t memguard_mapsize;
static vm_size_t memguard_physlimit;
static u_long memguard_wasted;
@@ -111,7 +113,7 @@
SYSCTL_ULONG(_vm_memguard, OID_AUTO, cursor, CTLFLAG_RD,
&memguard_cursor, 0, "MemGuard cursor");
SYSCTL_ULONG(_vm_memguard, OID_AUTO, mapsize, CTLFLAG_RD,
- &memguard_mapsize, 0, "MemGuard private vm_map size");
+ &memguard_mapsize, 0, "MemGuard private arena size");
SYSCTL_ULONG(_vm_memguard, OID_AUTO, phys_limit, CTLFLAG_RD,
&memguard_physlimit, 0, "Limit on MemGuard memory consumption");
SYSCTL_ULONG(_vm_memguard, OID_AUTO, wasted, CTLFLAG_RD,
@@ -125,15 +127,17 @@
SYSCTL_ULONG(_vm_memguard, OID_AUTO, fail_pgs, CTLFLAG_RD,
&memguard_fail_pgs, 0, "MemGuard failures due to lack of pages");
-#define MG_GUARD 0x001
-#define MG_ALLLARGE 0x002
-static int memguard_options = MG_GUARD;
+#define MG_GUARD_AROUND 0x001
+#define MG_GUARD_ALLLARGE 0x002
+#define MG_GUARD_NOFREE 0x004
+static int memguard_options = MG_GUARD_AROUND;
TUNABLE_INT("vm.memguard.options", &memguard_options);
SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RW,
&memguard_options, 0,
"MemGuard options:\n"
"\t0x001 - add guard pages around each allocation\n"
- "\t0x002 - always use MemGuard for allocations over a page");
+ "\t0x002 - always use MemGuard for allocations over a page\n"
+ "\t0x004 - guard uma(9) zones with UMA_ZONE_NOFREE flag");
static u_int memguard_minsize;
static u_long memguard_minsize_reject;
@@ -197,21 +201,18 @@
* out of a single VM map (contiguous chunk of address space).
*/
void
-memguard_init(vm_map_t parent_map)
+memguard_init(vmem_t *parent)
{
- vm_offset_t base, limit;
+ vm_offset_t base;
- memguard_map = kmem_suballoc(parent_map, &base, &limit,
- memguard_mapsize, FALSE);
- memguard_map->system_map = 1;
- KASSERT(memguard_mapsize == limit - base,
- ("Expected %lu, got %lu", (u_long)memguard_mapsize,
- (u_long)(limit - base)));
+ vmem_alloc(parent, memguard_mapsize, M_BESTFIT | M_WAITOK, &base);
+ vmem_init(memguard_arena, "memguard arena", base, memguard_mapsize,
+ PAGE_SIZE, 0, M_WAITOK);
memguard_cursor = base;
+ memguard_base = base;
printf("MEMGUARD DEBUGGING ALLOCATOR INITIALIZED:\n");
printf("\tMEMGUARD map base: 0x%lx\n", (u_long)base);
- printf("\tMEMGUARD map limit: 0x%lx\n", (u_long)limit);
printf("\tMEMGUARD map size: %jd KBytes\n",
(uintmax_t)memguard_mapsize >> 10);
}
@@ -226,12 +227,14 @@
parent = SYSCTL_STATIC_CHILDREN(_vm_memguard);
- SYSCTL_ADD_ULONG(NULL, parent, OID_AUTO, "mapstart", CTLFLAG_RD,
- &memguard_map->min_offset, "MemGuard KVA base");
- SYSCTL_ADD_ULONG(NULL, parent, OID_AUTO, "maplimit", CTLFLAG_RD,
- &memguard_map->max_offset, "MemGuard KVA end");
+ SYSCTL_ADD_UAUTO(NULL, parent, OID_AUTO, "mapstart", CTLFLAG_RD,
+ &memguard_base, "MemGuard KVA base");
+ SYSCTL_ADD_UAUTO(NULL, parent, OID_AUTO, "maplimit", CTLFLAG_RD,
+ &memguard_mapsize, "MemGuard KVA size");
+#if 0
SYSCTL_ADD_ULONG(NULL, parent, OID_AUTO, "mapused", CTLFLAG_RD,
&memguard_map->size, "MemGuard KVA used");
+#endif
}
SYSINIT(memguard, SI_SUB_KLD, SI_ORDER_ANY, memguard_sysinit, NULL);
@@ -257,9 +260,24 @@
p = PHYS_TO_VM_PAGE(pa);
KASSERT(p->wire_count != 0 && p->queue == PQ_NONE,
("MEMGUARD: Expected wired page %p in vtomgfifo!", p));
- return ((u_long *)&p->pageq.tqe_next);
+ return (&p->plinks.memguard.p);
}
+static u_long *
+v2sizev(vm_offset_t va)
+{
+ vm_paddr_t pa;
+ struct vm_page *p;
+
+ pa = pmap_kextract(va);
+ if (pa == 0)
+ panic("MemGuard detected double-free of %p", (void *)va);
+ p = PHYS_TO_VM_PAGE(pa);
+ KASSERT(p->wire_count != 0 && p->queue == PQ_NONE,
+ ("MEMGUARD: Expected wired page %p in vtomgfifo!", p));
+ return (&p->plinks.memguard.v);
+}
+
/*
* Allocate a single object of specified size with specified flags
* (either M_WAITOK or M_NOWAIT).
@@ -267,7 +285,7 @@
void *
memguard_alloc(unsigned long req_size, int flags)
{
- vm_offset_t addr;
+ vm_offset_t addr, origaddr;
u_long size_p, size_v;
int do_guard, rv;
@@ -282,11 +300,10 @@
* value.
*/
size_v = size_p;
- do_guard = (memguard_options & MG_GUARD) != 0;
+ do_guard = (memguard_options & MG_GUARD_AROUND) != 0;
if (do_guard)
size_v += 2 * PAGE_SIZE;
- vm_map_lock(memguard_map);
/*
* When we pass our memory limit, reject sub-page allocations.
* Page-size and larger allocations will use the same amount
@@ -293,7 +310,7 @@
* of physical memory whether we allocate or hand off to
* uma_large_alloc(), so keep those.
*/
- if (memguard_map->size >= memguard_physlimit &&
+ if (vmem_size(memguard_arena, VMEM_ALLOC) >= memguard_physlimit &&
req_size < PAGE_SIZE) {
addr = (vm_offset_t)NULL;
memguard_fail_pgs++;
@@ -310,9 +327,9 @@
* map, unless vm_map_findspace() is tweaked.
*/
for (;;) {
- rv = vm_map_findspace(memguard_map, memguard_cursor,
- size_v, &addr);
- if (rv == KERN_SUCCESS)
+ if (vmem_xalloc(memguard_arena, size_v, 0, 0, 0,
+ memguard_cursor, VMEM_ADDR_MAX,
+ M_BESTFIT | M_NOWAIT, &origaddr) == 0)
break;
/*
* The map has no space. This may be due to
@@ -319,24 +336,27 @@
* fragmentation, or because the cursor is near the
* end of the map.
*/
- if (memguard_cursor == vm_map_min(memguard_map)) {
+ if (memguard_cursor == memguard_base) {
memguard_fail_kva++;
addr = (vm_offset_t)NULL;
goto out;
}
memguard_wrap++;
- memguard_cursor = vm_map_min(memguard_map);
+ memguard_cursor = memguard_base;
}
+ addr = origaddr;
if (do_guard)
addr += PAGE_SIZE;
- rv = kmem_back(memguard_map, addr, size_p, flags);
+ rv = kmem_back(kmem_object, addr, size_p, flags);
if (rv != KERN_SUCCESS) {
+ vmem_xfree(memguard_arena, origaddr, size_v);
memguard_fail_pgs++;
addr = (vm_offset_t)NULL;
goto out;
}
- memguard_cursor = addr + size_p;
+ memguard_cursor = addr + size_v;
*v2sizep(trunc_page(addr)) = req_size;
+ *v2sizev(trunc_page(addr)) = size_v;
memguard_succ++;
if (req_size < PAGE_SIZE) {
memguard_wasted += (PAGE_SIZE - req_size);
@@ -351,7 +371,6 @@
}
}
out:
- vm_map_unlock(memguard_map);
return ((void *)addr);
}
@@ -360,7 +379,7 @@
{
vm_offset_t a = (vm_offset_t)(uintptr_t)addr;
- return (a >= memguard_map->min_offset && a < memguard_map->max_offset);
+ return (a >= memguard_base && a < memguard_base + memguard_mapsize);
}
/*
@@ -370,12 +389,13 @@
memguard_free(void *ptr)
{
vm_offset_t addr;
- u_long req_size, size;
+ u_long req_size, size, sizev;
char *temp;
int i;
addr = trunc_page((uintptr_t)ptr);
req_size = *v2sizep(addr);
+ sizev = *v2sizev(addr);
size = round_page(req_size);
/*
@@ -397,11 +417,12 @@
* vm_map lock to serialize updates to memguard_wasted, since
* we had the lock at increment.
*/
- vm_map_lock(memguard_map);
+ kmem_unback(kmem_object, addr, size);
+ if (sizev > size)
+ addr -= PAGE_SIZE;
+ vmem_xfree(memguard_arena, addr, sizev);
if (req_size < PAGE_SIZE)
memguard_wasted -= (PAGE_SIZE - req_size);
- (void)vm_map_delete(memguard_map, addr, addr + size);
- vm_map_unlock(memguard_map);
}
/*
@@ -429,8 +450,8 @@
return (newaddr);
}
-int
-memguard_cmp(struct malloc_type *mtp, unsigned long size)
+static int
+memguard_cmp(unsigned long size)
{
if (size < memguard_minsize) {
@@ -437,7 +458,7 @@
memguard_minsize_reject++;
return (0);
}
- if ((memguard_options & MG_ALLLARGE) != 0 && size >= PAGE_SIZE)
+ if ((memguard_options & MG_GUARD_ALLLARGE) != 0 && size >= PAGE_SIZE)
return (1);
if (memguard_frequency > 0 &&
(random() % 100000) < memguard_frequency) {
@@ -444,6 +465,17 @@
memguard_frequency_hits++;
return (1);
}
+
+ return (0);
+}
+
+int
+memguard_cmp_mtp(struct malloc_type *mtp, unsigned long size)
+{
+
+ if (memguard_cmp(size))
+ return(1);
+
#if 1
/*
* The safest way of comparsion is to always compare short description
@@ -467,3 +499,21 @@
return (0);
#endif
}
+
+int
+memguard_cmp_zone(uma_zone_t zone)
+{
+
+ if ((memguard_options & MG_GUARD_NOFREE) == 0 &&
+ zone->uz_flags & UMA_ZONE_NOFREE)
+ return (0);
+
+ if (memguard_cmp(zone->uz_size))
+ return (1);
+
+ /*
+ * The safest way of comparsion is to always compare zone name,
+ * but it is also the slowest way.
+ */
+ return (strcmp(zone->uz_name, vm_memguard_desc) == 0);
+}
Modified: trunk/sys/vm/memguard.h
===================================================================
--- trunk/sys/vm/memguard.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/memguard.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2005,
* Bosko Milekic <bmilekic at FreeBSD.org>. All rights reserved.
@@ -23,7 +24,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $
*/
#ifndef _VM_MEMGUARD_H_
@@ -33,14 +34,16 @@
struct malloc_type;
struct vm_map;
+struct vmem;
#ifdef DEBUG_MEMGUARD
unsigned long memguard_fudge(unsigned long, const struct vm_map *);
-void memguard_init(struct vm_map *);
+void memguard_init(struct vmem *);
void *memguard_alloc(unsigned long, int);
void *memguard_realloc(void *, unsigned long, struct malloc_type *, int);
void memguard_free(void *);
-int memguard_cmp(struct malloc_type *, unsigned long);
+int memguard_cmp_mtp(struct malloc_type *, unsigned long);
+int memguard_cmp_zone(uma_zone_t);
int is_memguard_addr(void *);
#else
#define memguard_fudge(size, xxx) (size)
@@ -48,7 +51,8 @@
#define memguard_alloc(size, flags) NULL
#define memguard_realloc(a, s, mtp, f) NULL
#define memguard_free(addr) do { } while (0)
-#define memguard_cmp(mtp, size) 0
+#define memguard_cmp_mtp(mtp, size) 0
+#define memguard_cmp_zone(zone) 0
#define is_memguard_addr(addr) 0
#endif
Modified: trunk/sys/vm/phys_pager.c
===================================================================
--- trunk/sys/vm/phys_pager.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/phys_pager.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2000 Peter Wemm
*
@@ -24,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/phys_pager.c 310110 2016-12-15 10:47:35Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -34,9 +35,11 @@
#include <sys/proc.h>
#include <sys/mutex.h>
#include <sys/mman.h>
+#include <sys/rwlock.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
@@ -54,9 +57,6 @@
mtx_init(&phys_pager_mtx, "phys_pager list", NULL, MTX_DEF);
}
-/*
- * MPSAFE
- */
static vm_object_t
phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred)
@@ -99,8 +99,8 @@
object = object1;
object1 = NULL;
object->handle = handle;
- TAILQ_INSERT_TAIL(&phys_pager_object_list, object,
- pager_object_list);
+ TAILQ_INSERT_TAIL(&phys_pager_object_list,
+ object, pager_object_list);
}
} else {
if (pindex > object->size)
@@ -115,20 +115,19 @@
return (object);
}
-/*
- * MPSAFE
- */
static void
phys_pager_dealloc(vm_object_t object)
{
if (object->handle != NULL) {
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
mtx_lock(&phys_pager_mtx);
TAILQ_REMOVE(&phys_pager_object_list, object, pager_object_list);
mtx_unlock(&phys_pager_mtx);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
}
+ object->handle = NULL;
+ object->type = OBJT_DEAD;
}
/*
@@ -139,7 +138,7 @@
{
int i;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
for (i = 0; i < count; i++) {
if (m[i]->valid == 0) {
if ((m[i]->flags & PG_ZERO) == 0)
@@ -151,10 +150,12 @@
KASSERT(m[i]->dirty == 0,
("phys_pager_getpages: dirty page %p", m[i]));
/* The requested page must remain busy, the others not. */
- if (i == reqpage)
+ if (i == reqpage) {
+ vm_page_lock(m[i]);
vm_page_flash(m[i]);
- else
- vm_page_wakeup(m[i]);
+ vm_page_unlock(m[i]);
+ } else
+ vm_page_xunbusy(m[i]);
}
return (VM_PAGER_OK);
}
@@ -161,7 +162,7 @@
static void
phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
- int *rtvals)
+ int *rtvals)
{
panic("phys_pager_putpage called");
@@ -179,7 +180,7 @@
#endif
static boolean_t
phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
- int *after)
+ int *after)
{
vm_pindex_t base, end;
Modified: trunk/sys/vm/pmap.h
===================================================================
--- trunk/sys/vm/pmap.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/pmap.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -57,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/pmap.h 270920 2014-09-01 07:58:15Z kib $
*/
/*
@@ -97,21 +98,25 @@
*/
extern vm_offset_t kernel_vm_end;
+/*
+ * Flags for pmap_enter(). The bits in the low-order byte are reserved
+ * for the protection code (vm_prot_t) that describes the fault type.
+ */
+#define PMAP_ENTER_NOSLEEP 0x0100
+#define PMAP_ENTER_WIRED 0x0200
+
void pmap_activate(struct thread *td);
+void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+ int advice);
void pmap_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
vm_size_t);
-#if defined(__mips__)
-void pmap_align_tlb(vm_offset_t *);
-#endif
-void pmap_change_wiring(pmap_t, vm_offset_t, boolean_t);
void pmap_clear_modify(vm_page_t m);
-void pmap_clear_reference(vm_page_t m);
void pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
void pmap_copy_page(vm_page_t, vm_page_t);
void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset,
vm_page_t mb[], vm_offset_t b_offset, int xfersize);
-void pmap_enter(pmap_t, vm_offset_t, vm_prot_t, vm_page_t,
- vm_prot_t, boolean_t);
+int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
+ vm_prot_t prot, u_int flags, int8_t psind);
void pmap_enter_object(pmap_t pmap, vm_offset_t start,
vm_offset_t end, vm_page_t m_start, vm_prot_t prot);
void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
@@ -144,6 +149,7 @@
void pmap_remove_write(vm_page_t m);
void pmap_sync_icache(pmap_t, vm_offset_t, vm_size_t);
boolean_t pmap_ts_referenced(vm_page_t m);
+void pmap_unwire(pmap_t pmap, vm_offset_t start, vm_offset_t end);
void pmap_zero_page(vm_page_t);
void pmap_zero_page_area(vm_page_t, int off, int size);
void pmap_zero_page_idle(vm_page_t);
Modified: trunk/sys/vm/redzone.c
===================================================================
--- trunk/sys/vm/redzone.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/redzone.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/redzone.c 227309 2011-11-07 15:43:11Z ed $");
#include <sys/param.h>
#include <sys/systm.h>
Modified: trunk/sys/vm/redzone.h
===================================================================
--- trunk/sys/vm/redzone.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/redzone.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
* All rights reserved.
@@ -23,7 +24,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $
*/
#ifndef _VM_REDZONE_H_
Modified: trunk/sys/vm/sg_pager.c
===================================================================
--- trunk/sys/vm/sg_pager.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/sg_pager.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
/*-
- * Copyright (c) 2009 Advanced Computing Technologies LLC
+ * Copyright (c) 2009 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb at FreeBSD.org>
* All rights reserved.
*
@@ -26,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/sg_pager.c 284100 2015-06-06 20:37:40Z jhb $");
/*
* This pager manages OBJT_SG objects. These objects are backed by
@@ -36,6 +37,7 @@
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
+#include <sys/rwlock.h>
#include <sys/sglist.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -42,6 +44,7 @@
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
#include <vm/uma.h>
static vm_object_t sg_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
@@ -122,12 +125,14 @@
* Free up our fake pages.
*/
while ((m = TAILQ_FIRST(&object->un_pager.sgp.sgp_pglist)) != 0) {
- TAILQ_REMOVE(&object->un_pager.sgp.sgp_pglist, m, pageq);
+ TAILQ_REMOVE(&object->un_pager.sgp.sgp_pglist, m, plinks.q);
vm_page_putfake(m);
}
sg = object->handle;
sglist_free(sg);
+ object->handle = NULL;
+ object->type = OBJT_DEAD;
}
static int
@@ -141,10 +146,10 @@
size_t space;
int i;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
sg = object->handle;
memattr = object->memattr;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
offset = m[reqpage]->pindex;
/*
@@ -179,16 +184,18 @@
/* Construct a new fake page. */
page = vm_page_getfake(paddr, memattr);
- VM_OBJECT_LOCK(object);
- TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, pageq);
+ VM_OBJECT_WLOCK(object);
+ TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, plinks.q);
/* Free the original pages and insert this fake page into the object. */
for (i = 0; i < count; i++) {
+ if (i == reqpage &&
+ vm_page_replace(page, object, offset) != m[i])
+ panic("sg_pager_getpages: invalid place replacement");
vm_page_lock(m[i]);
vm_page_free(m[i]);
vm_page_unlock(m[i]);
}
- vm_page_insert(page, object, offset);
m[reqpage] = page;
page->valid = VM_PAGE_BITS_ALL;
Modified: trunk/sys/vm/swap_pager.c
===================================================================
--- trunk/sys/vm/swap_pager.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/swap_pager.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1998 Matthew Dillon,
* Copyright (c) 1994 John S. Dyson
@@ -50,7 +51,7 @@
*
* - on the fly reallocation of swap during putpages. The new system
* does not try to keep previously allocated swap blocks for dirty
- * pages.
+ * pages.
*
* - on the fly deallocation of swap
*
@@ -67,7 +68,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/swap_pager.c 320557 2017-07-01 22:21:11Z alc $");
#include "opt_swap.h"
#include "opt_vm.h"
@@ -89,6 +90,7 @@
#include <sys/racct.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/blist.h>
@@ -114,9 +116,8 @@
#include <geom/geom.h>
/*
- * SWB_NPAGES must be a power of 2. It may be set to 1, 2, 4, 8, 16
- * or 32 pages per allocation.
- * The 32-page limit is due to the radix code (kern/subr_blist.c).
+ * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64.
+ * The 64-page limit is due to the radix code (kern/subr_blist.c).
*/
#ifndef MAX_PAGEOUT_CLUSTER
#define MAX_PAGEOUT_CLUSTER 16
@@ -133,7 +134,6 @@
* Unused disk addresses within a swap area are allocated and managed
* using a blist.
*/
-#define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
#define SWAP_META_PAGES (SWB_NPAGES * 2)
#define SWAP_META_MASK (SWAP_META_PAGES - 1)
@@ -154,15 +154,21 @@
static int swdev_syscall_active = 0; /* serialize swap(on|off) */
static vm_ooffset_t swap_total;
-SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
+SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
"Total amount of available swap storage.");
static vm_ooffset_t swap_reserved;
-SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
+SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
"Amount of swap storage needed to back all allocated anonymous memory.");
static int overcommit = 0;
-SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
+SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
"Configure virtual memory overcommit behavior. See tuning(7) "
"for details.");
+static unsigned long swzone;
+SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0,
+ "Actual size of swap metadata zone");
+static unsigned long swap_maxpages;
+SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
+ "Maximum amount of swap supported");
/* bits from overcommit */
#define SWAP_RESERVE_FORCE_ON (1 << 0)
@@ -184,7 +190,7 @@
static int curfail;
static struct timeval lastfail;
struct uidinfo *uip;
-
+
uip = cred->cr_ruidinfo;
if (incr & PAGE_MASK)
@@ -191,11 +197,13 @@
panic("swap_reserve: & PAGE_MASK");
#ifdef RACCT
- PROC_LOCK(curproc);
- error = racct_add(curproc, RACCT_SWAP, incr);
- PROC_UNLOCK(curproc);
- if (error != 0)
- return (0);
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ error = racct_add(curproc, RACCT_SWAP, incr);
+ PROC_UNLOCK(curproc);
+ if (error != 0)
+ return (0);
+ }
#endif
res = 0;
@@ -285,7 +293,7 @@
swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
{
struct uidinfo *uip;
-
+
uip = cred->cr_ruidinfo;
if (decr & PAGE_MASK)
@@ -328,7 +336,7 @@
SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
- CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
+ CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
/*
* "named" and "unnamed" anon region objects. Try to reduce the overhead
@@ -340,10 +348,9 @@
#define NOBJLIST(handle) \
(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
-static struct mtx sw_alloc_mtx; /* protect list manipulation */
+static struct mtx sw_alloc_mtx; /* protect list manipulation */
static struct pagerlst swap_pager_object_list[NOBJLISTS];
static uma_zone_t swap_zone;
-static struct vm_object swap_zone_obj;
/*
* pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
@@ -373,18 +380,14 @@
};
/*
- * dmmax is in page-sized chunks with the new swap system. It was
- * dev-bsized chunks in the old. dmmax is always a power of 2.
- *
* swap_*() routines are externally accessible. swp_*() routines are
* internal.
*/
-static int dmmax;
static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */
static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */
-SYSCTL_INT(_vm, OID_AUTO, dmmax,
- CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
+SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0,
+ "Maximum size of a swap block in pages");
static void swp_sizecheck(void);
static void swp_pager_async_iodone(struct buf *bp);
@@ -419,7 +422,7 @@
/*
* SWP_SIZECHECK() - update swap_pager_full indication
- *
+ *
* update the swap_pager_almost_full indication and warn when we are
* about to run out of swap space, using lowat/hiwat hysteresis.
*
@@ -474,7 +477,7 @@
/*
* SWAP_PAGER_INIT() - initialize the swap pager!
*
- * Expected to be started from system init. NOTE: This code is run
+ * Expected to be started from system init. NOTE: This code is run
* before much else so be careful what you depend on. Most of the VM
* system has yet to be initialized at this point.
*/
@@ -490,11 +493,7 @@
TAILQ_INIT(&swap_pager_object_list[i]);
mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
-
- /*
- * Device Stripe, in PAGE_SIZE'd blocks
- */
- dmmax = SWB_NPAGES * 2;
+ sx_init(&sw_alloc_sx, "swspsx");
}
/*
@@ -506,7 +505,7 @@
void
swap_pager_swap_init(void)
{
- int n, n2;
+ unsigned long n, n2;
/*
* Number of in-transit swap bp operations. Don't
@@ -519,7 +518,7 @@
* MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
* constrained by the swap device interleave stripe size.
*
- * Currently we hardwire nsw_wcount_async to 4. This limit is
+ * Currently we hardwire nsw_wcount_async to 4. This limit is
* designed to prevent other I/O from having high latencies due to
* our pageout I/O. The value 4 works well for one or two active swap
* devices but is probably a little low if you have more. Even so,
@@ -542,7 +541,7 @@
/*
* Initialize our zone. Right now I'm just guessing on the number
* we need based on the number of pages in the system. Each swblock
- * can hold 16 pages, so this is probably overkill. This reservation
+ * can hold 32 pages, so this is probably overkill. This reservation
* is typically limited to around 32MB by default.
*/
n = cnt.v_page_count / 2;
@@ -554,7 +553,7 @@
if (swap_zone == NULL)
panic("failed to create swap_zone.");
do {
- if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
+ if (uma_zone_reserve_kva(swap_zone, n))
break;
/*
* if the allocation failed, try a zone two thirds the
@@ -563,12 +562,14 @@
n -= ((n + 2) / 3);
} while (n > 0);
if (n2 != n)
- printf("Swap zone entries reduced from %d to %d.\n", n2, n);
+ printf("Swap zone entries reduced from %lu to %lu.\n", n2, n);
+ swap_maxpages = n * SWAP_META_PAGES;
+ swzone = n * sizeof(struct swblock);
n2 = n;
/*
* Initialize our meta-data hash table. The swapper does not need to
- * be quite as efficient as the VM system, so we do not use an
+ * be quite as efficient as the VM system, so we do not use an
* oversized hash table.
*
* n: size of hash table, must be power of 2
@@ -622,7 +623,7 @@
crhold(cred);
}
object = vm_object_allocate(OBJT_DEFAULT, pindex);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
object->handle = handle;
if (cred != NULL) {
object->cred = cred;
@@ -629,7 +630,7 @@
object->charge = size;
}
swp_pager_meta_build(object, 0, SWAPBLK_NONE);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
sx_xunlock(&sw_alloc_sx);
mtx_unlock(&Giant);
@@ -640,13 +641,13 @@
crhold(cred);
}
object = vm_object_allocate(OBJT_DEFAULT, pindex);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (cred != NULL) {
object->cred = cred;
object->charge = size;
}
swp_pager_meta_build(object, 0, SWAPBLK_NONE);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
return (object);
}
@@ -654,7 +655,7 @@
/*
* SWAP_PAGER_DEALLOC() - remove swap metadata from object
*
- * The swap backing for the object is destroyed. The code is
+ * The swap backing for the object is destroyed. The code is
* designed such that we can reinstantiate it later, but this
* routine is typically called only when the entire object is
* about to be destroyed.
@@ -675,16 +676,18 @@
mtx_unlock(&sw_alloc_mtx);
}
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
vm_object_pip_wait(object, "swpdea");
/*
- * Free all remaining metadata. We only bother to free it from
+ * Free all remaining metadata. We only bother to free it from
* the swap meta data. We do not attempt to free swapblk's still
* associated with vm_page_t's for this object. We do not care
* if paging is still in progress on some objects.
*/
swp_pager_meta_free_all(object);
+ object->handle = NULL;
+ object->type = OBJT_DEAD;
}
/************************************************************************
@@ -748,7 +751,7 @@
return (blk >= sp->sw_first && blk < sp->sw_end);
}
-
+
static void
swp_pager_strategy(struct buf *bp)
{
@@ -758,6 +761,17 @@
TAILQ_FOREACH(sp, &swtailq, sw_list) {
if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
mtx_unlock(&sw_dev_mtx);
+ if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
+ unmapped_buf_allowed) {
+ bp->b_kvaalloc = bp->b_data;
+ bp->b_data = unmapped_buf;
+ bp->b_kvabase = unmapped_buf;
+ bp->b_offset = 0;
+ bp->b_flags |= B_UNMAPPED;
+ } else {
+ pmap_qenter((vm_offset_t)bp->b_data,
+ &bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
+ }
sp->sw_strategy(bp, sp);
return;
}
@@ -764,10 +778,10 @@
}
panic("Swapdev not found");
}
-
+
/*
- * SWP_PAGER_FREESWAPSPACE() - free raw swap space
+ * SWP_PAGER_FREESWAPSPACE() - free raw swap space
*
* This routine returns the specified swap blocks back to the bitmap.
*
@@ -785,7 +799,7 @@
/*
* If we are attempting to stop swapping on
* this device, we don't want to mark any
- * blocks free lest they be reused.
+ * blocks free lest they be reused.
*/
if ((sp->sw_flags & SW_CLOSING) == 0) {
blist_free(sp->sw_blist, blk - sp->sw_first,
@@ -808,15 +822,16 @@
*
* This routine removes swapblk assignments from swap metadata.
*
- * The external callers of this routine typically have already destroyed
- * or renamed vm_page_t's associated with this range in the object so
+ * The external callers of this routine typically have already destroyed
+ * or renamed vm_page_t's associated with this range in the object so
* we should be ok.
+ *
+ * The object must be locked.
*/
void
swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
swp_pager_meta_free(object, start, size);
}
@@ -823,8 +838,8 @@
/*
* SWAP_PAGER_RESERVE() - reserve swap blocks in object
*
- * Assigns swap blocks to the specified range within the object. The
- * swap blocks are not zerod. Any previous swap assignment is destroyed.
+ * Assigns swap blocks to the specified range within the object. The
+ * swap blocks are not zeroed. Any previous swap assignment is destroyed.
*
* Returns 0 on success, -1 on failure.
*/
@@ -835,7 +850,7 @@
daddr_t blk = SWAPBLK_NONE;
vm_pindex_t beg = start; /* save start index */
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
while (size) {
if (n == 0) {
n = BLIST_MAX_ALLOC;
@@ -843,7 +858,7 @@
n >>= 1;
if (n == 0) {
swp_pager_meta_free(object, beg, start - beg);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return (-1);
}
}
@@ -855,7 +870,7 @@
--n;
}
swp_pager_meta_free(object, start, n);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return (0);
}
@@ -869,7 +884,7 @@
*
* This routine is allowed to sleep. It may sleep allocating metadata
* indirectly through swp_pager_meta_build() or if paging is still in
- * progress on the source.
+ * progress on the source.
*
* The source object contains no vm_page_t's (which is just as well)
*
@@ -884,12 +899,12 @@
{
vm_pindex_t i;
- VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
- VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(srcobject);
+ VM_OBJECT_ASSERT_WLOCKED(dstobject);
/*
- * If destroysource is set, we remove the source object from the
- * swap_pager internal queue now.
+ * If destroysource is set, we remove the source object from the
+ * swap_pager internal queue now.
*/
if (destroysource) {
if (srcobject->handle != NULL) {
@@ -925,7 +940,7 @@
daddr_t srcaddr;
srcaddr = swp_pager_meta_ctl(
- srcobject,
+ srcobject,
i + offset,
SWM_POP
);
@@ -935,11 +950,11 @@
* swp_pager_meta_build() can sleep.
*/
vm_object_pip_add(srcobject, 1);
- VM_OBJECT_UNLOCK(srcobject);
+ VM_OBJECT_WUNLOCK(srcobject);
vm_object_pip_add(dstobject, 1);
swp_pager_meta_build(dstobject, i, srcaddr);
vm_object_pip_wakeup(dstobject);
- VM_OBJECT_LOCK(srcobject);
+ VM_OBJECT_WLOCK(srcobject);
vm_object_pip_wakeup(srcobject);
}
} else {
@@ -947,7 +962,7 @@
* Destination has valid swapblk or it is represented
* by a resident page. We destroy the sourceblock.
*/
-
+
swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
}
}
@@ -988,7 +1003,7 @@
{
daddr_t blk0;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_LOCKED(object);
/*
* do we have good backing store at the requested index ?
*/
@@ -1042,7 +1057,7 @@
* SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
*
* This removes any associated swap backing store, whether valid or
- * not, from the page.
+ * not, from the page.
*
* This routine is typically called when a page is made dirty, at
* which point any associated swap can be freed. MADV_FREE also
@@ -1054,12 +1069,13 @@
* depends on it.
*
* This routine may not sleep.
+ *
+ * The object containing the page must be locked.
*/
static void
swap_pager_unswapped(vm_page_t m)
{
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
}
@@ -1071,7 +1087,7 @@
* a chunk surrounding m[reqpage] as is contiguous in swap and which
* belongs to the same object.
*
- * The code is designed for asynchronous operation and
+ * The code is designed for asynchronous operation and
* immediate-notification of 'reqpage' but tends not to be
* used that way. Please do not optimize-out this algorithmic
* feature, I intend to improve on it in the future.
@@ -1101,7 +1117,7 @@
* Calculate range to retrieve. The pages have already been assigned
* their swapblks. We require a *contiguous* range but we know it to
* not span devices. If we do not supply it, bad things
- * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
+ * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
* loops are set up such that the case(s) are handled implicitly.
*
* The swp_*() calls must be made with the object locked.
@@ -1139,7 +1155,7 @@
}
/*
- * Return VM_PAGER_FAIL if we have nothing to do. Return mreq
+ * Return VM_PAGER_FAIL if we have nothing to do. Return mreq
* still busy, but the others unbusied.
*/
if (blk == SWAPBLK_NONE)
@@ -1148,7 +1164,7 @@
/*
* Getpbuf() can sleep.
*/
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
/*
* Get a swap buffer header to perform the IO
*/
@@ -1155,11 +1171,6 @@
bp = getpbuf(&nsw_rcount);
bp->b_flags |= B_PAGING;
- /*
- * map our page(s) into kva for input
- */
- pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);
-
bp->b_iocmd = BIO_READ;
bp->b_iodone = swp_pager_async_iodone;
bp->b_rcred = crhold(thread0.td_ucred);
@@ -1169,7 +1180,7 @@
bp->b_bufsize = PAGE_SIZE * (j - i);
bp->b_pager.pg_reqpage = reqpage - i;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
{
int k;
@@ -1188,7 +1199,7 @@
* does not remove it.
*/
vm_object_pip_add(object, bp->b_npages);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
/*
* perform the I/O. NOTE!!! bp cannot be considered valid after
@@ -1209,11 +1220,12 @@
* cleared on completion. If an I/O error occurs, SWAPBLK_NONE
* is set in the meta-data.
*/
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
- mreq->oflags |= VPO_WANTED;
+ mreq->oflags |= VPO_SWAPSLEEP;
PCPU_INC(cnt.v_intrans);
- if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) {
+ if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP,
+ "swread", hz * 20)) {
printf(
"swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
@@ -1234,13 +1246,13 @@
/*
* A final note: in a low swap situation, we cannot deallocate swap
* and mark a page dirty here because the caller is likely to mark
- * the page clean when we return, causing the page to possibly revert
+ * the page clean when we return, causing the page to possibly revert
* to all-zero's later.
*/
}
/*
- * swap_pager_putpages:
+ * swap_pager_putpages:
*
* Assign swap (if necessary) and initiate I/O on the specified pages.
*
@@ -1247,8 +1259,8 @@
* We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects
* are automatically converted to SWAP objects.
*
- * In a low memory situation we may block in VOP_STRATEGY(), but the new
- * vm_page reservation system coupled with properly written VFS devices
+ * In a low memory situation we may block in VOP_STRATEGY(), but the new
+ * vm_page reservation system coupled with properly written VFS devices
* should ensure that no low-memory deadlock occurs. This is an area
* which needs work.
*
@@ -1263,14 +1275,14 @@
*/
void
swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
- boolean_t sync, int *rtvals)
+ int flags, int *rtvals)
{
- int i;
- int n = 0;
+ int i, n;
+ boolean_t sync;
if (count && m[0]->object != object) {
- panic("swap_pager_putpages: object mismatch %p/%p",
- object,
+ panic("swap_pager_putpages: object mismatch %p/%p",
+ object,
m[0]->object
);
}
@@ -1284,15 +1296,18 @@
*/
if (object->type != OBJT_SWAP)
swp_pager_meta_build(object, 0, SWAPBLK_NONE);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
+ n = 0;
if (curproc != pageproc)
sync = TRUE;
+ else
+ sync = (flags & VM_PAGER_PUT_SYNC) != 0;
/*
* Step 2
*
- * Update nsw parameters from swap_async_max sysctl values.
+ * Update nsw parameters from swap_async_max sysctl values.
* Do not let the sysop crash the machine with bogus numbers.
*/
mtx_lock(&pbuf_mtx);
@@ -1371,8 +1386,6 @@
bp->b_flags |= B_PAGING;
bp->b_iocmd = BIO_WRITE;
- pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
-
bp->b_rcred = crhold(thread0.td_ucred);
bp->b_wcred = crhold(thread0.td_ucred);
bp->b_bcount = PAGE_SIZE * n;
@@ -1379,22 +1392,22 @@
bp->b_bufsize = PAGE_SIZE * n;
bp->b_blkno = blk;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
for (j = 0; j < n; ++j) {
vm_page_t mreq = m[i+j];
swp_pager_meta_build(
- mreq->object,
+ mreq->object,
mreq->pindex,
blk + j
);
- vm_page_dirty(mreq);
+ MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
rtvals[i+j] = VM_PAGER_OK;
mreq->oflags |= VPO_SWAPINPROG;
bp->b_pages[j] = mreq;
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
bp->b_npages = n;
/*
* Must set dirty range for NFS to work.
@@ -1444,7 +1457,7 @@
*/
swp_pager_async_iodone(bp);
}
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
}
/*
@@ -1453,12 +1466,6 @@
* Completion routine for asynchronous reads and writes from/to swap.
* Also called manually by synchronous code to finish up a bp.
*
- * For READ operations, the pages are VPO_BUSY'd. For WRITE operations,
- * the pages are vm_page_t->busy'd. For READ operations, we VPO_BUSY
- * unbusy all pages except the 'main' request page. For WRITE
- * operations, we vm_page_t->busy'd unbusy all pages ( we can do this
- * because we marked them all VM_PAGER_PEND on return from putpages ).
- *
* This routine may not sleep.
*/
static void
@@ -1475,7 +1482,7 @@
"swap_pager: I/O error - %s failed; blkno %ld,"
"size %ld, error %d\n",
((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
- (long)bp->b_blkno,
+ (long)bp->b_blkno,
(long)bp->b_bcount,
bp->b_error
);
@@ -1484,11 +1491,16 @@
/*
* remove the mapping for kernel virtual
*/
- pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) != 0) {
+ bp->b_data = bp->b_kvaalloc;
+ bp->b_kvabase = bp->b_kvaalloc;
+ bp->b_flags &= ~B_UNMAPPED;
+ } else
+ pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
if (bp->b_npages) {
object = bp->b_pages[0]->object;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
}
/*
@@ -1495,7 +1507,7 @@
* cleanup pages. If an error occurs writing to swap, we are in
* very serious trouble. If it happens to be a disk error, though,
* we may be able to recover by reassigning the swap later on. So
- * in this case we remove the m->swapblk assignment for the page
+ * in this case we remove the m->swapblk assignment for the page
* but do not free it in the rlist. The errornous block(s) are thus
* never reallocated as swap. Redirty the page and continue.
*/
@@ -1503,12 +1515,16 @@
vm_page_t m = bp->b_pages[i];
m->oflags &= ~VPO_SWAPINPROG;
+ if (m->oflags & VPO_SWAPSLEEP) {
+ m->oflags &= ~VPO_SWAPSLEEP;
+ wakeup(&object->paging_in_progress);
+ }
if (bp->b_ioflags & BIO_ERROR) {
/*
* If an error occurs I'd love to throw the swapblk
* away without freeing it back to swapspace, so it
- * can never be used again. But I can't from an
+ * can never be used again. But I can't from an
* interrupt.
*/
if (bp->b_iocmd == BIO_READ) {
@@ -1517,7 +1533,7 @@
* locked for the parent, but all other
* pages can be freed. We still want to
* wakeup the parent waiting on the page,
- * though. ( also: pg_reqpage can be -1 and
+ * though. ( also: pg_reqpage can be -1 and
* not match anything ).
*
* We have to wake specifically requested pages
@@ -1531,10 +1547,13 @@
m->valid = 0;
if (i != bp->b_pager.pg_reqpage)
swp_pager_free_nrpage(m);
- else
+ else {
+ vm_page_lock(m);
vm_page_flash(m);
+ vm_page_unlock(m);
+ }
/*
- * If i == bp->b_pager.pg_reqpage, do not wake
+ * If i == bp->b_pager.pg_reqpage, do not wake
* the page up. The caller needs to.
*/
} else {
@@ -1547,11 +1566,11 @@
vm_page_lock(m);
vm_page_activate(m);
vm_page_unlock(m);
- vm_page_io_finish(m);
+ vm_page_sunbusy(m);
}
} else if (bp->b_iocmd == BIO_READ) {
/*
- * NOTE: for reads, m->dirty will probably be
+ * NOTE: for reads, m->dirty will probably be
* overridden by the original caller of getpages so
* we cannot set them in order to free the underlying
* swap in a low-swap situation. I don't think we'd
@@ -1563,8 +1582,8 @@
*
* Note that the requested page, reqpage, is left
* busied, but we still have to wake it up. The
- * other pages are released (unbusied) by
- * vm_page_wakeup().
+ * other pages are released (unbusied) by
+ * vm_page_xunbusy().
*/
KASSERT(!pmap_page_is_mapped(m),
("swp_pager_async_iodone: page %p is mapped", m));
@@ -1577,7 +1596,7 @@
* up too because we cleared VPO_SWAPINPROG and
* could be waiting for it in getpages. However,
* be sure to not unbusy getpages specifically
- * requested page - getpages expects it to be
+ * requested page - getpages expects it to be
* left busy.
*/
if (i != bp->b_pager.pg_reqpage) {
@@ -1584,13 +1603,16 @@
vm_page_lock(m);
vm_page_deactivate(m);
vm_page_unlock(m);
- vm_page_wakeup(m);
- } else
+ vm_page_xunbusy(m);
+ } else {
+ vm_page_lock(m);
vm_page_flash(m);
+ vm_page_unlock(m);
+ }
} else {
/*
* For write success, clear the dirty
- * status, then finish the I/O ( which decrements the
+ * status, then finish the I/O ( which decrements the
* busy count and possibly wakes waiter's up ).
*/
KASSERT(!pmap_page_is_write_mapped(m),
@@ -1597,7 +1619,7 @@
("swp_pager_async_iodone: page %p is not write"
" protected", m));
vm_page_undirty(m);
- vm_page_io_finish(m);
+ vm_page_sunbusy(m);
if (vm_page_count_severe()) {
vm_page_lock(m);
vm_page_try_to_cache(m);
@@ -1612,11 +1634,11 @@
*/
if (object != NULL) {
vm_object_pip_wakeupn(object, bp->b_npages);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
- /*
- * swapdev_strategy() manually sets b_vp and b_bufobj before calling
+ /*
+ * swapdev_strategy() manually sets b_vp and b_bufobj before calling
* bstrategy(). Set them back to NULL now we're done with it, or we'll
* trigger a KASSERT in relpbuf().
*/
@@ -1628,10 +1650,10 @@
* release the physical I/O buffer
*/
relpbuf(
- bp,
- ((bp->b_iocmd == BIO_READ) ? &nsw_rcount :
- ((bp->b_flags & B_ASYNC) ?
- &nsw_wcount_async :
+ bp,
+ ((bp->b_iocmd == BIO_READ) ? &nsw_rcount :
+ ((bp->b_flags & B_ASYNC) ?
+ &nsw_wcount_async :
&nsw_wcount_sync
)
)
@@ -1653,7 +1675,7 @@
int bcount;
int i;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
if (object->type != OBJT_SWAP)
return (0);
@@ -1695,7 +1717,7 @@
vm_page_t m;
vm_object_pip_add(object, 1);
- m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
+ m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
if (m->valid == VM_PAGE_BITS_ALL) {
vm_object_pip_subtract(object, 1);
vm_page_dirty(m);
@@ -1702,7 +1724,7 @@
vm_page_lock(m);
vm_page_activate(m);
vm_page_unlock(m);
- vm_page_wakeup(m);
+ vm_page_xunbusy(m);
vm_pager_page_unswapped(m);
return;
}
@@ -1714,7 +1736,7 @@
vm_page_lock(m);
vm_page_deactivate(m);
vm_page_unlock(m);
- vm_page_wakeup(m);
+ vm_page_xunbusy(m);
vm_pager_page_unswapped(m);
}
@@ -1732,36 +1754,49 @@
swap_pager_swapoff(struct swdevt *sp)
{
struct swblock *swap;
+ vm_object_t locked_obj, object;
+ vm_pindex_t pindex;
int i, j, retries;
GIANT_REQUIRED;
retries = 0;
+ locked_obj = NULL;
full_rescan:
mtx_lock(&swhash_mtx);
for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
restart:
for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
- vm_object_t object = swap->swb_object;
- vm_pindex_t pindex = swap->swb_index;
- for (j = 0; j < SWAP_META_PAGES; ++j) {
- if (swp_pager_isondev(swap->swb_pages[j], sp)) {
- /* avoid deadlock */
- if (!VM_OBJECT_TRYLOCK(object)) {
- break;
- } else {
+ object = swap->swb_object;
+ pindex = swap->swb_index;
+ for (j = 0; j < SWAP_META_PAGES; ++j) {
+ if (!swp_pager_isondev(swap->swb_pages[j], sp))
+ continue;
+ if (locked_obj != object) {
+ if (locked_obj != NULL)
+ VM_OBJECT_WUNLOCK(locked_obj);
+ locked_obj = object;
+ if (!VM_OBJECT_TRYWLOCK(object)) {
mtx_unlock(&swhash_mtx);
- swp_pager_force_pagein(object,
- pindex + j);
- VM_OBJECT_UNLOCK(object);
+ /* Depends on type-stability. */
+ VM_OBJECT_WLOCK(object);
mtx_lock(&swhash_mtx);
goto restart;
}
}
- }
+ MPASS(locked_obj == object);
+ mtx_unlock(&swhash_mtx);
+ swp_pager_force_pagein(object, pindex + j);
+ mtx_lock(&swhash_mtx);
+ goto restart;
+ }
}
}
mtx_unlock(&swhash_mtx);
+ if (locked_obj != NULL) {
+ VM_OBJECT_WUNLOCK(locked_obj);
+ locked_obj = NULL;
+ }
if (sp->sw_used) {
/*
* Objects may be locked or paging to the device being
@@ -1783,7 +1818,7 @@
* SWAP META DATA *
************************************************************************
*
- * These routines manipulate the swap metadata stored in the
+ * These routines manipulate the swap metadata stored in the
* OBJT_SWAP object.
*
* Swap metadata is implemented with a global hash and not directly
@@ -1809,7 +1844,7 @@
struct swblock **pswap;
int idx;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
/*
* Convert default object to swap object if necessary
*/
@@ -1821,13 +1856,13 @@
mtx_lock(&sw_alloc_mtx);
TAILQ_INSERT_TAIL(
NOBJLIST(object->handle),
- object,
+ object,
pager_object_list
);
mtx_unlock(&sw_alloc_mtx);
}
}
-
+
/*
* Locate hash entry. If not found create, but if we aren't adding
* anything just return. If we run out of space in the map we wait
@@ -1843,12 +1878,13 @@
if (swapblk == SWAPBLK_NONE)
goto done;
- swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
+ swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT |
+ (curproc == pageproc ? M_USE_RESERVE : 0));
if (swap == NULL) {
mtx_unlock(&swhash_mtx);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
if (uma_zone_exhausted(swap_zone)) {
- if (atomic_cmpset_rel_int(&exhausted, 0, 1))
+ if (atomic_cmpset_int(&exhausted, 0, 1))
printf("swap zone exhausted, "
"increase kern.maxswzone\n");
vm_pageout_oom(VM_OOM_SWAPZ);
@@ -1855,11 +1891,11 @@
pause("swzonex", 10);
} else
VM_WAIT;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
goto retry;
}
- if (atomic_cmpset_rel_int(&exhausted, 1, 0))
+ if (atomic_cmpset_int(&exhausted, 1, 0))
printf("swap zone ok\n");
swap->swb_hnext = NULL;
@@ -1896,10 +1932,10 @@
/*
* SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
*
- * The requested range of blocks is freed, with any associated swap
+ * The requested range of blocks is freed, with any associated swap
* returned to the swap bitmap.
*
- * This routine will free swap metadata structures as they are cleaned
+ * This routine will free swap metadata structures as they are cleaned
* out. This routine does *NOT* operate on swap metadata associated
* with resident pages.
*/
@@ -1907,7 +1943,7 @@
swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_LOCKED(object);
if (object->type != OBJT_SWAP)
return;
@@ -1951,23 +1987,22 @@
static void
swp_pager_meta_free_all(vm_object_t object)
{
- daddr_t index = 0;
+ struct swblock **pswap, *swap;
+ vm_pindex_t index;
+ daddr_t v;
+ int i;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
if (object->type != OBJT_SWAP)
return;
- while (object->un_pager.swp.swp_bcount) {
- struct swblock **pswap;
- struct swblock *swap;
-
+ index = 0;
+ while (object->un_pager.swp.swp_bcount != 0) {
mtx_lock(&swhash_mtx);
pswap = swp_pager_hash(object, index);
if ((swap = *pswap) != NULL) {
- int i;
-
for (i = 0; i < SWAP_META_PAGES; ++i) {
- daddr_t v = swap->swb_pages[i];
+ v = swap->swb_pages[i];
if (v != SWAPBLK_NONE) {
--swap->swb_count;
swp_pager_freeswapspace(v, 1);
@@ -1974,7 +2009,8 @@
}
}
if (swap->swb_count != 0)
- panic("swap_pager_meta_free_all: swb_count != 0");
+ panic(
+ "swap_pager_meta_free_all: swb_count != 0");
*pswap = swap->swb_hnext;
uma_zfree(swap_zone, swap);
--object->un_pager.swp.swp_bcount;
@@ -1991,14 +2027,14 @@
* swapblk assignments in the swap meta data or in the vm_page_t.
* The routine typically returns the swapblk being looked-up, or popped,
* or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
- * was invalid. This routine will automatically free any invalid
+ * was invalid. This routine will automatically free any invalid
* meta-data swapblks.
*
* It is not possible to store invalid swapblks in the swap meta data
* (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
*
- * When acting on a busy resident page and paging is in progress, we
- * have to wait until paging is complete but otherwise can act on the
+ * When acting on a busy resident page and paging is in progress, we
+ * have to wait until paging is complete but otherwise can act on the
* busy page.
*
* SWM_FREE remove and free swap block from metadata
@@ -2012,9 +2048,9 @@
daddr_t r1;
int idx;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_LOCKED(object);
/*
- * The meta data only exists of the object is OBJT_SWAP
+ * The meta data only exists of the object is OBJT_SWAP
* and even then might not be allocated yet.
*/
if (object->type != OBJT_SWAP)
@@ -2040,7 +2076,7 @@
uma_zfree(swap_zone, swap);
--object->un_pager.swp.swp_bcount;
}
- }
+ }
}
}
mtx_unlock(&swhash_mtx);
@@ -2058,7 +2094,7 @@
};
#endif
-/*
+/*
* MPSAFE
*/
/* ARGSUSED */
@@ -2144,7 +2180,8 @@
}
static void
-swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev)
+swaponsomething(struct vnode *vp, void *id, u_long nblks,
+ sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags)
{
struct swdevt *sp, *tsp;
swblk_t dvbase;
@@ -2153,7 +2190,7 @@
/*
* nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
* First chop nblks off to page-align it, then convert.
- *
+ *
* sw->sw_nblks is in page-sized chunks now too.
*/
nblks &= ~(ctodb(1) - 1);
@@ -2180,6 +2217,7 @@
sp->sw_used = 0;
sp->sw_strategy = strategy;
sp->sw_close = close;
+ sp->sw_flags = flags;
sp->sw_blist = blist_create(nblks, M_WAITOK);
/*
@@ -2204,7 +2242,7 @@
sp->sw_end = dvbase + nblks;
TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
nswapdev++;
- swap_pager_avail += nblks;
+ swap_pager_avail += nblks - 2;
swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
swapon_check_swzone(swap_total / PAGE_SIZE);
swp_sizecheck();
@@ -2276,7 +2314,7 @@
static int
swapoff_one(struct swdevt *sp, struct ucred *cred)
{
- u_long nblks, dvbase;
+ u_long nblks;
#ifdef MAC
int error;
#endif
@@ -2307,10 +2345,7 @@
*/
mtx_lock(&sw_dev_mtx);
sp->sw_flags |= SW_CLOSING;
- for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
- swap_pager_avail -= blist_fill(sp->sw_blist,
- dvbase, dmmax);
- }
+ swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks);
swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE;
mtx_unlock(&sw_dev_mtx);
@@ -2320,8 +2355,8 @@
swap_pager_swapoff(sp);
sp->sw_close(curthread, sp);
+ mtx_lock(&sw_dev_mtx);
sp->sw_id = NULL;
- mtx_lock(&sw_dev_mtx);
TAILQ_REMOVE(&swtailq, sp, sw_list);
nswapdev--;
if (nswapdev == 0) {
@@ -2342,12 +2377,12 @@
struct swdevt *sp, *spt;
const char *devname;
int error;
-
+
mtx_lock(&Giant);
while (swdev_syscall_active)
tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
swdev_syscall_active = 1;
-
+
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
mtx_unlock(&sw_dev_mtx);
@@ -2365,7 +2400,7 @@
mtx_lock(&sw_dev_mtx);
}
mtx_unlock(&sw_dev_mtx);
-
+
swdev_syscall_active = 0;
wakeup_one(&swdev_syscall_active);
mtx_unlock(&Giant);
@@ -2465,7 +2500,7 @@
for (cur = map->header.next; cur != &map->header; cur = cur->next) {
if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
(object = cur->object.vm_object) != NULL) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (object->type == OBJT_SWAP &&
object->un_pager.swp.swp_bcount != 0) {
n = (cur->end - cur->start) / PAGE_SIZE;
@@ -2472,7 +2507,7 @@
count += object->un_pager.swp.swp_bcount *
SWAP_META_PAGES * n / object->size + 1;
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
}
return (count);
@@ -2497,11 +2532,52 @@
static void
+swapgeom_close_ev(void *arg, int flags)
+{
+ struct g_consumer *cp;
+
+ cp = arg;
+ g_access(cp, -1, -1, 0);
+ g_detach(cp);
+ g_destroy_consumer(cp);
+}
+
+/*
+ * Add a reference to the g_consumer for an inflight transaction.
+ */
+static void
+swapgeom_acquire(struct g_consumer *cp)
+{
+
+ mtx_assert(&sw_dev_mtx, MA_OWNED);
+ cp->index++;
+}
+
+/*
+ * Remove a reference from the g_consumer. Post a close event if
+ * all referneces go away.
+ */
+static void
+swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
+{
+
+ mtx_assert(&sw_dev_mtx, MA_OWNED);
+ cp->index--;
+ if (cp->index == 0) {
+ if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0)
+ sp->sw_id = NULL;
+ }
+}
+
+static void
swapgeom_done(struct bio *bp2)
{
+ struct swdevt *sp;
struct buf *bp;
+ struct g_consumer *cp;
bp = bp2->bio_caller2;
+ cp = bp2->bio_from;
bp->b_ioflags = bp2->bio_flags;
if (bp2->bio_error)
bp->b_ioflags |= BIO_ERROR;
@@ -2508,6 +2584,10 @@
bp->b_resid = bp->b_bcount - bp2->bio_completed;
bp->b_error = bp2->bio_error;
bufdone(bp);
+ sp = bp2->bio_caller1;
+ mtx_lock(&sw_dev_mtx);
+ swapgeom_release(cp, sp);
+ mtx_unlock(&sw_dev_mtx);
g_destroy_bio(bp2);
}
@@ -2517,18 +2597,25 @@
struct bio *bio;
struct g_consumer *cp;
+ mtx_lock(&sw_dev_mtx);
cp = sp->sw_id;
if (cp == NULL) {
+ mtx_unlock(&sw_dev_mtx);
bp->b_error = ENXIO;
bp->b_ioflags |= BIO_ERROR;
bufdone(bp);
return;
}
+ swapgeom_acquire(cp);
+ mtx_unlock(&sw_dev_mtx);
if (bp->b_iocmd == BIO_WRITE)
bio = g_new_bio();
else
bio = g_alloc_bio();
if (bio == NULL) {
+ mtx_lock(&sw_dev_mtx);
+ swapgeom_release(cp, sp);
+ mtx_unlock(&sw_dev_mtx);
bp->b_error = ENOMEM;
bp->b_ioflags |= BIO_ERROR;
bufdone(bp);
@@ -2535,12 +2622,22 @@
return;
}
+ bio->bio_caller1 = sp;
bio->bio_caller2 = bp;
bio->bio_cmd = bp->b_iocmd;
- bio->bio_data = bp->b_data;
bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
bio->bio_length = bp->b_bcount;
bio->bio_done = swapgeom_done;
+ if ((bp->b_flags & B_UNMAPPED) != 0) {
+ bio->bio_ma = bp->b_pages;
+ bio->bio_data = unmapped_buf;
+ bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+ bio->bio_ma_n = bp->b_npages;
+ bio->bio_flags |= BIO_UNMAPPED;
+ } else {
+ bio->bio_data = bp->b_data;
+ bio->bio_ma = NULL;
+ }
g_io_request(bio, cp);
return;
}
@@ -2549,31 +2646,41 @@
swapgeom_orphan(struct g_consumer *cp)
{
struct swdevt *sp;
+ int destroy;
mtx_lock(&sw_dev_mtx);
- TAILQ_FOREACH(sp, &swtailq, sw_list)
- if (sp->sw_id == cp)
+ TAILQ_FOREACH(sp, &swtailq, sw_list) {
+ if (sp->sw_id == cp) {
sp->sw_flags |= SW_CLOSING;
+ break;
+ }
+ }
+ /*
+ * Drop reference we were created with. Do directly since we're in a
+ * special context where we don't have to queue the call to
+ * swapgeom_close_ev().
+ */
+ cp->index--;
+ destroy = ((sp != NULL) && (cp->index == 0));
+ if (destroy)
+ sp->sw_id = NULL;
mtx_unlock(&sw_dev_mtx);
+ if (destroy)
+ swapgeom_close_ev(cp, 0);
}
static void
-swapgeom_close_ev(void *arg, int flags)
+swapgeom_close(struct thread *td, struct swdevt *sw)
{
struct g_consumer *cp;
- cp = arg;
- g_access(cp, -1, -1, 0);
- g_detach(cp);
- g_destroy_consumer(cp);
-}
-
-static void
-swapgeom_close(struct thread *td, struct swdevt *sw)
-{
-
+ mtx_lock(&sw_dev_mtx);
+ cp = sw->sw_id;
+ sw->sw_id = NULL;
+ mtx_unlock(&sw_dev_mtx);
/* XXX: direct call when Giant untangled */
- g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
+ if (cp != NULL)
+ g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
}
@@ -2614,6 +2721,8 @@
if (gp == NULL)
gp = g_new_geomf(&g_swap_class, "swap");
cp = g_new_consumer(gp);
+ cp->index = 1; /* Number of active I/Os, plus one for being active. */
+ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
g_attach(cp, pp);
/*
* XXX: Everytime you think you can improve the margin for
@@ -2630,9 +2739,9 @@
}
nblks = pp->mediasize / DEV_BSIZE;
swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
- swapgeom_close, dev2udev(swh->dev));
+ swapgeom_close, dev2udev(swh->dev),
+ (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
swh->error = 0;
- return;
}
static int
@@ -2709,7 +2818,7 @@
}
}
mtx_unlock(&sw_dev_mtx);
-
+
(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_system_check_swapon(td->td_ucred, vp);
@@ -2721,6 +2830,6 @@
return (error);
swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
- NODEV);
+ NODEV, 0);
return (0);
}
Modified: trunk/sys/vm/swap_pager.h
===================================================================
--- trunk/sys/vm/swap_pager.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/swap_pager.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1990 University of Utah.
* Copyright (c) 1991 The Regents of the University of California.
@@ -32,7 +33,7 @@
* SUCH DAMAGE.
*
* from: @(#)swap_pager.h 7.1 (Berkeley) 12/5/90
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/swap_pager.h 248514 2013-03-19 14:39:27Z kib $
*/
#ifndef _VM_SWAP_PAGER_H_
@@ -68,6 +69,7 @@
sw_close_t *sw_close;
};
+#define SW_UNMAPPED 0x01
#define SW_CLOSING 0x04
#ifdef _KERNEL
Modified: trunk/sys/vm/uma.h
===================================================================
--- trunk/sys/vm/uma.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff at FreeBSD.org>
* Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
@@ -24,7 +25,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/uma.h 324602 2017-10-13 17:11:08Z jhb $
*
*/
@@ -33,8 +34,8 @@
*
*/
-#ifndef VM_UMA_H
-#define VM_UMA_H
+#ifndef _VM_UMA_H_
+#define _VM_UMA_H_
#include <sys/param.h> /* For NULL */
#include <sys/malloc.h> /* For M_* */
@@ -50,7 +51,7 @@
void zone_drain(uma_zone_t);
-/*
+/*
* Item constructor
*
* Arguments:
@@ -58,7 +59,7 @@
* arg The arg field passed to uma_zalloc_arg
* size The size of the allocated item
* flags See zalloc flags
- *
+ *
* Returns:
* 0 on success
* errno on failure
@@ -76,7 +77,7 @@
* item A pointer to the memory which has been allocated.
* size The size of the item being destructed.
* arg Argument passed through uma_zfree_arg
- *
+ *
* Returns:
* Nothing
*
@@ -87,7 +88,7 @@
*/
typedef void (*uma_dtor)(void *mem, int size, void *arg);
-/*
+/*
* Item initializer
*
* Arguments:
@@ -94,13 +95,13 @@
* item A pointer to the memory which has been allocated.
* size The size of the item being initialized.
* flags See zalloc flags
- *
+ *
* Returns:
* 0 on success
* errno on failure
*
* Discussion:
- * The initializer is called when the memory is cached in the uma zone.
+ * The initializer is called when the memory is cached in the uma zone.
* The initializer and the destructor should leave the object in the same
* state.
*/
@@ -110,7 +111,7 @@
* Item discard function
*
* Arguments:
- * item A pointer to memory which has been 'freed' but has not left the
+ * item A pointer to memory which has been 'freed' but has not left the
* zone's cache.
* size The size of the item being discarded.
*
@@ -124,9 +125,19 @@
typedef void (*uma_fini)(void *mem, int size);
/*
+ * Import new memory into a cache zone.
+ */
+typedef int (*uma_import)(void *arg, void **store, int count, int flags);
+
+/*
+ * Free memory from a cache zone.
+ */
+typedef void (*uma_release)(void *arg, void **store, int count);
+
+/*
* What's the difference between initializing and constructing?
*
- * The item is initialized when it is cached, and this is the state that the
+ * The item is initialized when it is cached, and this is the state that the
* object should be in when returned to the allocator. The purpose of this is
* to remove some code which would otherwise be called on each allocation by
* utilizing a known, stable state. This differs from the constructor which
@@ -167,7 +178,7 @@
*/
uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor,
uma_dtor dtor, uma_init uminit, uma_fini fini,
- int align, u_int32_t flags);
+ int align, uint32_t flags);
/*
* Create a secondary uma zone
@@ -211,11 +222,24 @@
* the only supported.
*
* Returns:
- * Error on failure, 0 on success.
+ * Error on failure, 0 on success.
*/
int uma_zsecond_add(uma_zone_t zone, uma_zone_t master);
/*
+ * Create cache-only zones.
+ *
+ * This allows uma's per-cpu cache facilities to handle arbitrary
+ * pointers. Consumers must specify the import and release functions to
+ * fill and destroy caches. UMA does not allocate any memory for these
+ * zones. The 'arg' parameter is passed to import/release and is caller
+ * specific.
+ */
+uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
+ uma_init zinit, uma_fini zfini, uma_import zimport,
+ uma_release zrelease, void *arg, int flags);
+
+/*
* Definitions for uma_zcreate flags
*
* These flags share space with UMA_ZFLAGs in uma_int.h. Be careful not to
@@ -252,6 +276,10 @@
* Zone's pages will not be included in
* mini-dumps.
*/
+#define UMA_ZONE_PCPU 0x8000 /*
+ * Allocates mp_ncpus slabs sized to
+ * sizeof(struct pcpu).
+ */
/*
* These flags are shared between the keg and zone. In zones wishing to add
@@ -259,8 +287,8 @@
* physical parameters of the request and may not be provided by the consumer.
*/
#define UMA_ZONE_INHERIT \
- (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_HASH | \
- UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB)
+ (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \
+ UMA_ZONE_HASH | UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
/* Definitions for align */
#define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */
@@ -269,6 +297,7 @@
#define UMA_ALIGN_SHORT (sizeof(short) - 1) /* "" short */
#define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */
#define UMA_ALIGN_CACHE (0 - 1) /* Cache line size align */
+#define UMA_ALIGNOF(type) (_Alignof(type) - 1) /* Alignment fit for 'type' */
/*
* Destroys an empty uma zone. If the zone is not empty uma complains loudly.
@@ -355,7 +384,8 @@
* A pointer to the allocated memory or NULL on failure.
*/
-typedef void *(*uma_alloc)(uma_zone_t zone, int size, u_int8_t *pflag, int wait);
+typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag,
+ int wait);
/*
* Backend page free routines
@@ -368,7 +398,7 @@
* Returns:
* None
*/
-typedef void (*uma_free)(void *item, int size, u_int8_t pflag);
+typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
@@ -403,7 +433,7 @@
* Discussion:
* uma_startup2 is called by kmeminit() to enable us of uma for malloc.
*/
-
+
void uma_startup2(void);
/*
@@ -432,24 +462,29 @@
void uma_set_align(int align);
/*
- * Switches the backing object of a zone
+ * Set a reserved number of items to hold for M_USE_RESERVE allocations. All
+ * other requests must allocate new backing pages.
+ */
+void uma_zone_reserve(uma_zone_t zone, int nitems);
+
+/*
+ * Reserves the maximum KVA space required by the zone and configures the zone
+ * to use a VM_ALLOC_NOOBJ-based backend allocator.
*
* Arguments:
* zone The zone to update.
- * obj The VM object to use for future allocations.
- * size The size of the object to allocate.
+ * nitems The upper limit on the number of items that can be allocated.
*
* Returns:
- * 0 if kva space can not be allocated
+ * 0 if KVA space can not be allocated
* 1 if successful
*
* Discussion:
- * A NULL object can be used and uma will allocate one for you. Setting
- * the size will limit the amount of memory allocated to this zone.
- *
+ * When the machine supports a direct map and the zone's items are smaller
+ * than a page, the zone will use the direct map instead of allocating KVA
+ * space.
*/
-struct vm_object;
-int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size);
+int uma_zone_reserve_kva(uma_zone_t zone, int nitems);
/*
* Sets a high limit on the number of items allowed in a zone
@@ -476,6 +511,18 @@
int uma_zone_get_max(uma_zone_t zone);
/*
+ * Sets a warning to be printed when limit is reached
+ *
+ * Arguments:
+ * zone The zone we will warn about
+ * warning Warning content
+ *
+ * Returns:
+ * Nothing
+ */
+void uma_zone_set_warning(uma_zone_t zone, const char *warning);
+
+/*
* Obtains the approximate current number of items allocated from a zone
*
* Arguments:
@@ -509,7 +556,7 @@
void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini);
/*
- * Replaces the standard page_alloc or obj_alloc functions for this zone
+ * Replaces the standard backend allocator for this zone.
*
* Arguments:
* zone The zone whose backend allocator is being changed.
@@ -571,13 +618,13 @@
* the underlying slab header.
*
* Arguments:
- * zone The UMA_ZONE_REFCNT zone to which the item belongs.
+ * zone The UMA_ZONE_REFCNT zone to which the item belongs.
* item The address of the item for which we want a refcnt.
*
* Returns:
- * A pointer to a u_int32_t reference counter.
+ * A pointer to a uint32_t reference counter.
*/
-u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item);
+uint32_t *uma_find_refcnt(uma_zone_t zone, void *item);
/*
* Used to determine if a fixed-size zone is exhausted.
@@ -586,12 +633,18 @@
* zone The zone to check
*
* Returns:
- * Non-zero if zone is exhausted.
+ * Non-zero if zone is exhausted.
*/
int uma_zone_exhausted(uma_zone_t zone);
int uma_zone_exhausted_nolock(uma_zone_t zone);
/*
+ * Common UMA_ZONE_PCPU zones.
+ */
+extern uma_zone_t pcpu_zone_64;
+extern uma_zone_t pcpu_zone_ptr;
+
+/*
* Exported statistics structures to be used by user space monitoring tools.
* Statistics stream consists of a uma_stream_header, followed by a series of
* alternative uma_type_header and uma_type_stat structures.
@@ -598,10 +651,10 @@
*/
#define UMA_STREAM_VERSION 0x00000001
struct uma_stream_header {
- u_int32_t ush_version; /* Stream format version. */
- u_int32_t ush_maxcpus; /* Value of MAXCPU for stream. */
- u_int32_t ush_count; /* Number of records. */
- u_int32_t _ush_pad; /* Pad/reserved field. */
+ uint32_t ush_version; /* Stream format version. */
+ uint32_t ush_maxcpus; /* Value of MAXCPU for stream. */
+ uint32_t ush_count; /* Number of records. */
+ uint32_t _ush_pad; /* Pad/reserved field. */
};
#define UTH_MAX_NAME 32
@@ -611,32 +664,35 @@
* Static per-zone data, some extracted from the supporting keg.
*/
char uth_name[UTH_MAX_NAME];
- u_int32_t uth_align; /* Keg: alignment. */
- u_int32_t uth_size; /* Keg: requested size of item. */
- u_int32_t uth_rsize; /* Keg: real size of item. */
- u_int32_t uth_maxpages; /* Keg: maximum number of pages. */
- u_int32_t uth_limit; /* Keg: max items to allocate. */
+ uint32_t uth_align; /* Keg: alignment. */
+ uint32_t uth_size; /* Keg: requested size of item. */
+ uint32_t uth_rsize; /* Keg: real size of item. */
+ uint32_t uth_maxpages; /* Keg: maximum number of pages. */
+ uint32_t uth_limit; /* Keg: max items to allocate. */
/*
* Current dynamic zone/keg-derived statistics.
*/
- u_int32_t uth_pages; /* Keg: pages allocated. */
- u_int32_t uth_keg_free; /* Keg: items free. */
- u_int32_t uth_zone_free; /* Zone: items free. */
- u_int32_t uth_bucketsize; /* Zone: desired bucket size. */
- u_int32_t uth_zone_flags; /* Zone: flags. */
- u_int64_t uth_allocs; /* Zone: number of allocations. */
- u_int64_t uth_frees; /* Zone: number of frees. */
- u_int64_t uth_fails; /* Zone: number of alloc failures. */
- u_int64_t uth_sleeps; /* Zone: number of alloc sleeps. */
- u_int64_t _uth_reserved1[2]; /* Reserved. */
+ uint32_t uth_pages; /* Keg: pages allocated. */
+ uint32_t uth_keg_free; /* Keg: items free. */
+ uint32_t uth_zone_free; /* Zone: items free. */
+ uint32_t uth_bucketsize; /* Zone: desired bucket size. */
+ uint32_t uth_zone_flags; /* Zone: flags. */
+ uint64_t uth_allocs; /* Zone: number of allocations. */
+ uint64_t uth_frees; /* Zone: number of frees. */
+ uint64_t uth_fails; /* Zone: number of alloc failures. */
+ uint64_t uth_sleeps; /* Zone: number of alloc sleeps. */
+ uint64_t _uth_reserved1[2]; /* Reserved. */
};
struct uma_percpu_stat {
- u_int64_t ups_allocs; /* Cache: number of allocations. */
- u_int64_t ups_frees; /* Cache: number of frees. */
- u_int64_t ups_cache_free; /* Cache: free items in cache. */
- u_int64_t _ups_reserved[5]; /* Reserved. */
+ uint64_t ups_allocs; /* Cache: number of allocations. */
+ uint64_t ups_frees; /* Cache: number of frees. */
+ uint64_t ups_cache_free; /* Cache: free items in cache. */
+ uint64_t _ups_reserved[5]; /* Reserved. */
};
-#endif
+void uma_reclaim_wakeup(void);
+void uma_reclaim_worker(void *);
+
+#endif /* _VM_UMA_H_ */
Modified: trunk/sys/vm/uma_core.c
===================================================================
--- trunk/sys/vm/uma_core.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma_core.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
/*-
- * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff at FreeBSD.org>
+ * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff at FreeBSD.org>
* Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
* Copyright (c) 2004-2006 Robert N. M. Watson
* All rights reserved.
@@ -48,7 +49,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/uma_core.c 320440 2017-06-28 06:40:13Z alc $");
/* I should really use ktr.. */
/*
@@ -59,9 +60,12 @@
#include "opt_ddb.h"
#include "opt_param.h"
+#include "opt_vm.h"
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/bitset.h>
+#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/types.h>
#include <sys/queue.h>
@@ -71,7 +75,9 @@
#include <sys/sysctl.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/rwlock.h>
#include <sys/sbuf.h>
+#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/vmmeter.h>
@@ -78,6 +84,7 @@
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
#include <vm/vm_param.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
@@ -88,6 +95,10 @@
#include <ddb/ddb.h>
+#ifdef DEBUG_MEMGUARD
+#include <vm/memguard.h>
+#endif
+
/*
* This is the zone and keg from which all zones are spawned. The idea is that
* even the zone & keg heads are allocated from the allocator, so we use the
@@ -122,24 +133,32 @@
/* Linked list of all kegs in the system */
static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
-/* This mutex protects the keg list */
-static struct mtx uma_mtx;
+/* Linked list of all cache-only zones in the system */
+static LIST_HEAD(,uma_zone) uma_cachezones =
+ LIST_HEAD_INITIALIZER(uma_cachezones);
+/* This RW lock protects the keg list */
+static struct rwlock_padalign uma_rwlock;
+
/* Linked list of boot time pages */
static LIST_HEAD(,uma_slab) uma_boot_pages =
LIST_HEAD_INITIALIZER(uma_boot_pages);
/* This mutex protects the boot time pages list */
-static struct mtx uma_boot_pages_mtx;
+static struct mtx_padalign uma_boot_pages_mtx;
+static struct sx uma_drain_lock;
+
/* Is the VM done starting up? */
static int booted = 0;
#define UMA_STARTUP 1
#define UMA_STARTUP2 2
-/* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
-static u_int uma_max_ipers;
-static u_int uma_max_ipers_ref;
+/*
+ * Only mbuf clusters use ref zones. Just provide enough references
+ * to support the one user. New code should not use the ref facility.
+ */
+static const u_int uma_max_ipers_ref = PAGE_SIZE / MCLBYTES;
/*
* This is the handle used to schedule events that need to happen
@@ -159,9 +178,12 @@
uma_dtor dtor;
uma_init uminit;
uma_fini fini;
+ uma_import import;
+ uma_release release;
+ void *arg;
uma_keg_t keg;
int align;
- u_int32_t flags;
+ uint32_t flags;
};
struct uma_kctor_args {
@@ -170,48 +192,49 @@
uma_init uminit;
uma_fini fini;
int align;
- u_int32_t flags;
+ uint32_t flags;
};
struct uma_bucket_zone {
uma_zone_t ubz_zone;
char *ubz_name;
- int ubz_entries;
+ int ubz_entries; /* Number of items it can hold. */
+ int ubz_maxsize; /* Maximum allocation size per-item. */
};
-#define BUCKET_MAX 128
+/*
+ * Compute the actual number of bucket entries to pack them in power
+ * of two sizes for more efficient space utilization.
+ */
+#define BUCKET_SIZE(n) \
+ (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
+#define BUCKET_MAX BUCKET_SIZE(256)
+
struct uma_bucket_zone bucket_zones[] = {
- { NULL, "16 Bucket", 16 },
- { NULL, "32 Bucket", 32 },
- { NULL, "64 Bucket", 64 },
- { NULL, "128 Bucket", 128 },
+ { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
+ { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
+ { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
+ { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
+ { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
+ { NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
+ { NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
+ { NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
+ { NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
{ NULL, NULL, 0}
};
-#define BUCKET_SHIFT 4
-#define BUCKET_ZONES ((BUCKET_MAX >> BUCKET_SHIFT) + 1)
-
/*
- * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
- * of approximately the right size.
- */
-static uint8_t bucket_size[BUCKET_ZONES];
-
-/*
* Flags and enumerations to be passed to internal functions.
*/
-enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
+enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
-#define ZFREE_STATFAIL 0x00000001 /* Update zone failure statistic. */
-#define ZFREE_STATFREE 0x00000002 /* Update zone free statistic. */
-
/* Prototypes.. */
-static void *obj_alloc(uma_zone_t, int, u_int8_t *, int);
-static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
-static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
-static void page_free(void *, int, u_int8_t);
+static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void page_free(void *, vm_size_t, uint8_t);
static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
static void cache_drain(uma_zone_t);
static void bucket_drain(uma_zone_t, uma_bucket_t);
@@ -231,21 +254,22 @@
static void uma_timeout(void *);
static void uma_startup3(void);
static void *zone_alloc_item(uma_zone_t, void *, int);
-static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip,
- int);
+static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
static void bucket_enable(void);
static void bucket_init(void);
-static uma_bucket_t bucket_alloc(int, int);
-static void bucket_free(uma_bucket_t);
+static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
+static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
static void bucket_zone_drain(void);
-static int zone_alloc_bucket(uma_zone_t zone, int flags);
+static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags);
static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
-static void *slab_alloc_item(uma_zone_t zone, uma_slab_t slab);
+static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
+static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
- uma_fini fini, int align, u_int32_t flags);
-static inline void zone_relock(uma_zone_t zone, uma_keg_t keg);
-static inline void keg_relock(uma_keg_t keg, uma_zone_t zone);
+ uma_fini fini, int align, uint32_t flags);
+static int zone_import(uma_zone_t zone, void **bucket, int max, int flags);
+static void zone_release(uma_zone_t zone, void **bucket, int cnt);
+static void uma_zero_item(void *item, uma_zone_t zone);
void uma_print_zone(uma_zone_t);
void uma_print_stats(void);
@@ -260,10 +284,14 @@
SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
+static int zone_warnings = 1;
+TUNABLE_INT("vm.zone_warnings", &zone_warnings);
+SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
+ "Warn when UMA zones becomes full");
+
/*
* This routine checks to see whether or not it's safe to enable buckets.
*/
-
static void
bucket_enable(void)
{
@@ -274,27 +302,20 @@
* Initialize bucket_zones, the array of zones of buckets of various sizes.
*
* For each zone, calculate the memory required for each bucket, consisting
- * of the header and an array of pointers. Initialize bucket_size[] to point
- * the range of appropriate bucket sizes at the zone.
+ * of the header and an array of pointers.
*/
static void
bucket_init(void)
{
struct uma_bucket_zone *ubz;
- int i;
- int j;
+ int size;
- for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
- int size;
-
- ubz = &bucket_zones[j];
+ for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
size = roundup(sizeof(struct uma_bucket), sizeof(void *));
size += sizeof(void *) * ubz->ubz_entries;
ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
- UMA_ZFLAG_INTERNAL | UMA_ZFLAG_BUCKET);
- for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
- bucket_size[i >> BUCKET_SHIFT] = j;
+ UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET);
}
}
@@ -305,14 +326,33 @@
static struct uma_bucket_zone *
bucket_zone_lookup(int entries)
{
- int idx;
+ struct uma_bucket_zone *ubz;
- idx = howmany(entries, 1 << BUCKET_SHIFT);
- return (&bucket_zones[bucket_size[idx]]);
+ for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
+ if (ubz->ubz_entries >= entries)
+ return (ubz);
+ ubz--;
+ return (ubz);
}
+static int
+bucket_select(int size)
+{
+ struct uma_bucket_zone *ubz;
+
+ ubz = &bucket_zones[0];
+ if (size > ubz->ubz_maxsize)
+ return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
+
+ for (; ubz->ubz_entries != 0; ubz++)
+ if (ubz->ubz_maxsize < size)
+ break;
+ ubz--;
+ return (ubz->ubz_entries);
+}
+
static uma_bucket_t
-bucket_alloc(int entries, int bflags)
+bucket_alloc(uma_zone_t zone, void *udata, int flags)
{
struct uma_bucket_zone *ubz;
uma_bucket_t bucket;
@@ -325,9 +365,29 @@
*/
if (bucketdisable)
return (NULL);
-
- ubz = bucket_zone_lookup(entries);
- bucket = zone_alloc_item(ubz->ubz_zone, NULL, bflags);
+ /*
+ * To limit bucket recursion we store the original zone flags
+ * in a cookie passed via zalloc_arg/zfree_arg. This allows the
+ * NOVM flag to persist even through deep recursions. We also
+ * store ZFLAG_BUCKET once we have recursed attempting to allocate
+ * a bucket for a bucket zone so we do not allow infinite bucket
+ * recursion. This cookie will even persist to frees of unused
+ * buckets via the allocation path or bucket allocations in the
+ * free path.
+ */
+ if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
+ udata = (void *)(uintptr_t)zone->uz_flags;
+ else {
+ if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
+ return (NULL);
+ udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
+ }
+ if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
+ flags |= M_NOVM;
+ ubz = bucket_zone_lookup(zone->uz_count);
+ if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
+ ubz++;
+ bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
if (bucket) {
#ifdef INVARIANTS
bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
@@ -340,13 +400,16 @@
}
static void
-bucket_free(uma_bucket_t bucket)
+bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
{
struct uma_bucket_zone *ubz;
+ KASSERT(bucket->ub_cnt == 0,
+ ("bucket_free: Freeing a non free bucket."));
+ if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
+ udata = (void *)(uintptr_t)zone->uz_flags;
ubz = bucket_zone_lookup(bucket->ub_entries);
- zone_free_item(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
- ZFREE_STATFREE);
+ uma_zfree_arg(ubz->ubz_zone, bucket, udata);
}
static void
@@ -358,11 +421,16 @@
zone_drain(ubz->ubz_zone);
}
-static inline uma_keg_t
-zone_first_keg(uma_zone_t zone)
+static void
+zone_log_warning(uma_zone_t zone)
{
+ static const struct timeval warninterval = { 300, 0 };
- return (LIST_FIRST(&zone->uz_kegs)->kl_keg);
+ if (!zone_warnings || zone->uz_warning == NULL)
+ return;
+
+ if (ratecheck(&zone->uz_ratecheck, &warninterval))
+ printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
}
static void
@@ -437,7 +505,7 @@
KEG_UNLOCK(keg);
hash_free(&oldhash);
- KEG_LOCK(keg);
+ return;
}
}
KEG_UNLOCK(keg);
@@ -549,8 +617,7 @@
if (hash->uh_slab_hash == NULL)
return;
if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
- zone_free_item(hashzone,
- hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
+ zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
else
free(hash->uh_slab_hash, M_UMAHASH);
}
@@ -569,21 +636,16 @@
static void
bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
{
- void *item;
+ int i;
if (bucket == NULL)
return;
- while (bucket->ub_cnt > 0) {
- bucket->ub_cnt--;
- item = bucket->ub_bucket[bucket->ub_cnt];
-#ifdef INVARIANTS
- bucket->ub_bucket[bucket->ub_cnt] = NULL;
- KASSERT(item != NULL,
- ("bucket_drain: botched ptr, item is NULL"));
-#endif
- zone_free_item(zone, item, NULL, SKIP_DTOR, 0);
- }
+ if (zone->uz_fini)
+ for (i = 0; i < bucket->ub_cnt; i++)
+ zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
+ zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
+ bucket->ub_cnt = 0;
}
/*
@@ -622,9 +684,9 @@
bucket_drain(zone, cache->uc_allocbucket);
bucket_drain(zone, cache->uc_freebucket);
if (cache->uc_allocbucket != NULL)
- bucket_free(cache->uc_allocbucket);
+ bucket_free(zone, cache->uc_allocbucket, NULL);
if (cache->uc_freebucket != NULL)
- bucket_free(cache->uc_freebucket);
+ bucket_free(zone, cache->uc_freebucket, NULL);
cache->uc_allocbucket = cache->uc_freebucket = NULL;
}
ZONE_LOCK(zone);
@@ -632,7 +694,91 @@
ZONE_UNLOCK(zone);
}
+static void
+cache_shrink(uma_zone_t zone)
+{
+
+ if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
+ return;
+
+ ZONE_LOCK(zone);
+ zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
+ ZONE_UNLOCK(zone);
+}
+
+static void
+cache_drain_safe_cpu(uma_zone_t zone)
+{
+ uma_cache_t cache;
+ uma_bucket_t b1, b2;
+
+ if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
+ return;
+
+ b1 = b2 = NULL;
+ ZONE_LOCK(zone);
+ critical_enter();
+ cache = &zone->uz_cpu[curcpu];
+ if (cache->uc_allocbucket) {
+ if (cache->uc_allocbucket->ub_cnt != 0)
+ LIST_INSERT_HEAD(&zone->uz_buckets,
+ cache->uc_allocbucket, ub_link);
+ else
+ b1 = cache->uc_allocbucket;
+ cache->uc_allocbucket = NULL;
+ }
+ if (cache->uc_freebucket) {
+ if (cache->uc_freebucket->ub_cnt != 0)
+ LIST_INSERT_HEAD(&zone->uz_buckets,
+ cache->uc_freebucket, ub_link);
+ else
+ b2 = cache->uc_freebucket;
+ cache->uc_freebucket = NULL;
+ }
+ critical_exit();
+ ZONE_UNLOCK(zone);
+ if (b1)
+ bucket_free(zone, b1, NULL);
+ if (b2)
+ bucket_free(zone, b2, NULL);
+}
+
/*
+ * Safely drain per-CPU caches of a zone(s) to alloc bucket.
+ * This is an expensive call because it needs to bind to all CPUs
+ * one by one and enter a critical section on each of them in order
+ * to safely access their cache buckets.
+ * Zone lock must not be held on call this function.
+ */
+static void
+cache_drain_safe(uma_zone_t zone)
+{
+ int cpu;
+
+ /*
+ * Polite bucket sizes shrinking was not enouth, shrink aggressively.
+ */
+ if (zone)
+ cache_shrink(zone);
+ else
+ zone_foreach(cache_shrink);
+
+ CPU_FOREACH(cpu) {
+ thread_lock(curthread);
+ sched_bind(curthread, cpu);
+ thread_unlock(curthread);
+
+ if (zone)
+ cache_drain_safe_cpu(zone);
+ else
+ zone_foreach(cache_drain_safe_cpu);
+ }
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
+}
+
+/*
* Drain the cached buckets from a zone. Expects a locked zone on entry.
*/
static void
@@ -644,19 +790,44 @@
* Drain the bucket queues and free the buckets, we just keep two per
* cpu (alloc/free).
*/
- while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
+ while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
LIST_REMOVE(bucket, ub_link);
ZONE_UNLOCK(zone);
bucket_drain(zone, bucket);
- bucket_free(bucket);
+ bucket_free(zone, bucket, NULL);
ZONE_LOCK(zone);
}
- /* Now we do the free queue.. */
- while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
- LIST_REMOVE(bucket, ub_link);
- bucket_free(bucket);
+ /*
+ * Shrink further bucket sizes. Price of single zone lock collision
+ * is probably lower then price of global cache drain.
+ */
+ if (zone->uz_count > zone->uz_count_min)
+ zone->uz_count--;
+}
+
+static void
+keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
+{
+ uint8_t *mem;
+ int i;
+ uint8_t flags;
+
+ mem = slab->us_data;
+ flags = slab->us_flags;
+ i = start;
+ if (keg->uk_fini != NULL) {
+ for (i--; i > -1; i--)
+ keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
+ keg->uk_size);
}
+ if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+ zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
+#ifdef UMA_DEBUG
+ printf("%s: Returning %d bytes.\n", keg->uk_name,
+ PAGE_SIZE * keg->uk_ppera);
+#endif
+ keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
}
/*
@@ -671,9 +842,6 @@
struct slabhead freeslabs = { 0 };
uma_slab_t slab;
uma_slab_t n;
- u_int8_t flags;
- u_int8_t *mem;
- int i;
/*
* We don't want to take pages from statically allocated kegs at this
@@ -715,35 +883,7 @@
while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
- if (keg->uk_fini)
- for (i = 0; i < keg->uk_ipers; i++)
- keg->uk_fini(
- slab->us_data + (keg->uk_rsize * i),
- keg->uk_size);
- flags = slab->us_flags;
- mem = slab->us_data;
-
- if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
- vm_object_t obj;
-
- if (flags & UMA_SLAB_KMEM)
- obj = kmem_object;
- else if (flags & UMA_SLAB_KERNEL)
- obj = kernel_object;
- else
- obj = NULL;
- for (i = 0; i < keg->uk_ppera; i++)
- vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
- obj);
- }
- if (keg->uk_flags & UMA_ZONE_OFFPAGE)
- zone_free_item(keg->uk_slabzone, slab, NULL,
- SKIP_NONE, ZFREE_STATFREE);
-#ifdef UMA_DEBUG
- printf("%s: Returning %d bytes.\n",
- keg->uk_name, UMA_SLAB_SIZE * keg->uk_ppera);
-#endif
- keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
+ keg_free_slab(keg, slab, keg->uk_ipers);
}
}
@@ -761,9 +901,7 @@
while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
if (waitok == M_NOWAIT)
goto out;
- mtx_unlock(&uma_mtx);
- msleep(zone, zone->uz_lock, PVM, "zonedrain", 1);
- mtx_lock(&uma_mtx);
+ msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
}
zone->uz_flags |= UMA_ZFLAG_DRAINING;
bucket_cache_drain(zone);
@@ -770,7 +908,7 @@
ZONE_UNLOCK(zone);
/*
* The DRAINING flag protects us from being freed while
- * we're running. Normally the uma_mtx would protect us but we
+ * we're running. Normally the uma_rwlock would protect us but we
* must be able to release and acquire the right lock for each keg.
*/
zone_foreach_keg(zone, &keg_drain);
@@ -804,15 +942,16 @@
uma_slabrefcnt_t slabref;
uma_alloc allocf;
uma_slab_t slab;
- u_int8_t *mem;
- u_int8_t flags;
+ uint8_t *mem;
+ uint8_t flags;
int i;
mtx_assert(&keg->uk_lock, MA_OWNED);
slab = NULL;
+ mem = NULL;
#ifdef UMA_DEBUG
- printf("slab_zalloc: Allocating a new slab for %s\n", keg->uk_name);
+ printf("alloc_slab: Allocating a new slab for %s\n", keg->uk_name);
#endif
allocf = keg->uk_allocf;
KEG_UNLOCK(keg);
@@ -819,10 +958,8 @@
if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
- if (slab == NULL) {
- KEG_LOCK(keg);
- return NULL;
- }
+ if (slab == NULL)
+ goto out;
}
/*
@@ -841,13 +978,12 @@
wait |= M_NODUMP;
/* zone is passed for legacy reasons. */
- mem = allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait);
+ mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
if (mem == NULL) {
if (keg->uk_flags & UMA_ZONE_OFFPAGE)
- zone_free_item(keg->uk_slabzone, slab, NULL,
- SKIP_NONE, ZFREE_STATFREE);
- KEG_LOCK(keg);
- return (NULL);
+ zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
+ slab = NULL;
+ goto out;
}
/* Point the slab into the allocated memory */
@@ -861,18 +997,15 @@
slab->us_keg = keg;
slab->us_data = mem;
slab->us_freecount = keg->uk_ipers;
- slab->us_firstfree = 0;
slab->us_flags = flags;
-
+ BIT_FILL(SLAB_SETSIZE, &slab->us_free);
+#ifdef INVARIANTS
+ BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
+#endif
if (keg->uk_flags & UMA_ZONE_REFCNT) {
slabref = (uma_slabrefcnt_t)slab;
- for (i = 0; i < keg->uk_ipers; i++) {
- slabref->us_freelist[i].us_refcnt = 0;
- slabref->us_freelist[i].us_item = i+1;
- }
- } else {
for (i = 0; i < keg->uk_ipers; i++)
- slab->us_freelist[i].us_item = i+1;
+ slabref->us_refcnt[i] = 0;
}
if (keg->uk_init != NULL) {
@@ -881,41 +1014,21 @@
keg->uk_size, wait) != 0)
break;
if (i != keg->uk_ipers) {
- if (keg->uk_fini != NULL) {
- for (i--; i > -1; i--)
- keg->uk_fini(slab->us_data +
- (keg->uk_rsize * i),
- keg->uk_size);
- }
- if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
- vm_object_t obj;
-
- if (flags & UMA_SLAB_KMEM)
- obj = kmem_object;
- else if (flags & UMA_SLAB_KERNEL)
- obj = kernel_object;
- else
- obj = NULL;
- for (i = 0; i < keg->uk_ppera; i++)
- vsetobj((vm_offset_t)mem +
- (i * PAGE_SIZE), obj);
- }
- if (keg->uk_flags & UMA_ZONE_OFFPAGE)
- zone_free_item(keg->uk_slabzone, slab,
- NULL, SKIP_NONE, ZFREE_STATFREE);
- keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,
- flags);
- KEG_LOCK(keg);
- return (NULL);
+ keg_free_slab(keg, slab, i);
+ slab = NULL;
+ goto out;
}
}
+out:
KEG_LOCK(keg);
- if (keg->uk_flags & UMA_ZONE_HASH)
- UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
+ if (slab != NULL) {
+ if (keg->uk_flags & UMA_ZONE_HASH)
+ UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
- keg->uk_pages += keg->uk_ppera;
- keg->uk_free += keg->uk_ipers;
+ keg->uk_pages += keg->uk_ppera;
+ keg->uk_free += keg->uk_ipers;
+ }
return (slab);
}
@@ -926,7 +1039,7 @@
* the VM is ready.
*/
static void *
-startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
+startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
{
uma_keg_t keg;
uma_slab_t tmps;
@@ -986,12 +1099,12 @@
* NULL if M_NOWAIT is set.
*/
static void *
-page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
+page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
{
void *p; /* Returned page */
*pflag = UMA_SLAB_KMEM;
- p = (void *) kmem_malloc(kmem_map, bytes, wait);
+ p = (void *) kmem_malloc(kmem_arena, bytes, wait);
return (p);
}
@@ -1008,50 +1121,53 @@
* NULL if M_NOWAIT is set.
*/
static void *
-obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
{
- vm_object_t object;
+ TAILQ_HEAD(, vm_page) alloctail;
+ u_long npages;
vm_offset_t retkva, zkva;
- vm_page_t p;
- int pages, startpages;
+ vm_page_t p, p_next;
uma_keg_t keg;
+ TAILQ_INIT(&alloctail);
keg = zone_first_keg(zone);
- object = keg->uk_obj;
- retkva = 0;
- /*
- * This looks a little weird since we're getting one page at a time.
- */
- VM_OBJECT_LOCK(object);
- p = TAILQ_LAST(&object->memq, pglist);
- pages = p != NULL ? p->pindex + 1 : 0;
- startpages = pages;
- zkva = keg->uk_kva + pages * PAGE_SIZE;
- for (; bytes > 0; bytes -= PAGE_SIZE) {
- p = vm_page_alloc(object, pages,
- VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
- if (p == NULL) {
- if (pages != startpages)
- pmap_qremove(retkva, pages - startpages);
- while (pages != startpages) {
- pages--;
- p = TAILQ_LAST(&object->memq, pglist);
- vm_page_unwire(p, 0);
- vm_page_free(p);
- }
- retkva = 0;
- goto done;
+ npages = howmany(bytes, PAGE_SIZE);
+ while (npages > 0) {
+ p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
+ VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
+ if (p != NULL) {
+ /*
+ * Since the page does not belong to an object, its
+ * listq is unused.
+ */
+ TAILQ_INSERT_TAIL(&alloctail, p, listq);
+ npages--;
+ continue;
}
+ if (wait & M_WAITOK) {
+ VM_WAIT;
+ continue;
+ }
+
+ /*
+ * Page allocation failed, free intermediate pages and
+ * exit.
+ */
+ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
+ vm_page_unwire(p, 0);
+ vm_page_free(p);
+ }
+ return (NULL);
+ }
+ *flags = UMA_SLAB_PRIV;
+ zkva = keg->uk_kva +
+ atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
+ retkva = zkva;
+ TAILQ_FOREACH(p, &alloctail, listq) {
pmap_qenter(zkva, &p, 1);
- if (retkva == 0)
- retkva = zkva;
zkva += PAGE_SIZE;
- pages += 1;
}
-done:
- VM_OBJECT_UNLOCK(object);
- *flags = UMA_SLAB_PRIV;
return ((void *)retkva);
}
@@ -1068,18 +1184,18 @@
* Nothing
*/
static void
-page_free(void *mem, int size, u_int8_t flags)
+page_free(void *mem, vm_size_t size, uint8_t flags)
{
- vm_map_t map;
+ struct vmem *vmem;
if (flags & UMA_SLAB_KMEM)
- map = kmem_map;
+ vmem = kmem_arena;
else if (flags & UMA_SLAB_KERNEL)
- map = kernel_map;
+ vmem = kernel_arena;
else
panic("UMA: page_free used with invalid flags %d", flags);
- kmem_free(map, (vm_offset_t)mem, size);
+ kmem_free(vmem, (vm_offset_t)mem, size);
}
/*
@@ -1110,47 +1226,74 @@
u_int memused;
u_int wastedspace;
u_int shsize;
+ u_int slabsize;
- KASSERT(keg != NULL, ("Keg is null in keg_small_init"));
+ if (keg->uk_flags & UMA_ZONE_PCPU) {
+ u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+
+ slabsize = sizeof(struct pcpu);
+ keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
+ PAGE_SIZE);
+ } else {
+ slabsize = UMA_SLAB_SIZE;
+ keg->uk_ppera = 1;
+ }
+
+ /*
+ * Calculate the size of each allocation (rsize) according to
+ * alignment. If the requested size is smaller than we have
+ * allocation bits for we round it up.
+ */
rsize = keg->uk_size;
-
- if (rsize < UMA_SMALLEST_UNIT)
- rsize = UMA_SMALLEST_UNIT;
+ if (rsize < slabsize / SLAB_SETSIZE)
+ rsize = slabsize / SLAB_SETSIZE;
if (rsize & keg->uk_align)
rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
-
keg->uk_rsize = rsize;
- keg->uk_ppera = 1;
- if (keg->uk_flags & UMA_ZONE_REFCNT) {
- rsize += UMA_FRITMREF_SZ; /* linkage & refcnt */
- shsize = sizeof(struct uma_slab_refcnt);
- } else {
- rsize += UMA_FRITM_SZ; /* Account for linkage */
+ KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
+ keg->uk_rsize < sizeof(struct pcpu),
+ ("%s: size %u too large", __func__, keg->uk_rsize));
+
+ if (keg->uk_flags & UMA_ZONE_REFCNT)
+ rsize += sizeof(uint32_t);
+
+ if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+ shsize = 0;
+ else
shsize = sizeof(struct uma_slab);
- }
- keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize;
- KASSERT(keg->uk_ipers != 0, ("keg_small_init: ipers is 0"));
+ keg->uk_ipers = (slabsize - shsize) / rsize;
+ KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
+ ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
+
memused = keg->uk_ipers * rsize + shsize;
- wastedspace = UMA_SLAB_SIZE - memused;
+ wastedspace = slabsize - memused;
/*
* We can't do OFFPAGE if we're internal or if we've been
* asked to not go to the VM for buckets. If we do this we
- * may end up going to the VM (kmem_map) for slabs which we
- * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
- * result of UMA_ZONE_VM, which clearly forbids it.
+ * may end up going to the VM for slabs which we do not
+ * want to do if we're UMA_ZFLAG_CACHEONLY as a result
+ * of UMA_ZONE_VM, which clearly forbids it.
*/
if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
(keg->uk_flags & UMA_ZFLAG_CACHEONLY))
return;
- if ((wastedspace >= UMA_MAX_WASTE) &&
- (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) {
- keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize;
- KASSERT(keg->uk_ipers <= 255,
- ("keg_small_init: keg->uk_ipers too high!"));
+ /*
+ * See if using an OFFPAGE slab will limit our waste. Only do
+ * this if it permits more items per-slab.
+ *
+ * XXX We could try growing slabsize to limit max waste as well.
+ * Historically this was not done because the VM could not
+ * efficiently handle contiguous allocations.
+ */
+ if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
+ (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
+ keg->uk_ipers = slabsize / keg->uk_rsize;
+ KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
+ ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
#ifdef UMA_DEBUG
printf("UMA decided we need offpage slab headers for "
"keg: %s, calculated wastedspace = %d, "
@@ -1157,13 +1300,15 @@
"maximum wasted space allowed = %d, "
"calculated ipers = %d, "
"new wasted space = %d\n", keg->uk_name, wastedspace,
- UMA_MAX_WASTE, keg->uk_ipers,
- UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize);
+ slabsize / UMA_MAX_WASTE, keg->uk_ipers,
+ slabsize - keg->uk_ipers * keg->uk_rsize);
#endif
keg->uk_flags |= UMA_ZONE_OFFPAGE;
- if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
- keg->uk_flags |= UMA_ZONE_HASH;
}
+
+ if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
+ (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
+ keg->uk_flags |= UMA_ZONE_HASH;
}
/*
@@ -1180,19 +1325,15 @@
static void
keg_large_init(uma_keg_t keg)
{
- int pages;
+ u_int shsize;
KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
+ KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
+ ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
- pages = keg->uk_size / UMA_SLAB_SIZE;
-
- /* Account for remainder */
- if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
- pages++;
-
- keg->uk_ppera = pages;
+ keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
keg->uk_ipers = 1;
keg->uk_rsize = keg->uk_size;
@@ -1200,8 +1341,21 @@
if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
return;
- keg->uk_flags |= UMA_ZONE_OFFPAGE;
- if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
+ /* Check whether we have enough space to not do OFFPAGE. */
+ if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
+ shsize = sizeof(struct uma_slab);
+ if (keg->uk_flags & UMA_ZONE_REFCNT)
+ shsize += keg->uk_ipers * sizeof(uint32_t);
+ if (shsize & UMA_ALIGN_PTR)
+ shsize = (shsize & ~UMA_ALIGN_PTR) +
+ (UMA_ALIGN_PTR + 1);
+
+ if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize)
+ keg->uk_flags |= UMA_ZONE_OFFPAGE;
+ }
+
+ if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
+ (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
keg->uk_flags |= UMA_ZONE_HASH;
}
@@ -1213,6 +1367,9 @@
int pages;
int rsize;
+ KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
+ ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
+
alignsize = keg->uk_align + 1;
rsize = keg->uk_size;
/*
@@ -1232,8 +1389,8 @@
keg->uk_ppera = pages;
keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
- KASSERT(keg->uk_ipers <= uma_max_ipers,
- ("keg_small_init: keg->uk_ipers too high(%d) increase max_ipers",
+ KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
+ ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
keg->uk_ipers));
}
@@ -1257,11 +1414,11 @@
keg->uk_fini = arg->fini;
keg->uk_align = arg->align;
keg->uk_free = 0;
+ keg->uk_reserve = 0;
keg->uk_pages = 0;
keg->uk_flags = arg->flags;
keg->uk_allocf = page_alloc;
keg->uk_freef = page_free;
- keg->uk_recurse = 0;
keg->uk_slabzone = NULL;
/*
@@ -1279,25 +1436,24 @@
if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
keg->uk_flags |= UMA_ZONE_VTOSLAB;
- /*
- * The +UMA_FRITM_SZ added to uk_size is to account for the
- * linkage that is added to the size in keg_small_init(). If
- * we don't account for this here then we may end up in
- * keg_small_init() with a calculated 'ipers' of 0.
- */
- if (keg->uk_flags & UMA_ZONE_REFCNT) {
- if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
- keg_cachespread_init(keg);
- else if ((keg->uk_size+UMA_FRITMREF_SZ) >
- (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
+ if (arg->flags & UMA_ZONE_PCPU)
+#ifdef SMP
+ keg->uk_flags |= UMA_ZONE_OFFPAGE;
+#else
+ keg->uk_flags &= ~UMA_ZONE_PCPU;
+#endif
+
+ if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
+ keg_cachespread_init(keg);
+ } else if (keg->uk_flags & UMA_ZONE_REFCNT) {
+ if (keg->uk_size >
+ (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) -
+ sizeof(uint32_t)))
keg_large_init(keg);
else
keg_small_init(keg);
} else {
- if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
- keg_cachespread_init(keg);
- else if ((keg->uk_size+UMA_FRITM_SZ) >
- (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
+ if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
keg_large_init(keg);
else
keg_small_init(keg);
@@ -1304,9 +1460,12 @@
}
if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
- if (keg->uk_flags & UMA_ZONE_REFCNT)
+ if (keg->uk_flags & UMA_ZONE_REFCNT) {
+ if (keg->uk_ipers > uma_max_ipers_ref)
+ panic("Too many ref items per zone: %d > %d\n",
+ keg->uk_ipers, uma_max_ipers_ref);
keg->uk_slabzone = slabrefzone;
- else
+ } else
keg->uk_slabzone = slabzone;
}
@@ -1330,12 +1489,9 @@
keg->uk_allocf = startup_alloc;
/*
- * Initialize keg's lock (shared among zones).
+ * Initialize keg's lock
*/
- if (arg->flags & UMA_ZONE_MTXCLASS)
- KEG_LOCK_INIT(keg, 1);
- else
- KEG_LOCK_INIT(keg, 0);
+ KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
/*
* If we're putting the slab header in the actual page we need to
@@ -1346,25 +1502,17 @@
u_int totsize;
/* Size of the slab struct and free list */
+ totsize = sizeof(struct uma_slab);
+
+ /* Size of the reference counts. */
if (keg->uk_flags & UMA_ZONE_REFCNT)
- totsize = sizeof(struct uma_slab_refcnt) +
- keg->uk_ipers * UMA_FRITMREF_SZ;
- else
- totsize = sizeof(struct uma_slab) +
- keg->uk_ipers * UMA_FRITM_SZ;
+ totsize += keg->uk_ipers * sizeof(uint32_t);
if (totsize & UMA_ALIGN_PTR)
totsize = (totsize & ~UMA_ALIGN_PTR) +
(UMA_ALIGN_PTR + 1);
- keg->uk_pgoff = (UMA_SLAB_SIZE * keg->uk_ppera) - totsize;
+ keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
- if (keg->uk_flags & UMA_ZONE_REFCNT)
- totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
- + keg->uk_ipers * UMA_FRITMREF_SZ;
- else
- totsize = keg->uk_pgoff + sizeof(struct uma_slab)
- + keg->uk_ipers * UMA_FRITM_SZ;
-
/*
* The only way the following is possible is if with our
* UMA_ALIGN_PTR adjustments we are now bigger than
@@ -1372,7 +1520,10 @@
* mathematically possible for all cases, so we make
* sure here anyway.
*/
- if (totsize > UMA_SLAB_SIZE * keg->uk_ppera) {
+ totsize = keg->uk_pgoff + sizeof(struct uma_slab);
+ if (keg->uk_flags & UMA_ZONE_REFCNT)
+ totsize += keg->uk_ipers * sizeof(uint32_t);
+ if (totsize > PAGE_SIZE * keg->uk_ppera) {
printf("zone %s ipers %d rsize %d size %d\n",
zone->uz_name, keg->uk_ipers, keg->uk_rsize,
keg->uk_size);
@@ -1387,14 +1538,15 @@
printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
keg->uk_ipers, keg->uk_ppera,
- (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
+ (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
+ keg->uk_free);
#endif
LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
- mtx_lock(&uma_mtx);
+ rw_wlock(&uma_rwlock);
LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
- mtx_unlock(&uma_mtx);
+ rw_wunlock(&uma_rwlock);
return (0);
}
@@ -1423,17 +1575,47 @@
zone->uz_frees = 0;
zone->uz_fails = 0;
zone->uz_sleeps = 0;
- zone->uz_fills = zone->uz_count = 0;
+ zone->uz_count = 0;
+ zone->uz_count_min = 0;
zone->uz_flags = 0;
+ zone->uz_warning = NULL;
+ timevalclear(&zone->uz_ratecheck);
keg = arg->keg;
+ ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
+
+ /*
+ * This is a pure cache zone, no kegs.
+ */
+ if (arg->import) {
+ if (arg->flags & UMA_ZONE_VM)
+ arg->flags |= UMA_ZFLAG_CACHEONLY;
+ zone->uz_flags = arg->flags;
+ zone->uz_size = arg->size;
+ zone->uz_import = arg->import;
+ zone->uz_release = arg->release;
+ zone->uz_arg = arg->arg;
+ zone->uz_lockptr = &zone->uz_lock;
+ rw_wlock(&uma_rwlock);
+ LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
+ rw_wunlock(&uma_rwlock);
+ goto out;
+ }
+
+ /*
+ * Use the regular zone/keg/slab allocator.
+ */
+ zone->uz_import = (uma_import)zone_import;
+ zone->uz_release = (uma_release)zone_release;
+ zone->uz_arg = zone;
+
if (arg->flags & UMA_ZONE_SECONDARY) {
KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
zone->uz_init = arg->uminit;
zone->uz_fini = arg->fini;
- zone->uz_lock = &keg->uk_lock;
+ zone->uz_lockptr = &keg->uk_lock;
zone->uz_flags |= UMA_ZONE_SECONDARY;
- mtx_lock(&uma_mtx);
+ rw_wlock(&uma_rwlock);
ZONE_LOCK(zone);
LIST_FOREACH(z, &keg->uk_zones, uz_link) {
if (LIST_NEXT(z, uz_link) == NULL) {
@@ -1442,7 +1624,7 @@
}
}
ZONE_UNLOCK(zone);
- mtx_unlock(&uma_mtx);
+ rw_wunlock(&uma_rwlock);
} else if (keg == NULL) {
if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
arg->align, arg->flags)) == NULL)
@@ -1463,12 +1645,13 @@
if (error)
return (error);
}
+
/*
* Link in the first keg.
*/
zone->uz_klink.kl_keg = keg;
LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
- zone->uz_lock = &keg->uk_lock;
+ zone->uz_lockptr = &keg->uk_lock;
zone->uz_size = keg->uk_size;
zone->uz_flags |= (keg->uk_flags &
(UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
@@ -1483,12 +1666,13 @@
return (0);
}
- if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
- zone->uz_count = BUCKET_MAX;
- else if (keg->uk_ipers <= BUCKET_MAX)
- zone->uz_count = keg->uk_ipers;
+out:
+ if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0)
+ zone->uz_count = bucket_select(zone->uz_size);
else
zone->uz_count = BUCKET_MAX;
+ zone->uz_count_min = zone->uz_count;
+
return (0);
}
@@ -1507,8 +1691,9 @@
keg = (uma_keg_t)arg;
KEG_LOCK(keg);
if (keg->uk_free != 0) {
- printf("Freed UMA keg was not empty (%d items). "
+ printf("Freed UMA keg (%s) was not empty (%d items). "
" Lost %d pages of memory.\n",
+ keg->uk_name ? keg->uk_name : "",
keg->uk_free, keg->uk_pages);
}
KEG_UNLOCK(keg);
@@ -1537,9 +1722,9 @@
if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
cache_drain(zone);
- mtx_lock(&uma_mtx);
+ rw_wlock(&uma_rwlock);
LIST_REMOVE(zone, uz_link);
- mtx_unlock(&uma_mtx);
+ rw_wunlock(&uma_rwlock);
/*
* XXX there are some races here where
* the zone can be drained but zone lock
@@ -1560,13 +1745,13 @@
/*
* We only destroy kegs from non secondary zones.
*/
- if ((zone->uz_flags & UMA_ZONE_SECONDARY) == 0) {
- mtx_lock(&uma_mtx);
+ if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0) {
+ rw_wlock(&uma_rwlock);
LIST_REMOVE(keg, uk_link);
- mtx_unlock(&uma_mtx);
- zone_free_item(kegs, keg, NULL, SKIP_NONE,
- ZFREE_STATFREE);
+ rw_wunlock(&uma_rwlock);
+ zone_free_item(kegs, keg, NULL, SKIP_NONE);
}
+ ZONE_LOCK_FINI(zone);
}
/*
@@ -1585,12 +1770,12 @@
uma_keg_t keg;
uma_zone_t zone;
- mtx_lock(&uma_mtx);
+ rw_rlock(&uma_rwlock);
LIST_FOREACH(keg, &uma_kegs, uk_link) {
LIST_FOREACH(zone, &keg->uk_zones, uz_link)
zfunc(zone);
}
- mtx_unlock(&uma_mtx);
+ rw_runlock(&uma_rwlock);
}
/* Public functions */
@@ -1601,86 +1786,15 @@
struct uma_zctor_args args;
uma_slab_t slab;
u_int slabsize;
- u_int objsize, totsize, wsize;
int i;
#ifdef UMA_DEBUG
printf("Creating uma keg headers zone and keg.\n");
#endif
- mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
+ rw_init(&uma_rwlock, "UMA lock");
- /*
- * Figure out the maximum number of items-per-slab we'll have if
- * we're using the OFFPAGE slab header to track free items, given
- * all possible object sizes and the maximum desired wastage
- * (UMA_MAX_WASTE).
- *
- * We iterate until we find an object size for
- * which the calculated wastage in keg_small_init() will be
- * enough to warrant OFFPAGE. Since wastedspace versus objsize
- * is an overall increasing see-saw function, we find the smallest
- * objsize such that the wastage is always acceptable for objects
- * with that objsize or smaller. Since a smaller objsize always
- * generates a larger possible uma_max_ipers, we use this computed
- * objsize to calculate the largest ipers possible. Since the
- * ipers calculated for OFFPAGE slab headers is always larger than
- * the ipers initially calculated in keg_small_init(), we use
- * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
- * obtain the maximum ipers possible for offpage slab headers.
- *
- * It should be noted that ipers versus objsize is an inversly
- * proportional function which drops off rather quickly so as
- * long as our UMA_MAX_WASTE is such that the objsize we calculate
- * falls into the portion of the inverse relation AFTER the steep
- * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
- *
- * Note that we have 8-bits (1 byte) to use as a freelist index
- * inside the actual slab header itself and this is enough to
- * accomodate us. In the worst case, a UMA_SMALLEST_UNIT sized
- * object with offpage slab header would have ipers =
- * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
- * 1 greater than what our byte-integer freelist index can
- * accomodate, but we know that this situation never occurs as
- * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
- * that we need to go to offpage slab headers. Or, if we do,
- * then we trap that condition below and panic in the INVARIANTS case.
- */
- wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE;
- totsize = wsize;
- objsize = UMA_SMALLEST_UNIT;
- while (totsize >= wsize) {
- totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
- (objsize + UMA_FRITM_SZ);
- totsize *= (UMA_FRITM_SZ + objsize);
- objsize++;
- }
- if (objsize > UMA_SMALLEST_UNIT)
- objsize--;
- uma_max_ipers = MAX(UMA_SLAB_SIZE / objsize, 64);
-
- wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE;
- totsize = wsize;
- objsize = UMA_SMALLEST_UNIT;
- while (totsize >= wsize) {
- totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
- (objsize + UMA_FRITMREF_SZ);
- totsize *= (UMA_FRITMREF_SZ + objsize);
- objsize++;
- }
- if (objsize > UMA_SMALLEST_UNIT)
- objsize--;
- uma_max_ipers_ref = MAX(UMA_SLAB_SIZE / objsize, 64);
-
- KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255),
- ("uma_startup: calculated uma_max_ipers values too large!"));
-
-#ifdef UMA_DEBUG
- printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
- printf("Calculated uma_max_ipers_slab (for OFFPAGE) is %d\n",
- uma_max_ipers_ref);
-#endif
-
/* "manually" create the initial zone */
+ memset(&args, 0, sizeof(args));
args.name = "UMA Kegs";
args.size = sizeof(struct uma_keg);
args.ctor = keg_ctor;
@@ -1697,8 +1811,8 @@
printf("Filling boot free list.\n");
#endif
for (i = 0; i < boot_pages; i++) {
- slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
- slab->us_data = (u_int8_t *)slab;
+ slab = (uma_slab_t)((uint8_t *)bootmem + (i * UMA_SLAB_SIZE));
+ slab->us_data = (uint8_t *)slab;
slab->us_flags = UMA_SLAB_BOOT;
LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
}
@@ -1727,16 +1841,9 @@
printf("Creating slab and hash zones.\n");
#endif
- /*
- * This is the max number of free list items we'll have with
- * offpage slabs.
- */
- slabsize = uma_max_ipers * UMA_FRITM_SZ;
- slabsize += sizeof(struct uma_slab);
-
/* Now make a zone for slab headers */
slabzone = uma_zcreate("UMA Slabs",
- slabsize,
+ sizeof(struct uma_slab),
NULL, NULL, NULL, NULL,
UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
@@ -1744,8 +1851,8 @@
* We also create a zone for the bigger slabs with reference
* counts in them, to accomodate UMA_ZONE_REFCNT zones.
*/
- slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
- slabsize += sizeof(struct uma_slab_refcnt);
+ slabsize = sizeof(struct uma_slab_refcnt);
+ slabsize += uma_max_ipers_ref * sizeof(uint32_t);
slabrefzone = uma_zcreate("UMA RCntSlabs",
slabsize,
NULL, NULL, NULL, NULL,
@@ -1772,6 +1879,7 @@
{
booted = UMA_STARTUP2;
bucket_enable();
+ sx_init(&uma_drain_lock, "umadrain");
#ifdef UMA_DEBUG
printf("UMA startup2 complete.\n");
#endif
@@ -1788,7 +1896,7 @@
#ifdef UMA_DEBUG
printf("Starting callout.\n");
#endif
- callout_init(&uma_callout, CALLOUT_MPSAFE);
+ callout_init(&uma_callout, 1);
callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
#ifdef UMA_DEBUG
printf("UMA startup3 complete.\n");
@@ -1797,7 +1905,7 @@
static uma_keg_t
uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
- int align, u_int32_t flags)
+ int align, uint32_t flags)
{
struct uma_kctor_args args;
@@ -1822,12 +1930,18 @@
/* See uma.h */
uma_zone_t
uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
- uma_init uminit, uma_fini fini, int align, u_int32_t flags)
+ uma_init uminit, uma_fini fini, int align, uint32_t flags)
{
struct uma_zctor_args args;
+ uma_zone_t res;
+ bool locked;
+ KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
+ align, name));
+
/* This stuff is essential for the zone ctor */
+ memset(&args, 0, sizeof(args));
args.name = name;
args.size = size;
args.ctor = ctor;
@@ -1838,7 +1952,16 @@
args.flags = flags;
args.keg = NULL;
- return (zone_alloc_item(zones, &args, M_WAITOK));
+ if (booted < UMA_STARTUP2) {
+ locked = false;
+ } else {
+ sx_slock(&uma_drain_lock);
+ locked = true;
+ }
+ res = zone_alloc_item(zones, &args, M_WAITOK);
+ if (locked)
+ sx_sunlock(&uma_drain_lock);
+ return (res);
}
/* See uma.h */
@@ -1848,8 +1971,11 @@
{
struct uma_zctor_args args;
uma_keg_t keg;
+ uma_zone_t res;
+ bool locked;
keg = zone_first_keg(master);
+ memset(&args, 0, sizeof(args));
args.name = name;
args.size = keg->uk_size;
args.ctor = ctor;
@@ -1860,7 +1986,40 @@
args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
args.keg = keg;
+ if (booted < UMA_STARTUP2) {
+ locked = false;
+ } else {
+ sx_slock(&uma_drain_lock);
+ locked = true;
+ }
/* XXX Attaches only one keg of potentially many. */
+ res = zone_alloc_item(zones, &args, M_WAITOK);
+ if (locked)
+ sx_sunlock(&uma_drain_lock);
+ return (res);
+}
+
+/* See uma.h */
+uma_zone_t
+uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
+ uma_init zinit, uma_fini zfini, uma_import zimport,
+ uma_release zrelease, void *arg, int flags)
+{
+ struct uma_zctor_args args;
+
+ memset(&args, 0, sizeof(args));
+ args.name = name;
+ args.size = size;
+ args.ctor = ctor;
+ args.dtor = dtor;
+ args.uminit = zinit;
+ args.fini = zfini;
+ args.import = zimport;
+ args.release = zrelease;
+ args.arg = arg;
+ args.align = 0;
+ args.flags = flags;
+
return (zone_alloc_item(zones, &args, M_WAITOK));
}
@@ -1869,10 +2028,10 @@
{
if (a < b) {
ZONE_LOCK(a);
- mtx_lock_flags(b->uz_lock, MTX_DUPOK);
+ mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
} else {
ZONE_LOCK(b);
- mtx_lock_flags(a->uz_lock, MTX_DUPOK);
+ mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
}
}
@@ -1955,7 +2114,9 @@
uma_zdestroy(uma_zone_t zone)
{
- zone_free_item(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
+ sx_slock(&uma_drain_lock);
+ zone_free_item(zones, zone, NULL, SKIP_NONE);
+ sx_sunlock(&uma_drain_lock);
}
/* See uma.h */
@@ -1965,6 +2126,7 @@
void *item;
uma_cache_t cache;
uma_bucket_t bucket;
+ int lockfail;
int cpu;
/* This is the fast path allocation */
@@ -1978,7 +2140,30 @@
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"uma_zalloc_arg: zone \"%s\"", zone->uz_name);
}
-
+#ifdef DEBUG_MEMGUARD
+ if (memguard_cmp_zone(zone)) {
+ item = memguard_alloc(zone->uz_size, flags);
+ if (item != NULL) {
+ /*
+ * Avoid conflict with the use-after-free
+ * protecting infrastructure from INVARIANTS.
+ */
+ if (zone->uz_init != NULL &&
+ zone->uz_init != mtrash_init &&
+ zone->uz_init(item, zone->uz_size, flags) != 0)
+ return (NULL);
+ if (zone->uz_ctor != NULL &&
+ zone->uz_ctor != mtrash_ctor &&
+ zone->uz_ctor(item, zone->uz_size, udata,
+ flags) != 0) {
+ zone->uz_fini(item, zone->uz_size);
+ return (NULL);
+ }
+ return (item);
+ }
+ /* This is unfortunate but should not be fatal. */
+ }
+#endif
/*
* If possible, allocate from the per-CPU cache. There are two
* requirements for safe access to the per-CPU cache: (1) the thread
@@ -1990,7 +2175,6 @@
* the current cache; when we re-acquire the critical section, we
* must detect and handle migration if it has occurred.
*/
-zalloc_restart:
critical_enter();
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
@@ -1997,54 +2181,57 @@
zalloc_start:
bucket = cache->uc_allocbucket;
-
- if (bucket) {
- if (bucket->ub_cnt > 0) {
- bucket->ub_cnt--;
- item = bucket->ub_bucket[bucket->ub_cnt];
+ if (bucket != NULL && bucket->ub_cnt > 0) {
+ bucket->ub_cnt--;
+ item = bucket->ub_bucket[bucket->ub_cnt];
#ifdef INVARIANTS
- bucket->ub_bucket[bucket->ub_cnt] = NULL;
+ bucket->ub_bucket[bucket->ub_cnt] = NULL;
#endif
- KASSERT(item != NULL,
- ("uma_zalloc: Bucket pointer mangled."));
- cache->uc_allocs++;
- critical_exit();
+ KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
+ cache->uc_allocs++;
+ critical_exit();
+ if (zone->uz_ctor != NULL &&
+ zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
+ atomic_add_long(&zone->uz_fails, 1);
+ zone_free_item(zone, item, udata, SKIP_DTOR);
+ return (NULL);
+ }
#ifdef INVARIANTS
- ZONE_LOCK(zone);
- uma_dbg_alloc(zone, NULL, item);
- ZONE_UNLOCK(zone);
+ uma_dbg_alloc(zone, NULL, item);
#endif
- if (zone->uz_ctor != NULL) {
- if (zone->uz_ctor(item, zone->uz_size,
- udata, flags) != 0) {
- zone_free_item(zone, item, udata,
- SKIP_DTOR, ZFREE_STATFAIL |
- ZFREE_STATFREE);
- return (NULL);
- }
- }
- if (flags & M_ZERO)
- bzero(item, zone->uz_size);
- return (item);
- } else if (cache->uc_freebucket) {
- /*
- * We have run out of items in our allocbucket.
- * See if we can switch with our free bucket.
- */
- if (cache->uc_freebucket->ub_cnt > 0) {
+ if (flags & M_ZERO)
+ uma_zero_item(item, zone);
+ return (item);
+ }
+
+ /*
+ * We have run out of items in our alloc bucket.
+ * See if we can switch with our free bucket.
+ */
+ bucket = cache->uc_freebucket;
+ if (bucket != NULL && bucket->ub_cnt > 0) {
#ifdef UMA_DEBUG_ALLOC
- printf("uma_zalloc: Swapping empty with"
- " alloc.\n");
+ printf("uma_zalloc: Swapping empty with alloc.\n");
#endif
- bucket = cache->uc_freebucket;
- cache->uc_freebucket = cache->uc_allocbucket;
- cache->uc_allocbucket = bucket;
+ cache->uc_freebucket = cache->uc_allocbucket;
+ cache->uc_allocbucket = bucket;
+ goto zalloc_start;
+ }
- goto zalloc_start;
- }
- }
- }
/*
+ * Discard any empty allocation bucket while we hold no locks.
+ */
+ bucket = cache->uc_allocbucket;
+ cache->uc_allocbucket = NULL;
+ critical_exit();
+ if (bucket != NULL)
+ bucket_free(zone, bucket, udata);
+
+ /* Short-circuit for zones without buckets and low memory. */
+ if (zone->uz_count == 0 || bucketdisable)
+ goto zalloc_item;
+
+ /*
* Attempt to retrieve the item from the per-CPU cache has failed, so
* we must go back to the zone. This requires the zone lock, so we
* must drop the critical section, then re-acquire it when we go back
@@ -2053,41 +2240,34 @@
* thread-local state specific to the cache from prior to releasing
* the critical section.
*/
- critical_exit();
- ZONE_LOCK(zone);
+ lockfail = 0;
+ if (ZONE_TRYLOCK(zone) == 0) {
+ /* Record contention to size the buckets. */
+ ZONE_LOCK(zone);
+ lockfail = 1;
+ }
critical_enter();
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
- bucket = cache->uc_allocbucket;
- if (bucket != NULL) {
- if (bucket->ub_cnt > 0) {
- ZONE_UNLOCK(zone);
- goto zalloc_start;
- }
- bucket = cache->uc_freebucket;
- if (bucket != NULL && bucket->ub_cnt > 0) {
- ZONE_UNLOCK(zone);
- goto zalloc_start;
- }
- }
- /* Since we have locked the zone we may as well send back our stats */
- zone->uz_allocs += cache->uc_allocs;
+ /*
+ * Since we have locked the zone we may as well send back our stats.
+ */
+ atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
+ atomic_add_long(&zone->uz_frees, cache->uc_frees);
cache->uc_allocs = 0;
- zone->uz_frees += cache->uc_frees;
cache->uc_frees = 0;
- /* Our old one is now a free bucket */
- if (cache->uc_allocbucket) {
- KASSERT(cache->uc_allocbucket->ub_cnt == 0,
- ("uma_zalloc_arg: Freeing a non free bucket."));
- LIST_INSERT_HEAD(&zone->uz_free_bucket,
- cache->uc_allocbucket, ub_link);
- cache->uc_allocbucket = NULL;
+ /* See if we lost the race to fill the cache. */
+ if (cache->uc_allocbucket != NULL) {
+ ZONE_UNLOCK(zone);
+ goto zalloc_start;
}
- /* Check the free list for a new alloc bucket */
- if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
+ /*
+ * Check the zone's cache of buckets.
+ */
+ if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
KASSERT(bucket->ub_cnt != 0,
("uma_zalloc_arg: Returning an empty bucket."));
@@ -2099,19 +2279,38 @@
/* We are no longer associated with this CPU. */
critical_exit();
- /* Bump up our uz_count so we get here less */
- if (zone->uz_count < BUCKET_MAX)
+ /*
+ * We bump the uz count when the cache size is insufficient to
+ * handle the working set.
+ */
+ if (lockfail && zone->uz_count < BUCKET_MAX)
zone->uz_count++;
+ ZONE_UNLOCK(zone);
/*
* Now lets just fill a bucket and put it on the free list. If that
- * works we'll restart the allocation from the begining.
+ * works we'll restart the allocation from the begining and it
+ * will use the just filled bucket.
*/
- if (zone_alloc_bucket(zone, flags)) {
+ bucket = zone_alloc_bucket(zone, udata, flags);
+ if (bucket != NULL) {
+ ZONE_LOCK(zone);
+ critical_enter();
+ cpu = curcpu;
+ cache = &zone->uz_cpu[cpu];
+ /*
+ * See if we lost the race or were migrated. Cache the
+ * initialized bucket to make this less likely or claim
+ * the memory directly.
+ */
+ if (cache->uc_allocbucket == NULL)
+ cache->uc_allocbucket = bucket;
+ else
+ LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
ZONE_UNLOCK(zone);
- goto zalloc_restart;
+ goto zalloc_start;
}
- ZONE_UNLOCK(zone);
+
/*
* We may not be able to get a bucket so return an actual item.
*/
@@ -2119,7 +2318,9 @@
printf("uma_zalloc_arg: Bucketzone returned NULL\n");
#endif
+zalloc_item:
item = zone_alloc_item(zone, udata, flags);
+
return (item);
}
@@ -2127,9 +2328,13 @@
keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
{
uma_slab_t slab;
+ int reserve;
mtx_assert(&keg->uk_lock, MA_OWNED);
slab = NULL;
+ reserve = 0;
+ if ((flags & M_USE_RESERVE) == 0)
+ reserve = keg->uk_reserve;
for (;;) {
/*
@@ -2137,7 +2342,7 @@
* used over those that are totally full. This helps to reduce
* fragmentation.
*/
- if (keg->uk_free != 0) {
+ if (keg->uk_free > reserve) {
if (!LIST_EMPTY(&keg->uk_part_slab)) {
slab = LIST_FIRST(&keg->uk_part_slab);
} else {
@@ -2162,8 +2367,10 @@
* If this is not a multi-zone, set the FULL bit.
* Otherwise slab_multi() takes care of it.
*/
- if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0)
+ if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
zone->uz_flags |= UMA_ZFLAG_FULL;
+ zone_log_warning(zone);
+ }
if (flags & M_NOWAIT)
break;
zone->uz_sleeps++;
@@ -2170,9 +2377,7 @@
msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
continue;
}
- keg->uk_recurse++;
slab = keg_alloc_slab(keg, zone, flags);
- keg->uk_recurse--;
/*
* If we got a slab here it's safe to mark it partially used
* and return. We assume that the caller is going to remove
@@ -2193,42 +2398,15 @@
return (slab);
}
-static inline void
-zone_relock(uma_zone_t zone, uma_keg_t keg)
-{
- if (zone->uz_lock != &keg->uk_lock) {
- KEG_UNLOCK(keg);
- ZONE_LOCK(zone);
- }
-}
-
-static inline void
-keg_relock(uma_keg_t keg, uma_zone_t zone)
-{
- if (zone->uz_lock != &keg->uk_lock) {
- ZONE_UNLOCK(zone);
- KEG_LOCK(keg);
- }
-}
-
static uma_slab_t
zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
{
uma_slab_t slab;
- if (keg == NULL)
+ if (keg == NULL) {
keg = zone_first_keg(zone);
- /*
- * This is to prevent us from recursively trying to allocate
- * buckets. The problem is that if an allocation forces us to
- * grab a new bucket we will call page_alloc, which will go off
- * and cause the vm to allocate vm_map_entries. If we need new
- * buckets there too we will recurse in kmem_alloc and bad
- * things happen. So instead we return a NULL bucket, and make
- * the code that allocates buckets smart enough to deal with it
- */
- if (keg->uk_flags & UMA_ZFLAG_BUCKET && keg->uk_recurse != 0)
- return (NULL);
+ KEG_LOCK(keg);
+ }
for (;;) {
slab = keg_fetch_slab(keg, zone, flags);
@@ -2237,13 +2415,13 @@
if (flags & (M_NOWAIT | M_NOVM))
break;
}
+ KEG_UNLOCK(keg);
return (NULL);
}
/*
* uma_zone_fetch_slab_multi: Fetches a slab from one available keg. Returns
- * with the keg locked. Caller must call zone_relock() afterwards if the
- * zone lock is required. On NULL the zone lock is held.
+ * with the keg locked. On NULL no lock is held.
*
* The last pointer is used to seed the search. It is not required.
*/
@@ -2267,12 +2445,11 @@
* Use the last slab allocated as a hint for where to start
* the search.
*/
- if (last) {
+ if (last != NULL) {
slab = keg_fetch_slab(last, zone, flags);
if (slab)
return (slab);
- zone_relock(zone, last);
- last = NULL;
+ KEG_UNLOCK(last);
}
/*
* Loop until we have a slab incase of transient failures
@@ -2288,7 +2465,7 @@
*/
LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
keg = klink->kl_keg;
- keg_relock(keg, zone);
+ KEG_LOCK(keg);
if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
slab = keg_fetch_slab(keg, zone, flags);
if (slab)
@@ -2298,7 +2475,7 @@
full++;
else
empty++;
- zone_relock(zone, keg);
+ KEG_UNLOCK(keg);
}
if (rflags & (M_NOWAIT | M_NOVM))
break;
@@ -2308,10 +2485,14 @@
* and sleep so just sleep for a short period and retry.
*/
if (full && !empty) {
+ ZONE_LOCK(zone);
zone->uz_flags |= UMA_ZFLAG_FULL;
zone->uz_sleeps++;
- msleep(zone, zone->uz_lock, PVM, "zonelimit", hz/100);
+ zone_log_warning(zone);
+ msleep(zone, zone->uz_lockptr, PVM,
+ "zonelimit", hz/100);
zone->uz_flags &= ~UMA_ZFLAG_FULL;
+ ZONE_UNLOCK(zone);
continue;
}
}
@@ -2319,30 +2500,20 @@
}
static void *
-slab_alloc_item(uma_zone_t zone, uma_slab_t slab)
+slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
{
- uma_keg_t keg;
- uma_slabrefcnt_t slabref;
void *item;
- u_int8_t freei;
+ uint8_t freei;
- keg = slab->us_keg;
+ MPASS(keg == slab->us_keg);
mtx_assert(&keg->uk_lock, MA_OWNED);
- freei = slab->us_firstfree;
- if (keg->uk_flags & UMA_ZONE_REFCNT) {
- slabref = (uma_slabrefcnt_t)slab;
- slab->us_firstfree = slabref->us_freelist[freei].us_item;
- } else {
- slab->us_firstfree = slab->us_freelist[freei].us_item;
- }
+ freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
+ BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
item = slab->us_data + (keg->uk_rsize * freei);
-
slab->us_freecount--;
keg->uk_free--;
-#ifdef INVARIANTS
- uma_dbg_alloc(zone, slab, item);
-#endif
+
/* Move this slab to the full list */
if (slab->us_freecount == 0) {
LIST_REMOVE(slab, us_link);
@@ -2353,82 +2524,58 @@
}
static int
-zone_alloc_bucket(uma_zone_t zone, int flags)
+zone_import(uma_zone_t zone, void **bucket, int max, int flags)
{
- uma_bucket_t bucket;
uma_slab_t slab;
uma_keg_t keg;
- int16_t saved;
- int max, origflags = flags;
+ int i;
- /*
- * Try this zone's free list first so we don't allocate extra buckets.
- */
- if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
- KASSERT(bucket->ub_cnt == 0,
- ("zone_alloc_bucket: Bucket on free list is not empty."));
- LIST_REMOVE(bucket, ub_link);
- } else {
- int bflags;
-
- bflags = (flags & ~M_ZERO);
- if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
- bflags |= M_NOVM;
-
- ZONE_UNLOCK(zone);
- bucket = bucket_alloc(zone->uz_count, bflags);
- ZONE_LOCK(zone);
+ slab = NULL;
+ keg = NULL;
+ /* Try to keep the buckets totally full */
+ for (i = 0; i < max; ) {
+ if ((slab = zone->uz_slab(zone, keg, flags)) == NULL)
+ break;
+ keg = slab->us_keg;
+ while (slab->us_freecount && i < max) {
+ bucket[i++] = slab_alloc_item(keg, slab);
+ if (keg->uk_free <= keg->uk_reserve)
+ break;
+ }
+ /* Don't grab more than one slab at a time. */
+ flags &= ~M_WAITOK;
+ flags |= M_NOWAIT;
}
+ if (slab != NULL)
+ KEG_UNLOCK(keg);
- if (bucket == NULL) {
- return (0);
- }
+ return i;
+}
-#ifdef SMP
- /*
- * This code is here to limit the number of simultaneous bucket fills
- * for any given zone to the number of per cpu caches in this zone. This
- * is done so that we don't allocate more memory than we really need.
- */
- if (zone->uz_fills >= mp_ncpus)
- goto done;
+static uma_bucket_t
+zone_alloc_bucket(uma_zone_t zone, void *udata, int flags)
+{
+ uma_bucket_t bucket;
+ int max;
-#endif
- zone->uz_fills++;
+ /* Don't wait for buckets, preserve caller's NOVM setting. */
+ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
+ if (bucket == NULL)
+ return (NULL);
max = MIN(bucket->ub_entries, zone->uz_count);
- /* Try to keep the buckets totally full */
- saved = bucket->ub_cnt;
- slab = NULL;
- keg = NULL;
- while (bucket->ub_cnt < max &&
- (slab = zone->uz_slab(zone, keg, flags)) != NULL) {
- keg = slab->us_keg;
- while (slab->us_freecount && bucket->ub_cnt < max) {
- bucket->ub_bucket[bucket->ub_cnt++] =
- slab_alloc_item(zone, slab);
- }
+ bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
+ max, flags);
- /* Don't block on the next fill */
- flags |= M_NOWAIT;
- }
- if (slab)
- zone_relock(zone, keg);
-
/*
- * We unlock here because we need to call the zone's init.
- * It should be safe to unlock because the slab dealt with
- * above is already on the appropriate list within the keg
- * and the bucket we filled is not yet on any list, so we
- * own it.
+ * Initialize the memory if necessary.
*/
- if (zone->uz_init != NULL) {
+ if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
int i;
- ZONE_UNLOCK(zone);
- for (i = saved; i < bucket->ub_cnt; i++)
+ for (i = 0; i < bucket->ub_cnt; i++)
if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
- origflags) != 0)
+ flags) != 0)
break;
/*
* If we couldn't initialize the whole bucket, put the
@@ -2435,35 +2582,27 @@
* rest back onto the freelist.
*/
if (i != bucket->ub_cnt) {
- int j;
-
- for (j = i; j < bucket->ub_cnt; j++) {
- zone_free_item(zone, bucket->ub_bucket[j],
- NULL, SKIP_FINI, 0);
+ zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
+ bucket->ub_cnt - i);
#ifdef INVARIANTS
- bucket->ub_bucket[j] = NULL;
+ bzero(&bucket->ub_bucket[i],
+ sizeof(void *) * (bucket->ub_cnt - i));
#endif
- }
bucket->ub_cnt = i;
}
- ZONE_LOCK(zone);
}
- zone->uz_fills--;
- if (bucket->ub_cnt != 0) {
- LIST_INSERT_HEAD(&zone->uz_full_bucket,
- bucket, ub_link);
- return (1);
+ if (bucket->ub_cnt == 0) {
+ bucket_free(zone, bucket, udata);
+ atomic_add_long(&zone->uz_fails, 1);
+ return (NULL);
}
-#ifdef SMP
-done:
-#endif
- bucket_free(bucket);
- return (0);
+ return (bucket);
}
+
/*
- * Allocates an item for an internal zone
+ * Allocates a single item from a zone.
*
* Arguments
* zone The zone to alloc for.
@@ -2478,7 +2617,6 @@
static void *
zone_alloc_item(uma_zone_t zone, void *udata, int flags)
{
- uma_slab_t slab;
void *item;
item = NULL;
@@ -2486,21 +2624,10 @@
#ifdef UMA_DEBUG_ALLOC
printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
#endif
- ZONE_LOCK(zone);
+ if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1)
+ goto fail;
+ atomic_add_long(&zone->uz_allocs, 1);
- slab = zone->uz_slab(zone, NULL, flags);
- if (slab == NULL) {
- zone->uz_fails++;
- ZONE_UNLOCK(zone);
- return (NULL);
- }
-
- item = slab_alloc_item(zone, slab);
-
- zone_relock(zone, slab->us_keg);
- zone->uz_allocs++;
- ZONE_UNLOCK(zone);
-
/*
* We have to call both the zone's init (not the keg's init)
* and the zone's ctor. This is because the item is going from
@@ -2509,22 +2636,27 @@
*/
if (zone->uz_init != NULL) {
if (zone->uz_init(item, zone->uz_size, flags) != 0) {
- zone_free_item(zone, item, udata, SKIP_FINI,
- ZFREE_STATFAIL | ZFREE_STATFREE);
- return (NULL);
+ zone_free_item(zone, item, udata, SKIP_FINI);
+ goto fail;
}
}
if (zone->uz_ctor != NULL) {
if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
- zone_free_item(zone, item, udata, SKIP_DTOR,
- ZFREE_STATFAIL | ZFREE_STATFREE);
- return (NULL);
+ zone_free_item(zone, item, udata, SKIP_DTOR);
+ goto fail;
}
}
+#ifdef INVARIANTS
+ uma_dbg_alloc(zone, NULL, item);
+#endif
if (flags & M_ZERO)
- bzero(item, zone->uz_size);
+ uma_zero_item(item, zone);
return (item);
+
+fail:
+ atomic_add_long(&zone->uz_fails, 1);
+ return (NULL);
}
/* See uma.h */
@@ -2533,7 +2665,7 @@
{
uma_cache_t cache;
uma_bucket_t bucket;
- int bflags;
+ int lockfail;
int cpu;
#ifdef UMA_DEBUG_ALLOC_1
@@ -2545,24 +2677,31 @@
/* uma_zfree(..., NULL) does nothing, to match free(9). */
if (item == NULL)
return;
-
- if (zone->uz_dtor)
- zone->uz_dtor(item, zone->uz_size, udata);
-
+#ifdef DEBUG_MEMGUARD
+ if (is_memguard_addr(item)) {
+ if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
+ zone->uz_dtor(item, zone->uz_size, udata);
+ if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
+ zone->uz_fini(item, zone->uz_size);
+ memguard_free(item);
+ return;
+ }
+#endif
#ifdef INVARIANTS
- ZONE_LOCK(zone);
if (zone->uz_flags & UMA_ZONE_MALLOC)
uma_dbg_free(zone, udata, item);
else
uma_dbg_free(zone, NULL, item);
- ZONE_UNLOCK(zone);
#endif
+ if (zone->uz_dtor != NULL)
+ zone->uz_dtor(item, zone->uz_size, udata);
+
/*
* The race here is acceptable. If we miss it we'll just have to wait
* a little longer for the limits to be reset.
*/
if (zone->uz_flags & UMA_ZFLAG_FULL)
- goto zfree_internal;
+ goto zfree_item;
/*
* If possible, free to the per-CPU cache. There are two
@@ -2581,45 +2720,25 @@
cache = &zone->uz_cpu[cpu];
zfree_start:
- bucket = cache->uc_freebucket;
+ /*
+ * Try to free into the allocbucket first to give LIFO ordering
+ * for cache-hot datastructures. Spill over into the freebucket
+ * if necessary. Alloc will swap them if one runs dry.
+ */
+ bucket = cache->uc_allocbucket;
+ if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
+ bucket = cache->uc_freebucket;
+ if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
+ KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
+ ("uma_zfree: Freeing to non free bucket index."));
+ bucket->ub_bucket[bucket->ub_cnt] = item;
+ bucket->ub_cnt++;
+ cache->uc_frees++;
+ critical_exit();
+ return;
+ }
- if (bucket) {
- /*
- * Do we have room in our bucket? It is OK for this uz count
- * check to be slightly out of sync.
- */
-
- if (bucket->ub_cnt < bucket->ub_entries) {
- KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
- ("uma_zfree: Freeing to non free bucket index."));
- bucket->ub_bucket[bucket->ub_cnt] = item;
- bucket->ub_cnt++;
- cache->uc_frees++;
- critical_exit();
- return;
- } else if (cache->uc_allocbucket) {
-#ifdef UMA_DEBUG_ALLOC
- printf("uma_zfree: Swapping buckets.\n");
-#endif
- /*
- * We have run out of space in our freebucket.
- * See if we can switch with our alloc bucket.
- */
- if (cache->uc_allocbucket->ub_cnt <
- cache->uc_freebucket->ub_cnt) {
- bucket = cache->uc_freebucket;
- cache->uc_freebucket = cache->uc_allocbucket;
- cache->uc_allocbucket = bucket;
- goto zfree_start;
- }
- }
- }
/*
- * We can get here for two reasons:
- *
- * 1) The buckets are NULL
- * 2) The alloc and free buckets are both somewhat full.
- *
* We must go back the zone, which requires acquiring the zone lock,
* which in turn means we must release and re-acquire the critical
* section. Since the critical section is released, we may be
@@ -2628,32 +2747,35 @@
* the critical section.
*/
critical_exit();
- ZONE_LOCK(zone);
+ if (zone->uz_count == 0 || bucketdisable)
+ goto zfree_item;
+
+ lockfail = 0;
+ if (ZONE_TRYLOCK(zone) == 0) {
+ /* Record contention to size the buckets. */
+ ZONE_LOCK(zone);
+ lockfail = 1;
+ }
critical_enter();
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
- if (cache->uc_freebucket != NULL) {
- if (cache->uc_freebucket->ub_cnt <
- cache->uc_freebucket->ub_entries) {
- ZONE_UNLOCK(zone);
- goto zfree_start;
- }
- if (cache->uc_allocbucket != NULL &&
- (cache->uc_allocbucket->ub_cnt <
- cache->uc_freebucket->ub_cnt)) {
- ZONE_UNLOCK(zone);
- goto zfree_start;
- }
- }
- /* Since we have locked the zone we may as well send back our stats */
- zone->uz_allocs += cache->uc_allocs;
+ /*
+ * Since we have locked the zone we may as well send back our stats.
+ */
+ atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
+ atomic_add_long(&zone->uz_frees, cache->uc_frees);
cache->uc_allocs = 0;
- zone->uz_frees += cache->uc_frees;
cache->uc_frees = 0;
bucket = cache->uc_freebucket;
+ if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
+ ZONE_UNLOCK(zone);
+ goto zfree_start;
+ }
cache->uc_freebucket = NULL;
+ /* We are no longer associated with this CPU. */
+ critical_exit();
/* Can we throw this on the zone full list? */
if (bucket != NULL) {
@@ -2663,34 +2785,35 @@
/* ub_cnt is pointing to the last free item */
KASSERT(bucket->ub_cnt != 0,
("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
- LIST_INSERT_HEAD(&zone->uz_full_bucket,
- bucket, ub_link);
+ LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
}
- if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
- LIST_REMOVE(bucket, ub_link);
- ZONE_UNLOCK(zone);
- cache->uc_freebucket = bucket;
- goto zfree_start;
- }
- /* We are no longer associated with this CPU. */
- critical_exit();
- /* And the zone.. */
+ /*
+ * We bump the uz count when the cache size is insufficient to
+ * handle the working set.
+ */
+ if (lockfail && zone->uz_count < BUCKET_MAX)
+ zone->uz_count++;
ZONE_UNLOCK(zone);
#ifdef UMA_DEBUG_ALLOC
printf("uma_zfree: Allocating new free bucket.\n");
#endif
- bflags = M_NOWAIT;
-
- if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
- bflags |= M_NOVM;
- bucket = bucket_alloc(zone->uz_count, bflags);
+ bucket = bucket_alloc(zone, udata, M_NOWAIT);
if (bucket) {
- ZONE_LOCK(zone);
- LIST_INSERT_HEAD(&zone->uz_free_bucket,
- bucket, ub_link);
- ZONE_UNLOCK(zone);
+ critical_enter();
+ cpu = curcpu;
+ cache = &zone->uz_cpu[cpu];
+ if (cache->uc_freebucket == NULL) {
+ cache->uc_freebucket = bucket;
+ goto zfree_start;
+ }
+ /*
+ * We lost the race, start over. We have to drop our
+ * critical section to free the bucket.
+ */
+ critical_exit();
+ bucket_free(zone, bucket, udata);
goto zfree_restart;
}
@@ -2697,63 +2820,18 @@
/*
* If nothing else caught this, we'll just do an internal free.
*/
-zfree_internal:
- zone_free_item(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
+zfree_item:
+ zone_free_item(zone, item, udata, SKIP_DTOR);
return;
}
-/*
- * Frees an item to an INTERNAL zone or allocates a free bucket
- *
- * Arguments:
- * zone The zone to free to
- * item The item we're freeing
- * udata User supplied data for the dtor
- * skip Skip dtors and finis
- */
static void
-zone_free_item(uma_zone_t zone, void *item, void *udata,
- enum zfreeskip skip, int flags)
+slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
{
- uma_slab_t slab;
- uma_slabrefcnt_t slabref;
- uma_keg_t keg;
- u_int8_t *mem;
- u_int8_t freei;
- int clearfull;
+ uint8_t freei;
- if (skip < SKIP_DTOR && zone->uz_dtor)
- zone->uz_dtor(item, zone->uz_size, udata);
-
- if (skip < SKIP_FINI && zone->uz_fini)
- zone->uz_fini(item, zone->uz_size);
-
- ZONE_LOCK(zone);
-
- if (flags & ZFREE_STATFAIL)
- zone->uz_fails++;
- if (flags & ZFREE_STATFREE)
- zone->uz_frees++;
-
- if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
- mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
- keg = zone_first_keg(zone); /* Must only be one. */
- if (zone->uz_flags & UMA_ZONE_HASH) {
- slab = hash_sfind(&keg->uk_hash, mem);
- } else {
- mem += keg->uk_pgoff;
- slab = (uma_slab_t)mem;
- }
- } else {
- /* This prevents redundant lookups via free(). */
- if ((zone->uz_flags & UMA_ZONE_MALLOC) && udata != NULL)
- slab = (uma_slab_t)udata;
- else
- slab = vtoslab((vm_offset_t)item);
- keg = slab->us_keg;
- keg_relock(keg, zone);
- }
+ mtx_assert(&keg->uk_lock, MA_OWNED);
MPASS(keg == slab->us_keg);
/* Do we need to remove from any lists? */
@@ -2765,51 +2843,104 @@
LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
}
- /* Slab management stuff */
- freei = ((unsigned long)item - (unsigned long)slab->us_data)
- / keg->uk_rsize;
-
-#ifdef INVARIANTS
- if (!skip)
- uma_dbg_free(zone, slab, item);
-#endif
-
- if (keg->uk_flags & UMA_ZONE_REFCNT) {
- slabref = (uma_slabrefcnt_t)slab;
- slabref->us_freelist[freei].us_item = slab->us_firstfree;
- } else {
- slab->us_freelist[freei].us_item = slab->us_firstfree;
- }
- slab->us_firstfree = freei;
+ /* Slab management. */
+ freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+ BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
slab->us_freecount++;
- /* Zone statistics */
+ /* Keg statistics. */
keg->uk_free++;
+}
+static void
+zone_release(uma_zone_t zone, void **bucket, int cnt)
+{
+ void *item;
+ uma_slab_t slab;
+ uma_keg_t keg;
+ uint8_t *mem;
+ int clearfull;
+ int i;
+
clearfull = 0;
- if (keg->uk_flags & UMA_ZFLAG_FULL) {
- if (keg->uk_pages < keg->uk_maxpages) {
- keg->uk_flags &= ~UMA_ZFLAG_FULL;
- clearfull = 1;
+ keg = zone_first_keg(zone);
+ KEG_LOCK(keg);
+ for (i = 0; i < cnt; i++) {
+ item = bucket[i];
+ if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
+ mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
+ if (zone->uz_flags & UMA_ZONE_HASH) {
+ slab = hash_sfind(&keg->uk_hash, mem);
+ } else {
+ mem += keg->uk_pgoff;
+ slab = (uma_slab_t)mem;
+ }
+ } else {
+ slab = vtoslab((vm_offset_t)item);
+ if (slab->us_keg != keg) {
+ KEG_UNLOCK(keg);
+ keg = slab->us_keg;
+ KEG_LOCK(keg);
+ }
}
+ slab_free_item(keg, slab, item);
+ if (keg->uk_flags & UMA_ZFLAG_FULL) {
+ if (keg->uk_pages < keg->uk_maxpages) {
+ keg->uk_flags &= ~UMA_ZFLAG_FULL;
+ clearfull = 1;
+ }
- /*
- * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
- * wake up all procs blocked on pages. This should be uncommon, so
- * keeping this simple for now (rather than adding count of blocked
- * threads etc).
- */
- wakeup(keg);
+ /*
+ * We can handle one more allocation. Since we're
+ * clearing ZFLAG_FULL, wake up all procs blocked
+ * on pages. This should be uncommon, so keeping this
+ * simple for now (rather than adding count of blocked
+ * threads etc).
+ */
+ wakeup(keg);
+ }
}
+ KEG_UNLOCK(keg);
if (clearfull) {
- zone_relock(zone, keg);
+ ZONE_LOCK(zone);
zone->uz_flags &= ~UMA_ZFLAG_FULL;
wakeup(zone);
ZONE_UNLOCK(zone);
- } else
- KEG_UNLOCK(keg);
+ }
+
}
+/*
+ * Frees a single item to any zone.
+ *
+ * Arguments:
+ * zone The zone to free to
+ * item The item we're freeing
+ * udata User supplied data for the dtor
+ * skip Skip dtors and finis
+ */
+static void
+zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
+{
+
+#ifdef INVARIANTS
+ if (skip == SKIP_NONE) {
+ if (zone->uz_flags & UMA_ZONE_MALLOC)
+ uma_dbg_free(zone, udata, item);
+ else
+ uma_dbg_free(zone, NULL, item);
+ }
+#endif
+ if (skip < SKIP_DTOR && zone->uz_dtor)
+ zone->uz_dtor(item, zone->uz_size, udata);
+
+ if (skip < SKIP_FINI && zone->uz_fini)
+ zone->uz_fini(item, zone->uz_size);
+
+ atomic_add_long(&zone->uz_frees, 1);
+ zone->uz_release(zone->uz_arg, &item, 1);
+}
+
/* See uma.h */
int
uma_zone_set_max(uma_zone_t zone, int nitems)
@@ -2816,13 +2947,15 @@
{
uma_keg_t keg;
- ZONE_LOCK(zone);
keg = zone_first_keg(zone);
+ if (keg == NULL)
+ return (0);
+ KEG_LOCK(keg);
keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
if (keg->uk_maxpages * keg->uk_ipers < nitems)
keg->uk_maxpages += keg->uk_ppera;
- nitems = keg->uk_maxpages * keg->uk_ipers;
- ZONE_UNLOCK(zone);
+ nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
+ KEG_UNLOCK(keg);
return (nitems);
}
@@ -2834,15 +2967,27 @@
int nitems;
uma_keg_t keg;
- ZONE_LOCK(zone);
keg = zone_first_keg(zone);
- nitems = keg->uk_maxpages * keg->uk_ipers;
- ZONE_UNLOCK(zone);
+ if (keg == NULL)
+ return (0);
+ KEG_LOCK(keg);
+ nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
+ KEG_UNLOCK(keg);
return (nitems);
}
/* See uma.h */
+void
+uma_zone_set_warning(uma_zone_t zone, const char *warning)
+{
+
+ ZONE_LOCK(zone);
+ zone->uz_warning = warning;
+ ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
int
uma_zone_get_cur(uma_zone_t zone)
{
@@ -2871,12 +3016,13 @@
{
uma_keg_t keg;
- ZONE_LOCK(zone);
keg = zone_first_keg(zone);
+ KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
+ KEG_LOCK(keg);
KASSERT(keg->uk_pages == 0,
("uma_zone_set_init on non-empty keg"));
keg->uk_init = uminit;
- ZONE_UNLOCK(zone);
+ KEG_UNLOCK(keg);
}
/* See uma.h */
@@ -2885,12 +3031,13 @@
{
uma_keg_t keg;
- ZONE_LOCK(zone);
keg = zone_first_keg(zone);
+ KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type"));
+ KEG_LOCK(keg);
KASSERT(keg->uk_pages == 0,
("uma_zone_set_fini on non-empty keg"));
keg->uk_fini = fini;
- ZONE_UNLOCK(zone);
+ KEG_UNLOCK(keg);
}
/* See uma.h */
@@ -2897,6 +3044,7 @@
void
uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
{
+
ZONE_LOCK(zone);
KASSERT(zone_first_keg(zone)->uk_pages == 0,
("uma_zone_set_zinit on non-empty keg"));
@@ -2908,6 +3056,7 @@
void
uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
{
+
ZONE_LOCK(zone);
KASSERT(zone_first_keg(zone)->uk_pages == 0,
("uma_zone_set_zfini on non-empty keg"));
@@ -2920,10 +3069,13 @@
void
uma_zone_set_freef(uma_zone_t zone, uma_free freef)
{
+ uma_keg_t keg;
- ZONE_LOCK(zone);
- zone_first_keg(zone)->uk_freef = freef;
- ZONE_UNLOCK(zone);
+ keg = zone_first_keg(zone);
+ KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
+ KEG_LOCK(keg);
+ keg->uk_freef = freef;
+ KEG_UNLOCK(keg);
}
/* See uma.h */
@@ -2933,44 +3085,67 @@
{
uma_keg_t keg;
- ZONE_LOCK(zone);
keg = zone_first_keg(zone);
- keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
+ KEG_LOCK(keg);
keg->uk_allocf = allocf;
- ZONE_UNLOCK(zone);
+ KEG_UNLOCK(keg);
}
/* See uma.h */
+void
+uma_zone_reserve(uma_zone_t zone, int items)
+{
+ uma_keg_t keg;
+
+ keg = zone_first_keg(zone);
+ if (keg == NULL)
+ return;
+ KEG_LOCK(keg);
+ keg->uk_reserve = items;
+ KEG_UNLOCK(keg);
+
+ return;
+}
+
+/* See uma.h */
int
-uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
+uma_zone_reserve_kva(uma_zone_t zone, int count)
{
uma_keg_t keg;
vm_offset_t kva;
- int pages;
+ u_int pages;
keg = zone_first_keg(zone);
+ if (keg == NULL)
+ return (0);
pages = count / keg->uk_ipers;
if (pages * keg->uk_ipers < count)
pages++;
+ pages *= keg->uk_ppera;
- kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
-
- if (kva == 0)
- return (0);
- if (obj == NULL)
- obj = vm_object_allocate(OBJT_PHYS, pages);
- else {
- VM_OBJECT_LOCK_INIT(obj, "uma object");
- _vm_object_allocate(OBJT_PHYS, pages, obj);
- }
- ZONE_LOCK(zone);
+#ifdef UMA_MD_SMALL_ALLOC
+ if (keg->uk_ppera > 1) {
+#else
+ if (1) {
+#endif
+ kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
+ if (kva == 0)
+ return (0);
+ } else
+ kva = 0;
+ KEG_LOCK(keg);
keg->uk_kva = kva;
- keg->uk_obj = obj;
+ keg->uk_offset = 0;
keg->uk_maxpages = pages;
- keg->uk_allocf = obj_alloc;
- keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
- ZONE_UNLOCK(zone);
+#ifdef UMA_MD_SMALL_ALLOC
+ keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
+#else
+ keg->uk_allocf = noobj_alloc;
+#endif
+ keg->uk_flags |= UMA_ZONE_NOFREE;
+ KEG_UNLOCK(keg);
+
return (1);
}
@@ -2983,7 +3158,9 @@
uma_keg_t keg;
keg = zone_first_keg(zone);
- ZONE_LOCK(zone);
+ if (keg == NULL)
+ return;
+ KEG_LOCK(keg);
slabs = items / keg->uk_ipers;
if (slabs * keg->uk_ipers < items)
slabs++;
@@ -2995,38 +3172,44 @@
LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
slabs--;
}
- ZONE_UNLOCK(zone);
+ KEG_UNLOCK(keg);
}
/* See uma.h */
-u_int32_t *
+uint32_t *
uma_find_refcnt(uma_zone_t zone, void *item)
{
uma_slabrefcnt_t slabref;
+ uma_slab_t slab;
uma_keg_t keg;
- u_int32_t *refcnt;
+ uint32_t *refcnt;
int idx;
- slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
- (~UMA_SLAB_MASK));
- keg = slabref->us_keg;
- KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
+ slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
+ slabref = (uma_slabrefcnt_t)slab;
+ keg = slab->us_keg;
+ KASSERT(keg->uk_flags & UMA_ZONE_REFCNT,
("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
- idx = ((unsigned long)item - (unsigned long)slabref->us_data)
- / keg->uk_rsize;
- refcnt = &slabref->us_freelist[idx].us_refcnt;
+ idx = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+ refcnt = &slabref->us_refcnt[idx];
return refcnt;
}
/* See uma.h */
-void
-uma_reclaim(void)
+static void
+uma_reclaim_locked(bool kmem_danger)
{
+
#ifdef UMA_DEBUG
printf("UMA: vm asked us to release pages!\n");
#endif
+ sx_assert(&uma_drain_lock, SA_XLOCKED);
bucket_enable();
zone_foreach(zone_drain);
+ if (vm_page_count_min() || kmem_danger) {
+ cache_drain_safe(NULL);
+ zone_foreach(zone_drain);
+ }
/*
* Some slabs may have been freed but this zone will be visited early
* we visit again so that we can free pages that are empty once other
@@ -3037,6 +3220,43 @@
bucket_zone_drain();
}
+void
+uma_reclaim(void)
+{
+
+ sx_xlock(&uma_drain_lock);
+ uma_reclaim_locked(false);
+ sx_xunlock(&uma_drain_lock);
+}
+
+static int uma_reclaim_needed;
+
+void
+uma_reclaim_wakeup(void)
+{
+
+ uma_reclaim_needed = 1;
+ wakeup(&uma_reclaim_needed);
+}
+
+void
+uma_reclaim_worker(void *arg __unused)
+{
+
+ sx_xlock(&uma_drain_lock);
+ for (;;) {
+ sx_sleep(&uma_reclaim_needed, &uma_drain_lock, PVM,
+ "umarcl", 0);
+ if (uma_reclaim_needed) {
+ uma_reclaim_needed = 0;
+ sx_xunlock(&uma_drain_lock);
+ EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
+ sx_xlock(&uma_drain_lock);
+ uma_reclaim_locked(true);
+ }
+ }
+}
+
/* See uma.h */
int
uma_zone_exhausted(uma_zone_t zone)
@@ -3056,11 +3276,11 @@
}
void *
-uma_large_malloc(int size, int wait)
+uma_large_malloc(vm_size_t size, int wait)
{
void *mem;
uma_slab_t slab;
- u_int8_t flags;
+ uint8_t flags;
slab = zone_alloc_item(slabzone, NULL, wait);
if (slab == NULL)
@@ -3072,8 +3292,7 @@
slab->us_flags = flags | UMA_SLAB_MALLOC;
slab->us_size = size;
} else {
- zone_free_item(slabzone, slab, NULL, SKIP_NONE,
- ZFREE_STATFAIL | ZFREE_STATFREE);
+ zone_free_item(slabzone, slab, NULL, SKIP_NONE);
}
return (mem);
@@ -3082,11 +3301,22 @@
void
uma_large_free(uma_slab_t slab)
{
- vsetobj((vm_offset_t)slab->us_data, kmem_object);
+
page_free(slab->us_data, slab->us_size, slab->us_flags);
- zone_free_item(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
+ zone_free_item(slabzone, slab, NULL, SKIP_NONE);
}
+static void
+uma_zero_item(void *item, uma_zone_t zone)
+{
+
+ if (zone->uz_flags & UMA_ZONE_PCPU) {
+ for (int i = 0; i < mp_ncpus; i++)
+ bzero(zpcpu_get_cpu(item, i), zone->uz_size);
+ } else
+ bzero(item, zone->uz_size);
+}
+
void
uma_print_stats(void)
{
@@ -3096,9 +3326,8 @@
static void
slab_print(uma_slab_t slab)
{
- printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
- slab->us_keg, slab->us_data, slab->us_freecount,
- slab->us_firstfree);
+ printf("slab: keg %p, data %p, freecount %d\n",
+ slab->us_keg, slab->us_data, slab->us_freecount);
}
static void
@@ -3120,8 +3349,8 @@
"out %d free %d limit %d\n",
keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
keg->uk_ipers, keg->uk_ppera,
- (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
- (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
+ (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
+ keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
printf("Part slabs:\n");
LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
slab_print(slab);
@@ -3164,11 +3393,11 @@
* directly so that we don't have to.
*/
static void
-uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp,
- u_int64_t *freesp, u_int64_t *sleepsp)
+uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp,
+ uint64_t *freesp, uint64_t *sleepsp)
{
uma_cache_t cache;
- u_int64_t allocs, frees, sleeps;
+ uint64_t allocs, frees, sleeps;
int cachefree, cpu;
allocs = frees = sleeps = 0;
@@ -3204,12 +3433,12 @@
int count;
count = 0;
- mtx_lock(&uma_mtx);
+ rw_rlock(&uma_rwlock);
LIST_FOREACH(kz, &uma_kegs, uk_link) {
LIST_FOREACH(z, &kz->uk_zones, uz_link)
count++;
}
- mtx_unlock(&uma_mtx);
+ rw_runlock(&uma_rwlock);
return (sysctl_handle_int(oidp, &count, 0, req));
}
@@ -3234,7 +3463,7 @@
sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
count = 0;
- mtx_lock(&uma_mtx);
+ rw_rlock(&uma_rwlock);
LIST_FOREACH(kz, &uma_kegs, uk_link) {
LIST_FOREACH(z, &kz->uk_zones, uz_link)
count++;
@@ -3274,7 +3503,7 @@
(LIST_FIRST(&kz->uk_zones) != z))
uth.uth_zone_flags = UTH_ZONE_SECONDARY;
- LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
+ LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
uth.uth_zone_free += bucket->ub_cnt;
uth.uth_allocs = z->uz_allocs;
uth.uth_frees = z->uz_frees;
@@ -3310,23 +3539,52 @@
ZONE_UNLOCK(z);
}
}
- mtx_unlock(&uma_mtx);
+ rw_runlock(&uma_rwlock);
error = sbuf_finish(&sbuf);
sbuf_delete(&sbuf);
return (error);
}
+int
+sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
+{
+ uma_zone_t zone = *(uma_zone_t *)arg1;
+ int error, max, old;
+
+ old = max = uma_zone_get_max(zone);
+ error = sysctl_handle_int(oidp, &max, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ if (max < old)
+ return (EINVAL);
+
+ uma_zone_set_max(zone, max);
+
+ return (0);
+}
+
+int
+sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
+{
+ uma_zone_t zone = *(uma_zone_t *)arg1;
+ int cur;
+
+ cur = uma_zone_get_cur(zone);
+ return (sysctl_handle_int(oidp, &cur, 0, req));
+}
+
#ifdef DDB
DB_SHOW_COMMAND(uma, db_show_uma)
{
- u_int64_t allocs, frees, sleeps;
+ uint64_t allocs, frees, sleeps;
uma_bucket_t bucket;
uma_keg_t kz;
uma_zone_t z;
int cachefree;
- db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
- "Requests", "Sleeps");
+ db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
+ "Free", "Requests", "Sleeps", "Bucket");
LIST_FOREACH(kz, &uma_kegs, uk_link) {
LIST_FOREACH(z, &kz->uk_zones, uz_link) {
if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
@@ -3340,15 +3598,37 @@
if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
(LIST_FIRST(&kz->uk_zones) != z)))
cachefree += kz->uk_free;
- LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
+ LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
cachefree += bucket->ub_cnt;
- db_printf("%18s %8ju %8jd %8d %12ju %8ju\n", z->uz_name,
- (uintmax_t)kz->uk_size,
+ db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n",
+ z->uz_name, (uintmax_t)kz->uk_size,
(intmax_t)(allocs - frees), cachefree,
- (uintmax_t)allocs, sleeps);
+ (uintmax_t)allocs, sleeps, z->uz_count);
if (db_pager_quit)
return;
}
}
}
+
+DB_SHOW_COMMAND(umacache, db_show_umacache)
+{
+ uint64_t allocs, frees;
+ uma_bucket_t bucket;
+ uma_zone_t z;
+ int cachefree;
+
+ db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
+ "Requests", "Bucket");
+ LIST_FOREACH(z, &uma_cachezones, uz_link) {
+ uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
+ LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
+ cachefree += bucket->ub_cnt;
+ db_printf("%18s %8ju %8jd %8d %12ju %8u\n",
+ z->uz_name, (uintmax_t)z->uz_size,
+ (intmax_t)(allocs - frees), cachefree,
+ (uintmax_t)allocs, z->uz_count);
+ if (db_pager_quit)
+ return;
+ }
+}
#endif
Modified: trunk/sys/vm/uma_dbg.c
===================================================================
--- trunk/sys/vm/uma_dbg.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma_dbg.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff at FreeBSD.org>
* Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
@@ -31,10 +32,11 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/uma_dbg.c 252040 2013-06-20 19:08:12Z jeff $");
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/bitset.h>
#include <sys/kernel.h>
#include <sys/types.h>
#include <sys/queue.h>
@@ -49,7 +51,7 @@
#include <vm/uma_int.h>
#include <vm/uma_dbg.h>
-static const u_int32_t uma_junk = 0xdeadc0de;
+static const uint32_t uma_junk = 0xdeadc0de;
/*
* Checks an item to make sure it hasn't been overwritten since it was freed,
@@ -62,7 +64,7 @@
trash_ctor(void *mem, int size, void *arg, int flags)
{
int cnt;
- u_int32_t *p;
+ uint32_t *p;
cnt = size / sizeof(uma_junk);
@@ -85,7 +87,7 @@
trash_dtor(void *mem, int size, void *arg)
{
int cnt;
- u_int32_t *p;
+ uint32_t *p;
cnt = size / sizeof(uma_junk);
@@ -122,7 +124,7 @@
mtrash_ctor(void *mem, int size, void *arg, int flags)
{
struct malloc_type **ksp;
- u_int32_t *p = mem;
+ uint32_t *p = mem;
int cnt;
size -= sizeof(struct malloc_type *);
@@ -150,7 +152,7 @@
mtrash_dtor(void *mem, int size, void *arg)
{
int cnt;
- u_int32_t *p;
+ uint32_t *p;
size -= sizeof(struct malloc_type *);
cnt = size / sizeof(uma_junk);
@@ -191,22 +193,30 @@
(void)mtrash_ctor(mem, size, NULL, 0);
}
+#ifdef INVARIANTS
static uma_slab_t
uma_dbg_getslab(uma_zone_t zone, void *item)
{
uma_slab_t slab;
uma_keg_t keg;
- u_int8_t *mem;
+ uint8_t *mem;
- mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
+ mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
slab = vtoslab((vm_offset_t)mem);
} else {
+ /*
+ * It is safe to return the slab here even though the
+ * zone is unlocked because the item's allocation state
+ * essentially holds a reference.
+ */
+ ZONE_LOCK(zone);
keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
if (keg->uk_flags & UMA_ZONE_HASH)
slab = hash_sfind(&keg->uk_hash, mem);
else
slab = (uma_slab_t)(mem + keg->uk_pgoff);
+ ZONE_UNLOCK(zone);
}
return (slab);
@@ -216,14 +226,14 @@
* Set up the slab's freei data such that uma_dbg_free can function.
*
*/
-
void
uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
{
uma_keg_t keg;
- uma_slabrefcnt_t slabref;
int freei;
+ if (zone_first_keg(zone) == NULL)
+ return;
if (slab == NULL) {
slab = uma_dbg_getslab(zone, item);
if (slab == NULL)
@@ -231,17 +241,13 @@
item, zone->uz_name);
}
keg = slab->us_keg;
+ freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
- freei = ((unsigned long)item - (unsigned long)slab->us_data)
- / keg->uk_rsize;
+ if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+ panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
+ item, zone, zone->uz_name, slab, freei);
+ BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
- if (keg->uk_flags & UMA_ZONE_REFCNT) {
- slabref = (uma_slabrefcnt_t)slab;
- slabref->us_freelist[freei].us_item = 255;
- } else {
- slab->us_freelist[freei].us_item = 255;
- }
-
return;
}
@@ -250,14 +256,14 @@
* and duplicate frees.
*
*/
-
void
uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
{
uma_keg_t keg;
- uma_slabrefcnt_t slabref;
int freei;
+ if (zone_first_keg(zone) == NULL)
+ return;
if (slab == NULL) {
slab = uma_dbg_getslab(zone, item);
if (slab == NULL)
@@ -265,49 +271,21 @@
item, zone->uz_name);
}
keg = slab->us_keg;
+ freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
- freei = ((unsigned long)item - (unsigned long)slab->us_data)
- / keg->uk_rsize;
-
if (freei >= keg->uk_ipers)
- panic("zone: %s(%p) slab %p freelist %d out of range 0-%d\n",
- zone->uz_name, zone, slab, freei, keg->uk_ipers-1);
+ panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
+ item, zone, zone->uz_name, slab, freei);
- if (((freei * keg->uk_rsize) + slab->us_data) != item) {
- printf("zone: %s(%p) slab %p freed address %p unaligned.\n",
- zone->uz_name, zone, slab, item);
- panic("should be %p\n",
- (freei * keg->uk_rsize) + slab->us_data);
- }
+ if (((freei * keg->uk_rsize) + slab->us_data) != item)
+ panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
+ item, zone, zone->uz_name, slab, freei);
- if (keg->uk_flags & UMA_ZONE_REFCNT) {
- slabref = (uma_slabrefcnt_t)slab;
- if (slabref->us_freelist[freei].us_item != 255) {
- printf("Slab at %p, freei %d = %d.\n",
- slab, freei, slabref->us_freelist[freei].us_item);
- panic("Duplicate free of item %p from zone %p(%s)\n",
- item, zone, zone->uz_name);
- }
+ if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+ panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
+ item, zone, zone->uz_name, slab, freei);
- /*
- * When this is actually linked into the slab this will change.
- * Until then the count of valid slabs will make sure we don't
- * accidentally follow this and assume it's a valid index.
- */
- slabref->us_freelist[freei].us_item = 0;
- } else {
- if (slab->us_freelist[freei].us_item != 255) {
- printf("Slab at %p, freei %d = %d.\n",
- slab, freei, slab->us_freelist[freei].us_item);
- panic("Duplicate free of item %p from zone %p(%s)\n",
- item, zone, zone->uz_name);
- }
+ BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
+}
- /*
- * When this is actually linked into the slab this will change.
- * Until then the count of valid slabs will make sure we don't
- * accidentally follow this and assume it's a valid index.
- */
- slab->us_freelist[freei].us_item = 0;
- }
-}
+#endif /* INVARIANTS */
Modified: trunk/sys/vm/uma_dbg.h
===================================================================
--- trunk/sys/vm/uma_dbg.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma_dbg.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff at FreeBSD.org>
* Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
@@ -24,7 +25,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/uma_dbg.h 148078 2005-07-16 09:51:52Z rwatson $
*
*/
Modified: trunk/sys/vm/uma_int.h
===================================================================
--- trunk/sys/vm/uma_int.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma_int.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
/*-
- * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff at FreeBSD.org>
+ * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff at FreeBSD.org>
* Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
* All rights reserved.
*
@@ -24,7 +25,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/uma_int.h 316835 2017-04-14 14:11:59Z avg $
*
*/
@@ -45,21 +46,10 @@
*
* The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may
* be allocated off the page from a special slab zone. The free list within a
- * slab is managed with a linked list of indices, which are 8 bit values. If
- * UMA_SLAB_SIZE is defined to be too large I will have to switch to 16bit
- * values. Currently on alpha you can get 250 or so 32 byte items and on x86
- * you can get 250 or so 16byte items. For item sizes that would yield more
- * than 10% memory waste we potentially allocate a separate uma_slab_t if this
- * will improve the number of items per slab that will fit.
+ * slab is managed with a bitmask. For item sizes that would yield more than
+ * 10% memory waste we potentially allocate a separate uma_slab_t if this will
+ * improve the number of items per slab that will fit.
*
- * Other potential space optimizations are storing the 8bit of linkage in space
- * wasted between items due to alignment problems. This may yield a much better
- * memory footprint for certain sizes of objects. Another alternative is to
- * increase the UMA_SLAB_SIZE, or allow for dynamic slab sizes. I prefer
- * dynamic slab sizes because we could stick with 8 bit indices and only use
- * large slab sizes for zones with a lot of waste per slab. This may create
- * inefficiencies in the vm subsystem due to fragmentation in the address space.
- *
* The only really gross cases, with regards to memory waste, are for those
* items that are just over half the page size. You can get nearly 50% waste,
* so you fall back to the memory footprint of the power of two allocator. I
@@ -120,8 +110,8 @@
#define UMA_BOOT_PAGES 64 /* Pages allocated for startup */
-/* Max waste before going to off page slab management */
-#define UMA_MAX_WASTE (UMA_SLAB_SIZE / 10)
+/* Max waste percentage before going to off page slab management */
+#define UMA_MAX_WASTE 10
/*
* I doubt there will be many cases where this is exceeded. This is the initial
@@ -133,14 +123,9 @@
/*
* I should investigate other hashing algorithms. This should yield a low
* number of collisions if the pages are relatively contiguous.
- *
- * This is the same algorithm that most processor caches use.
- *
- * I'm shifting and masking instead of % because it should be faster.
*/
-#define UMA_HASH(h, s) ((((unsigned long)s) >> UMA_SLAB_SHIFT) & \
- (h)->uh_hashmask)
+#define UMA_HASH(h, s) ((((uintptr_t)s) >> UMA_SLAB_SHIFT) & (h)->uh_hashmask)
#define UMA_HASH_INSERT(h, s, mem) \
SLIST_INSERT_HEAD(&(h)->uh_slab_hash[UMA_HASH((h), \
@@ -184,8 +169,8 @@
struct uma_cache {
uma_bucket_t uc_freebucket; /* Bucket we're freeing to */
uma_bucket_t uc_allocbucket; /* Bucket to allocate from */
- u_int64_t uc_allocs; /* Count of allocations */
- u_int64_t uc_frees; /* Count of frees */
+ uint64_t uc_allocs; /* Count of allocations */
+ uint64_t uc_frees; /* Count of frees */
} UMA_ALIGN;
typedef struct uma_cache * uma_cache_t;
@@ -197,24 +182,21 @@
*
*/
struct uma_keg {
- LIST_ENTRY(uma_keg) uk_link; /* List of all kegs */
-
- struct mtx uk_lock; /* Lock for the keg */
+ struct mtx_padalign uk_lock; /* Lock for the keg */
struct uma_hash uk_hash;
- const char *uk_name; /* Name of creating zone. */
LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */
LIST_HEAD(,uma_slab) uk_part_slab; /* partially allocated slabs */
LIST_HEAD(,uma_slab) uk_free_slab; /* empty slab list */
LIST_HEAD(,uma_slab) uk_full_slab; /* full slabs */
- u_int32_t uk_recurse; /* Allocation recursion count */
- u_int32_t uk_align; /* Alignment mask */
- u_int32_t uk_pages; /* Total page count */
- u_int32_t uk_free; /* Count of items free in slabs */
- u_int32_t uk_size; /* Requested size of each item */
- u_int32_t uk_rsize; /* Real size of each item */
- u_int32_t uk_maxpages; /* Maximum number of pages to alloc */
+ uint32_t uk_align; /* Alignment mask */
+ uint32_t uk_pages; /* Total page count */
+ uint32_t uk_free; /* Count of items free in slabs */
+ uint32_t uk_reserve; /* Number of reserved items. */
+ uint32_t uk_size; /* Requested size of each item */
+ uint32_t uk_rsize; /* Real size of each item */
+ uint32_t uk_maxpages; /* Maximum number of pages to alloc */
uma_init uk_init; /* Keg's init routine */
uma_fini uk_fini; /* Keg's fini routine */
@@ -221,21 +203,32 @@
uma_alloc uk_allocf; /* Allocation function */
uma_free uk_freef; /* Free routine */
- struct vm_object *uk_obj; /* Zone specific object */
- vm_offset_t uk_kva; /* Base kva for zones with objs */
+ u_long uk_offset; /* Next free offset from base KVA */
+ vm_offset_t uk_kva; /* Zone base KVA */
uma_zone_t uk_slabzone; /* Slab zone backing us, if OFFPAGE */
- u_int16_t uk_pgoff; /* Offset to uma_slab struct */
- u_int16_t uk_ppera; /* pages per allocation from backend */
- u_int16_t uk_ipers; /* Items per slab */
- u_int32_t uk_flags; /* Internal flags */
+ uint16_t uk_pgoff; /* Offset to uma_slab struct */
+ uint16_t uk_ppera; /* pages per allocation from backend */
+ uint16_t uk_ipers; /* Items per slab */
+ uint32_t uk_flags; /* Internal flags */
+
+ /* Least used fields go to the last cache line. */
+ const char *uk_name; /* Name of creating zone. */
+ LIST_ENTRY(uma_keg) uk_link; /* List of all kegs */
};
typedef struct uma_keg * uma_keg_t;
-/* Page management structure */
+/*
+ * Free bits per-slab.
+ */
+#define SLAB_SETSIZE (PAGE_SIZE / UMA_SMALLEST_UNIT)
+BITSET_DEFINE(slabbits, SLAB_SETSIZE);
-/* Sorry for the union, but space efficiency is important */
-struct uma_slab_head {
+/*
+ * The slab structure manages a single contiguous allocation from backing
+ * store and subdivides it into individually allocatable items.
+ */
+struct uma_slab {
uma_keg_t us_keg; /* Keg we live in */
union {
LIST_ENTRY(uma_slab) _us_link; /* slabs in zone */
@@ -242,19 +235,18 @@
unsigned long _us_size; /* Size of allocation */
} us_type;
SLIST_ENTRY(uma_slab) us_hlink; /* Link for hash table */
- u_int8_t *us_data; /* First item */
- u_int8_t us_flags; /* Page flags see uma.h */
- u_int8_t us_freecount; /* How many are free? */
- u_int8_t us_firstfree; /* First free item index */
+ uint8_t *us_data; /* First item */
+ struct slabbits us_free; /* Free bitmask. */
+#ifdef INVARIANTS
+ struct slabbits us_debugfree; /* Debug bitmask. */
+#endif
+ uint16_t us_freecount; /* How many are free? */
+ uint8_t us_flags; /* Page flags see uma.h */
+ uint8_t us_pad; /* Pad to 32bits, unused. */
};
-/* The standard slab structure */
-struct uma_slab {
- struct uma_slab_head us_head; /* slab header data */
- struct {
- u_int8_t us_item;
- } us_freelist[1]; /* actual number bigger */
-};
+#define us_link us_type._us_link
+#define us_size us_type._us_size
/*
* The slab structure for UMA_ZONE_REFCNT zones for whose items we
@@ -261,37 +253,14 @@
* maintain reference counters in the slab for.
*/
struct uma_slab_refcnt {
- struct uma_slab_head us_head; /* slab header data */
- struct {
- u_int8_t us_item;
- u_int32_t us_refcnt;
- } us_freelist[1]; /* actual number bigger */
+ struct uma_slab us_head; /* slab header data */
+ uint32_t us_refcnt[0]; /* Actually larger. */
};
-#define us_keg us_head.us_keg
-#define us_link us_head.us_type._us_link
-#define us_size us_head.us_type._us_size
-#define us_hlink us_head.us_hlink
-#define us_data us_head.us_data
-#define us_flags us_head.us_flags
-#define us_freecount us_head.us_freecount
-#define us_firstfree us_head.us_firstfree
-
typedef struct uma_slab * uma_slab_t;
typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int);
-
-/*
- * These give us the size of one free item reference within our corresponding
- * uma_slab structures, so that our calculations during zone setup are correct
- * regardless of what the compiler decides to do with padding the structure
- * arrays within uma_slab.
- */
-#define UMA_FRITM_SZ (sizeof(struct uma_slab) - sizeof(struct uma_slab_head))
-#define UMA_FRITMREF_SZ (sizeof(struct uma_slab_refcnt) - \
- sizeof(struct uma_slab_head))
-
struct uma_klink {
LIST_ENTRY(uma_klink) kl_link;
uma_keg_t kl_keg;
@@ -305,12 +274,12 @@
*
*/
struct uma_zone {
- const char *uz_name; /* Text name of the zone */
- struct mtx *uz_lock; /* Lock for the zone (keg's lock) */
+ struct mtx_padalign uz_lock; /* Lock for the zone */
+ struct mtx_padalign *uz_lockptr;
+ const char *uz_name; /* Text name of the zone */
LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */
- LIST_HEAD(,uma_bucket) uz_full_bucket; /* full buckets */
- LIST_HEAD(,uma_bucket) uz_free_bucket; /* Buckets for frees */
+ LIST_HEAD(,uma_bucket) uz_buckets; /* full buckets */
LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */
struct uma_klink uz_klink; /* klink for first keg. */
@@ -319,18 +288,25 @@
uma_ctor uz_ctor; /* Constructor for each allocation */
uma_dtor uz_dtor; /* Destructor */
uma_init uz_init; /* Initializer for each item */
- uma_fini uz_fini; /* Discards memory */
+ uma_fini uz_fini; /* Finalizer for each item. */
+ uma_import uz_import; /* Import new memory to cache. */
+ uma_release uz_release; /* Release memory from cache. */
+ void *uz_arg; /* Import/release argument. */
- u_int32_t uz_flags; /* Flags inherited from kegs */
- u_int32_t uz_size; /* Size inherited from kegs */
+ uint32_t uz_flags; /* Flags inherited from kegs */
+ uint32_t uz_size; /* Size inherited from kegs */
- u_int64_t uz_allocs UMA_ALIGN; /* Total number of allocations */
- u_int64_t uz_frees; /* Total number of frees */
- u_int64_t uz_fails; /* Total number of alloc failures */
- u_int64_t uz_sleeps; /* Total number of alloc sleeps */
- uint16_t uz_fills; /* Outstanding bucket fills */
- uint16_t uz_count; /* Highest value ub_ptr can have */
+ volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */
+ volatile u_long uz_fails; /* Total number of alloc failures */
+ volatile u_long uz_frees; /* Total number of frees */
+ uint64_t uz_sleeps; /* Total number of alloc sleeps */
+ uint16_t uz_count; /* Amount of items in full bucket */
+ uint16_t uz_count_min; /* Minimal amount of items there */
+ /* The next three fields are used to print a rate-limited warnings. */
+ const char *uz_warning; /* Warning to print on failure */
+ struct timeval uz_ratecheck; /* Warnings rate-limiting */
+
/*
* This HAS to be the last item because we adjust the zone size
* based on NCPU and then allocate the space for the zones.
@@ -341,23 +317,31 @@
/*
* These flags must not overlap with the UMA_ZONE flags specified in uma.h.
*/
-#define UMA_ZFLAG_BUCKET 0x02000000 /* Bucket zone. */
#define UMA_ZFLAG_MULTI 0x04000000 /* Multiple kegs in the zone. */
#define UMA_ZFLAG_DRAINING 0x08000000 /* Running zone_drain. */
-#define UMA_ZFLAG_PRIVALLOC 0x10000000 /* Use uz_allocf. */
+#define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */
#define UMA_ZFLAG_INTERNAL 0x20000000 /* No offpage no PCPU. */
#define UMA_ZFLAG_FULL 0x40000000 /* Reached uz_maxpages */
#define UMA_ZFLAG_CACHEONLY 0x80000000 /* Don't ask VM for buckets. */
-#define UMA_ZFLAG_INHERIT (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | \
- UMA_ZFLAG_BUCKET)
+#define UMA_ZFLAG_INHERIT \
+ (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | UMA_ZFLAG_BUCKET)
+static inline uma_keg_t
+zone_first_keg(uma_zone_t zone)
+{
+ uma_klink_t klink;
+
+ klink = LIST_FIRST(&zone->uz_kegs);
+ return (klink != NULL) ? klink->kl_keg : NULL;
+}
+
#undef UMA_ALIGN
#ifdef _KERNEL
/* Internal prototypes */
-static __inline uma_slab_t hash_sfind(struct uma_hash *hash, u_int8_t *data);
-void *uma_large_malloc(int size, int wait);
+static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data);
+void *uma_large_malloc(vm_size_t size, int wait);
void uma_large_free(uma_slab_t slab);
/* Lock Macros */
@@ -371,13 +355,26 @@
mtx_init(&(k)->uk_lock, (k)->uk_name, \
"UMA zone", MTX_DEF | MTX_DUPOK); \
} while (0)
-
+
#define KEG_LOCK_FINI(k) mtx_destroy(&(k)->uk_lock)
#define KEG_LOCK(k) mtx_lock(&(k)->uk_lock)
#define KEG_UNLOCK(k) mtx_unlock(&(k)->uk_lock)
-#define ZONE_LOCK(z) mtx_lock((z)->uz_lock)
-#define ZONE_UNLOCK(z) mtx_unlock((z)->uz_lock)
+#define ZONE_LOCK_INIT(z, lc) \
+ do { \
+ if ((lc)) \
+ mtx_init(&(z)->uz_lock, (z)->uz_name, \
+ (z)->uz_name, MTX_DEF | MTX_DUPOK); \
+ else \
+ mtx_init(&(z)->uz_lock, (z)->uz_name, \
+ "UMA zone", MTX_DEF | MTX_DUPOK); \
+ } while (0)
+
+#define ZONE_LOCK(z) mtx_lock((z)->uz_lockptr)
+#define ZONE_TRYLOCK(z) mtx_trylock((z)->uz_lockptr)
+#define ZONE_UNLOCK(z) mtx_unlock((z)->uz_lockptr)
+#define ZONE_LOCK_FINI(z) mtx_destroy(&(z)->uz_lock)
+
/*
* Find a slab within a hash table. This is used for OFFPAGE zones to lookup
* the slab structure.
@@ -390,7 +387,7 @@
* A pointer to a slab if successful, else NULL.
*/
static __inline uma_slab_t
-hash_sfind(struct uma_hash *hash, u_int8_t *data)
+hash_sfind(struct uma_hash *hash, uint8_t *data)
{
uma_slab_t slab;
int hval;
@@ -398,7 +395,7 @@
hval = UMA_HASH(hash, data);
SLIST_FOREACH(slab, &hash->uh_slab_hash[hval], us_hlink) {
- if ((u_int8_t *)slab->us_data == data)
+ if ((uint8_t *)slab->us_data == data)
return (slab);
}
return (NULL);
@@ -408,15 +405,9 @@
vtoslab(vm_offset_t va)
{
vm_page_t p;
- uma_slab_t slab;
p = PHYS_TO_VM_PAGE(pmap_kextract(va));
- slab = (uma_slab_t )p->object;
-
- if (p->flags & PG_SLAB)
- return (slab);
- else
- return (NULL);
+ return ((uma_slab_t)p->plinks.s.pv);
}
static __inline void
@@ -425,27 +416,17 @@
vm_page_t p;
p = PHYS_TO_VM_PAGE(pmap_kextract(va));
- p->object = (vm_object_t)slab;
- p->flags |= PG_SLAB;
+ p->plinks.s.pv = slab;
}
-static __inline void
-vsetobj(vm_offset_t va, vm_object_t obj)
-{
- vm_page_t p;
-
- p = PHYS_TO_VM_PAGE(pmap_kextract(va));
- p->object = obj;
- p->flags &= ~PG_SLAB;
-}
-
/*
* The following two functions may be defined by architecture specific code
* if they can provide more effecient allocation functions. This is useful
* for using direct mapped addresses.
*/
-void *uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait);
-void uma_small_free(void *mem, int size, u_int8_t flags);
+void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag,
+ int wait);
+void uma_small_free(void *mem, vm_size_t size, uint8_t flags);
#endif /* _KERNEL */
#endif /* VM_UMA_INT_H */
Modified: trunk/sys/vm/vm.h
===================================================================
--- trunk/sys/vm/vm.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -55,7 +56,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm.h 321717 2017-07-30 10:36:20Z kib $
*/
#ifndef VM_H
@@ -68,6 +69,7 @@
#define VM_INHERIT_SHARE ((vm_inherit_t) 0)
#define VM_INHERIT_COPY ((vm_inherit_t) 1)
#define VM_INHERIT_NONE ((vm_inherit_t) 2)
+#define VM_INHERIT_ZERO ((vm_inherit_t) 3)
#define VM_INHERIT_DEFAULT VM_INHERIT_COPY
typedef u_char vm_prot_t; /* protection codes */
@@ -77,6 +79,7 @@
#define VM_PROT_WRITE ((vm_prot_t) 0x02)
#define VM_PROT_EXECUTE ((vm_prot_t) 0x04)
#define VM_PROT_COPY ((vm_prot_t) 0x08) /* copy-on-read */
+#define VM_PROT_FAULT_LOOKUP ((vm_prot_t) 0x010)
#define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)
#define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE)
@@ -134,8 +137,6 @@
vm_offset_t buffer_eva;
vm_offset_t clean_sva;
vm_offset_t clean_eva;
- vm_offset_t pager_sva;
- vm_offset_t pager_eva;
};
extern struct kva_md_info kmi;
@@ -149,6 +150,7 @@
void swap_reserve_force(vm_ooffset_t incr);
void swap_release(vm_ooffset_t decr);
void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred);
+void swapper(void);
#endif /* VM_H */
Modified: trunk/sys/vm/vm_extern.h
===================================================================
--- trunk/sys/vm/vm_extern.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_extern.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
@@ -27,36 +28,50 @@
* SUCH DAMAGE.
*
* @(#)vm_extern.h 8.2 (Berkeley) 1/12/94
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_extern.h 270920 2014-09-01 07:58:15Z kib $
*/
#ifndef _VM_EXTERN_H_
#define _VM_EXTERN_H_
+struct pmap;
struct proc;
struct vmspace;
struct vnode;
+struct vmem;
#ifdef _KERNEL
-int kernacc(void *, int, int);
-vm_offset_t kmem_alloc(vm_map_t, vm_size_t);
-vm_offset_t kmem_alloc_attr(vm_map_t map, vm_size_t size, int flags,
+/* These operate on kernel virtual addresses only. */
+vm_offset_t kva_alloc(vm_size_t);
+void kva_free(vm_offset_t, vm_size_t);
+
+/* These operate on pageable virtual addresses. */
+vm_offset_t kmap_alloc_wait(vm_map_t, vm_size_t);
+void kmap_free_wakeup(vm_map_t, vm_offset_t, vm_size_t);
+
+/* These operate on virtual addresses backed by memory. */
+vm_offset_t kmem_alloc_attr(struct vmem *, vm_size_t size, int flags,
vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr);
-vm_offset_t kmem_alloc_contig(vm_map_t map, vm_size_t size, int flags,
- vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
- unsigned long boundary, vm_memattr_t memattr);
-vm_offset_t kmem_alloc_nofault(vm_map_t, vm_size_t);
-vm_offset_t kmem_alloc_nofault_space(vm_map_t, vm_size_t, int);
-vm_offset_t kmem_alloc_wait(vm_map_t, vm_size_t);
-void kmem_free(vm_map_t, vm_offset_t, vm_size_t);
-void kmem_free_wakeup(vm_map_t, vm_offset_t, vm_size_t);
-void kmem_init(vm_offset_t, vm_offset_t);
-vm_offset_t kmem_malloc(vm_map_t map, vm_size_t size, int flags);
-int kmem_back(vm_map_t, vm_offset_t, vm_size_t, int);
+vm_offset_t kmem_alloc_contig(struct vmem *, vm_size_t size, int flags,
+ vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
+ vm_memattr_t memattr);
+vm_offset_t kmem_malloc(struct vmem *, vm_size_t size, int flags);
+void kmem_free(struct vmem *, vm_offset_t, vm_size_t);
+
+/* This provides memory for previously allocated address space. */
+int kmem_back(vm_object_t, vm_offset_t, vm_size_t, int);
+void kmem_unback(vm_object_t, vm_offset_t, vm_size_t);
+
+/* Bootstrapping. */
vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t,
boolean_t);
+void kmem_init(vm_offset_t, vm_offset_t);
+void kmem_init_zero_region(void);
+void kmeminit(void);
+
void swapout_procs(int);
+int kernacc(void *, int, int);
int useracc(void *, int, int);
int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
@@ -67,8 +82,6 @@
int fault_flags, vm_page_t *m_hold);
int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
vm_prot_t prot, vm_page_t *ma, int max_count);
-void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
-int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
void vm_waitproc(struct proc *);
int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t);
@@ -75,7 +88,8 @@
int vm_mmap_to_errno(int rv);
void vm_set_page_size(void);
void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t);
-struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t);
+typedef int (*pmap_pinit_t)(struct pmap *pmap);
+struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t);
struct vmspace *vmspace_fork(struct vmspace *, vm_ooffset_t *);
int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
int vmspace_unshare(struct proc *);
@@ -90,5 +104,6 @@
void vm_imgact_unmap_page(struct sf_buf *sf);
void vm_thread_dispose(struct thread *td);
int vm_thread_new(struct thread *td, int pages);
+int vm_mlock(struct proc *, struct ucred *, const void *, size_t);
#endif /* _KERNEL */
#endif /* !_VM_EXTERN_H_ */
Modified: trunk/sys/vm/vm_fault.c
===================================================================
--- trunk/sys/vm/vm_fault.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_fault.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -72,7 +73,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_fault.c 329707 2018-02-21 11:31:29Z kib $");
#include "opt_ktrace.h"
#include "opt_vm.h"
@@ -81,9 +82,9 @@
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
@@ -101,22 +102,12 @@
#include <vm/vm_kern.h>
#include <vm/vm_pager.h>
#include <vm/vm_extern.h>
+#include <vm/vm_reserv.h>
-#include <sys/mount.h> /* XXX Temporary for VFS_LOCK_GIANT() */
-
#define PFBAK 4
#define PFFOR 4
-#define PAGEORDER_SIZE (PFBAK+PFFOR)
-static int prefault_pageorder[] = {
- -1 * PAGE_SIZE, 1 * PAGE_SIZE,
- -2 * PAGE_SIZE, 2 * PAGE_SIZE,
- -3 * PAGE_SIZE, 3 * PAGE_SIZE,
- -4 * PAGE_SIZE, 4 * PAGE_SIZE
-};
-
static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
-static void vm_fault_prefault(pmap_t, vm_offset_t, vm_map_entry_t);
#define VM_FAULT_READ_BEHIND 8
#define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX)
@@ -134,17 +125,19 @@
vm_map_t map;
vm_map_entry_t entry;
int lookup_still_valid;
+ int map_generation;
struct vnode *vp;
- int vfslocked;
};
static void vm_fault_cache_behind(const struct faultstate *fs, int distance);
+static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
+ int faultcount, int reqpage);
static inline void
release_page(struct faultstate *fs)
{
- vm_page_wakeup(fs->m);
+ vm_page_xunbusy(fs->m);
vm_page_lock(fs->m);
vm_page_deactivate(fs->m);
vm_page_unlock(fs->m);
@@ -162,39 +155,141 @@
}
static void
+unlock_vp(struct faultstate *fs)
+{
+
+ if (fs->vp != NULL) {
+ vput(fs->vp);
+ fs->vp = NULL;
+ }
+}
+
+static void
unlock_and_deallocate(struct faultstate *fs)
{
vm_object_pip_wakeup(fs->object);
- VM_OBJECT_UNLOCK(fs->object);
+ VM_OBJECT_WUNLOCK(fs->object);
if (fs->object != fs->first_object) {
- VM_OBJECT_LOCK(fs->first_object);
+ VM_OBJECT_WLOCK(fs->first_object);
vm_page_lock(fs->first_m);
vm_page_free(fs->first_m);
vm_page_unlock(fs->first_m);
vm_object_pip_wakeup(fs->first_object);
- VM_OBJECT_UNLOCK(fs->first_object);
+ VM_OBJECT_WUNLOCK(fs->first_object);
fs->first_m = NULL;
}
vm_object_deallocate(fs->first_object);
- unlock_map(fs);
- if (fs->vp != NULL) {
- vput(fs->vp);
- fs->vp = NULL;
+ unlock_map(fs);
+ unlock_vp(fs);
+}
+
+static void
+vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
+ vm_prot_t fault_type, int fault_flags, bool set_wd)
+{
+ bool need_dirty;
+
+ if (((prot & VM_PROT_WRITE) == 0 &&
+ (fault_flags & VM_FAULT_DIRTY) == 0) ||
+ (m->oflags & VPO_UNMANAGED) != 0)
+ return;
+
+ VM_OBJECT_ASSERT_LOCKED(m->object);
+
+ need_dirty = ((fault_type & VM_PROT_WRITE) != 0 &&
+ (fault_flags & VM_FAULT_WIRE) == 0) ||
+ (fault_flags & VM_FAULT_DIRTY) != 0;
+
+ if (set_wd)
+ vm_object_set_writeable_dirty(m->object);
+ else
+ /*
+ * If two callers of vm_fault_dirty() with set_wd ==
+ * FALSE, one for the map entry with MAP_ENTRY_NOSYNC
+ * flag set, other with flag clear, race, it is
+ * possible for the no-NOSYNC thread to see m->dirty
+ * != 0 and not clear VPO_NOSYNC. Take vm_page lock
+ * around manipulation of VPO_NOSYNC and
+ * vm_page_dirty() call, to avoid the race and keep
+ * m->oflags consistent.
+ */
+ vm_page_lock(m);
+
+ /*
+ * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
+ * if the page is already dirty to prevent data written with
+ * the expectation of being synced from not being synced.
+ * Likewise if this entry does not request NOSYNC then make
+ * sure the page isn't marked NOSYNC. Applications sharing
+ * data should use the same flags to avoid ping ponging.
+ */
+ if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) {
+ if (m->dirty == 0) {
+ m->oflags |= VPO_NOSYNC;
+ }
+ } else {
+ m->oflags &= ~VPO_NOSYNC;
}
- VFS_UNLOCK_GIANT(fs->vfslocked);
- fs->vfslocked = 0;
+
+ /*
+ * If the fault is a write, we know that this page is being
+ * written NOW so dirty it explicitly to save on
+ * pmap_is_modified() calls later.
+ *
+ * Also tell the backing pager, if any, that it should remove
+ * any swap backing since the page is now dirty.
+ */
+ if (need_dirty)
+ vm_page_dirty(m);
+ if (!set_wd)
+ vm_page_unlock(m);
+ if (need_dirty)
+ vm_pager_page_unswapped(m);
}
+static void
+vm_fault_fill_hold(vm_page_t *m_hold, vm_page_t m)
+{
+
+ if (m_hold != NULL) {
+ *m_hold = m;
+ vm_page_lock(m);
+ vm_page_hold(m);
+ vm_page_unlock(m);
+ }
+}
+
/*
- * TRYPAGER - used by vm_fault to calculate whether the pager for the
- * current object *might* contain the page.
- *
- * default objects are zero-fill, there is no real pager.
+ * Unlocks fs.first_object and fs.map on success.
*/
-#define TRYPAGER (fs.object->type != OBJT_DEFAULT && \
- ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 || wired))
+static int
+vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
+ int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
+{
+ vm_page_t m;
+ int rv;
+ MPASS(fs->vp == NULL);
+ m = vm_page_lookup(fs->first_object, fs->first_pindex);
+ /* A busy page can be mapped for read|execute access. */
+ if (m == NULL || ((prot & VM_PROT_WRITE) != 0 &&
+ vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL)
+ return (KERN_FAILURE);
+ rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
+ PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), 0);
+ if (rv != KERN_SUCCESS)
+ return (rv);
+ vm_fault_fill_hold(m_hold, m);
+ vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
+ VM_OBJECT_RUNLOCK(fs->first_object);
+ if (!wired)
+ vm_fault_prefault(fs, vaddr, 0, 0);
+ vm_map_lookup_done(fs->map, fs->entry);
+ curthread->td_ru.ru_minflt++;
+ return (KERN_SUCCESS);
+}
+
/*
* vm_fault:
*
@@ -242,8 +337,7 @@
vm_prot_t prot;
long ahead, behind;
int alloc_req, era, faultcount, nera, reqpage, result;
- boolean_t growstack, is_first_object_locked, wired;
- int map_generation;
+ boolean_t dead, is_first_object_locked, wired;
vm_object_t next_object;
vm_page_t marray[VM_FAULT_READ_MAX];
int hardfault;
@@ -252,10 +346,8 @@
int locked, error;
hardfault = 0;
- growstack = TRUE;
PCPU_INC(cnt.v_vm_faults);
fs.vp = NULL;
- fs.vfslocked = 0;
faultcount = reqpage = 0;
RetryFault:;
@@ -265,21 +357,15 @@
* search.
*/
fs.map = map;
- result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry,
- &fs.first_object, &fs.first_pindex, &prot, &wired);
+ result = vm_map_lookup(&fs.map, vaddr, fault_type |
+ VM_PROT_FAULT_LOOKUP, &fs.entry, &fs.first_object,
+ &fs.first_pindex, &prot, &wired);
if (result != KERN_SUCCESS) {
- if (growstack && result == KERN_INVALID_ADDRESS &&
- map != kernel_map) {
- result = vm_map_growstack(curproc, vaddr);
- if (result != KERN_SUCCESS)
- return (KERN_FAILURE);
- growstack = FALSE;
- goto RetryFault;
- }
+ unlock_vp(&fs);
return (result);
}
- map_generation = fs.map->timestamp;
+ fs.map_generation = fs.map->timestamp;
if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
panic("vm_fault: fault on nofault entry, addr: %lx",
@@ -286,7 +372,63 @@
(u_long)vaddr);
}
+ if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION &&
+ fs.entry->wiring_thread != curthread) {
+ vm_map_unlock_read(fs.map);
+ vm_map_lock(fs.map);
+ if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) &&
+ (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
+ unlock_vp(&fs);
+ fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
+ vm_map_unlock_and_wait(fs.map, 0);
+ } else
+ vm_map_unlock(fs.map);
+ goto RetryFault;
+ }
+
+ MPASS((fs.entry->eflags & MAP_ENTRY_GUARD) == 0);
+
+ if (wired)
+ fault_type = prot | (fault_type & VM_PROT_COPY);
+ else
+ KASSERT((fault_flags & VM_FAULT_WIRE) == 0,
+ ("!wired && VM_FAULT_WIRE"));
+
/*
+ * Try to avoid lock contention on the top-level object through
+ * special-case handling of some types of page faults, specifically,
+ * those that are both (1) mapping an existing page from the top-
+ * level object and (2) not having to mark that object as containing
+ * dirty pages. Under these conditions, a read lock on the top-level
+ * object suffices, allowing multiple page faults of a similar type to
+ * run in parallel on the same top-level object.
+ */
+ if (fs.vp == NULL /* avoid locked vnode leak */ &&
+ (fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0 &&
+ /* avoid calling vm_object_set_writeable_dirty() */
+ ((prot & VM_PROT_WRITE) == 0 ||
+ (fs.first_object->type != OBJT_VNODE &&
+ (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
+ (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0)) {
+ VM_OBJECT_RLOCK(fs.first_object);
+ if ((prot & VM_PROT_WRITE) == 0 ||
+ (fs.first_object->type != OBJT_VNODE &&
+ (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
+ (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) {
+ result = vm_fault_soft_fast(&fs, vaddr, prot,
+ fault_type, fault_flags, wired, m_hold);
+ if (result == KERN_SUCCESS)
+ return (result);
+ }
+ if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
+ VM_OBJECT_RUNLOCK(fs.first_object);
+ VM_OBJECT_WLOCK(fs.first_object);
+ }
+ } else {
+ VM_OBJECT_WLOCK(fs.first_object);
+ }
+
+ /*
* Make a reference to this object to prevent its disposal while we
* are messing with it. Once we have the reference, the map is free
* to be diddled. Since objects reference their shadows (and copies),
@@ -296,15 +438,11 @@
* truncation operations) during I/O. This must be done after
* obtaining the vnode lock in order to avoid possible deadlocks.
*/
- VM_OBJECT_LOCK(fs.first_object);
vm_object_reference_locked(fs.first_object);
vm_object_pip_add(fs.first_object, 1);
fs.lookup_still_valid = TRUE;
- if (wired)
- fault_type = prot | (fault_type & VM_PROT_COPY);
-
fs.first_m = NULL;
/*
@@ -314,11 +452,18 @@
fs.pindex = fs.first_pindex;
while (TRUE) {
/*
- * If the object is dead, we stop here
+ * If the object is marked for imminent termination,
+ * we retry here, since the collapse pass has raced
+ * with us. Otherwise, if we see terminally dead
+ * object, return fail.
*/
- if (fs.object->flags & OBJ_DEAD) {
+ if ((fs.object->flags & OBJ_DEAD) != 0) {
+ dead = fs.object->type == OBJT_DEAD;
unlock_and_deallocate(&fs);
- return (KERN_PROTECTION_FAILURE);
+ if (dead)
+ return (KERN_PROTECTION_FAILURE);
+ pause("vmf_de", 1);
+ goto RetryFault;
}
/*
@@ -326,31 +471,13 @@
*/
fs.m = vm_page_lookup(fs.object, fs.pindex);
if (fs.m != NULL) {
- /*
- * check for page-based copy on write.
- * We check fs.object == fs.first_object so
- * as to ensure the legacy COW mechanism is
- * used when the page in question is part of
- * a shadow object. Otherwise, vm_page_cowfault()
- * removes the page from the backing object,
- * which is not what we want.
- */
- vm_page_lock(fs.m);
- if ((fs.m->cow) &&
- (fault_type & VM_PROT_WRITE) &&
- (fs.object == fs.first_object)) {
- vm_page_cowfault(fs.m);
- unlock_and_deallocate(&fs);
- goto RetryFault;
- }
-
/*
* Wait/Retry if the page is busy. We have to do this
- * if the page is busy via either VPO_BUSY or
- * vm_page_t->busy because the vm_pager may be using
- * vm_page_t->busy for pageouts ( and even pageins if
- * it is the vnode pager ), and we could end up trying
- * to pagein and pageout the same page simultaneously.
+ * if the page is either exclusive or shared busy
+ * because the vm_pager may be using read busy for
+ * pageouts (and even pageins if it is the vnode
+ * pager), and we could end up trying to pagein and
+ * pageout the same page simultaneously.
*
* We can theoretically allow the busy case on a read
* fault if the page is marked valid, but since such
@@ -357,10 +484,10 @@
* pages are typically already pmap'd, putting that
* special case in might be more effort then it is
* worth. We cannot under any circumstances mess
- * around with a vm_page_t->busy page except, perhaps,
+ * around with a shared busied page except, perhaps,
* to pmap it.
*/
- if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) {
+ if (vm_page_busied(fs.m)) {
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
@@ -367,34 +494,33 @@
* likely to reclaim it.
*/
vm_page_aflag_set(fs.m, PGA_REFERENCED);
- vm_page_unlock(fs.m);
if (fs.object != fs.first_object) {
- if (!VM_OBJECT_TRYLOCK(
+ if (!VM_OBJECT_TRYWLOCK(
fs.first_object)) {
- VM_OBJECT_UNLOCK(fs.object);
- VM_OBJECT_LOCK(fs.first_object);
- VM_OBJECT_LOCK(fs.object);
+ VM_OBJECT_WUNLOCK(fs.object);
+ VM_OBJECT_WLOCK(fs.first_object);
+ VM_OBJECT_WLOCK(fs.object);
}
vm_page_lock(fs.first_m);
vm_page_free(fs.first_m);
vm_page_unlock(fs.first_m);
vm_object_pip_wakeup(fs.first_object);
- VM_OBJECT_UNLOCK(fs.first_object);
+ VM_OBJECT_WUNLOCK(fs.first_object);
fs.first_m = NULL;
}
unlock_map(&fs);
if (fs.m == vm_page_lookup(fs.object,
fs.pindex)) {
- vm_page_sleep_if_busy(fs.m, TRUE,
- "vmpfw");
+ vm_page_sleep_if_busy(fs.m, "vmpfw");
}
vm_object_pip_wakeup(fs.object);
- VM_OBJECT_UNLOCK(fs.object);
+ VM_OBJECT_WUNLOCK(fs.object);
PCPU_INC(cnt.v_intrans);
vm_object_deallocate(fs.first_object);
goto RetryFault;
}
- vm_pageq_remove(fs.m);
+ vm_page_lock(fs.m);
+ vm_page_remque(fs.m);
vm_page_unlock(fs.m);
/*
@@ -403,7 +529,7 @@
* (readable), jump to readrest, else break-out ( we
* found the page ).
*/
- vm_page_busy(fs.m);
+ vm_page_xbusy(fs.m);
if (fs.m->valid != VM_PAGE_BITS_ALL)
goto readrest;
break;
@@ -410,10 +536,12 @@
}
/*
- * Page is not resident, If this is the search termination
+ * Page is not resident. If this is the search termination
* or the pager might contain the page, allocate a new page.
+ * Default objects are zero-fill, there is no real pager.
*/
- if (TRYPAGER || fs.object == fs.first_object) {
+ if (fs.object->type != OBJT_DEFAULT ||
+ fs.object == fs.first_object) {
if (fs.pindex >= fs.object->size) {
unlock_and_deallocate(&fs);
return (KERN_PROTECTION_FAILURE);
@@ -460,9 +588,10 @@
*
* Attempt to fault-in the page if there is a chance that the
* pager has it, and potentially fault in additional pages
- * at the same time.
+ * at the same time. For default objects simply provide
+ * zero-filled pages.
*/
- if (TRYPAGER) {
+ if (fs.object->type != OBJT_DEFAULT) {
int rv;
u_char behavior = vm_map_entry_behavior(fs.entry);
@@ -509,30 +638,15 @@
/*
* Call the pager to retrieve the data, if any, after
* releasing the lock on the map. We hold a ref on
- * fs.object and the pages are VPO_BUSY'd.
+ * fs.object and the pages are exclusive busied.
*/
unlock_map(&fs);
-vnode_lock:
- if (fs.object->type == OBJT_VNODE) {
- vp = fs.object->handle;
- if (vp == fs.vp)
- goto vnode_locked;
- else if (fs.vp != NULL) {
- vput(fs.vp);
- fs.vp = NULL;
- }
+ if (fs.object->type == OBJT_VNODE &&
+ (vp = fs.object->handle) != fs.vp) {
+ unlock_vp(&fs);
locked = VOP_ISLOCKED(vp);
- if (VFS_NEEDSGIANT(vp->v_mount) && !fs.vfslocked) {
- fs.vfslocked = 1;
- if (!mtx_trylock(&Giant)) {
- VM_OBJECT_UNLOCK(fs.object);
- mtx_lock(&Giant);
- VM_OBJECT_LOCK(fs.object);
- goto vnode_lock;
- }
- }
if (locked != LK_EXCLUSIVE)
locked = LK_SHARED;
/* Do not sleep for vnode lock while fs.m is busy */
@@ -539,10 +653,6 @@
error = vget(vp, locked | LK_CANRECURSE |
LK_NOWAIT, curthread);
if (error != 0) {
- int vfslocked;
-
- vfslocked = fs.vfslocked;
- fs.vfslocked = 0; /* Keep Giant */
vhold(vp);
release_page(&fs);
unlock_and_deallocate(&fs);
@@ -550,7 +660,6 @@
LK_CANRECURSE, curthread);
vdrop(vp);
fs.vp = vp;
- fs.vfslocked = vfslocked;
KASSERT(error == 0,
("vm_fault: vget failed"));
goto RetryFault;
@@ -557,7 +666,6 @@
}
fs.vp = vp;
}
-vnode_locked:
KASSERT(fs.vp == NULL || !fs.map->system_map,
("vm_fault: vnode-backed object mapped by system map"));
@@ -573,7 +681,7 @@
* return value is the index into the marray for the
* vm_page_t passed to the routine.
*
- * fs.m plus the additional pages are VPO_BUSY'd.
+ * fs.m plus the additional pages are exclusive busied.
*/
faultcount = vm_fault_additional_pages(
fs.m, behind, ahead, marray, &reqpage);
@@ -667,12 +775,12 @@
*/
if (fs.object != fs.first_object) {
vm_object_pip_wakeup(fs.object);
- VM_OBJECT_UNLOCK(fs.object);
+ VM_OBJECT_WUNLOCK(fs.object);
fs.object = fs.first_object;
fs.pindex = fs.first_pindex;
fs.m = fs.first_m;
- VM_OBJECT_LOCK(fs.object);
+ VM_OBJECT_WLOCK(fs.object);
}
fs.first_m = NULL;
@@ -686,21 +794,22 @@
}
PCPU_INC(cnt.v_zfod);
fs.m->valid = VM_PAGE_BITS_ALL;
+ /* Don't try to prefault neighboring pages. */
+ faultcount = 1;
break; /* break to PAGE HAS BEEN FOUND */
} else {
KASSERT(fs.object != next_object,
("object loop %p", next_object));
- VM_OBJECT_LOCK(next_object);
+ VM_OBJECT_WLOCK(next_object);
vm_object_pip_add(next_object, 1);
if (fs.object != fs.first_object)
vm_object_pip_wakeup(fs.object);
- VM_OBJECT_UNLOCK(fs.object);
+ VM_OBJECT_WUNLOCK(fs.object);
fs.object = next_object;
}
}
- KASSERT((fs.m->oflags & VPO_BUSY) != 0,
- ("vm_fault: not busy after main loop"));
+ vm_page_assert_xbusied(fs.m);
/*
* PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
@@ -746,7 +855,7 @@
*/
((fs.object->type == OBJT_DEFAULT) ||
(fs.object->type == OBJT_SWAP)) &&
- (is_first_object_locked = VM_OBJECT_TRYLOCK(fs.first_object)) &&
+ (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) &&
/*
* We don't chase down the shadow chain
*/
@@ -762,10 +871,20 @@
* process'es object. The page is
* automatically made dirty.
*/
- vm_page_lock(fs.m);
- vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
- vm_page_unlock(fs.m);
- vm_page_busy(fs.m);
+ if (vm_page_rename(fs.m, fs.first_object,
+ fs.first_pindex)) {
+ unlock_and_deallocate(&fs);
+ goto RetryFault;
+ }
+#if VM_NRESERVLEVEL > 0
+ /*
+ * Rename the reservation.
+ */
+ vm_reserv_rename(fs.m, fs.first_object,
+ fs.object, OFF_TO_IDX(
+ fs.first_object->backing_object_offset));
+#endif
+ vm_page_xbusy(fs.m);
fs.first_m = fs.m;
fs.m = NULL;
PCPU_INC(cnt.v_cow_optim);
@@ -775,8 +894,12 @@
*/
pmap_copy_page(fs.m, fs.first_m);
fs.first_m->valid = VM_PAGE_BITS_ALL;
+ if ((fault_flags & VM_FAULT_WIRE) == 0) {
+ prot &= ~VM_PROT_WRITE;
+ fault_type &= ~VM_PROT_WRITE;
+ }
if (wired && (fault_flags &
- VM_FAULT_CHANGE_WIRING) == 0) {
+ VM_FAULT_WIRE) == 0) {
vm_page_lock(fs.first_m);
vm_page_wire(fs.first_m);
vm_page_unlock(fs.first_m);
@@ -795,7 +918,7 @@
* conditional
*/
vm_object_pip_wakeup(fs.object);
- VM_OBJECT_UNLOCK(fs.object);
+ VM_OBJECT_WUNLOCK(fs.object);
/*
* Only use the new page below...
*/
@@ -803,7 +926,7 @@
fs.pindex = fs.first_pindex;
fs.m = fs.first_m;
if (!is_first_object_locked)
- VM_OBJECT_LOCK(fs.object);
+ VM_OBJECT_WLOCK(fs.object);
PCPU_INC(cnt.v_cow_faults);
curthread->td_cow++;
} else {
@@ -826,7 +949,7 @@
goto RetryFault;
}
fs.lookup_still_valid = TRUE;
- if (fs.map->timestamp != map_generation) {
+ if (fs.map->timestamp != fs.map_generation) {
result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
&fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
@@ -878,53 +1001,16 @@
if (hardfault)
fs.entry->next_read = fs.pindex + faultcount - reqpage;
- if ((prot & VM_PROT_WRITE) != 0 ||
- (fault_flags & VM_FAULT_DIRTY) != 0) {
- vm_object_set_writeable_dirty(fs.object);
+ vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
+ vm_page_assert_xbusied(fs.m);
- /*
- * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
- * if the page is already dirty to prevent data written with
- * the expectation of being synced from not being synced.
- * Likewise if this entry does not request NOSYNC then make
- * sure the page isn't marked NOSYNC. Applications sharing
- * data should use the same flags to avoid ping ponging.
- */
- if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
- if (fs.m->dirty == 0)
- fs.m->oflags |= VPO_NOSYNC;
- } else {
- fs.m->oflags &= ~VPO_NOSYNC;
- }
-
- /*
- * If the fault is a write, we know that this page is being
- * written NOW so dirty it explicitly to save on
- * pmap_is_modified() calls later.
- *
- * Also tell the backing pager, if any, that it should remove
- * any swap backing since the page is now dirty.
- */
- if (((fault_type & VM_PROT_WRITE) != 0 &&
- (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
- (fault_flags & VM_FAULT_DIRTY) != 0) {
- vm_page_dirty(fs.m);
- vm_pager_page_unswapped(fs.m);
- }
- }
-
/*
- * Page had better still be busy
- */
- KASSERT(fs.m->oflags & VPO_BUSY,
- ("vm_fault: page %p not busy!", fs.m));
- /*
* Page must be completely valid or it is not fit to
* map into user space. vm_pager_get_pages() ensures this.
*/
KASSERT(fs.m->valid == VM_PAGE_BITS_ALL,
("vm_fault: page %p partially invalid", fs.m));
- VM_OBJECT_UNLOCK(fs.object);
+ VM_OBJECT_WUNLOCK(fs.object);
/*
* Put this page into the physical map. We had to do the unlock above
@@ -932,10 +1018,12 @@
* back on the active queue until later so that the pageout daemon
* won't find it (yet).
*/
- pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired);
- if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0)
- vm_fault_prefault(fs.map->pmap, vaddr, fs.entry);
- VM_OBJECT_LOCK(fs.object);
+ pmap_enter(fs.map->pmap, vaddr, fs.m, prot,
+ fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0);
+ if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 &&
+ wired == 0)
+ vm_fault_prefault(&fs, vaddr, faultcount, reqpage);
+ VM_OBJECT_WLOCK(fs.object);
vm_page_lock(fs.m);
/*
@@ -942,11 +1030,9 @@
* If the page is not wired down, then put it where the pageout daemon
* can find it.
*/
- if (fault_flags & VM_FAULT_CHANGE_WIRING) {
- if (wired)
- vm_page_wire(fs.m);
- else
- vm_page_unwire(fs.m, 1);
+ if ((fault_flags & VM_FAULT_WIRE) != 0) {
+ KASSERT(wired, ("VM_FAULT_WIRE && !wired"));
+ vm_page_wire(fs.m);
} else
vm_page_activate(fs.m);
if (m_hold != NULL) {
@@ -954,15 +1040,16 @@
vm_page_hold(fs.m);
}
vm_page_unlock(fs.m);
- vm_page_wakeup(fs.m);
+ vm_page_xunbusy(fs.m);
/*
* Unlock everything, and return
*/
unlock_and_deallocate(&fs);
- if (hardfault)
+ if (hardfault) {
+ PCPU_INC(cnt.v_io_faults);
curthread->td_ru.ru_majflt++;
- else
+ } else
curthread->td_ru.ru_minflt++;
return (KERN_SUCCESS);
@@ -980,17 +1067,17 @@
vm_pindex_t pindex;
object = fs->object;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
first_object = fs->first_object;
if (first_object != object) {
- if (!VM_OBJECT_TRYLOCK(first_object)) {
- VM_OBJECT_UNLOCK(object);
- VM_OBJECT_LOCK(first_object);
- VM_OBJECT_LOCK(object);
+ if (!VM_OBJECT_TRYWLOCK(first_object)) {
+ VM_OBJECT_WUNLOCK(object);
+ VM_OBJECT_WLOCK(first_object);
+ VM_OBJECT_WLOCK(object);
}
}
- if (first_object->type != OBJT_DEVICE &&
- first_object->type != OBJT_PHYS && first_object->type != OBJT_SG) {
+ /* Neither fictitious nor unmanaged pages can be cached. */
+ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
if (fs->first_pindex < distance)
pindex = 0;
else
@@ -998,13 +1085,12 @@
if (pindex < OFF_TO_IDX(fs->entry->offset))
pindex = OFF_TO_IDX(fs->entry->offset);
m = first_object != object ? fs->first_m : fs->m;
- KASSERT((m->oflags & VPO_BUSY) != 0,
- ("vm_fault_cache_behind: page %p is not busy", m));
+ vm_page_assert_xbusied(m);
m_prev = vm_page_prev(m);
while ((m = m_prev) != NULL && m->pindex >= pindex &&
m->valid == VM_PAGE_BITS_ALL) {
m_prev = vm_page_prev(m);
- if (m->busy != 0 || (m->oflags & VPO_BUSY) != 0)
+ if (vm_page_busied(m))
continue;
vm_page_lock(m);
if (m->hold_count == 0 && m->wire_count == 0) {
@@ -1019,7 +1105,7 @@
}
}
if (first_object != object)
- VM_OBJECT_UNLOCK(first_object);
+ VM_OBJECT_WUNLOCK(first_object);
}
/*
@@ -1029,31 +1115,50 @@
* of mmap time.
*/
static void
-vm_fault_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry)
+vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
+ int faultcount, int reqpage)
{
- int i;
+ pmap_t pmap;
+ vm_map_entry_t entry;
+ vm_object_t backing_object, lobject;
vm_offset_t addr, starta;
vm_pindex_t pindex;
vm_page_t m;
- vm_object_t object;
+ int backward, forward, i;
+ pmap = fs->map->pmap;
if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
return;
- object = entry->object.vm_object;
+ if (faultcount > 0) {
+ backward = reqpage;
+ forward = faultcount - reqpage - 1;
+ } else {
+ backward = PFBAK;
+ forward = PFFOR;
+ }
+ entry = fs->entry;
- starta = addra - PFBAK * PAGE_SIZE;
- if (starta < entry->start) {
+ if (addra < backward * PAGE_SIZE) {
starta = entry->start;
- } else if (starta > addra) {
- starta = 0;
+ } else {
+ starta = addra - backward * PAGE_SIZE;
+ if (starta < entry->start)
+ starta = entry->start;
}
- for (i = 0; i < PAGEORDER_SIZE; i++) {
- vm_object_t backing_object, lobject;
-
- addr = addra + prefault_pageorder[i];
- if (addr > addra + (PFFOR * PAGE_SIZE))
+ /*
+ * Generate the sequence of virtual addresses that are candidates for
+ * prefaulting in an outward spiral from the faulting virtual address,
+ * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra
+ * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
+ * If the candidate address doesn't have a backing physical page, then
+ * the loop immediately terminates.
+ */
+ for (i = 0; i < 2 * imax(backward, forward); i++) {
+ addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
+ PAGE_SIZE);
+ if (addr > addra + forward * PAGE_SIZE)
addr = 0;
if (addr < starta || addr >= entry->end)
@@ -1063,8 +1168,8 @@
continue;
pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
- lobject = object;
- VM_OBJECT_LOCK(lobject);
+ lobject = entry->object.vm_object;
+ VM_OBJECT_RLOCK(lobject);
while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
lobject->type == OBJT_DEFAULT &&
(backing_object = lobject->backing_object) != NULL) {
@@ -1071,21 +1176,18 @@
KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
0, ("vm_fault_prefault: unaligned object offset"));
pindex += lobject->backing_object_offset >> PAGE_SHIFT;
- VM_OBJECT_LOCK(backing_object);
- VM_OBJECT_UNLOCK(lobject);
+ VM_OBJECT_RLOCK(backing_object);
+ VM_OBJECT_RUNLOCK(lobject);
lobject = backing_object;
}
- /*
- * give-up when a page is not in memory
- */
if (m == NULL) {
- VM_OBJECT_UNLOCK(lobject);
+ VM_OBJECT_RUNLOCK(lobject);
break;
}
if (m->valid == VM_PAGE_BITS_ALL &&
(m->flags & PG_FICTITIOUS) == 0)
pmap_enter_quick(pmap, addr, m, entry->protection);
- VM_OBJECT_UNLOCK(lobject);
+ VM_OBJECT_RUNLOCK(lobject);
}
}
@@ -1108,7 +1210,7 @@
if (len == 0)
return (0);
- end = round_page(addr + len);
+ end = round_page(addr + len);
addr = trunc_page(addr);
/*
@@ -1117,9 +1219,9 @@
if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map))
return (-1);
- count = howmany(end - addr, PAGE_SIZE);
- if (count > max_count)
+ if (atop(end - addr) > max_count)
panic("vm_fault_quick_hold_pages: count > max_count");
+ count = atop(end - addr);
/*
* Most likely, the physical pages are resident in the pmap, so it is
@@ -1168,68 +1270,6 @@
}
/*
- * vm_fault_wire:
- *
- * Wire down a range of virtual addresses in a map.
- */
-int
-vm_fault_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
- boolean_t fictitious)
-{
- vm_offset_t va;
- int rv;
-
- /*
- * We simulate a fault to get the page and enter it in the physical
- * map. For user wiring, we only ask for read access on currently
- * read-only sections.
- */
- for (va = start; va < end; va += PAGE_SIZE) {
- rv = vm_fault(map, va, VM_PROT_NONE, VM_FAULT_CHANGE_WIRING);
- if (rv) {
- if (va != start)
- vm_fault_unwire(map, start, va, fictitious);
- return (rv);
- }
- }
- return (KERN_SUCCESS);
-}
-
-/*
- * vm_fault_unwire:
- *
- * Unwire a range of virtual addresses in a map.
- */
-void
-vm_fault_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
- boolean_t fictitious)
-{
- vm_paddr_t pa;
- vm_offset_t va;
- vm_page_t m;
- pmap_t pmap;
-
- pmap = vm_map_pmap(map);
-
- /*
- * Since the pages are wired down, we must be able to get their
- * mappings from the physical map system.
- */
- for (va = start; va < end; va += PAGE_SIZE) {
- pa = pmap_extract(pmap, va);
- if (pa != 0) {
- pmap_change_wiring(pmap, va, FALSE);
- if (!fictitious) {
- m = PHYS_TO_VM_PAGE(pa);
- vm_page_lock(m);
- vm_page_unwire(m, TRUE);
- vm_page_unlock(m);
- }
- }
- }
-}
-
-/*
* Routine:
* vm_fault_copy_entry
* Function:
@@ -1254,7 +1294,7 @@
vm_offset_t vaddr;
vm_page_t dst_m;
vm_page_t src_m;
- boolean_t src_readonly, upgrade;
+ boolean_t upgrade;
#ifdef lint
src_map++;
@@ -1261,28 +1301,35 @@
#endif /* lint */
upgrade = src_entry == dst_entry;
+ access = prot = dst_entry->protection;
src_object = src_entry->object.vm_object;
src_pindex = OFF_TO_IDX(src_entry->offset);
- src_readonly = (src_entry->protection & VM_PROT_WRITE) == 0;
- /*
- * Create the top-level object for the destination entry. (Doesn't
- * actually shadow anything - we copy the pages directly.)
- */
- dst_object = vm_object_allocate(OBJT_DEFAULT,
- OFF_TO_IDX(dst_entry->end - dst_entry->start));
+ if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
+ dst_object = src_object;
+ vm_object_reference(dst_object);
+ } else {
+ /*
+ * Create the top-level object for the destination entry. (Doesn't
+ * actually shadow anything - we copy the pages directly.)
+ */
+ dst_object = vm_object_allocate(OBJT_DEFAULT,
+ OFF_TO_IDX(dst_entry->end - dst_entry->start));
#if VM_NRESERVLEVEL > 0
- dst_object->flags |= OBJ_COLORED;
- dst_object->pg_color = atop(dst_entry->start);
+ dst_object->flags |= OBJ_COLORED;
+ dst_object->pg_color = atop(dst_entry->start);
#endif
+ }
- VM_OBJECT_LOCK(dst_object);
+ VM_OBJECT_WLOCK(dst_object);
KASSERT(upgrade || dst_entry->object.vm_object == NULL,
("vm_fault_copy_entry: vm_object not NULL"));
- dst_entry->object.vm_object = dst_object;
- dst_entry->offset = 0;
- dst_object->charge = dst_entry->end - dst_entry->start;
+ if (src_object != dst_object) {
+ dst_entry->object.vm_object = dst_object;
+ dst_entry->offset = 0;
+ dst_object->charge = dst_entry->end - dst_entry->start;
+ }
if (fork_charge != NULL) {
KASSERT(dst_entry->cred == NULL,
("vm_fault_copy_entry: leaked swp charge"));
@@ -1289,11 +1336,13 @@
dst_object->cred = curthread->td_ucred;
crhold(dst_object->cred);
*fork_charge += dst_object->charge;
- } else {
+ } else if (dst_object->cred == NULL) {
+ KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
+ dst_entry));
dst_object->cred = dst_entry->cred;
dst_entry->cred = NULL;
}
- access = prot = dst_entry->protection;
+
/*
* If not an upgrade, then enter the mappings in the pmap as
* read and/or execute accesses. Otherwise, enter them as
@@ -1319,75 +1368,100 @@
for (vaddr = dst_entry->start, dst_pindex = 0;
vaddr < dst_entry->end;
vaddr += PAGE_SIZE, dst_pindex++) {
-
+again:
/*
- * Allocate a page in the destination object.
- */
- do {
- dst_m = vm_page_alloc(dst_object, dst_pindex,
- VM_ALLOC_NORMAL);
- if (dst_m == NULL) {
- VM_OBJECT_UNLOCK(dst_object);
- VM_WAIT;
- VM_OBJECT_LOCK(dst_object);
- }
- } while (dst_m == NULL);
-
- /*
* Find the page in the source object, and copy it in.
- * (Because the source is wired down, the page will be in
- * memory.)
+ * Because the source is wired down, the page will be
+ * in memory.
*/
- VM_OBJECT_LOCK(src_object);
+ if (src_object != dst_object)
+ VM_OBJECT_RLOCK(src_object);
object = src_object;
pindex = src_pindex + dst_pindex;
while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
- src_readonly &&
(backing_object = object->backing_object) != NULL) {
/*
- * Allow fallback to backing objects if we are reading.
+ * Unless the source mapping is read-only or
+ * it is presently being upgraded from
+ * read-only, the first object in the shadow
+ * chain should provide all of the pages. In
+ * other words, this loop body should never be
+ * executed when the source mapping is already
+ * read/write.
*/
- VM_OBJECT_LOCK(backing_object);
+ KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 ||
+ upgrade,
+ ("vm_fault_copy_entry: main object missing page"));
+
+ VM_OBJECT_RLOCK(backing_object);
pindex += OFF_TO_IDX(object->backing_object_offset);
- VM_OBJECT_UNLOCK(object);
+ if (object != dst_object)
+ VM_OBJECT_RUNLOCK(object);
object = backing_object;
}
- if (src_m == NULL)
- panic("vm_fault_copy_wired: page missing");
- pmap_copy_page(src_m, dst_m);
- VM_OBJECT_UNLOCK(object);
- dst_m->valid = VM_PAGE_BITS_ALL;
- dst_m->dirty = VM_PAGE_BITS_ALL;
- VM_OBJECT_UNLOCK(dst_object);
+ KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));
+ if (object != dst_object) {
+ /*
+ * Allocate a page in the destination object.
+ */
+ dst_m = vm_page_alloc(dst_object, (src_object ==
+ dst_object ? src_pindex : 0) + dst_pindex,
+ VM_ALLOC_NORMAL);
+ if (dst_m == NULL) {
+ VM_OBJECT_WUNLOCK(dst_object);
+ VM_OBJECT_RUNLOCK(object);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(dst_object);
+ goto again;
+ }
+ pmap_copy_page(src_m, dst_m);
+ VM_OBJECT_RUNLOCK(object);
+ dst_m->valid = VM_PAGE_BITS_ALL;
+ dst_m->dirty = VM_PAGE_BITS_ALL;
+ } else {
+ dst_m = src_m;
+ if (vm_page_sleep_if_busy(dst_m, "fltupg"))
+ goto again;
+ vm_page_xbusy(dst_m);
+ KASSERT(dst_m->valid == VM_PAGE_BITS_ALL,
+ ("invalid dst page %p", dst_m));
+ }
+ VM_OBJECT_WUNLOCK(dst_object);
+
/*
* Enter it in the pmap. If a wired, copy-on-write
* mapping is being replaced by a write-enabled
* mapping, then wire that new mapping.
*/
- pmap_enter(dst_map->pmap, vaddr, access, dst_m, prot, upgrade);
+ pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
+ access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
/*
* Mark it no longer busy, and put it on the active list.
*/
- VM_OBJECT_LOCK(dst_object);
+ VM_OBJECT_WLOCK(dst_object);
if (upgrade) {
- vm_page_lock(src_m);
- vm_page_unwire(src_m, 0);
- vm_page_unlock(src_m);
-
- vm_page_lock(dst_m);
- vm_page_wire(dst_m);
- vm_page_unlock(dst_m);
+ if (src_m != dst_m) {
+ vm_page_lock(src_m);
+ vm_page_unwire(src_m, 0);
+ vm_page_unlock(src_m);
+ vm_page_lock(dst_m);
+ vm_page_wire(dst_m);
+ vm_page_unlock(dst_m);
+ } else {
+ KASSERT(dst_m->wire_count > 0,
+ ("dst_m %p is not wired", dst_m));
+ }
} else {
vm_page_lock(dst_m);
vm_page_activate(dst_m);
vm_page_unlock(dst_m);
}
- vm_page_wakeup(dst_m);
+ vm_page_xunbusy(dst_m);
}
- VM_OBJECT_UNLOCK(dst_object);
+ VM_OBJECT_WUNLOCK(dst_object);
if (upgrade) {
dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
vm_object_deallocate(src_object);
@@ -1423,7 +1497,7 @@
vm_page_t rtm;
int cbehind, cahead;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
object = m->object;
pindex = m->pindex;
Modified: trunk/sys/vm/vm_glue.c
===================================================================
--- trunk/sys/vm/vm_glue.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_glue.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -57,24 +58,28 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_glue.c 300673 2016-05-25 10:04:53Z kib $");
#include "opt_vm.h"
#include "opt_kstack_pages.h"
#include "opt_kstack_max_pages.h"
+#include "opt_kstack_usage_prof.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/limits.h>
#include <sys/lock.h>
+#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sf_buf.h>
#include <sys/shm.h>
#include <sys/vmmeter.h>
+#include <sys/vmem.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/_kstack_cache.h>
@@ -95,15 +100,7 @@
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>
-/*
- * System initialization
- *
- * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
- *
- * Note: run scheduling should be divorced from the vm system.
- */
-static void scheduler(void *);
-SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL);
+#include <machine/cpu.h>
#ifndef NO_SWAPPING
static int swapout(struct proc *);
@@ -238,9 +235,9 @@
vm_pindex_t pindex;
int rv;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
pindex = OFF_TO_IDX(offset);
- m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+ m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
if (m->valid != VM_PAGE_BITS_ALL) {
ma[0] = m;
rv = vm_pager_get_pages(object, ma, 1, 0);
@@ -255,12 +252,13 @@
goto out;
}
}
+ vm_page_xunbusy(m);
vm_page_lock(m);
vm_page_hold(m);
+ vm_page_activate(m);
vm_page_unlock(m);
- vm_page_wakeup(m);
out:
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return (m);
}
@@ -307,6 +305,8 @@
static int kstack_cache_size = 128;
static int kstacks;
static struct mtx kstack_cache_mtx;
+MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF);
+
SYSCTL_INT(_vm, OID_AUTO, kstack_cache_size, CTLFLAG_RW, &kstack_cache_size, 0,
"");
SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
@@ -364,11 +364,13 @@
* We need to align the kstack's mapped address to fit within
* a single TLB entry.
*/
- ks = kmem_alloc_nofault_space(kernel_map,
- (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE, VMFS_TLB_ALIGNED_SPACE);
+ if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE,
+ PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
+ M_BESTFIT | M_NOWAIT, &ks)) {
+ ks = 0;
+ }
#else
- ks = kmem_alloc_nofault(kernel_map,
- (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
+ ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
#endif
if (ks == 0) {
printf("vm_thread_new: kstack allocation failed\n");
@@ -392,17 +394,17 @@
* For the length of the stack, link in a real page of ram for each
* page of stack.
*/
- VM_OBJECT_LOCK(ksobj);
+ VM_OBJECT_WLOCK(ksobj);
for (i = 0; i < pages; i++) {
/*
* Get a kernel stack page.
*/
m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
- VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
+ VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
ma[i] = m;
m->valid = VM_PAGE_BITS_ALL;
}
- VM_OBJECT_UNLOCK(ksobj);
+ VM_OBJECT_WUNLOCK(ksobj);
pmap_qenter(ks, ma, pages);
return (1);
}
@@ -415,7 +417,7 @@
atomic_add_int(&kstacks, -1);
pmap_qremove(ks, pages);
- VM_OBJECT_LOCK(ksobj);
+ VM_OBJECT_WLOCK(ksobj);
for (i = 0; i < pages; i++) {
m = vm_page_lookup(ksobj, i);
if (m == NULL)
@@ -425,9 +427,9 @@
vm_page_free(m);
vm_page_unlock(m);
}
- VM_OBJECT_UNLOCK(ksobj);
+ VM_OBJECT_WUNLOCK(ksobj);
vm_object_deallocate(ksobj);
- kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
+ kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
(pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
}
@@ -486,9 +488,54 @@
EVENTHANDLER_PRI_ANY);
}
-MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF);
SYSINIT(vm_kstacks, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, kstack_cache_init, NULL);
+#ifdef KSTACK_USAGE_PROF
+/*
+ * Track maximum stack used by a thread in kernel.
+ */
+static int max_kstack_used;
+
+SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD,
+ &max_kstack_used, 0,
+ "Maxiumum stack depth used by a thread in kernel");
+
+void
+intr_prof_stack_use(struct thread *td, struct trapframe *frame)
+{
+ vm_offset_t stack_top;
+ vm_offset_t current;
+ int used, prev_used;
+
+ /*
+ * Testing for interrupted kernel mode isn't strictly
+ * needed. It optimizes the execution, since interrupts from
+ * usermode will have only the trap frame on the stack.
+ */
+ if (TRAPF_USERMODE(frame))
+ return;
+
+ stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE;
+ current = (vm_offset_t)(uintptr_t)&stack_top;
+
+ /*
+ * Try to detect if interrupt is using kernel thread stack.
+ * Hardware could use a dedicated stack for interrupt handling.
+ */
+ if (stack_top <= current || current < td->td_kstack)
+ return;
+
+ used = stack_top - current;
+ for (;;) {
+ prev_used = max_kstack_used;
+ if (prev_used >= used)
+ break;
+ if (atomic_cmpset_int(&max_kstack_used, prev_used, used))
+ break;
+ }
+}
+#endif /* KSTACK_USAGE_PROF */
+
#ifndef NO_SWAPPING
/*
* Allow a thread's kernel stack to be paged out.
@@ -504,7 +551,7 @@
pages = td->td_kstack_pages;
ksobj = td->td_kstack_obj;
pmap_qremove(td->td_kstack, pages);
- VM_OBJECT_LOCK(ksobj);
+ VM_OBJECT_WLOCK(ksobj);
for (i = 0; i < pages; i++) {
m = vm_page_lookup(ksobj, i);
if (m == NULL)
@@ -514,7 +561,7 @@
vm_page_unwire(m, 0);
vm_page_unlock(m);
}
- VM_OBJECT_UNLOCK(ksobj);
+ VM_OBJECT_WUNLOCK(ksobj);
}
/*
@@ -529,19 +576,17 @@
pages = td->td_kstack_pages;
ksobj = td->td_kstack_obj;
- VM_OBJECT_LOCK(ksobj);
+ VM_OBJECT_WLOCK(ksobj);
for (i = 0; i < pages; i++)
- ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY |
+ ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL |
VM_ALLOC_WIRED);
for (i = 0; i < pages; i++) {
if (ma[i]->valid != VM_PAGE_BITS_ALL) {
- KASSERT(ma[i]->oflags & VPO_BUSY,
- ("lost busy 1"));
+ vm_page_assert_xbusied(ma[i]);
vm_object_pip_add(ksobj, 1);
for (j = i + 1; j < pages; j++) {
- KASSERT(ma[j]->valid == VM_PAGE_BITS_ALL ||
- (ma[j]->oflags & VPO_BUSY),
- ("lost busy 2"));
+ if (ma[j]->valid != VM_PAGE_BITS_ALL)
+ vm_page_assert_xbusied(ma[j]);
if (ma[j]->valid == VM_PAGE_BITS_ALL)
break;
}
@@ -552,11 +597,11 @@
vm_object_pip_wakeup(ksobj);
for (k = i; k < j; k++)
ma[k] = vm_page_lookup(ksobj, k);
- vm_page_wakeup(ma[i]);
- } else if (ma[i]->oflags & VPO_BUSY)
- vm_page_wakeup(ma[i]);
+ vm_page_xunbusy(ma[i]);
+ } else if (vm_page_xbusied(ma[i]))
+ vm_page_xunbusy(ma[i]);
}
- VM_OBJECT_UNLOCK(ksobj);
+ VM_OBJECT_WUNLOCK(ksobj);
pmap_qenter(td->td_kstack, ma, pages);
cpu_thread_swapin(td);
}
@@ -688,13 +733,9 @@
* This swapin algorithm attempts to swap-in processes only if there
* is enough space for them. Of course, if a process waits for a long
* time, it will be swapped in anyway.
- *
- * Giant is held on entry.
*/
-/* ARGSUSED*/
-static void
-scheduler(dummy)
- void *dummy;
+void
+swapper(void)
{
struct proc *p;
struct thread *td;
@@ -704,9 +745,6 @@
int ppri;
int pri;
- mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
- mtx_unlock(&Giant);
-
loop:
if (vm_page_count_min()) {
VM_WAIT;
@@ -757,7 +795,7 @@
* Nothing to do, back to sleep.
*/
if ((p = pp) == NULL) {
- tsleep(&proc0, PVM, "sched", MAXSLP * hz / 2);
+ tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
goto loop;
}
PROC_LOCK(p);
Modified: trunk/sys/vm/vm_init.c
===================================================================
--- trunk/sys/vm/vm_init.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_init.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -63,13 +64,14 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_init.c 255426 2013-09-09 18:11:59Z jhb $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/selinfo.h>
@@ -76,6 +78,7 @@
#include <sys/pipe.h>
#include <sys/bio.h>
#include <sys/buf.h>
+#include <sys/vmem.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -100,6 +103,26 @@
SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_FIRST, vm_mem_init, NULL);
/*
+ * Import kva into the kernel arena.
+ */
+static int
+kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp)
+{
+ vm_offset_t addr;
+ int result;
+
+ addr = vm_map_min(kernel_map);
+ result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0,
+ VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
+ if (result != KERN_SUCCESS)
+ return (ENOMEM);
+
+ *addrp = addr;
+
+ return (0);
+}
+
+/*
* vm_init initializes the virtual memory system.
* This is done only by the first cpu up.
*
@@ -110,6 +133,7 @@
vm_mem_init(dummy)
void *dummy;
{
+
/*
* Initializes resident memory structures. From here on, all physical
* memory is accounted for, and we use only virtual addresses.
@@ -120,9 +144,24 @@
/*
* Initialize other VM packages
*/
+ vmem_startup();
vm_object_init();
vm_map_startup();
kmem_init(virtual_avail, virtual_end);
+
+ /*
+ * Initialize the kernel_arena. This can grow on demand.
+ */
+ vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0);
+ vmem_set_import(kernel_arena, kva_import, NULL, NULL,
+#if VM_NRESERVLEVEL > 0
+ 1 << (VM_LEVEL_0_ORDER + PAGE_SHIFT));
+#else
+ /* On non-superpage architectures want large import sizes. */
+ PAGE_SIZE * 1024);
+#endif
+
+ kmem_init_zero_region();
pmap_init();
vm_pager_init();
}
@@ -136,7 +175,6 @@
long physmem_est;
vm_offset_t minaddr;
vm_offset_t maxaddr;
- vm_map_t clean_map;
/*
* Allocate space for system data structures.
@@ -144,8 +182,6 @@
* As pages of kernel virtual memory are allocated, "v" is incremented.
* As pages of memory are allocated and cleared,
* "firstaddr" is incremented.
- * An index into the kernel page table corresponding to the
- * virtual memory address maintained in "v" is kept in "mapaddr".
*/
/*
@@ -157,8 +193,6 @@
again:
v = (caddr_t)firstaddr;
- v = kern_timeout_callwheel_alloc(v);
-
/*
* Discount the physical memory larger than the size of kernel_map
* to avoid eating up all of KVA space.
@@ -173,7 +207,8 @@
*/
if (firstaddr == 0) {
size = (vm_size_t)v;
- firstaddr = kmem_alloc(kernel_map, round_page(size));
+ firstaddr = kmem_malloc(kernel_arena, round_page(size),
+ M_ZERO | M_WAITOK);
if (firstaddr == 0)
panic("startup: no room for tables");
goto again;
@@ -185,27 +220,49 @@
if ((vm_size_t)((char *)v - firstaddr) != size)
panic("startup: table size inconsistency");
- clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva,
- (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS, TRUE);
- buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva,
- &kmi->buffer_eva, (long)nbuf * BKVASIZE, FALSE);
- buffer_map->system_map = 1;
- pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva,
- (long)nswbuf * MAXPHYS, FALSE);
- pager_map->system_map = 1;
- exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
- exec_map_entries * round_page(PATH_MAX + ARG_MAX), FALSE);
- pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva,
- FALSE);
+ /*
+ * Allocate the clean map to hold all of the paging and I/O virtual
+ * memory.
+ */
+ size = (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS +
+ (long)bio_transient_maxcnt * MAXPHYS;
+ kmi->clean_sva = firstaddr = kva_alloc(size);
+ kmi->clean_eva = firstaddr + size;
/*
- * XXX: Mbuf system machine-specific initializations should
- * go here, if anywhere.
+ * Allocate the buffer arena.
*/
+ size = (long)nbuf * BKVASIZE;
+ kmi->buffer_sva = firstaddr;
+ kmi->buffer_eva = kmi->buffer_sva + size;
+ vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size,
+ PAGE_SIZE, 0, 0);
+ firstaddr += size;
/*
- * Initialize the callouts we just allocated.
+ * Now swap kva.
*/
- kern_timeout_callwheel_init();
+ swapbkva = firstaddr;
+ size = (long)nswbuf * MAXPHYS;
+ firstaddr += size;
+
+ /*
+ * And optionally transient bio space.
+ */
+ if (bio_transient_maxcnt != 0) {
+ size = (long)bio_transient_maxcnt * MAXPHYS;
+ vmem_init(transient_arena, "transient arena",
+ firstaddr, size, PAGE_SIZE, 0, 0);
+ firstaddr += size;
+ }
+ if (firstaddr != kmi->clean_eva)
+ panic("Clean map calculation incorrect");
+
+ /*
+ * Allocate the pageable submaps.
+ */
+ exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
+ exec_map_entries * round_page(PATH_MAX + ARG_MAX), FALSE);
+ pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva,
+ FALSE);
}
-
Modified: trunk/sys/vm/vm_kern.c
===================================================================
--- trunk/sys/vm/vm_kern.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_kern.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -63,7 +64,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_kern.c 324782 2017-10-20 00:38:01Z emaste $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -70,13 +71,15 @@
#include <sys/kernel.h> /* for ticks and hz */
#include <sys/eventhandler.h>
#include <sys/lock.h>
-#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/malloc.h>
+#include <sys/rwlock.h>
#include <sys/sysctl.h>
+#include <sys/vmem.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
@@ -86,16 +89,28 @@
#include <vm/uma.h>
vm_map_t kernel_map;
-vm_map_t kmem_map;
vm_map_t exec_map;
vm_map_t pipe_map;
-vm_map_t buffer_map;
const void *zero_region;
CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);
+/* NB: Used by kernel debuggers. */
+const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS;
+
+SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
+ SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
+
+SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
+#if defined(__arm__) || defined(__sparc64__)
+ &vm_max_kernel_address, 0,
+#else
+ SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS,
+#endif
+ "Max kernel address");
+
/*
- * kmem_alloc_nofault:
+ * kva_alloc:
*
* Allocate a virtual address range with no underlying object and
* no initial mapping to physical memory. Any mapping from this
@@ -104,113 +119,137 @@
* a mapping on demand through vm_fault() will result in a panic.
*/
vm_offset_t
-kmem_alloc_nofault(map, size)
- vm_map_t map;
- vm_size_t size;
+kva_alloc(vm_size_t size)
{
vm_offset_t addr;
- int result;
size = round_page(size);
- addr = vm_map_min(map);
- result = vm_map_find(map, NULL, 0, &addr, size, VMFS_ANY_SPACE,
- VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
- if (result != KERN_SUCCESS) {
+ if (vmem_alloc(kernel_arena, size, M_BESTFIT | M_NOWAIT, &addr))
return (0);
- }
+
return (addr);
}
/*
- * kmem_alloc_nofault_space:
+ * kva_free:
*
- * Allocate a virtual address range with no underlying object and
- * no initial mapping to physical memory within the specified
- * address space. Any mapping from this range to physical memory
- * must be explicitly created prior to its use, typically with
- * pmap_qenter(). Any attempt to create a mapping on demand
- * through vm_fault() will result in a panic.
+ * Release a region of kernel virtual memory allocated
+ * with kva_alloc, and return the physical pages
+ * associated with that region.
+ *
+ * This routine may not block on kernel maps.
*/
-vm_offset_t
-kmem_alloc_nofault_space(map, size, find_space)
- vm_map_t map;
- vm_size_t size;
- int find_space;
+void
+kva_free(vm_offset_t addr, vm_size_t size)
{
- vm_offset_t addr;
- int result;
size = round_page(size);
- addr = vm_map_min(map);
- result = vm_map_find(map, NULL, 0, &addr, size, find_space,
- VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
- if (result != KERN_SUCCESS) {
- return (0);
- }
- return (addr);
+ vmem_free(kernel_arena, addr, size);
}
/*
- * Allocate wired-down memory in the kernel's address map
- * or a submap.
+ * Allocates a region from the kernel address map and physical pages
+ * within the specified address range to the kernel object. Creates a
+ * wired mapping from this region to these pages, and returns the
+ * region's starting virtual address. The allocated pages are not
+ * necessarily physically contiguous. If M_ZERO is specified through the
+ * given flags, then the pages are zeroed before they are mapped.
*/
vm_offset_t
-kmem_alloc(map, size)
- vm_map_t map;
- vm_size_t size;
+kmem_alloc_attr(vmem_t *vmem, vm_size_t size, int flags, vm_paddr_t low,
+ vm_paddr_t high, vm_memattr_t memattr)
{
- vm_offset_t addr;
- vm_offset_t offset;
+ vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
+ vm_offset_t addr, i;
+ vm_ooffset_t offset;
+ vm_page_t m;
+ int pflags, tries;
size = round_page(size);
-
- /*
- * Use the kernel object for wired-down kernel pages. Assume that no
- * region of the kernel object is referenced more than once.
- */
-
- /*
- * Locate sufficient space in the map. This will give us the final
- * virtual address for the new memory, and thus will tell us the
- * offset within the kernel map.
- */
- vm_map_lock(map);
- if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
- vm_map_unlock(map);
+ if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr))
return (0);
+ offset = addr - VM_MIN_KERNEL_ADDRESS;
+ pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+ VM_OBJECT_WLOCK(object);
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ tries = 0;
+retry:
+ m = vm_page_alloc_contig(object, OFF_TO_IDX(offset + i),
+ pflags, 1, low, high, PAGE_SIZE, 0, memattr);
+ if (m == NULL) {
+ VM_OBJECT_WUNLOCK(object);
+ if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
+ vm_pageout_grow_cache(tries, low, high);
+ VM_OBJECT_WLOCK(object);
+ tries++;
+ goto retry;
+ }
+ kmem_unback(object, addr, i);
+ vmem_free(vmem, addr, size);
+ return (0);
+ }
+ if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
+ pmap_zero_page(m);
+ m->valid = VM_PAGE_BITS_ALL;
+ pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL,
+ VM_PROT_ALL | PMAP_ENTER_WIRED, 0);
}
- offset = addr - VM_MIN_KERNEL_ADDRESS;
- vm_object_reference(kernel_object);
- vm_map_insert(map, kernel_object, offset, addr, addr + size,
- VM_PROT_ALL, VM_PROT_ALL, 0);
- vm_map_unlock(map);
-
- /*
- * And finally, mark the data as non-pageable.
- */
- (void) vm_map_wire(map, addr, addr + size,
- VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
-
+ VM_OBJECT_WUNLOCK(object);
return (addr);
}
/*
- * kmem_free:
- *
- * Release a region of kernel virtual memory allocated
- * with kmem_alloc, and return the physical pages
- * associated with that region.
- *
- * This routine may not block on kernel maps.
+ * Allocates a region from the kernel address map and physically
+ * contiguous pages within the specified address range to the kernel
+ * object. Creates a wired mapping from this region to these pages, and
+ * returns the region's starting virtual address. If M_ZERO is specified
+ * through the given flags, then the pages are zeroed before they are
+ * mapped.
*/
-void
-kmem_free(map, addr, size)
- vm_map_t map;
- vm_offset_t addr;
- vm_size_t size;
+vm_offset_t
+kmem_alloc_contig(struct vmem *vmem, vm_size_t size, int flags, vm_paddr_t low,
+ vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
+ vm_memattr_t memattr)
{
-
- (void) vm_map_remove(map, trunc_page(addr), round_page(addr + size));
+ vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
+ vm_offset_t addr, tmp;
+ vm_ooffset_t offset;
+ vm_page_t end_m, m;
+ int pflags, tries;
+
+ size = round_page(size);
+ if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
+ return (0);
+ offset = addr - VM_MIN_KERNEL_ADDRESS;
+ pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+ VM_OBJECT_WLOCK(object);
+ tries = 0;
+retry:
+ m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags,
+ atop(size), low, high, alignment, boundary, memattr);
+ if (m == NULL) {
+ VM_OBJECT_WUNLOCK(object);
+ if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
+ vm_pageout_grow_cache(tries, low, high);
+ VM_OBJECT_WLOCK(object);
+ tries++;
+ goto retry;
+ }
+ vmem_free(vmem, addr, size);
+ return (0);
+ }
+ end_m = m + atop(size);
+ tmp = addr;
+ for (; m < end_m; m++) {
+ if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
+ pmap_zero_page(m);
+ m->valid = VM_PAGE_BITS_ALL;
+ pmap_enter(kernel_pmap, tmp, m, VM_PROT_ALL,
+ VM_PROT_ALL | PMAP_ENTER_WIRED, 0);
+ tmp += PAGE_SIZE;
+ }
+ VM_OBJECT_WUNLOCK(object);
+ return (addr);
}
/*
@@ -236,8 +275,8 @@
size = round_page(size);
*min = vm_map_min(parent);
- ret = vm_map_find(parent, NULL, 0, min, size, superpage_align ?
- VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
+ ret = vm_map_find(parent, NULL, 0, min, size, 0, superpage_align ?
+ VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
MAP_ACC_NO_CHARGE);
if (ret != KERN_SUCCESS)
panic("kmem_suballoc: bad status return of %d", ret);
@@ -253,65 +292,25 @@
/*
* kmem_malloc:
*
- * Allocate wired-down memory in the kernel's address map for the higher
- * level kernel memory allocator (kern/kern_malloc.c). We cannot use
- * kmem_alloc() because we may need to allocate memory at interrupt
- * level where we cannot block (canwait == FALSE).
- *
- * This routine has its own private kernel submap (kmem_map) and object
- * (kmem_object). This, combined with the fact that only malloc uses
- * this routine, ensures that we will never block in map or object waits.
- *
- * We don't worry about expanding the map (adding entries) since entries
- * for wired maps are statically allocated.
- *
- * `map' is ONLY allowed to be kmem_map or one of the mbuf submaps to
- * which we never free.
+ * Allocate wired-down pages in the kernel's address space.
*/
vm_offset_t
-kmem_malloc(map, size, flags)
- vm_map_t map;
- vm_size_t size;
- int flags;
+kmem_malloc(struct vmem *vmem, vm_size_t size, int flags)
{
vm_offset_t addr;
- int i, rv;
+ int rv;
size = round_page(size);
- addr = vm_map_min(map);
+ if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
+ return (0);
- /*
- * Locate sufficient space in the map. This will give us the final
- * virtual address for the new memory, and thus will tell us the
- * offset within the kernel map.
- */
- vm_map_lock(map);
- if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
- vm_map_unlock(map);
- if ((flags & M_NOWAIT) == 0) {
- for (i = 0; i < 8; i++) {
- EVENTHANDLER_INVOKE(vm_lowmem, 0);
- uma_reclaim();
- vm_map_lock(map);
- if (vm_map_findspace(map, vm_map_min(map),
- size, &addr) == 0) {
- break;
- }
- vm_map_unlock(map);
- tsleep(&i, 0, "nokva", (hz / 4) * (i + 1));
- }
- if (i == 8) {
- panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
- (long)size, (long)map->size);
- }
- } else {
- return (0);
- }
+ rv = kmem_back((vmem == kmem_arena) ? kmem_object : kernel_object,
+ addr, size, flags);
+ if (rv != KERN_SUCCESS) {
+ vmem_free(vmem, addr, size);
+ return (0);
}
-
- rv = kmem_back(map, addr, size, flags);
- vm_map_unlock(map);
- return (rv == KERN_SUCCESS ? addr : 0);
+ return (addr);
}
/*
@@ -320,45 +319,22 @@
* Allocate physical pages for the specified virtual address range.
*/
int
-kmem_back(vm_map_t map, vm_offset_t addr, vm_size_t size, int flags)
+kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
{
vm_offset_t offset, i;
- vm_map_entry_t entry;
vm_page_t m;
int pflags;
- boolean_t found;
- KASSERT(vm_map_locked(map), ("kmem_back: map %p is not locked", map));
+ KASSERT(object == kmem_object || object == kernel_object,
+ ("kmem_back: only supports kernel objects."));
+
offset = addr - VM_MIN_KERNEL_ADDRESS;
- vm_object_reference(kmem_object);
- vm_map_insert(map, kmem_object, offset, addr, addr + size,
- VM_PROT_ALL, VM_PROT_ALL, 0);
+ pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
- /*
- * Assert: vm_map_insert() will never be able to extend the
- * previous entry so vm_map_lookup_entry() will find a new
- * entry exactly corresponding to this address range and it
- * will have wired_count == 0.
- */
- found = vm_map_lookup_entry(map, addr, &entry);
- KASSERT(found && entry->start == addr && entry->end == addr + size &&
- entry->wired_count == 0 && (entry->eflags & MAP_ENTRY_IN_TRANSITION)
- == 0, ("kmem_back: entry not found or misaligned"));
-
- if ((flags & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
- pflags = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED;
- else
- pflags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED;
-
- if (flags & M_ZERO)
- pflags |= VM_ALLOC_ZERO;
- if (flags & M_NODUMP)
- pflags |= VM_ALLOC_NODUMP;
-
- VM_OBJECT_LOCK(kmem_object);
+ VM_OBJECT_WLOCK(object);
for (i = 0; i < size; i += PAGE_SIZE) {
retry:
- m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), pflags);
+ m = vm_page_alloc(object, OFF_TO_IDX(offset + i), pflags);
/*
* Ran out of space, free everything up and return. Don't need
@@ -366,80 +342,75 @@
* aren't on any queues.
*/
if (m == NULL) {
+ VM_OBJECT_WUNLOCK(object);
if ((flags & M_NOWAIT) == 0) {
- VM_OBJECT_UNLOCK(kmem_object);
- entry->eflags |= MAP_ENTRY_IN_TRANSITION;
- vm_map_unlock(map);
VM_WAIT;
- vm_map_lock(map);
- KASSERT(
-(entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_NEEDS_WAKEUP)) ==
- MAP_ENTRY_IN_TRANSITION,
- ("kmem_back: volatile entry"));
- entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
- VM_OBJECT_LOCK(kmem_object);
+ VM_OBJECT_WLOCK(object);
goto retry;
}
- /*
- * Free the pages before removing the map entry.
- * They are already marked busy. Calling
- * vm_map_delete before the pages has been freed or
- * unbusied will cause a deadlock.
- */
- while (i != 0) {
- i -= PAGE_SIZE;
- m = vm_page_lookup(kmem_object,
- OFF_TO_IDX(offset + i));
- vm_page_unwire(m, 0);
- vm_page_free(m);
- }
- VM_OBJECT_UNLOCK(kmem_object);
- vm_map_delete(map, addr, addr + size);
+ kmem_unback(object, addr, i);
return (KERN_NO_SPACE);
}
if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
pmap_zero_page(m);
- m->valid = VM_PAGE_BITS_ALL;
KASSERT((m->oflags & VPO_UNMANAGED) != 0,
("kmem_malloc: page %p is managed", m));
+ m->valid = VM_PAGE_BITS_ALL;
+ pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL,
+ VM_PROT_ALL | PMAP_ENTER_WIRED, 0);
}
- VM_OBJECT_UNLOCK(kmem_object);
+ VM_OBJECT_WUNLOCK(object);
- /*
- * Mark map entry as non-pageable. Repeat the assert.
- */
- KASSERT(entry->start == addr && entry->end == addr + size &&
- entry->wired_count == 0,
- ("kmem_back: entry not found or misaligned after allocation"));
- entry->wired_count = 1;
+ return (KERN_SUCCESS);
+}
- /*
- * At this point, the kmem_object must be unlocked because
- * vm_map_simplify_entry() calls vm_object_deallocate(), which
- * locks the kmem_object.
- */
- vm_map_simplify_entry(map, entry);
+/*
+ * kmem_unback:
+ *
+ * Unmap and free the physical pages underlying the specified virtual
+ * address range.
+ *
+ * A physical page must exist within the specified object at each index
+ * that is being unmapped.
+ */
+void
+kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
+{
+ vm_page_t m;
+ vm_offset_t i, offset;
- /*
- * Loop thru pages, entering them in the pmap.
- */
- VM_OBJECT_LOCK(kmem_object);
+ KASSERT(object == kmem_object || object == kernel_object,
+ ("kmem_unback: only supports kernel objects."));
+
+ pmap_remove(kernel_pmap, addr, addr + size);
+ offset = addr - VM_MIN_KERNEL_ADDRESS;
+ VM_OBJECT_WLOCK(object);
for (i = 0; i < size; i += PAGE_SIZE) {
- m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i));
- /*
- * Because this is kernel_pmap, this call will not block.
- */
- pmap_enter(kernel_pmap, addr + i, VM_PROT_ALL, m, VM_PROT_ALL,
- TRUE);
- vm_page_wakeup(m);
+ m = vm_page_lookup(object, OFF_TO_IDX(offset + i));
+ vm_page_unwire(m, 0);
+ vm_page_free(m);
}
- VM_OBJECT_UNLOCK(kmem_object);
+ VM_OBJECT_WUNLOCK(object);
+}
- return (KERN_SUCCESS);
+/*
+ * kmem_free:
+ *
+ * Free memory allocated with kmem_malloc. The size must match the
+ * original allocation.
+ */
+void
+kmem_free(struct vmem *vmem, vm_offset_t addr, vm_size_t size)
+{
+
+ size = round_page(size);
+ kmem_unback((vmem == kmem_arena) ? kmem_object : kernel_object,
+ addr, size);
+ vmem_free(vmem, addr, size);
}
/*
- * kmem_alloc_wait:
+ * kmap_alloc_wait:
*
* Allocates pageable memory from a sub-map of the kernel. If the submap
* has no room, the caller sleeps waiting for more memory in the submap.
@@ -447,9 +418,7 @@
* This routine may block.
*/
vm_offset_t
-kmem_alloc_wait(map, size)
- vm_map_t map;
- vm_size_t size;
+kmap_alloc_wait(vm_map_t map, vm_size_t size)
{
vm_offset_t addr;
@@ -481,16 +450,13 @@
}
/*
- * kmem_free_wakeup:
+ * kmap_free_wakeup:
*
* Returns memory to a submap of the kernel, and wakes up any processes
* waiting for memory in that map.
*/
void
-kmem_free_wakeup(map, addr, size)
- vm_map_t map;
- vm_offset_t addr;
- vm_size_t size;
+kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size)
{
vm_map_lock(map);
@@ -502,12 +468,11 @@
vm_map_unlock(map);
}
-static void
+void
kmem_init_zero_region(void)
{
vm_offset_t addr, i;
vm_page_t m;
- int error;
/*
* Map a single physical page of zeros to a larger virtual range.
@@ -514,7 +479,7 @@
* This requires less looping in places that want large amounts of
* zeros, while not using much more physical resources.
*/
- addr = kmem_alloc_nofault(kernel_map, ZERO_REGION_SIZE);
+ addr = kva_alloc(ZERO_REGION_SIZE);
m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
if ((m->flags & PG_ZERO) == 0)
@@ -521,9 +486,7 @@
pmap_zero_page(m);
for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE)
pmap_qenter(addr + i, &m, 1);
- error = vm_map_protect(kernel_map, addr, addr + ZERO_REGION_SIZE,
- VM_PROT_READ, TRUE);
- KASSERT(error == 0, ("error=%d", error));
+ pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_READ);
zero_region = (const void *)addr;
}
@@ -537,8 +500,7 @@
* `start' as allocated, and the range between `start' and `end' as free.
*/
void
-kmem_init(start, end)
- vm_offset_t start, end;
+kmem_init(vm_offset_t start, vm_offset_t end)
{
vm_map_t m;
@@ -556,8 +518,6 @@
start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
/* ... and ending with the completion of the above `insert' */
vm_map_unlock(m);
-
- kmem_init_zero_region();
}
#ifdef DIAGNOSTIC
@@ -574,11 +534,13 @@
error = sysctl_handle_int(oidp, &i, 0, req);
if (error)
return (error);
- if (i)
- EVENTHANDLER_INVOKE(vm_lowmem, 0);
+ if ((i & ~(VM_LOW_KMEM | VM_LOW_PAGES)) != 0)
+ return (EINVAL);
+ if (i != 0)
+ EVENTHANDLER_INVOKE(vm_lowmem, i);
return (0);
}
SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
- debug_vm_lowmem, "I", "set to trigger vm_lowmem event");
+ debug_vm_lowmem, "I", "set to trigger vm_lowmem event with given flags");
#endif
Modified: trunk/sys/vm/vm_kern.h
===================================================================
--- trunk/sys/vm/vm_kern.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_kern.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -57,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_kern.h 254307 2013-08-13 22:40:43Z jeff $
*/
#ifndef _VM_VM_KERN_H_
@@ -64,11 +65,15 @@
#define _VM_VM_KERN_H_ 1
/* Kernel memory management definitions. */
-extern vm_map_t buffer_map;
extern vm_map_t kernel_map;
-extern vm_map_t kmem_map;
extern vm_map_t exec_map;
extern vm_map_t pipe_map;
+extern struct vmem *kernel_arena;
+extern struct vmem *kmem_arena;
+extern struct vmem *buffer_arena;
+extern struct vmem *transient_arena;
+extern struct vmem *memguard_arena;
+extern vm_offset_t swapbkva;
extern u_long vm_kmem_size;
#endif /* _VM_VM_KERN_H_ */
Modified: trunk/sys/vm/vm_map.c
===================================================================
--- trunk/sys/vm/vm_map.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_map.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -63,7 +64,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_map.c 326523 2017-12-04 10:05:59Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -77,6 +78,7 @@
#include <sys/vnode.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/file.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
@@ -125,19 +127,24 @@
static uma_zone_t kmapentzone;
static uma_zone_t mapzone;
static uma_zone_t vmspace_zone;
-static struct vm_object kmapentobj;
static int vmspace_zinit(void *mem, int size, int flags);
-static void vmspace_zfini(void *mem, int size);
static int vm_map_zinit(void *mem, int ize, int flags);
-static void vm_map_zfini(void *mem, int size);
static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
vm_offset_t max);
static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
+static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
+static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
+ vm_map_entry_t gap_entry);
#ifdef INVARIANTS
static void vm_map_zdtor(void *mem, int size, void *arg);
static void vmspace_zdtor(void *mem, int size, void *arg);
#endif
+static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
+ vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
+ int cow);
+static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
+ vm_offset_t failed_addr);
#define ENTRY_CHARGED(e) ((e)->cred != NULL || \
((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
@@ -192,25 +199,22 @@
#else
NULL,
#endif
- vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
uma_prealloc(mapzone, MAX_KMAP);
kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
- uma_prealloc(kmapentzone, MAX_KMAPENT);
mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
+#ifdef INVARIANTS
+ vmspace_zdtor,
+#else
+ NULL,
+#endif
+ vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
}
-static void
-vmspace_zfini(void *mem, int size)
-{
- struct vmspace *vm;
-
- vm = (struct vmspace *)mem;
- vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
-}
-
static int
vmspace_zinit(void *mem, int size, int flags)
{
@@ -220,19 +224,10 @@
vm->vm_map.pmap = NULL;
(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
+ PMAP_LOCK_INIT(vmspace_pmap(vm));
return (0);
}
-static void
-vm_map_zfini(void *mem, int size)
-{
- vm_map_t map;
-
- map = (vm_map_t)mem;
- mtx_destroy(&map->system_mtx);
- sx_destroy(&map->lock);
-}
-
static int
vm_map_zinit(void *mem, int size, int flags)
{
@@ -239,8 +234,7 @@
vm_map_t map;
map = (vm_map_t)mem;
- map->nentries = 0;
- map->size = 0;
+ memset(map, 0, sizeof(*map));
mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
sx_init(&map->lock, "vm map (user)");
return (0);
@@ -274,15 +268,22 @@
/*
* Allocate a vmspace structure, including a vm_map and pmap,
* and initialize those structures. The refcnt is set to 1.
+ *
+ * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
*/
struct vmspace *
-vmspace_alloc(min, max)
- vm_offset_t min, max;
+vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
{
struct vmspace *vm;
vm = uma_zalloc(vmspace_zone, M_WAITOK);
- if (vm->vm_map.pmap == NULL && !pmap_pinit(vmspace_pmap(vm))) {
+
+ KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
+
+ if (pinit == NULL)
+ pinit = &pmap_pinit;
+
+ if (!pinit(vmspace_pmap(vm))) {
uma_zfree(vmspace_zone, vm);
return (NULL);
}
@@ -300,26 +301,11 @@
return (vm);
}
-void
-vm_init2(void)
-{
- uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
- (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE) / 8 +
- maxproc * 2 + maxfiles);
- vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
-#ifdef INVARIANTS
- vmspace_zdtor,
-#else
- NULL,
-#endif
- vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-}
-
+#ifdef RACCT
static void
vmspace_container_reset(struct proc *p)
{
-#ifdef RACCT
PROC_LOCK(p);
racct_set(p, RACCT_DATA, 0);
racct_set(p, RACCT_STACK, 0);
@@ -327,8 +313,8 @@
racct_set(p, RACCT_MEMLOCK, 0);
racct_set(p, RACCT_VMEM, 0);
PROC_UNLOCK(p);
+}
#endif
-}
static inline void
vmspace_dofree(struct vmspace *vm)
@@ -359,6 +345,9 @@
vmspace_free(struct vmspace *vm)
{
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "vmspace_free() called with non-sleepable lock held");
+
if (vm->vm_refcnt == 0)
panic("vmspace_free: attempt to free already freed vmspace");
@@ -427,7 +416,10 @@
pmap_activate(td);
vmspace_dofree(vm);
}
- vmspace_container_reset(p);
+#ifdef RACCT
+ if (racct_enable)
+ vmspace_container_reset(p);
+#endif
}
/* Acquire reference to vmspace owned by another process. */
@@ -960,6 +952,15 @@
"vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
map->nentries, entry, after_where);
VM_MAP_ASSERT_LOCKED(map);
+ KASSERT(after_where == &map->header ||
+ after_where->end <= entry->start,
+ ("vm_map_entry_link: prev end %jx new start %jx overlap",
+ (uintmax_t)after_where->end, (uintmax_t)entry->start));
+ KASSERT(after_where->next == &map->header ||
+ entry->end <= after_where->next->start,
+ ("vm_map_entry_link: new end %jx next start %jx overlap",
+ (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
+
map->nentries++;
entry->prev = after_where;
entry->next = after_where->next;
@@ -1132,24 +1133,26 @@
*/
int
vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
- vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
- int cow)
+ vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
{
- vm_map_entry_t new_entry;
- vm_map_entry_t prev_entry;
- vm_map_entry_t temp_entry;
+ vm_map_entry_t new_entry, prev_entry, temp_entry;
+ struct ucred *cred;
vm_eflags_t protoeflags;
- struct ucred *cred;
vm_inherit_t inheritance;
- boolean_t charge_prev_obj;
VM_MAP_ASSERT_LOCKED(map);
+ KASSERT((object != kmem_object && object != kernel_object) ||
+ (cow & MAP_COPY_ON_WRITE) == 0,
+ ("vm_map_insert: kmem or kernel object and COW"));
+ KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
+ ("vm_map_insert: paradoxical MAP_NOFAULT request"));
+ KASSERT((prot & ~max) == 0,
+ ("prot %#x is not subset of max_prot %#x", prot, max));
/*
* Check that the start and end points are not bogus.
*/
- if ((start < map->min_offset) || (end > map->max_offset) ||
- (start >= end))
+ if (start < map->min_offset || end > map->max_offset || start >= end)
return (KERN_INVALID_ADDRESS);
/*
@@ -1164,28 +1167,34 @@
/*
* Assert that the next entry doesn't overlap the end point.
*/
- if ((prev_entry->next != &map->header) &&
- (prev_entry->next->start < end))
+ if (prev_entry->next != &map->header && prev_entry->next->start < end)
return (KERN_NO_SPACE);
+ if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
+ max != VM_PROT_NONE))
+ return (KERN_INVALID_ARGUMENT);
+
protoeflags = 0;
- charge_prev_obj = FALSE;
-
if (cow & MAP_COPY_ON_WRITE)
- protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
-
- if (cow & MAP_NOFAULT) {
+ protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
+ if (cow & MAP_NOFAULT)
protoeflags |= MAP_ENTRY_NOFAULT;
-
- KASSERT(object == NULL,
- ("vm_map_insert: paradoxical MAP_NOFAULT request"));
- }
if (cow & MAP_DISABLE_SYNCER)
protoeflags |= MAP_ENTRY_NOSYNC;
if (cow & MAP_DISABLE_COREDUMP)
protoeflags |= MAP_ENTRY_NOCOREDUMP;
+ if (cow & MAP_STACK_GROWS_DOWN)
+ protoeflags |= MAP_ENTRY_GROWS_DOWN;
+ if (cow & MAP_STACK_GROWS_UP)
+ protoeflags |= MAP_ENTRY_GROWS_UP;
if (cow & MAP_VN_WRITECOUNT)
protoeflags |= MAP_ENTRY_VN_WRITECNT;
+ if ((cow & MAP_CREATE_GUARD) != 0)
+ protoeflags |= MAP_ENTRY_GUARD;
+ if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
+ protoeflags |= MAP_ENTRY_STACK_GAP_DN;
+ if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
+ protoeflags |= MAP_ENTRY_STACK_GAP_UP;
if (cow & MAP_INHERIT_SHARE)
inheritance = VM_INHERIT_SHARE;
else
@@ -1192,23 +1201,17 @@
inheritance = VM_INHERIT_DEFAULT;
cred = NULL;
- KASSERT((object != kmem_object && object != kernel_object) ||
- ((object == kmem_object || object == kernel_object) &&
- !(protoeflags & MAP_ENTRY_NEEDS_COPY)),
- ("kmem or kernel object and cow"));
- if (cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT))
+ if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
goto charged;
if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
return (KERN_RESOURCE_SHORTAGE);
- KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) ||
+ KASSERT(object == NULL ||
+ (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
object->cred == NULL,
- ("OVERCOMMIT: vm_map_insert o %p", object));
+ ("overcommit: vm_map_insert o %p", object));
cred = curthread->td_ucred;
- crhold(cred);
- if (object == NULL && !(protoeflags & MAP_ENTRY_NEEDS_COPY))
- charge_prev_obj = TRUE;
}
charged:
@@ -1223,37 +1226,35 @@
* reference counting is insufficient to recognize
* aliases with precision.)
*/
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (object->ref_count > 1 || object->shadow_count != 0)
vm_object_clear_flag(object, OBJ_ONEMAPPING);
- VM_OBJECT_UNLOCK(object);
- }
- else if ((prev_entry != &map->header) &&
- (prev_entry->eflags == protoeflags) &&
- (cow & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) == 0 &&
- (prev_entry->end == start) &&
- (prev_entry->wired_count == 0) &&
- (prev_entry->cred == cred ||
- (prev_entry->object.vm_object != NULL &&
- (prev_entry->object.vm_object->cred == cred))) &&
- vm_object_coalesce(prev_entry->object.vm_object,
- prev_entry->offset,
- (vm_size_t)(prev_entry->end - prev_entry->start),
- (vm_size_t)(end - prev_entry->end), charge_prev_obj)) {
+ VM_OBJECT_WUNLOCK(object);
+ } else if (prev_entry != &map->header &&
+ prev_entry->eflags == protoeflags &&
+ (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
+ prev_entry->end == start && prev_entry->wired_count == 0 &&
+ (prev_entry->cred == cred ||
+ (prev_entry->object.vm_object != NULL &&
+ prev_entry->object.vm_object->cred == cred)) &&
+ vm_object_coalesce(prev_entry->object.vm_object,
+ prev_entry->offset,
+ (vm_size_t)(prev_entry->end - prev_entry->start),
+ (vm_size_t)(end - prev_entry->end), cred != NULL &&
+ (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
/*
* We were able to extend the object. Determine if we
* can extend the previous map entry to include the
* new range as well.
*/
- if ((prev_entry->inheritance == inheritance) &&
- (prev_entry->protection == prot) &&
- (prev_entry->max_protection == max)) {
- map->size += (end - prev_entry->end);
+ if (prev_entry->inheritance == inheritance &&
+ prev_entry->protection == prot &&
+ prev_entry->max_protection == max) {
+ if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
+ map->size += end - prev_entry->end;
prev_entry->end = end;
vm_map_entry_resize_free(map, prev_entry);
vm_map_simplify_entry(map, prev_entry);
- if (cred != NULL)
- crfree(cred);
return (KERN_SUCCESS);
}
@@ -1265,23 +1266,18 @@
*/
object = prev_entry->object.vm_object;
offset = prev_entry->offset +
- (prev_entry->end - prev_entry->start);
+ (prev_entry->end - prev_entry->start);
vm_object_reference(object);
if (cred != NULL && object != NULL && object->cred != NULL &&
!(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
/* Object already accounts for this uid. */
- crfree(cred);
cred = NULL;
}
}
+ if (cred != NULL)
+ crhold(cred);
/*
- * NOTE: if conditionals fail, object can be NULL here. This occurs
- * in things like the buffer map where we manage kva but do not manage
- * backing objects.
- */
-
- /*
* Create a new entry
*/
new_entry = vm_map_entry_create(map);
@@ -1292,17 +1288,17 @@
new_entry->eflags = protoeflags;
new_entry->object.vm_object = object;
new_entry->offset = offset;
- new_entry->avail_ssize = 0;
new_entry->inheritance = inheritance;
new_entry->protection = prot;
new_entry->max_protection = max;
new_entry->wired_count = 0;
+ new_entry->wiring_thread = NULL;
new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
new_entry->next_read = OFF_TO_IDX(offset);
KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
- ("OVERCOMMIT: vm_map_insert leaks vm_map %p", new_entry));
+ ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
new_entry->cred = cred;
/*
@@ -1309,20 +1305,20 @@
* Insert the new entry into the list
*/
vm_map_entry_link(map, prev_entry, new_entry);
- map->size += new_entry->end - new_entry->start;
+ if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
+ map->size += new_entry->end - new_entry->start;
/*
- * It may be possible to merge the new entry with the next and/or
- * previous entries. However, due to MAP_STACK_* being a hack, a
- * panic can result from merging such entries.
+ * Try to coalesce the new entry with both the previous and next
+ * entries in the list. Previously, we only attempted to coalesce
+ * with the previous entry when object is NULL. Here, we handle the
+ * other cases, which are less common.
*/
- if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)
- vm_map_simplify_entry(map, new_entry);
+ vm_map_simplify_entry(map, new_entry);
- if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
- vm_map_pmap_enter(map, start, prot,
- object, OFF_TO_IDX(offset), end - start,
- cow & MAP_PREFAULT_PARTIAL);
+ if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
+ vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
+ end - start, cow & MAP_PREFAULT_PARTIAL);
}
return (KERN_SUCCESS);
@@ -1421,11 +1417,20 @@
int result;
end = start + length;
+ KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
+ object == NULL,
+ ("vm_map_fixed: non-NULL backing object for stack"));
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
- (void) vm_map_delete(map, start, end);
- result = vm_map_insert(map, object, offset, start, end, prot,
- max, cow);
+ if ((cow & MAP_CHECK_EXCL) == 0)
+ vm_map_delete(map, start, end);
+ if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
+ result = vm_map_stack_locked(map, start, length, sgrowsiz,
+ prot, max, cow);
+ } else {
+ result = vm_map_insert(map, object, offset, start, end,
+ prot, max, cow);
+ }
vm_map_unlock(map);
return (result);
}
@@ -1442,48 +1447,101 @@
int
vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
vm_offset_t *addr, /* IN/OUT */
- vm_size_t length, int find_space, vm_prot_t prot,
- vm_prot_t max, int cow)
+ vm_size_t length, vm_offset_t max_addr, int find_space,
+ vm_prot_t prot, vm_prot_t max, int cow)
{
- vm_offset_t start;
+ vm_offset_t alignment, initial_addr, start;
int result;
- start = *addr;
+ KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
+ object == NULL,
+ ("vm_map_find: non-NULL backing object for stack"));
+ if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
+ (object->flags & OBJ_COLORED) == 0))
+ find_space = VMFS_ANY_SPACE;
+ if (find_space >> 8 != 0) {
+ KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
+ alignment = (vm_offset_t)1 << (find_space >> 8);
+ } else
+ alignment = 0;
+ initial_addr = *addr;
+again:
+ start = initial_addr;
vm_map_lock(map);
do {
if (find_space != VMFS_NO_SPACE) {
- if (vm_map_findspace(map, start, length, addr)) {
+ if (vm_map_findspace(map, start, length, addr) ||
+ (max_addr != 0 && *addr + length > max_addr)) {
vm_map_unlock(map);
+ if (find_space == VMFS_OPTIMAL_SPACE) {
+ find_space = VMFS_ANY_SPACE;
+ goto again;
+ }
return (KERN_NO_SPACE);
}
switch (find_space) {
- case VMFS_ALIGNED_SPACE:
+ case VMFS_SUPER_SPACE:
+ case VMFS_OPTIMAL_SPACE:
pmap_align_superpage(object, offset, addr,
length);
break;
-#ifdef VMFS_TLB_ALIGNED_SPACE
- case VMFS_TLB_ALIGNED_SPACE:
- pmap_align_tlb(addr);
+ case VMFS_ANY_SPACE:
break;
-#endif
default:
+ if ((*addr & (alignment - 1)) != 0) {
+ *addr &= ~(alignment - 1);
+ *addr += alignment;
+ }
break;
}
start = *addr;
}
- result = vm_map_insert(map, object, offset, start, start +
- length, prot, max, cow);
- } while (result == KERN_NO_SPACE && (find_space == VMFS_ALIGNED_SPACE
-#ifdef VMFS_TLB_ALIGNED_SPACE
- || find_space == VMFS_TLB_ALIGNED_SPACE
-#endif
- ));
+ if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
+ result = vm_map_stack_locked(map, start, length,
+ sgrowsiz, prot, max, cow);
+ } else {
+ result = vm_map_insert(map, object, offset, start,
+ start + length, prot, max, cow);
+ }
+ } while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE &&
+ find_space != VMFS_ANY_SPACE);
vm_map_unlock(map);
return (result);
}
/*
+ * vm_map_find_min() is a variant of vm_map_find() that takes an
+ * additional parameter (min_addr) and treats the given address
+ * (*addr) differently. Specifically, it treats *addr as a hint
+ * and not as the minimum address where the mapping is created.
+ *
+ * This function works in two phases. First, it tries to
+ * allocate above the hint. If that fails and the hint is
+ * greater than min_addr, it performs a second pass, replacing
+ * the hint with min_addr as the minimum address for the
+ * allocation.
+ */
+int
+vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
+ vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
+ vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
+ int cow)
+{
+ vm_offset_t hint;
+ int rv;
+
+ hint = *addr;
+ for (;;) {
+ rv = vm_map_find(map, object, offset, addr, length, max_addr,
+ find_space, prot, max, cow);
+ if (rv == KERN_SUCCESS || min_addr >= hint)
+ return (rv);
+ *addr = hint = min_addr;
+ }
+}
+
+/*
* vm_map_simplify_entry:
*
* Simplify the given map entry by merging with either neighbor. This
@@ -1501,7 +1559,8 @@
vm_map_entry_t next, prev;
vm_size_t prevsize, esize;
- if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
+ if ((entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP |
+ MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) != 0)
return;
prev = entry->prev;
@@ -1611,7 +1670,8 @@
* map. This is a bit of a hack, but is also about the best place to
* put this improvement.
*/
- if (entry->object.vm_object == NULL && !map->system_map) {
+ if (entry->object.vm_object == NULL && !map->system_map &&
+ (entry->eflags & MAP_ENTRY_GUARD) == 0) {
vm_object_t object;
object = vm_object_allocate(OBJT_DEFAULT,
atop(entry->end - entry->start));
@@ -1625,12 +1685,12 @@
} else if (entry->object.vm_object != NULL &&
((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
entry->cred != NULL) {
- VM_OBJECT_LOCK(entry->object.vm_object);
+ VM_OBJECT_WLOCK(entry->object.vm_object);
KASSERT(entry->object.vm_object->cred == NULL,
("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
entry->object.vm_object->cred = entry->cred;
entry->object.vm_object->charge = entry->end - entry->start;
- VM_OBJECT_UNLOCK(entry->object.vm_object);
+ VM_OBJECT_WUNLOCK(entry->object.vm_object);
entry->cred = NULL;
}
@@ -1688,7 +1748,8 @@
* map. This is a bit of a hack, but is also about the best place to
* put this improvement.
*/
- if (entry->object.vm_object == NULL && !map->system_map) {
+ if (entry->object.vm_object == NULL && !map->system_map &&
+ (entry->eflags & MAP_ENTRY_GUARD) == 0) {
vm_object_t object;
object = vm_object_allocate(OBJT_DEFAULT,
atop(entry->end - entry->start));
@@ -1702,12 +1763,12 @@
} else if (entry->object.vm_object != NULL &&
((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
entry->cred != NULL) {
- VM_OBJECT_LOCK(entry->object.vm_object);
+ VM_OBJECT_WLOCK(entry->object.vm_object);
KASSERT(entry->object.vm_object->cred == NULL,
("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
entry->object.vm_object->cred = entry->cred;
entry->object.vm_object->charge = entry->end - entry->start;
- VM_OBJECT_UNLOCK(entry->object.vm_object);
+ VM_OBJECT_WUNLOCK(entry->object.vm_object);
entry->cred = NULL;
}
@@ -1781,7 +1842,7 @@
}
/*
- * The maximum number of pages to map
+ * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
*/
#define MAX_INIT_PT 96
@@ -1788,10 +1849,16 @@
/*
* vm_map_pmap_enter:
*
- * Preload read-only mappings for the given object's resident pages into
- * the given map. This eliminates the soft faults on process startup and
- * immediately after an mmap(2). Because these are speculative mappings,
- * cached pages are not reactivated and mapped.
+ * Preload the specified map's pmap with mappings to the specified
+ * object's memory-resident pages. No further physical pages are
+ * allocated, and no further virtual pages are retrieved from secondary
+ * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a
+ * limited number of page mappings are created at the low-end of the
+ * specified address range. (For this purpose, a superpage mapping
+ * counts as one page mapping.) Otherwise, all resident pages within
+ * the specified address range are mapped. Because these mappings are
+ * being created speculatively, cached pages are not reactivated and
+ * mapped.
*/
void
vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
@@ -1799,30 +1866,35 @@
{
vm_offset_t start;
vm_page_t p, p_start;
- vm_pindex_t psize, tmpidx;
+ vm_pindex_t mask, psize, threshold, tmpidx;
if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
return;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_RLOCK(object);
if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
- pmap_object_init_pt(map->pmap, addr, object, pindex, size);
- goto unlock_return;
+ VM_OBJECT_RUNLOCK(object);
+ VM_OBJECT_WLOCK(object);
+ if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
+ pmap_object_init_pt(map->pmap, addr, object, pindex,
+ size);
+ VM_OBJECT_WUNLOCK(object);
+ return;
+ }
+ VM_OBJECT_LOCK_DOWNGRADE(object);
}
psize = atop(size);
-
- if ((flags & MAP_PREFAULT_PARTIAL) && psize > MAX_INIT_PT &&
- object->resident_page_count > MAX_INIT_PT)
- goto unlock_return;
-
if (psize + pindex > object->size) {
- if (object->size < pindex)
- goto unlock_return;
+ if (object->size < pindex) {
+ VM_OBJECT_RUNLOCK(object);
+ return;
+ }
psize = object->size - pindex;
}
start = 0;
p_start = NULL;
+ threshold = MAX_INIT_PT;
p = vm_page_find_least(object, pindex);
/*
@@ -1837,8 +1909,10 @@
* don't allow an madvise to blow away our really
* free pages allocating pv entries.
*/
- if ((flags & MAP_PREFAULT_MADVISE) &&
- cnt.v_free_count < cnt.v_free_reserved) {
+ if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
+ cnt.v_free_count < cnt.v_free_reserved) ||
+ ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
+ tmpidx >= threshold)) {
psize = tmpidx;
break;
}
@@ -1847,6 +1921,16 @@
start = addr + ptoa(tmpidx);
p_start = p;
}
+ /* Jump ahead if a superpage mapping is possible. */
+ if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
+ (pagesizes[p->psind] - 1)) == 0) {
+ mask = atop(pagesizes[p->psind]) - 1;
+ if (tmpidx + mask < psize &&
+ vm_page_ps_is_valid(p)) {
+ p += mask;
+ threshold += mask;
+ }
+ }
} else if (p_start != NULL) {
pmap_enter_object(map->pmap, start, addr +
ptoa(tmpidx), p_start, prot);
@@ -1856,8 +1940,7 @@
if (p_start != NULL)
pmap_enter_object(map->pmap, start, addr + ptoa(psize),
p_start, prot);
-unlock_return:
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_RUNLOCK(object);
}
/*
@@ -1877,6 +1960,9 @@
struct ucred *cred;
vm_prot_t old_prot;
+ if (start == end)
+ return (KERN_SUCCESS);
+
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
@@ -1890,8 +1976,10 @@
/*
* Make a first pass to check for protection violations.
*/
- current = entry;
- while ((current != &map->header) && (current->start < end)) {
+ for (current = entry; current != &map->header && current->start < end;
+ current = current->next) {
+ if ((current->eflags & MAP_ENTRY_GUARD) != 0)
+ continue;
if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
vm_map_unlock(map);
return (KERN_INVALID_ARGUMENT);
@@ -1900,23 +1988,22 @@
vm_map_unlock(map);
return (KERN_PROTECTION_FAILURE);
}
- current = current->next;
}
-
/*
* Do an accounting pass for private read-only mappings that
* now will do cow due to allowed write (e.g. debugger sets
* breakpoint on text segment)
*/
- for (current = entry; (current != &map->header) &&
- (current->start < end); current = current->next) {
+ for (current = entry; current != &map->header && current->start < end;
+ current = current->next) {
vm_map_clip_end(map, current, end);
if (set_max ||
((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
- ENTRY_CHARGED(current)) {
+ ENTRY_CHARGED(current) ||
+ (current->eflags & MAP_ENTRY_GUARD) != 0) {
continue;
}
@@ -1933,9 +2020,9 @@
continue;
}
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_WLOCK(obj);
if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
continue;
}
@@ -1945,9 +2032,10 @@
* charged clipped mapping of the same object later.
*/
KASSERT(obj->charge == 0,
- ("vm_map_protect: object %p overcharged\n", obj));
+ ("vm_map_protect: object %p overcharged (entry %p)",
+ obj, current));
if (!swap_reserve(ptoa(obj->size))) {
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
vm_map_unlock(map);
return (KERN_RESOURCE_SHORTAGE);
}
@@ -1955,7 +2043,7 @@
crhold(cred);
obj->cred = cred;
obj->charge = ptoa(obj->size);
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
}
/*
@@ -1962,8 +2050,11 @@
* Go back and fix up protections. [Note that clipping is not
* necessary the second time.]
*/
- current = entry;
- while ((current != &map->header) && (current->start < end)) {
+ for (current = entry; current != &map->header && current->start < end;
+ current = current->next) {
+ if ((current->eflags & MAP_ENTRY_GUARD) != 0)
+ continue;
+
old_prot = current->protection;
if (set_max)
@@ -1973,12 +2064,16 @@
else
current->protection = new_prot;
- if ((current->eflags & (MAP_ENTRY_COW | MAP_ENTRY_USER_WIRED))
- == (MAP_ENTRY_COW | MAP_ENTRY_USER_WIRED) &&
+ /*
+ * For user wired map entries, the normal lazy evaluation of
+ * write access upgrades through soft page faults is
+ * undesirable. Instead, immediately copy any pages that are
+ * copy-on-write and enable write access in the physical map.
+ */
+ if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
(current->protection & VM_PROT_WRITE) != 0 &&
- (old_prot & VM_PROT_WRITE) == 0) {
+ (old_prot & VM_PROT_WRITE) == 0)
vm_fault_copy_entry(map, map, current, current, NULL);
- }
/*
* When restricting access, update the physical map. Worry
@@ -1993,7 +2088,6 @@
#undef MASK
}
vm_map_simplify_entry(map, current);
- current = current->next;
}
vm_map_unlock(map);
return (KERN_SUCCESS);
@@ -2031,6 +2125,8 @@
case MADV_AUTOSYNC:
case MADV_NOCORE:
case MADV_CORE:
+ if (start == end)
+ return (KERN_SUCCESS);
modify_map = 1;
vm_map_lock(map);
break;
@@ -2037,6 +2133,8 @@
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_FREE:
+ if (start == end)
+ return (KERN_SUCCESS);
vm_map_lock_read(map);
break;
default:
@@ -2113,7 +2211,7 @@
(current != &map->header) && (current->start < end);
current = current->next
) {
- vm_offset_t useStart;
+ vm_offset_t useEnd, useStart;
if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
continue;
@@ -2121,20 +2219,44 @@
pstart = OFF_TO_IDX(current->offset);
pend = pstart + atop(current->end - current->start);
useStart = current->start;
+ useEnd = current->end;
if (current->start < start) {
pstart += atop(start - current->start);
useStart = start;
}
- if (current->end > end)
+ if (current->end > end) {
pend -= atop(current->end - end);
+ useEnd = end;
+ }
if (pstart >= pend)
continue;
+ /*
+ * Perform the pmap_advise() before clearing
+ * PGA_REFERENCED in vm_page_advise(). Otherwise, a
+ * concurrent pmap operation, such as pmap_remove(),
+ * could clear a reference in the pmap and set
+ * PGA_REFERENCED on the page before the pmap_advise()
+ * had completed. Consequently, the page would appear
+ * referenced based upon an old reference that
+ * occurred before this pmap_advise() ran.
+ */
+ if (behav == MADV_DONTNEED || behav == MADV_FREE)
+ pmap_advise(map->pmap, useStart, useEnd,
+ behav);
+
vm_object_madvise(current->object.vm_object, pstart,
pend, behav);
- if (behav == MADV_WILLNEED) {
+
+ /*
+ * Pre-populate paging structures in the
+ * WILLNEED case. For wired entries, the
+ * paging structures are already populated.
+ */
+ if (behav == MADV_WILLNEED &&
+ current->wired_count == 0) {
vm_map_pmap_enter(map,
useStart,
current->protection,
@@ -2170,10 +2292,13 @@
case VM_INHERIT_NONE:
case VM_INHERIT_COPY:
case VM_INHERIT_SHARE:
+ case VM_INHERIT_ZERO:
break;
default:
return (KERN_INVALID_ARGUMENT);
}
+ if (start == end)
+ return (KERN_SUCCESS);
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (vm_map_lookup_entry(map, start, &temp_entry)) {
@@ -2183,7 +2308,9 @@
entry = temp_entry->next;
while ((entry != &map->header) && (entry->start < end)) {
vm_map_clip_end(map, entry, end);
- entry->inheritance = new_inheritance;
+ if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
+ new_inheritance != VM_INHERIT_ZERO)
+ entry->inheritance = new_inheritance;
vm_map_simplify_entry(map, entry);
entry = entry->next;
}
@@ -2206,6 +2333,8 @@
int rv;
boolean_t need_wakeup, result, user_unwire;
+ if (start == end)
+ return (KERN_SUCCESS);
user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
@@ -2272,7 +2401,11 @@
* Mark the entry in case the map lock is released. (See
* above.)
*/
+ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
+ entry->wiring_thread == NULL,
+ ("owned map entry %p", entry));
entry->eflags |= MAP_ENTRY_IN_TRANSITION;
+ entry->wiring_thread = curthread;
/*
* Check the map for holes in the specified region.
* If VM_MAP_WIRE_HOLESOK was specified, skip this check.
@@ -2305,32 +2438,44 @@
else
KASSERT(result, ("vm_map_unwire: lookup failed"));
}
- entry = first_entry;
- while (entry != &map->header && entry->start < end) {
+ for (entry = first_entry; entry != &map->header && entry->start < end;
+ entry = entry->next) {
+ /*
+ * If VM_MAP_WIRE_HOLESOK was specified, an empty
+ * space in the unwired region could have been mapped
+ * while the map lock was dropped for draining
+ * MAP_ENTRY_IN_TRANSITION. Moreover, another thread
+ * could be simultaneously wiring this new mapping
+ * entry. Detect these cases and skip any entries
+ * marked as in transition by us.
+ */
+ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
+ entry->wiring_thread != curthread) {
+ KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
+ ("vm_map_unwire: !HOLESOK and new/changed entry"));
+ continue;
+ }
+
if (rv == KERN_SUCCESS && (!user_unwire ||
(entry->eflags & MAP_ENTRY_USER_WIRED))) {
if (user_unwire)
entry->eflags &= ~MAP_ENTRY_USER_WIRED;
- entry->wired_count--;
- if (entry->wired_count == 0) {
- /*
- * Retain the map lock.
- */
- vm_fault_unwire(map, entry->start, entry->end,
- entry->object.vm_object != NULL &&
- (entry->object.vm_object->type == OBJT_DEVICE ||
- entry->object.vm_object->type == OBJT_SG));
- }
+ if (entry->wired_count == 1)
+ vm_map_entry_unwire(map, entry);
+ else
+ entry->wired_count--;
}
- KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
- ("vm_map_unwire: in-transition flag missing"));
+ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
+ ("vm_map_unwire: in-transition flag missing %p", entry));
+ KASSERT(entry->wiring_thread == curthread,
+ ("vm_map_unwire: alien wire %p", entry));
entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
+ entry->wiring_thread = NULL;
if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
need_wakeup = TRUE;
}
vm_map_simplify_entry(map, entry);
- entry = entry->next;
}
vm_map_unlock(map);
if (need_wakeup)
@@ -2339,6 +2484,42 @@
}
/*
+ * vm_map_wire_entry_failure:
+ *
+ * Handle a wiring failure on the given entry.
+ *
+ * The map should be locked.
+ */
+static void
+vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
+ vm_offset_t failed_addr)
+{
+
+ VM_MAP_ASSERT_LOCKED(map);
+ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
+ entry->wired_count == 1,
+ ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
+ KASSERT(failed_addr < entry->end,
+ ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
+
+ /*
+ * If any pages at the start of this entry were successfully wired,
+ * then unwire them.
+ */
+ if (failed_addr > entry->start) {
+ pmap_unwire(map->pmap, entry->start, failed_addr);
+ vm_object_unwire(entry->object.vm_object, entry->offset,
+ failed_addr - entry->start, PQ_ACTIVE);
+ }
+
+ /*
+ * Assign an out-of-range value to represent the failure to wire this
+ * entry.
+ */
+ entry->wired_count = -1;
+}
+
+/*
* vm_map_wire:
*
* Implements both kernel and user wiring.
@@ -2348,12 +2529,14 @@
int flags)
{
vm_map_entry_t entry, first_entry, tmp_entry;
- vm_offset_t saved_end, saved_start;
+ vm_offset_t faddr, saved_end, saved_start;
unsigned int last_timestamp;
int rv;
- boolean_t fictitious, need_wakeup, result, user_wire;
+ boolean_t need_wakeup, result, user_wire;
vm_prot_t prot;
+ if (start == end)
+ return (KERN_SUCCESS);
prot = 0;
if (flags & VM_MAP_WIRE_WRITE)
prot |= VM_PROT_WRITE;
@@ -2423,7 +2606,11 @@
* Mark the entry in case the map lock is released. (See
* above.)
*/
+ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
+ entry->wiring_thread == NULL,
+ ("owned map entry %p", entry));
entry->eflags |= MAP_ENTRY_IN_TRANSITION;
+ entry->wiring_thread = curthread;
if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
|| (entry->protection & prot) != prot) {
entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
@@ -2438,9 +2625,7 @@
entry->wired_count++;
saved_start = entry->start;
saved_end = entry->end;
- fictitious = entry->object.vm_object != NULL &&
- (entry->object.vm_object->type == OBJT_DEVICE ||
- entry->object.vm_object->type == OBJT_SG);
+
/*
* Release the map lock, relying on the in-transition
* mark. Mark the map busy for fork.
@@ -2447,8 +2632,17 @@
*/
vm_map_busy(map);
vm_map_unlock(map);
- rv = vm_fault_wire(map, saved_start, saved_end,
- fictitious);
+
+ faddr = saved_start;
+ do {
+ /*
+ * Simulate a fault to get the page and enter
+ * it into the physical map.
+ */
+ if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
+ VM_FAULT_WIRE)) != KERN_SUCCESS)
+ break;
+ } while ((faddr += PAGE_SIZE) < saved_end);
vm_map_lock(map);
vm_map_unbusy(map);
if (last_timestamp + 1 != map->timestamp) {
@@ -2467,23 +2661,22 @@
first_entry = NULL;
entry = tmp_entry;
while (entry->end < saved_end) {
- if (rv != KERN_SUCCESS) {
- KASSERT(entry->wired_count == 1,
- ("vm_map_wire: bad count"));
- entry->wired_count = -1;
- }
+ /*
+ * In case of failure, handle entries
+ * that were not fully wired here;
+ * fully wired entries are handled
+ * later.
+ */
+ if (rv != KERN_SUCCESS &&
+ faddr < entry->end)
+ vm_map_wire_entry_failure(map,
+ entry, faddr);
entry = entry->next;
}
}
last_timestamp = map->timestamp;
if (rv != KERN_SUCCESS) {
- KASSERT(entry->wired_count == 1,
- ("vm_map_wire: bad count"));
- /*
- * Assign an out-of-range value to represent
- * the failure to wire this entry.
- */
- entry->wired_count = -1;
+ vm_map_wire_entry_failure(map, entry, faddr);
end = entry->end;
goto done;
}
@@ -2496,9 +2689,9 @@
* If VM_MAP_WIRE_HOLESOK was specified, skip this check.
*/
next_entry:
- if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
- (entry->end < end && (entry->next == &map->header ||
- entry->next->start > entry->end))) {
+ if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
+ entry->end < end && (entry->next == &map->header ||
+ entry->next->start > entry->end)) {
end = entry->end;
rv = KERN_INVALID_ADDRESS;
goto done;
@@ -2515,10 +2708,27 @@
else
KASSERT(result, ("vm_map_wire: lookup failed"));
}
- entry = first_entry;
- while (entry != &map->header && entry->start < end) {
+ for (entry = first_entry; entry != &map->header && entry->start < end;
+ entry = entry->next) {
+ /*
+ * If VM_MAP_WIRE_HOLESOK was specified, an empty
+ * space in the unwired region could have been mapped
+ * while the map lock was dropped for faulting in the
+ * pages or draining MAP_ENTRY_IN_TRANSITION.
+ * Moreover, another thread could be simultaneously
+ * wiring this new mapping entry. Detect these cases
+ * and skip any entries marked as in transition not by us.
+ */
+ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
+ entry->wiring_thread != curthread) {
+ KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
+ ("vm_map_wire: !HOLESOK and new/changed entry"));
+ continue;
+ }
+
if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
goto next_entry_done;
+
if (rv == KERN_SUCCESS) {
if (user_wire)
entry->eflags |= MAP_ENTRY_USER_WIRED;
@@ -2528,30 +2738,30 @@
* unnecessary.
*/
entry->wired_count = 0;
- } else {
- if (!user_wire ||
- (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
+ } else if (!user_wire ||
+ (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
+ /*
+ * Undo the wiring. Wiring succeeded on this entry
+ * but failed on a later entry.
+ */
+ if (entry->wired_count == 1)
+ vm_map_entry_unwire(map, entry);
+ else
entry->wired_count--;
- if (entry->wired_count == 0) {
- /*
- * Retain the map lock.
- */
- vm_fault_unwire(map, entry->start, entry->end,
- entry->object.vm_object != NULL &&
- (entry->object.vm_object->type == OBJT_DEVICE ||
- entry->object.vm_object->type == OBJT_SG));
- }
}
next_entry_done:
- KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
- ("vm_map_wire: in-transition flag missing"));
- entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION|MAP_ENTRY_WIRE_SKIPPED);
+ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
+ ("vm_map_wire: in-transition flag missing %p", entry));
+ KASSERT(entry->wiring_thread == curthread,
+ ("vm_map_wire: alien wire %p", entry));
+ entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
+ MAP_ENTRY_WIRE_SKIPPED);
+ entry->wiring_thread = NULL;
if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
need_wakeup = TRUE;
}
vm_map_simplify_entry(map, entry);
- entry = entry->next;
}
vm_map_unlock(map);
if (need_wakeup)
@@ -2673,10 +2883,13 @@
static void
vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
{
- vm_fault_unwire(map, entry->start, entry->end,
- entry->object.vm_object != NULL &&
- (entry->object.vm_object->type == OBJT_DEVICE ||
- entry->object.vm_object->type == OBJT_SG));
+
+ VM_MAP_ASSERT_LOCKED(map);
+ KASSERT(entry->wired_count > 0,
+ ("vm_map_entry_unwire: entry %p isn't wired", entry));
+ pmap_unwire(map->pmap, entry->start, entry->end);
+ vm_object_unwire(entry->object.vm_object, entry->offset, entry->end -
+ entry->start, PQ_ACTIVE);
entry->wired_count = 0;
}
@@ -2703,6 +2916,15 @@
vm_map_entry_unlink(map, entry);
object = entry->object.vm_object;
+
+ if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
+ MPASS(entry->cred == NULL);
+ MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
+ MPASS(object == NULL);
+ vm_map_entry_deallocate(entry, map->system_map);
+ return;
+ }
+
size = entry->end - entry->start;
map->size -= size;
@@ -2719,9 +2941,9 @@
count = OFF_TO_IDX(size);
offidxstart = OFF_TO_IDX(entry->offset);
offidxend = offidxstart + count;
- VM_OBJECT_LOCK(object);
- if (object->ref_count != 1 &&
- ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
+ VM_OBJECT_WLOCK(object);
+ if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
+ OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
object == kernel_object || object == kmem_object)) {
vm_object_collapse(object);
@@ -2734,7 +2956,8 @@
vm_object_page_remove(object, offidxstart, offidxend,
OBJPR_NOTMAPPED);
if (object->type == OBJT_SWAP)
- swap_pager_freespace(object, offidxstart, count);
+ swap_pager_freespace(object, offidxstart,
+ count);
if (offidxend >= object->size &&
offidxstart < object->size) {
size1 = object->size;
@@ -2742,13 +2965,14 @@
if (object->cred != NULL) {
size1 -= object->size;
KASSERT(object->charge >= ptoa(size1),
- ("vm_map_entry_delete: object->charge < 0"));
- swap_release_by_cred(ptoa(size1), object->cred);
+ ("object %p charge < 0", object));
+ swap_release_by_cred(ptoa(size1),
+ object->cred);
object->charge -= ptoa(size1);
}
}
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
} else
entry->object.vm_object = NULL;
if (map->system_map)
@@ -2772,6 +2996,8 @@
vm_map_entry_t first_entry;
VM_MAP_ASSERT_LOCKED(map);
+ if (start == end)
+ return (KERN_SUCCESS);
/*
* Find the start of the region, and clip it
@@ -2938,13 +3164,14 @@
if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
return;
- if (src_entry->wired_count == 0) {
-
+ if (src_entry->wired_count == 0 ||
+ (src_entry->protection & VM_PROT_WRITE) == 0) {
/*
* If the source entry is marked needs_copy, it is already
* write-protected.
*/
- if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
+ if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
+ (src_entry->protection & VM_PROT_WRITE) != 0) {
pmap_protect(src_map->pmap,
src_entry->start,
src_entry->end,
@@ -2956,15 +3183,17 @@
*/
size = src_entry->end - src_entry->start;
if ((src_object = src_entry->object.vm_object) != NULL) {
- VM_OBJECT_LOCK(src_object);
+ VM_OBJECT_WLOCK(src_object);
charged = ENTRY_CHARGED(src_entry);
- if ((src_object->handle == NULL) &&
- (src_object->type == OBJT_DEFAULT ||
- src_object->type == OBJT_SWAP)) {
+ if (src_object->handle == NULL &&
+ (src_object->type == OBJT_DEFAULT ||
+ src_object->type == OBJT_SWAP)) {
vm_object_collapse(src_object);
- if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
+ if ((src_object->flags & (OBJ_NOSPLIT |
+ OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
vm_object_split(src_entry);
- src_object = src_entry->object.vm_object;
+ src_object =
+ src_entry->object.vm_object;
}
}
vm_object_reference_locked(src_object);
@@ -2977,7 +3206,7 @@
src_object->cred = src_entry->cred;
src_object->charge = size;
}
- VM_OBJECT_UNLOCK(src_object);
+ VM_OBJECT_WUNLOCK(src_object);
dst_entry->object.vm_object = src_object;
if (charged) {
cred = curthread->td_ucred;
@@ -2991,8 +3220,10 @@
*fork_charge += size;
}
}
- src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
- dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
+ src_entry->eflags |= MAP_ENTRY_COW |
+ MAP_ENTRY_NEEDS_COPY;
+ dst_entry->eflags |= MAP_ENTRY_COW |
+ MAP_ENTRY_NEEDS_COPY;
dst_entry->offset = src_entry->offset;
if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
/*
@@ -3015,6 +3246,10 @@
fake_entry->next = curthread->td_map_def_user;
curthread->td_map_def_user = fake_entry;
}
+
+ pmap_copy(dst_map->pmap, src_map->pmap,
+ dst_entry->start, dst_entry->end - dst_entry->start,
+ src_entry->start);
} else {
dst_entry->object.vm_object = NULL;
dst_entry->offset = 0;
@@ -3024,14 +3259,11 @@
*fork_charge += size;
}
}
-
- pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
- dst_entry->end - dst_entry->start, src_entry->start);
} else {
/*
- * Of course, wired down pages can't be set copy-on-write.
- * Cause wired pages to be copied into the new map by
- * simulating faults (the new pages are pageable)
+ * We don't want to make writeable wired pages copy-on-write.
+ * Immediately copy these pages into the new map by simulating
+ * page faults. The new pages are pageable.
*/
vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
fork_charge);
@@ -3051,6 +3283,8 @@
vm_size_t entrysize;
vm_offset_t newend;
+ if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
+ return;
entrysize = entry->end - entry->start;
vm2->vm_map.size += entrysize;
if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
@@ -3087,10 +3321,11 @@
vm_map_entry_t new_entry, old_entry;
vm_object_t object;
int locked;
+ vm_inherit_t inh;
old_map = &vm1->vm_map;
/* Copy immutable fields of vm1 to vm2. */
- vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
+ vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
if (vm2 == NULL)
return (NULL);
vm2->vm_taddr = vm1->vm_taddr;
@@ -3109,7 +3344,12 @@
if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
panic("vm_map_fork: encountered a submap");
- switch (old_entry->inheritance) {
+ inh = old_entry->inheritance;
+ if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
+ inh != VM_INHERIT_NONE)
+ inh = VM_INHERIT_COPY;
+
+ switch (inh) {
case VM_INHERIT_NONE:
break;
@@ -3153,7 +3393,7 @@
vm_object_deallocate(object);
object = old_entry->object.vm_object;
}
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
vm_object_clear_flag(object, OBJ_ONEMAPPING);
if (old_entry->cred != NULL) {
KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
@@ -3177,7 +3417,7 @@
("vmspace_fork: vnp.writecount %p",
object));
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
/*
* Clone the entry, referencing the shared object.
@@ -3186,6 +3426,7 @@
*new_entry = *old_entry;
new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
MAP_ENTRY_IN_TRANSITION);
+ new_entry->wiring_thread = NULL;
new_entry->wired_count = 0;
if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
vnode_pager_update_writecount(object,
@@ -3220,6 +3461,7 @@
*/
new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
+ new_entry->wiring_thread = NULL;
new_entry->wired_count = 0;
new_entry->object.vm_object = NULL;
new_entry->cred = NULL;
@@ -3229,6 +3471,33 @@
vm_map_copy_entry(old_map, new_map, old_entry,
new_entry, fork_charge);
break;
+
+ case VM_INHERIT_ZERO:
+ /*
+ * Create a new anonymous mapping entry modelled from
+ * the old one.
+ */
+ new_entry = vm_map_entry_create(new_map);
+ memset(new_entry, 0, sizeof(*new_entry));
+
+ new_entry->start = old_entry->start;
+ new_entry->end = old_entry->end;
+ new_entry->eflags = old_entry->eflags &
+ ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
+ MAP_ENTRY_VN_WRITECNT);
+ new_entry->protection = old_entry->protection;
+ new_entry->max_protection = old_entry->max_protection;
+ new_entry->inheritance = VM_INHERIT_ZERO;
+
+ vm_map_entry_link(new_map, new_map->header.prev,
+ new_entry);
+ vmspace_map_entry_forked(vm1, vm2, new_entry);
+
+ new_entry->cred = curthread->td_ucred;
+ crhold(new_entry->cred);
+ *fork_charge += (new_entry->end - new_entry->start);
+
+ break;
}
old_entry = old_entry->next;
}
@@ -3244,73 +3513,83 @@
return (vm2);
}
+/*
+ * Create a process's stack for exec_new_vmspace(). This function is never
+ * asked to wire the newly created stack.
+ */
int
vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
vm_prot_t prot, vm_prot_t max, int cow)
{
+ vm_size_t growsize, init_ssize;
+ rlim_t vmemlim;
+ int rv;
+
+ MPASS((map->flags & MAP_WIREFUTURE) == 0);
+ growsize = sgrowsiz;
+ init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
+ vm_map_lock(map);
+ PROC_LOCK(curproc);
+ vmemlim = lim_cur(curproc, RLIMIT_VMEM);
+ PROC_UNLOCK(curproc);
+ /* If we would blow our VMEM resource limit, no go */
+ if (map->size + init_ssize > vmemlim) {
+ rv = KERN_NO_SPACE;
+ goto out;
+ }
+ rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
+ max, cow);
+out:
+ vm_map_unlock(map);
+ return (rv);
+}
+
+static int stack_guard_page = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
+ &stack_guard_page, 0,
+ "Specifies the number of guard pages for a stack that grows");
+
+static int
+vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
+ vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
+{
vm_map_entry_t new_entry, prev_entry;
- vm_offset_t bot, top;
- vm_size_t growsize, init_ssize;
+ vm_offset_t bot, gap_bot, gap_top, top;
+ vm_size_t init_ssize, sgp;
int orient, rv;
- rlim_t lmemlim, vmemlim;
/*
* The stack orientation is piggybacked with the cow argument.
* Extract it into orient and mask the cow argument so that we
* don't pass it around further.
- * NOTE: We explicitly allow bi-directional stacks.
*/
- orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
+ orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
KASSERT(orient != 0, ("No stack grow direction"));
+ KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
+ ("bi-dir stack"));
if (addrbos < vm_map_min(map) ||
- addrbos > vm_map_max(map) ||
- addrbos + max_ssize < addrbos)
- return (KERN_NO_SPACE);
+ addrbos + max_ssize > vm_map_max(map) ||
+ addrbos + max_ssize <= addrbos)
+ return (KERN_INVALID_ADDRESS);
+ sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
+ if (sgp >= max_ssize)
+ return (KERN_INVALID_ARGUMENT);
- growsize = sgrowsiz;
- init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
+ init_ssize = growsize;
+ if (max_ssize < init_ssize + sgp)
+ init_ssize = max_ssize - sgp;
- PROC_LOCK(curproc);
- lmemlim = lim_cur(curproc, RLIMIT_MEMLOCK);
- vmemlim = lim_cur(curproc, RLIMIT_VMEM);
- PROC_UNLOCK(curproc);
-
- vm_map_lock(map);
-
/* If addr is already mapped, no go */
- if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
- vm_map_unlock(map);
+ if (vm_map_lookup_entry(map, addrbos, &prev_entry))
return (KERN_NO_SPACE);
- }
- if (!old_mlock && map->flags & MAP_WIREFUTURE) {
- if (ptoa(pmap_wired_count(map->pmap)) + init_ssize > lmemlim) {
- vm_map_unlock(map);
- return (KERN_NO_SPACE);
- }
- }
-
- /* If we would blow our VMEM resource limit, no go */
- if (map->size + init_ssize > vmemlim) {
- vm_map_unlock(map);
- return (KERN_NO_SPACE);
- }
-
/*
* If we can't accomodate max_ssize in the current mapping, no go.
- * However, we need to be aware that subsequent user mappings might
- * map into the space we have reserved for stack, and currently this
- * space is not protected.
- *
- * Hopefully we will at least detect this condition when we try to
- * grow the stack.
*/
if ((prev_entry->next != &map->header) &&
- (prev_entry->next->start < addrbos + max_ssize)) {
- vm_map_unlock(map);
+ (prev_entry->next->start < addrbos + max_ssize))
return (KERN_NO_SPACE);
- }
/*
* We initially map a stack of only init_ssize. We will grow as
@@ -3322,59 +3601,53 @@
* and cow to be 0. Possibly we should eliminate these as input
* parameters, and just pass these values here in the insert call.
*/
- if (orient == MAP_STACK_GROWS_DOWN)
+ if (orient == MAP_STACK_GROWS_DOWN) {
bot = addrbos + max_ssize - init_ssize;
- else if (orient == MAP_STACK_GROWS_UP)
+ top = bot + init_ssize;
+ gap_bot = addrbos;
+ gap_top = bot;
+ } else /* if (orient == MAP_STACK_GROWS_UP) */ {
bot = addrbos;
- else
- bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
- top = bot + init_ssize;
+ top = bot + init_ssize;
+ gap_bot = top;
+ gap_top = addrbos + max_ssize;
+ }
rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
-
- /* Now set the avail_ssize amount. */
- if (rv == KERN_SUCCESS) {
- if (prev_entry != &map->header)
- vm_map_clip_end(map, prev_entry, bot);
- new_entry = prev_entry->next;
- if (new_entry->end != top || new_entry->start != bot)
- panic("Bad entry start/end for new stack entry");
-
- new_entry->avail_ssize = max_ssize - init_ssize;
- if (orient & MAP_STACK_GROWS_DOWN)
- new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
- if (orient & MAP_STACK_GROWS_UP)
- new_entry->eflags |= MAP_ENTRY_GROWS_UP;
- }
-
- vm_map_unlock(map);
+ if (rv != KERN_SUCCESS)
+ return (rv);
+ new_entry = prev_entry->next;
+ KASSERT(new_entry->end == top || new_entry->start == bot,
+ ("Bad entry start/end for new stack entry"));
+ KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
+ (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
+ ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
+ KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
+ (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
+ ("new entry lacks MAP_ENTRY_GROWS_UP"));
+ rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
+ VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
+ MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
+ if (rv != KERN_SUCCESS)
+ (void)vm_map_delete(map, bot, top);
return (rv);
}
-static int stack_guard_page = 0;
-TUNABLE_INT("security.bsd.stack_guard_page", &stack_guard_page);
-SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RW,
- &stack_guard_page, 0,
- "Insert stack guard page ahead of the growable segments.");
-
-/* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the
- * desired address is already mapped, or if we successfully grow
- * the stack. Also returns KERN_SUCCESS if addr is outside the
- * stack range (this is strange, but preserves compatibility with
- * the grow function in vm_machdep.c).
+/*
+ * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we
+ * successfully grow the stack.
*/
-int
-vm_map_growstack(struct proc *p, vm_offset_t addr)
+static int
+vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
{
- vm_map_entry_t next_entry, prev_entry;
- vm_map_entry_t new_entry, stack_entry;
- struct vmspace *vm = p->p_vmspace;
- vm_map_t map = &vm->vm_map;
- vm_offset_t end;
- vm_size_t growsize;
- size_t grow_amount, max_grow;
+ vm_map_entry_t stack_entry;
+ struct proc *p;
+ struct vmspace *vm;
+ struct ucred *cred;
+ vm_offset_t gap_end, gap_start, grow_start;
+ size_t grow_amount, guard, max_grow;
rlim_t lmemlim, stacklim, vmemlim;
- int is_procstack, rv;
- struct ucred *cred;
+ int rv, rv1;
+ bool gap_deleted, grow_down, is_procstack;
#ifdef notyet
uint64_t limit;
#endif
@@ -3382,124 +3655,84 @@
int error;
#endif
-Retry:
+ p = curproc;
+ vm = p->p_vmspace;
+
+ /*
+ * Disallow stack growth when the access is performed by a
+ * debugger or AIO daemon. The reason is that the wrong
+ * resource limits are applied.
+ */
+ if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
+ return (KERN_FAILURE);
+
+ MPASS(!map->system_map);
+
+ guard = stack_guard_page * PAGE_SIZE;
PROC_LOCK(p);
lmemlim = lim_cur(p, RLIMIT_MEMLOCK);
stacklim = lim_cur(p, RLIMIT_STACK);
vmemlim = lim_cur(p, RLIMIT_VMEM);
PROC_UNLOCK(p);
-
- vm_map_lock_read(map);
-
- /* If addr is already in the entry range, no need to grow.*/
- if (vm_map_lookup_entry(map, addr, &prev_entry)) {
- vm_map_unlock_read(map);
+retry:
+ /* If addr is not in a hole for a stack grow area, no need to grow. */
+ if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
+ return (KERN_FAILURE);
+ if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
return (KERN_SUCCESS);
- }
-
- next_entry = prev_entry->next;
- if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
- /*
- * This entry does not grow upwards. Since the address lies
- * beyond this entry, the next entry (if one exists) has to
- * be a downward growable entry. The entry list header is
- * never a growable entry, so it suffices to check the flags.
- */
- if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
- vm_map_unlock_read(map);
- return (KERN_SUCCESS);
- }
- stack_entry = next_entry;
+ if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
+ stack_entry = gap_entry->next;
+ if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
+ stack_entry->start != gap_entry->end)
+ return (KERN_FAILURE);
+ grow_amount = round_page(stack_entry->start - addr);
+ grow_down = true;
+ } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
+ stack_entry = gap_entry->prev;
+ if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
+ stack_entry->end != gap_entry->start)
+ return (KERN_FAILURE);
+ grow_amount = round_page(addr + 1 - stack_entry->end);
+ grow_down = false;
} else {
- /*
- * This entry grows upward. If the next entry does not at
- * least grow downwards, this is the entry we need to grow.
- * otherwise we have two possible choices and we have to
- * select one.
- */
- if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
- /*
- * We have two choices; grow the entry closest to
- * the address to minimize the amount of growth.
- */
- if (addr - prev_entry->end <= next_entry->start - addr)
- stack_entry = prev_entry;
- else
- stack_entry = next_entry;
- } else
- stack_entry = prev_entry;
+ return (KERN_FAILURE);
}
-
- if (stack_entry == next_entry) {
- KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
- KASSERT(addr < stack_entry->start, ("foo"));
- end = (prev_entry != &map->header) ? prev_entry->end :
- stack_entry->start - stack_entry->avail_ssize;
- grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
- max_grow = stack_entry->start - end;
- } else {
- KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
- KASSERT(addr >= stack_entry->end, ("foo"));
- end = (next_entry != &map->header) ? next_entry->start :
- stack_entry->end + stack_entry->avail_ssize;
- grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
- max_grow = end - stack_entry->end;
- }
-
- if (grow_amount > stack_entry->avail_ssize) {
- vm_map_unlock_read(map);
+ max_grow = gap_entry->end - gap_entry->start;
+ if (guard > max_grow)
return (KERN_NO_SPACE);
- }
-
- /*
- * If there is no longer enough space between the entries nogo, and
- * adjust the available space. Note: this should only happen if the
- * user has mapped into the stack area after the stack was created,
- * and is probably an error.
- *
- * This also effectively destroys any guard page the user might have
- * intended by limiting the stack size.
- */
- if (grow_amount + (stack_guard_page ? PAGE_SIZE : 0) > max_grow) {
- if (vm_map_lock_upgrade(map))
- goto Retry;
-
- stack_entry->avail_ssize = max_grow;
-
- vm_map_unlock(map);
+ max_grow -= guard;
+ if (grow_amount > max_grow)
return (KERN_NO_SPACE);
- }
- is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0;
-
/*
* If this is the main process stack, see if we're over the stack
* limit.
*/
- if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
- vm_map_unlock_read(map);
+ is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
+ addr < (vm_offset_t)p->p_sysent->sv_usrstack;
+ if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
return (KERN_NO_SPACE);
- }
+
#ifdef RACCT
- PROC_LOCK(p);
- if (is_procstack &&
- racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) {
+ if (racct_enable) {
+ PROC_LOCK(p);
+ if (is_procstack && racct_set(p, RACCT_STACK,
+ ctob(vm->vm_ssize) + grow_amount)) {
+ PROC_UNLOCK(p);
+ return (KERN_NO_SPACE);
+ }
PROC_UNLOCK(p);
- vm_map_unlock_read(map);
- return (KERN_NO_SPACE);
}
- PROC_UNLOCK(p);
#endif
- /* Round up the grow amount modulo sgrowsiz */
- growsize = sgrowsiz;
- grow_amount = roundup(grow_amount, growsize);
- if (grow_amount > stack_entry->avail_ssize)
- grow_amount = stack_entry->avail_ssize;
+ grow_amount = roundup(grow_amount, sgrowsiz);
+ if (grow_amount > max_grow)
+ grow_amount = max_grow;
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
grow_amount = trunc_page((vm_size_t)stacklim) -
ctob(vm->vm_ssize);
}
+
#ifdef notyet
PROC_LOCK(p);
limit = racct_get_available(p, RACCT_STACK);
@@ -3507,97 +3740,79 @@
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
grow_amount = limit - ctob(vm->vm_ssize);
#endif
- if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+
+ if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
- vm_map_unlock_read(map);
rv = KERN_NO_SPACE;
goto out;
}
#ifdef RACCT
- PROC_LOCK(p);
- if (racct_set(p, RACCT_MEMLOCK,
- ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
+ if (racct_enable) {
+ PROC_LOCK(p);
+ if (racct_set(p, RACCT_MEMLOCK,
+ ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
+ PROC_UNLOCK(p);
+ rv = KERN_NO_SPACE;
+ goto out;
+ }
PROC_UNLOCK(p);
- vm_map_unlock_read(map);
- rv = KERN_NO_SPACE;
- goto out;
}
- PROC_UNLOCK(p);
#endif
}
+
/* If we would blow our VMEM resource limit, no go */
if (map->size + grow_amount > vmemlim) {
- vm_map_unlock_read(map);
rv = KERN_NO_SPACE;
goto out;
}
#ifdef RACCT
- PROC_LOCK(p);
- if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
+ if (racct_enable) {
+ PROC_LOCK(p);
+ if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
+ PROC_UNLOCK(p);
+ rv = KERN_NO_SPACE;
+ goto out;
+ }
PROC_UNLOCK(p);
- vm_map_unlock_read(map);
- rv = KERN_NO_SPACE;
- goto out;
}
- PROC_UNLOCK(p);
#endif
- if (vm_map_lock_upgrade(map))
- goto Retry;
+ if (vm_map_lock_upgrade(map)) {
+ gap_entry = NULL;
+ vm_map_lock_read(map);
+ goto retry;
+ }
- if (stack_entry == next_entry) {
- /*
- * Growing downward.
- */
- /* Get the preliminary new entry start value */
- addr = stack_entry->start - grow_amount;
-
- /*
- * If this puts us into the previous entry, cut back our
- * growth to the available space. Also, see the note above.
- */
- if (addr < end) {
- stack_entry->avail_ssize = max_grow;
- addr = end;
- if (stack_guard_page)
- addr += PAGE_SIZE;
+ if (grow_down) {
+ grow_start = gap_entry->end - grow_amount;
+ if (gap_entry->start + grow_amount == gap_entry->end) {
+ gap_start = gap_entry->start;
+ gap_end = gap_entry->end;
+ vm_map_entry_delete(map, gap_entry);
+ gap_deleted = true;
+ } else {
+ MPASS(gap_entry->start < gap_entry->end - grow_amount);
+ gap_entry->end -= grow_amount;
+ vm_map_entry_resize_free(map, gap_entry);
+ gap_deleted = false;
}
-
- rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
- next_entry->protection, next_entry->max_protection, 0);
-
- /* Adjust the available stack space by the amount we grew. */
- if (rv == KERN_SUCCESS) {
- if (prev_entry != &map->header)
- vm_map_clip_end(map, prev_entry, addr);
- new_entry = prev_entry->next;
- KASSERT(new_entry == stack_entry->prev, ("foo"));
- KASSERT(new_entry->end == stack_entry->start, ("foo"));
- KASSERT(new_entry->start == addr, ("foo"));
- grow_amount = new_entry->end - new_entry->start;
- new_entry->avail_ssize = stack_entry->avail_ssize -
- grow_amount;
- stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
- new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
+ rv = vm_map_insert(map, NULL, 0, grow_start,
+ grow_start + grow_amount,
+ stack_entry->protection, stack_entry->max_protection,
+ MAP_STACK_GROWS_DOWN);
+ if (rv != KERN_SUCCESS) {
+ if (gap_deleted) {
+ rv1 = vm_map_insert(map, NULL, 0, gap_start,
+ gap_end, VM_PROT_NONE, VM_PROT_NONE,
+ MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
+ MPASS(rv1 == KERN_SUCCESS);
+ } else {
+ gap_entry->end += grow_amount;
+ vm_map_entry_resize_free(map, gap_entry);
+ }
}
} else {
- /*
- * Growing upward.
- */
- addr = stack_entry->end + grow_amount;
-
- /*
- * If this puts us into the next entry, cut back our growth
- * to the available space. Also, see the note above.
- */
- if (addr > end) {
- stack_entry->avail_ssize = end - stack_entry->end;
- addr = end;
- if (stack_guard_page)
- addr -= PAGE_SIZE;
- }
-
- grow_amount = addr - stack_entry->end;
+ grow_start = stack_entry->end;
cred = stack_entry->cred;
if (cred == NULL && stack_entry->object.vm_object != NULL)
cred = stack_entry->object.vm_object->cred;
@@ -3605,43 +3820,40 @@
rv = KERN_NO_SPACE;
/* Grow the underlying object if applicable. */
else if (stack_entry->object.vm_object == NULL ||
- vm_object_coalesce(stack_entry->object.vm_object,
- stack_entry->offset,
- (vm_size_t)(stack_entry->end - stack_entry->start),
- (vm_size_t)grow_amount, cred != NULL)) {
- map->size += (addr - stack_entry->end);
- /* Update the current entry. */
- stack_entry->end = addr;
- stack_entry->avail_ssize -= grow_amount;
+ vm_object_coalesce(stack_entry->object.vm_object,
+ stack_entry->offset,
+ (vm_size_t)(stack_entry->end - stack_entry->start),
+ (vm_size_t)grow_amount, cred != NULL)) {
+ if (gap_entry->start + grow_amount == gap_entry->end)
+ vm_map_entry_delete(map, gap_entry);
+ else
+ gap_entry->start += grow_amount;
+ stack_entry->end += grow_amount;
+ map->size += grow_amount;
vm_map_entry_resize_free(map, stack_entry);
rv = KERN_SUCCESS;
-
- if (next_entry != &map->header)
- vm_map_clip_start(map, next_entry, addr);
} else
rv = KERN_FAILURE;
}
-
if (rv == KERN_SUCCESS && is_procstack)
vm->vm_ssize += btoc(grow_amount);
- vm_map_unlock(map);
-
/*
* Heed the MAP_WIREFUTURE flag if it was set for this process.
*/
- if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
- vm_map_wire(map,
- (stack_entry == next_entry) ? addr : addr - grow_amount,
- (stack_entry == next_entry) ? stack_entry->start : addr,
+ if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
+ vm_map_unlock(map);
+ vm_map_wire(map, grow_start, grow_start + grow_amount,
(p->p_flag & P_SYSTEM)
? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
: VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
- }
+ vm_map_lock_read(map);
+ } else
+ vm_map_lock_downgrade(map);
out:
#ifdef RACCT
- if (rv != KERN_SUCCESS) {
+ if (racct_enable && rv != KERN_SUCCESS) {
PROC_LOCK(p);
error = racct_set(p, RACCT_VMEM, map->size);
KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
@@ -3669,7 +3881,9 @@
struct vmspace *oldvmspace = p->p_vmspace;
struct vmspace *newvmspace;
- newvmspace = vmspace_alloc(minuser, maxuser);
+ KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
+ ("vmspace_exec recursed"));
+ newvmspace = vmspace_alloc(minuser, maxuser, NULL);
if (newvmspace == NULL)
return (ENOMEM);
newvmspace->vm_swrss = oldvmspace->vm_swrss;
@@ -3685,7 +3899,7 @@
PROC_VMSPACE_UNLOCK(p);
if (p == curthread->td_proc)
pmap_activate(curthread);
- vmspace_free(oldvmspace);
+ curthread->td_pflags |= TDP_EXECVMSPC;
return (0);
}
@@ -3759,10 +3973,11 @@
vm_size_t size;
struct ucred *cred;
-RetryLookup:;
+RetryLookup:
vm_map_lock_read(map);
+RetryLookupLocked:
/*
* Lookup the faulting address.
*/
@@ -3788,17 +4003,24 @@
* Check whether this task is allowed to have this page.
*/
prot = entry->protection;
- fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
+ if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
+ fault_typea &= ~VM_PROT_FAULT_LOOKUP;
+ if (prot == VM_PROT_NONE && map != kernel_map &&
+ (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
+ (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
+ MAP_ENTRY_STACK_GAP_UP)) != 0 &&
+ vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
+ goto RetryLookupLocked;
+ }
+ fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
vm_map_unlock_read(map);
return (KERN_PROTECTION_FAILURE);
}
- if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
- (entry->eflags & MAP_ENTRY_COW) &&
- (fault_type & VM_PROT_WRITE)) {
- vm_map_unlock_read(map);
- return (KERN_PROTECTION_FAILURE);
- }
+ KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
+ (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
+ (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
+ ("entry %p flags %x", entry, entry->eflags));
if ((fault_typea & VM_PROT_COPY) != 0 &&
(entry->max_protection & VM_PROT_WRITE) == 0 &&
(entry->eflags & MAP_ENTRY_COW) == 0) {
@@ -3862,10 +4084,10 @@
crfree(entry->cred);
entry->cred = NULL;
} else if (entry->cred != NULL) {
- VM_OBJECT_LOCK(eobject);
+ VM_OBJECT_WLOCK(eobject);
eobject->cred = entry->cred;
eobject->charge = size;
- VM_OBJECT_UNLOCK(eobject);
+ VM_OBJECT_WUNLOCK(eobject);
entry->cred = NULL;
}
@@ -3890,10 +4112,10 @@
atop(size));
entry->offset = 0;
if (entry->cred != NULL) {
- VM_OBJECT_LOCK(entry->object.vm_object);
+ VM_OBJECT_WLOCK(entry->object.vm_object);
entry->object.vm_object->cred = entry->cred;
entry->object.vm_object->charge = size;
- VM_OBJECT_UNLOCK(entry->object.vm_object);
+ VM_OBJECT_WUNLOCK(entry->object.vm_object);
entry->cred = NULL;
}
vm_map_lock_downgrade(map);
@@ -3952,10 +4174,6 @@
fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
if ((fault_type & prot) != fault_type)
return (KERN_PROTECTION_FAILURE);
- if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
- (entry->eflags & MAP_ENTRY_COW) &&
- (fault_type & VM_PROT_WRITE))
- return (KERN_PROTECTION_FAILURE);
/*
* If this page is not pageable, we have to get it for all possible
@@ -4016,32 +4234,21 @@
#include <ddb/ddb.h>
-/*
- * vm_map_print: [ debug ]
- */
-DB_SHOW_COMMAND(map, vm_map_print)
+static void
+vm_map_print(vm_map_t map)
{
- static int nlines;
- /* XXX convert args. */
- vm_map_t map = (vm_map_t)addr;
- boolean_t full = have_addr;
-
vm_map_entry_t entry;
db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
(void *)map,
(void *)map->pmap, map->nentries, map->timestamp);
- nlines++;
- if (!full && db_indent)
- return;
-
db_indent += 2;
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
- db_iprintf("map entry %p: start=%p, end=%p\n",
- (void *)entry, (void *)entry->start, (void *)entry->end);
- nlines++;
+ db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
+ (void *)entry, (void *)entry->start, (void *)entry->end,
+ entry->eflags);
{
static char *inheritance_name[4] =
{"share", "copy", "none", "donate_copy"};
@@ -4057,14 +4264,11 @@
db_printf(", share=%p, offset=0x%jx\n",
(void *)entry->object.sub_map,
(uintmax_t)entry->offset);
- nlines++;
if ((entry->prev == &map->header) ||
(entry->prev->object.sub_map !=
entry->object.sub_map)) {
db_indent += 2;
- vm_map_print((db_expr_t)(intptr_t)
- entry->object.sub_map,
- full, 0, (char *)0);
+ vm_map_print((vm_map_t)entry->object.sub_map);
db_indent -= 2;
}
} else {
@@ -4081,7 +4285,6 @@
db_printf(", copy (%s)",
(entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
db_printf("\n");
- nlines++;
if ((entry->prev == &map->header) ||
(entry->prev->object.vm_object !=
@@ -4089,24 +4292,30 @@
db_indent += 2;
vm_object_print((db_expr_t)(intptr_t)
entry->object.vm_object,
- full, 0, (char *)0);
- nlines += 4;
+ 0, 0, (char *)0);
db_indent -= 2;
}
}
}
db_indent -= 2;
- if (db_indent == 0)
- nlines = 0;
}
+DB_SHOW_COMMAND(map, map)
+{
+ if (!have_addr) {
+ db_printf("usage: show map <addr>\n");
+ return;
+ }
+ vm_map_print((vm_map_t)addr);
+}
+
DB_SHOW_COMMAND(procvm, procvm)
{
struct proc *p;
if (have_addr) {
- p = (struct proc *) addr;
+ p = db_lookup_proc(addr);
} else {
p = curproc;
}
@@ -4115,7 +4324,7 @@
(void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
(void *)vmspace_pmap(p->p_vmspace));
- vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
+ vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
}
#endif /* DDB */
Modified: trunk/sys/vm/vm_map.h
===================================================================
--- trunk/sys/vm/vm_map.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_map.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -57,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_map.h 321718 2017-07-30 10:49:13Z kib $
*/
/*
@@ -103,7 +104,7 @@
struct vm_map_entry *right; /* right child in binary search tree */
vm_offset_t start; /* start address */
vm_offset_t end; /* end address */
- vm_offset_t avail_ssize; /* amt can grow if this is a stack */
+ vm_offset_t pad0;
vm_size_t adj_free; /* amount of adjacent free space */
vm_size_t max_free; /* max free space in subtree */
union vm_map_object object; /* object I point to */
@@ -116,6 +117,7 @@
int wired_count; /* can be paged if = 0 */
vm_pindex_t next_read; /* index of the next sequential read */
struct ucred *cred; /* tmp storage for creator ref */
+ struct thread *wiring_thread;
};
#define MAP_ENTRY_NOSYNC 0x0001
@@ -141,6 +143,9 @@
#define MAP_ENTRY_WIRE_SKIPPED 0x4000
#define MAP_ENTRY_VN_WRITECNT 0x8000 /* writeable vnode mapping */
+#define MAP_ENTRY_GUARD 0x10000
+#define MAP_ENTRY_STACK_GAP_DN 0x20000
+#define MAP_ENTRY_STACK_GAP_UP 0x40000
#ifdef _KERNEL
static __inline u_char
@@ -314,6 +319,8 @@
#define MAP_PREFAULT 0x0008
#define MAP_PREFAULT_PARTIAL 0x0010
#define MAP_DISABLE_SYNCER 0x0020
+#define MAP_CHECK_EXCL 0x0040
+#define MAP_CREATE_GUARD 0x0080
#define MAP_DISABLE_COREDUMP 0x0100
#define MAP_PREFAULT_MADVISE 0x0200 /* from (user) madvise request */
#define MAP_VN_WRITECOUNT 0x0400
@@ -321,13 +328,15 @@
#define MAP_STACK_GROWS_UP 0x2000
#define MAP_ACC_CHARGED 0x4000
#define MAP_ACC_NO_CHARGE 0x8000
+#define MAP_CREATE_STACK_GAP_UP 0x10000
+#define MAP_CREATE_STACK_GAP_DN 0x20000
/*
* vm_fault option flags
*/
-#define VM_FAULT_NORMAL 0 /* Nothing special */
-#define VM_FAULT_CHANGE_WIRING 1 /* Change the wiring as appropriate */
-#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */
+#define VM_FAULT_NORMAL 0 /* Nothing special */
+#define VM_FAULT_WIRE 1 /* Wire the mapped page */
+#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */
/*
* Initially, mappings are slightly sequential. The maximum window size must
@@ -338,14 +347,16 @@
#define VM_FAULT_READ_AHEAD_MAX min(atop(MAXPHYS) - 1, UINT8_MAX)
/*
- * The following "find_space" options are supported by vm_map_find()
+ * The following "find_space" options are supported by vm_map_find().
+ *
+ * For VMFS_ALIGNED_SPACE, the desired alignment is specified to
+ * the macro argument as log base 2 of the desired alignment.
*/
#define VMFS_NO_SPACE 0 /* don't find; use the given range */
#define VMFS_ANY_SPACE 1 /* find a range with any alignment */
-#define VMFS_ALIGNED_SPACE 2 /* find a superpage-aligned range */
-#if defined(__mips__)
-#define VMFS_TLB_ALIGNED_SPACE 3 /* find a TLB entry aligned range */
-#endif
+#define VMFS_OPTIMAL_SPACE 2 /* find a range with optimal alignment*/
+#define VMFS_SUPER_SPACE 3 /* find a superpage-aligned range */
+#define VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find a range with fixed alignment */
/*
* vm_map_wire and vm_map_unwire option flags
@@ -363,7 +374,9 @@
vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t);
int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t);
int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t,
- int, vm_prot_t, vm_prot_t, int);
+ vm_offset_t, int, vm_prot_t, vm_prot_t, int);
+int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *,
+ vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int);
int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t,
vm_prot_t, vm_prot_t, int);
int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *);
@@ -385,9 +398,7 @@
int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int);
void vm_map_simplify_entry (vm_map_t, vm_map_entry_t);
-void vm_init2 (void);
int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
-int vm_map_growstack (struct proc *p, vm_offset_t addr);
int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
int flags);
int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
Modified: trunk/sys/vm/vm_meter.c
===================================================================
--- trunk/sys/vm/vm_meter.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_meter.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
@@ -30,7 +31,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_meter.c 311049 2017-01-02 08:31:29Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -39,6 +40,7 @@
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resource.h>
+#include <sys/rwlock.h>
#include <sys/sx.h>
#include <sys/vmmeter.h>
#include <sys/smp.h>
@@ -55,21 +57,21 @@
struct vmmeter cnt;
SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min,
- CTLFLAG_RW, &cnt.v_free_min, 0, "");
+ CTLFLAG_RW, &cnt.v_free_min, 0, "Minimum low-free-pages threshold");
SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target,
- CTLFLAG_RW, &cnt.v_free_target, 0, "");
+ CTLFLAG_RW, &cnt.v_free_target, 0, "Desired free pages");
SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved,
- CTLFLAG_RW, &cnt.v_free_reserved, 0, "");
+ CTLFLAG_RW, &cnt.v_free_reserved, 0, "Pages reserved for deadlock");
SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target,
- CTLFLAG_RW, &cnt.v_inactive_target, 0, "");
+ CTLFLAG_RW, &cnt.v_inactive_target, 0, "Pages desired inactive");
SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min,
- CTLFLAG_RW, &cnt.v_cache_min, 0, "");
+ CTLFLAG_RW, &cnt.v_cache_min, 0, "Min pages on cache queue");
SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max,
- CTLFLAG_RW, &cnt.v_cache_max, 0, "");
+ CTLFLAG_RW, &cnt.v_cache_max, 0, "Max pages on cache queue");
SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
- CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "");
+ CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
- CTLFLAG_RW, &cnt.v_free_severe, 0, "");
+ CTLFLAG_RW, &cnt.v_free_severe, 0, "Severe page depletion point");
static int
sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS)
@@ -92,50 +94,40 @@
CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_loadavg, "S,loadavg",
"Machine loadaverage history");
+/*
+ * This function aims to determine if the object is mapped,
+ * specifically, if it is referenced by a vm_map_entry. Because
+ * objects occasionally acquire transient references that do not
+ * represent a mapping, the method used here is inexact. However, it
+ * has very low overhead and is good enough for the advisory
+ * vm.vmtotal sysctl.
+ */
+static bool
+is_object_active(vm_object_t obj)
+{
+
+ return (obj->ref_count > obj->shadow_count);
+}
+
static int
vmtotal(SYSCTL_HANDLER_ARGS)
{
- struct proc *p;
struct vmtotal total;
- vm_map_entry_t entry;
vm_object_t object;
- vm_map_t map;
- int paging;
+ struct proc *p;
struct thread *td;
- struct vmspace *vm;
bzero(&total, sizeof(total));
+
/*
- * Mark all objects as inactive.
- */
- mtx_lock(&vm_object_list_mtx);
- TAILQ_FOREACH(object, &vm_object_list, object_list) {
- if (!VM_OBJECT_TRYLOCK(object)) {
- /*
- * Avoid a lock-order reversal. Consequently,
- * the reported number of active pages may be
- * greater than the actual number.
- */
- continue;
- }
- vm_object_clear_flag(object, OBJ_ACTIVE);
- VM_OBJECT_UNLOCK(object);
- }
- mtx_unlock(&vm_object_list_mtx);
- /*
* Calculate process statistics.
*/
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_flag & P_SYSTEM)
+ if ((p->p_flag & P_SYSTEM) != 0)
continue;
PROC_LOCK(p);
- switch (p->p_state) {
- case PRS_NEW:
- PROC_UNLOCK(p);
- continue;
- break;
- default:
+ if (p->p_state != PRS_NEW) {
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
switch (td->td_state) {
@@ -142,13 +134,16 @@
case TDS_INHIBITED:
if (TD_IS_SWAPPED(td))
total.t_sw++;
- else if (TD_IS_SLEEPING(td) &&
- td->td_priority <= PZERO)
- total.t_dw++;
- else
- total.t_sl++;
+ else if (TD_IS_SLEEPING(td)) {
+ if (td->td_priority <= PZERO)
+ total.t_dw++;
+ else
+ total.t_sl++;
+ if (td->td_wchan ==
+ &cnt.v_free_count)
+ total.t_pw++;
+ }
break;
-
case TDS_CAN_RUN:
total.t_sw++;
break;
@@ -155,8 +150,7 @@
case TDS_RUNQ:
case TDS_RUNNING:
total.t_rq++;
- thread_unlock(td);
- continue;
+ break;
default:
break;
}
@@ -164,29 +158,6 @@
}
}
PROC_UNLOCK(p);
- /*
- * Note active objects.
- */
- paging = 0;
- vm = vmspace_acquire_ref(p);
- if (vm == NULL)
- continue;
- map = &vm->vm_map;
- vm_map_lock_read(map);
- for (entry = map->header.next;
- entry != &map->header; entry = entry->next) {
- if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) ||
- (object = entry->object.vm_object) == NULL)
- continue;
- VM_OBJECT_LOCK(object);
- vm_object_set_flag(object, OBJ_ACTIVE);
- paging |= object->paging_in_progress;
- VM_OBJECT_UNLOCK(object);
- }
- vm_map_unlock_read(map);
- vmspace_free(vm);
- if (paging)
- total.t_pw++;
}
sx_sunlock(&allproc_lock);
/*
@@ -195,12 +166,11 @@
mtx_lock(&vm_object_list_mtx);
TAILQ_FOREACH(object, &vm_object_list, object_list) {
/*
- * Perform unsynchronized reads on the object to avoid
- * a lock-order reversal. In this case, the lack of
- * synchronization should not impair the accuracy of
- * the reported statistics.
+ * Perform unsynchronized reads on the object. In
+ * this case, the lack of synchronization should not
+ * impair the accuracy of the reported statistics.
*/
- if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
+ if ((object->flags & OBJ_FICTITIOUS) != 0) {
/*
* Devices, like /dev/mem, will badly skew our totals.
*/
@@ -213,9 +183,18 @@
*/
continue;
}
+ if (object->ref_count == 1 &&
+ (object->flags & OBJ_NOSPLIT) != 0) {
+ /*
+ * Also skip otherwise unreferenced swap
+ * objects backing tmpfs vnodes, and POSIX or
+ * SysV shared memory.
+ */
+ continue;
+ }
total.t_vm += object->size;
total.t_rm += object->resident_page_count;
- if (object->flags & OBJ_ACTIVE) {
+ if (is_object_active(object)) {
total.t_avm += object->size;
total.t_arm += object->resident_page_count;
}
@@ -223,7 +202,7 @@
/* shared object */
total.t_vmshr += object->size;
total.t_rmshr += object->resident_page_count;
- if (object->flags & OBJ_ACTIVE) {
+ if (is_object_active(object)) {
total.t_avmshr += object->size;
total.t_armshr += object->resident_page_count;
}
@@ -270,104 +249,63 @@
"VM meter vm stats");
SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats");
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_swtch, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_swtch, 0, vcnt, "IU", "Context switches");
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_trap, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_trap, 0, vcnt, "IU", "Traps");
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_syscall, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_syscall, 0, vcnt, "IU", "Syscalls");
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_intr, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_intr, 0, vcnt, "IU", "Hardware interrupts");
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_soft, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_soft, 0, vcnt, "IU", "Software interrupts");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vm_faults, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_vm_faults, 0, vcnt, "IU", "VM faults");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cow_faults, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_cow_faults, 0, vcnt, "IU", "COW faults");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cow_optim, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_cow_optim, 0, vcnt, "IU", "Optimized COW faults");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_zfod, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_zfod, 0, vcnt, "IU", "Zero fill");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_ozfod, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_ozfod, 0, vcnt, "IU", "Optimized zero fill");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swapin, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_swapin, 0, vcnt, "IU", "Swapin operations");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swapout, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_swapout, 0, vcnt, "IU", "Swapout operations");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swappgsin, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_swappgsin, 0, vcnt, "IU", "Swapin pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swappgsout, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_swappgsout, 0, vcnt, "IU", "Swapout pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodein, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_vnodein, 0, vcnt, "IU", "Vnodein operations");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodeout, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_vnodeout, 0, vcnt, "IU", "Vnodeout operations");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodepgsin, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_vnodepgsin, 0, vcnt, "IU", "Vnodein pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodepgsout, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_vnodepgsout, 0, vcnt, "IU", "Vnodeout pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_intrans, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_intrans, 0, vcnt, "IU", "In transit page blocking");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_reactivated, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_reactivated, 0, vcnt, "IU", "Reactivated pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdwakeups, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_pdwakeups, 0, vcnt, "IU", "Pagedaemon wakeups");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_pdpages, 0, vcnt, "IU", "Pagedaemon page scans");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_tcached, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_tcached, 0, vcnt, "IU", "Total pages cached");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_dfree, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_dfree, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pfree, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_pfree, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_tfree, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_tfree, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_page_size, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_page_size, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_page_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_page_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_reserved, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_free_reserved, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_target, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_free_target, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_min, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_free_min, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_free_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_wire_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_wire_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_active_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_active_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_inactive_target, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_inactive_target, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_inactive_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_inactive_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_cache_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_min, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_cache_min, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_max, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_cache_max, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pageout_free_min, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_pageout_free_min, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_interrupt_free_min, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_interrupt_free_min, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_forks, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_forks, 0, vcnt, "IU", "Number of fork() calls");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vforks, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_vforks, 0, vcnt, "IU", "Number of vfork() calls");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_rforks, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_rforks, 0, vcnt, "IU", "Number of rfork() calls");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_kthreads, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_kthreads, 0, vcnt, "IU", "Number of fork() calls by kernel");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_forkpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_forkpages, 0, vcnt, "IU", "VM pages affected by fork()");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vforkpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_vforkpages, 0, vcnt, "IU", "VM pages affected by vfork()");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_rforkpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_rforkpages, 0, vcnt, "IU", "VM pages affected by rfork()");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_kthreadpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
- &cnt.v_kthreadpages, 0, vcnt, "IU", "VM pages affected by fork() by kernel");
+#define VM_STATS(parent, var, descr) \
+ SYSCTL_PROC(parent, OID_AUTO, var, \
+ CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &cnt.var, 0, vcnt, \
+ "IU", descr)
+#define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr)
+#define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr)
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
- zero_page_count, CTLFLAG_RD, &vm_page_zero_count, 0, "");
+VM_STATS_SYS(v_swtch, "Context switches");
+VM_STATS_SYS(v_trap, "Traps");
+VM_STATS_SYS(v_syscall, "System calls");
+VM_STATS_SYS(v_intr, "Device interrupts");
+VM_STATS_SYS(v_soft, "Software interrupts");
+VM_STATS_VM(v_vm_faults, "Address memory faults");
+VM_STATS_VM(v_io_faults, "Page faults requiring I/O");
+VM_STATS_VM(v_cow_faults, "Copy-on-write faults");
+VM_STATS_VM(v_cow_optim, "Optimized COW faults");
+VM_STATS_VM(v_zfod, "Pages zero-filled on demand");
+VM_STATS_VM(v_ozfod, "Optimized zero fill pages");
+VM_STATS_VM(v_swapin, "Swap pager pageins");
+VM_STATS_VM(v_swapout, "Swap pager pageouts");
+VM_STATS_VM(v_swappgsin, "Swap pages swapped in");
+VM_STATS_VM(v_swappgsout, "Swap pages swapped out");
+VM_STATS_VM(v_vnodein, "Vnode pager pageins");
+VM_STATS_VM(v_vnodeout, "Vnode pager pageouts");
+VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in");
+VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out");
+VM_STATS_VM(v_intrans, "In transit page faults");
+VM_STATS_VM(v_reactivated, "Pages reactivated from free list");
+VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups");
+VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon");
+VM_STATS_VM(v_tcached, "Total pages cached");
+VM_STATS_VM(v_dfree, "Pages freed by pagedaemon");
+VM_STATS_VM(v_pfree, "Pages freed by exiting processes");
+VM_STATS_VM(v_tfree, "Total pages freed");
+VM_STATS_VM(v_page_size, "Page size in bytes");
+VM_STATS_VM(v_page_count, "Total number of pages in system");
+VM_STATS_VM(v_free_reserved, "Pages reserved for deadlock");
+VM_STATS_VM(v_free_target, "Pages desired free");
+VM_STATS_VM(v_free_min, "Minimum low-free-pages threshold");
+VM_STATS_VM(v_free_count, "Free pages");
+VM_STATS_VM(v_wire_count, "Wired pages");
+VM_STATS_VM(v_active_count, "Active pages");
+VM_STATS_VM(v_inactive_target, "Desired inactive pages");
+VM_STATS_VM(v_inactive_count, "Inactive pages");
+VM_STATS_VM(v_cache_count, "Pages on cache queue");
+VM_STATS_VM(v_cache_min, "Min pages on cache queue");
+VM_STATS_VM(v_cache_max, "Max pages on cached queue");
+VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel");
+VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code");
+VM_STATS_VM(v_forks, "Number of fork() calls");
+VM_STATS_VM(v_vforks, "Number of vfork() calls");
+VM_STATS_VM(v_rforks, "Number of rfork() calls");
+VM_STATS_VM(v_kthreads, "Number of fork() calls by kernel");
+VM_STATS_VM(v_forkpages, "VM pages affected by fork()");
+VM_STATS_VM(v_vforkpages, "VM pages affected by vfork()");
+VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()");
+VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel");
+
+SYSCTL_INT(_vm_stats_misc, OID_AUTO, zero_page_count, CTLFLAG_RD,
+ &vm_page_zero_count, 0, "Number of zero-ed free pages");
Modified: trunk/sys/vm/vm_mmap.c
===================================================================
--- trunk/sys/vm/vm_mmap.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_mmap.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991, 1993
@@ -41,7 +42,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 321717 2017-07-30 10:36:20Z kib $");
#include "opt_compat.h"
#include "opt_hwpmc_hooks.h"
@@ -48,7 +49,7 @@
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -56,9 +57,11 @@
#include <sys/filedesc.h>
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/procctl.h>
#include <sys/racct.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
@@ -67,6 +70,7 @@
#include <sys/mount.h>
#include <sys/conf.h>
#include <sys/stat.h>
+#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/vmmeter.h>
@@ -88,15 +92,13 @@
#include <sys/pmckern.h>
#endif
-int old_mlock = 1;
+int old_mlock = 0;
SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0,
"Do not apply RLIMIT_MEMLOCK on mlockall");
TUNABLE_INT("vm.old_mlock", &old_mlock);
-#ifndef _SYS_SYSPROTO_H_
-struct sbrk_args {
- int incr;
-};
+#ifdef MAP_32BIT
+#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
#endif
static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
@@ -106,14 +108,14 @@
static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
-/*
- * MPSAFE
- */
-/* ARGSUSED */
+#ifndef _SYS_SYSPROTO_H_
+struct sbrk_args {
+ int incr;
+};
+#endif
+
int
-sys_sbrk(td, uap)
- struct thread *td;
- struct sbrk_args *uap;
+sys_sbrk(struct thread *td, struct sbrk_args *uap)
{
/* Not yet implemented */
return (EOPNOTSUPP);
@@ -125,14 +127,8 @@
};
#endif
-/*
- * MPSAFE
- */
-/* ARGSUSED */
int
-sys_sstk(td, uap)
- struct thread *td;
- struct sstk_args *uap;
+sys_sstk(struct thread *td, struct sstk_args *uap)
{
/* Not yet implemented */
return (EOPNOTSUPP);
@@ -145,13 +141,10 @@
};
#endif
-/* ARGSUSED */
int
-ogetpagesize(td, uap)
- struct thread *td;
- struct getpagesize_args *uap;
+ogetpagesize(struct thread *td, struct getpagesize_args *uap)
{
- /* MP SAFE */
+
td->td_retval[0] = PAGE_SIZE;
return (0);
}
@@ -183,9 +176,6 @@
};
#endif
-/*
- * MPSAFE
- */
int
sys_mmap(td, uap)
struct thread *td;
@@ -201,7 +191,7 @@
vm_prot_t cap_maxprot, prot, maxprot;
void *handle;
objtype_t handle_type;
- int flags, error;
+ int align, error, flags;
off_t pos;
struct vmspace *vms = td->td_proc->p_vmspace;
cap_rights_t rights;
@@ -239,6 +229,12 @@
flags |= MAP_ANON;
pos = 0;
}
+ if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
+ return (EINVAL);
+ if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || uap->fd != -1 ||
+ pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT |
+ MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0))
+ return (EINVAL);
/*
* Align the file position to a page boundary,
@@ -251,6 +247,13 @@
size += pageoff; /* low end... */
size = (vm_size_t) round_page(size); /* hi end */
+ /* Ensure alignment is at least a page and fits in a pointer. */
+ align = flags & MAP_ALIGNMENT_MASK;
+ if (align != 0 && align != MAP_ALIGNED_SUPER &&
+ (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
+ align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
+ return (EINVAL);
+
/*
* Check for illegal addresses. Watch out for address wrap... Note
* that VM_*_ADDRESS are not constants due to casts (argh).
@@ -271,6 +274,18 @@
return (EINVAL);
if (addr + size < addr)
return (EINVAL);
+#ifdef MAP_32BIT
+ if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
+ return (EINVAL);
+ } else if (flags & MAP_32BIT) {
+ /*
+ * For MAP_32BIT, override the hint if it is too high and
+ * do not bother moving the mapping past the heap (since
+ * the heap is usually above 2GB).
+ */
+ if (addr + size > MAP_32BIT_MAX_ADDR)
+ addr = 0;
+#endif
} else {
/*
* XXX for non-fixed mappings where no hint is provided or
@@ -289,7 +304,12 @@
lim_max(td->td_proc, RLIMIT_DATA));
PROC_UNLOCK(td->td_proc);
}
- if (flags & MAP_ANON) {
+ if ((flags & MAP_GUARD) != 0) {
+ handle = NULL;
+ handle_type = OBJT_DEFAULT;
+ maxprot = VM_PROT_NONE;
+ cap_maxprot = VM_PROT_NONE;
+ } else if ((flags & MAP_ANON) != 0) {
/*
* Mapping blank space is trivial.
*/
@@ -304,17 +324,17 @@
* rights, but also return the maximum rights to be combined
* with maxprot later.
*/
- rights = CAP_MMAP;
+ cap_rights_init(&rights, CAP_MMAP);
if (prot & PROT_READ)
- rights |= CAP_READ;
+ cap_rights_set(&rights, CAP_MMAP_R);
if ((flags & MAP_SHARED) != 0) {
if (prot & PROT_WRITE)
- rights |= CAP_WRITE;
+ cap_rights_set(&rights, CAP_MMAP_W);
}
if (prot & PROT_EXEC)
- rights |= CAP_MAPEXEC;
- if ((error = fget_mmap(td, uap->fd, rights, &cap_maxprot,
- &fp)) != 0)
+ cap_rights_set(&rights, CAP_MMAP_X);
+ error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
+ if (error != 0)
goto done;
if (fp->f_type == DTYPE_SHM) {
handle = fp->f_data;
@@ -492,9 +512,6 @@
int flags;
};
#endif
-/*
- * MPSAFE
- */
int
sys_msync(td, uap)
struct thread *td;
@@ -531,7 +548,7 @@
case KERN_SUCCESS:
return (0);
case KERN_INVALID_ADDRESS:
- return (EINVAL); /* Sun returns ENOMEM? */
+ return (ENOMEM);
case KERN_INVALID_ARGUMENT:
return (EBUSY);
case KERN_FAILURE:
@@ -547,9 +564,6 @@
size_t len;
};
#endif
-/*
- * MPSAFE
- */
int
sys_munmap(td, uap)
struct thread *td;
@@ -623,9 +637,6 @@
int prot;
};
#endif
-/*
- * MPSAFE
- */
int
sys_mprotect(td, uap)
struct thread *td;
@@ -665,13 +676,8 @@
int inherit;
};
#endif
-/*
- * MPSAFE
- */
int
-sys_minherit(td, uap)
- struct thread *td;
- struct minherit_args *uap;
+sys_minherit(struct thread *td, struct minherit_args *uap)
{
vm_offset_t addr;
vm_size_t size, pageoff;
@@ -706,19 +712,12 @@
};
#endif
-/*
- * MPSAFE
- */
-/* ARGSUSED */
int
-sys_madvise(td, uap)
- struct thread *td;
- struct madvise_args *uap;
+sys_madvise(struct thread *td, struct madvise_args *uap)
{
vm_offset_t start, end;
vm_map_t map;
- struct proc *p;
- int error;
+ int flags;
/*
* Check for our special case, advising the swap pager we are
@@ -725,15 +724,11 @@
* "immortal."
*/
if (uap->behav == MADV_PROTECT) {
- error = priv_check(td, PRIV_VM_MADV_PROTECT);
- if (error == 0) {
- p = td->td_proc;
- PROC_LOCK(p);
- p->p_flag |= P_PROTECTED;
- PROC_UNLOCK(p);
- }
- return (error);
+ flags = PPROT_SET;
+ return (kern_procctl(td, P_PID, td->td_proc->p_pid,
+ PROC_SPROTECT, &flags));
}
+
/*
* Check for illegal behavior
*/
@@ -770,14 +765,8 @@
};
#endif
-/*
- * MPSAFE
- */
-/* ARGSUSED */
int
-sys_mincore(td, uap)
- struct thread *td;
- struct mincore_args *uap;
+sys_mincore(struct thread *td, struct mincore_args *uap)
{
vm_offset_t addr, first_addr;
vm_offset_t end, cend;
@@ -883,12 +872,12 @@
m = PHYS_TO_VM_PAGE(locked_pa);
if (m->object != object) {
if (object != NULL)
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
object = m->object;
- locked = VM_OBJECT_TRYLOCK(object);
+ locked = VM_OBJECT_TRYWLOCK(object);
vm_page_unlock(m);
if (!locked) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
vm_page_lock(m);
goto retry;
}
@@ -906,9 +895,9 @@
*/
if (current->object.vm_object != object) {
if (object != NULL)
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
object = current->object.vm_object;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
}
if (object->type == OBJT_DEFAULT ||
object->type == OBJT_SWAP ||
@@ -945,7 +934,7 @@
mincoreinfo |= MINCORE_REFERENCED_OTHER;
}
if (object != NULL)
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
/*
* subyte may page fault. In case it needs to modify
@@ -963,12 +952,12 @@
* the byte vector is zeroed for those skipped entries.
*/
while ((lastvecindex + 1) < vecindex) {
+ ++lastvecindex;
error = subyte(vec + lastvecindex, 0);
if (error) {
error = EFAULT;
goto done2;
}
- ++lastvecindex;
}
/*
@@ -1004,12 +993,12 @@
*/
vecindex = OFF_TO_IDX(end - first_addr);
while ((lastvecindex + 1) < vecindex) {
+ ++lastvecindex;
error = subyte(vec + lastvecindex, 0);
if (error) {
error = EFAULT;
goto done2;
}
- ++lastvecindex;
}
/*
@@ -1030,15 +1019,16 @@
size_t len;
};
#endif
-/*
- * MPSAFE
- */
int
-sys_mlock(td, uap)
- struct thread *td;
- struct mlock_args *uap;
+sys_mlock(struct thread *td, struct mlock_args *uap)
{
- struct proc *proc;
+
+ return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
+}
+
+int
+vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
+{
vm_offset_t addr, end, last, start;
vm_size_t npages, size;
vm_map_t map;
@@ -1045,11 +1035,11 @@
unsigned long nsize;
int error;
- error = priv_check(td, PRIV_VM_MLOCK);
+ error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
if (error)
return (error);
- addr = (vm_offset_t)uap->addr;
- size = uap->len;
+ addr = (vm_offset_t)addr0;
+ size = len;
last = addr + size;
start = trunc_page(addr);
end = round_page(last);
@@ -1058,7 +1048,6 @@
npages = atop(end - start);
if (npages > vm_page_max_wired)
return (ENOMEM);
- proc = td->td_proc;
map = &proc->p_vmspace->vm_map;
PROC_LOCK(proc);
nsize = ptoa(npages + pmap_wired_count(map->pmap));
@@ -1070,16 +1059,18 @@
if (npages + cnt.v_wire_count > vm_page_max_wired)
return (EAGAIN);
#ifdef RACCT
- PROC_LOCK(proc);
- error = racct_set(proc, RACCT_MEMLOCK, nsize);
- PROC_UNLOCK(proc);
- if (error != 0)
- return (ENOMEM);
+ if (racct_enable) {
+ PROC_LOCK(proc);
+ error = racct_set(proc, RACCT_MEMLOCK, nsize);
+ PROC_UNLOCK(proc);
+ if (error != 0)
+ return (ENOMEM);
+ }
#endif
error = vm_map_wire(map, start, end,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
#ifdef RACCT
- if (error != KERN_SUCCESS) {
+ if (racct_enable && error != KERN_SUCCESS) {
PROC_LOCK(proc);
racct_set(proc, RACCT_MEMLOCK,
ptoa(pmap_wired_count(map->pmap)));
@@ -1095,13 +1086,8 @@
};
#endif
-/*
- * MPSAFE
- */
int
-sys_mlockall(td, uap)
- struct thread *td;
- struct mlockall_args *uap;
+sys_mlockall(struct thread *td, struct mlockall_args *uap)
{
vm_map_t map;
int error;
@@ -1127,11 +1113,13 @@
PROC_UNLOCK(td->td_proc);
}
#ifdef RACCT
- PROC_LOCK(td->td_proc);
- error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
- PROC_UNLOCK(td->td_proc);
- if (error != 0)
- return (ENOMEM);
+ if (racct_enable) {
+ PROC_LOCK(td->td_proc);
+ error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
+ PROC_UNLOCK(td->td_proc);
+ if (error != 0)
+ return (ENOMEM);
+ }
#endif
if (uap->how & MCL_FUTURE) {
@@ -1153,7 +1141,7 @@
error = (error == KERN_SUCCESS ? 0 : EAGAIN);
}
#ifdef RACCT
- if (error != KERN_SUCCESS) {
+ if (racct_enable && error != KERN_SUCCESS) {
PROC_LOCK(td->td_proc);
racct_set(td->td_proc, RACCT_MEMLOCK,
ptoa(pmap_wired_count(map->pmap)));
@@ -1170,13 +1158,8 @@
};
#endif
-/*
- * MPSAFE
- */
int
-sys_munlockall(td, uap)
- struct thread *td;
- struct munlockall_args *uap;
+sys_munlockall(struct thread *td, struct munlockall_args *uap)
{
vm_map_t map;
int error;
@@ -1195,7 +1178,7 @@
error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
#ifdef RACCT
- if (error == KERN_SUCCESS) {
+ if (racct_enable && error == KERN_SUCCESS) {
PROC_LOCK(td->td_proc);
racct_set(td->td_proc, RACCT_MEMLOCK, 0);
PROC_UNLOCK(td->td_proc);
@@ -1211,9 +1194,6 @@
size_t len;
};
#endif
-/*
- * MPSAFE
- */
int
sys_munlock(td, uap)
struct thread *td;
@@ -1221,6 +1201,9 @@
{
vm_offset_t addr, end, last, start;
vm_size_t size;
+#ifdef RACCT
+ vm_map_t map;
+#endif
int error;
error = priv_check(td, PRIV_VM_MUNLOCK);
@@ -1236,9 +1219,11 @@
error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
#ifdef RACCT
- if (error == KERN_SUCCESS) {
+ if (racct_enable && error == KERN_SUCCESS) {
PROC_LOCK(td->td_proc);
- racct_sub(td->td_proc, RACCT_MEMLOCK, ptoa(end - start));
+ map = &td->td_proc->p_vmspace->vm_map;
+ racct_set(td->td_proc, RACCT_MEMLOCK,
+ ptoa(pmap_wired_count(map->pmap)));
PROC_UNLOCK(td->td_proc);
}
#endif
@@ -1263,21 +1248,16 @@
struct vattr va;
vm_object_t obj;
vm_offset_t foff;
- struct mount *mp;
struct ucred *cred;
- int error, flags, locktype, vfslocked;
+ int error, flags, locktype;
- mp = vp->v_mount;
cred = td->td_ucred;
if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
locktype = LK_EXCLUSIVE;
else
locktype = LK_SHARED;
- vfslocked = VFS_LOCK_GIANT(mp);
- if ((error = vget(vp, locktype, td)) != 0) {
- VFS_UNLOCK_GIANT(vfslocked);
+ if ((error = vget(vp, locktype, td)) != 0)
return (error);
- }
foff = *foffp;
flags = *flagsp;
obj = vp->v_object;
@@ -1289,18 +1269,16 @@
error = EINVAL;
goto done;
}
- if (obj->handle != vp) {
+ if (obj->type == OBJT_VNODE && obj->handle != vp) {
vput(vp);
vp = (struct vnode *)obj->handle;
/*
* Bypass filesystems obey the mpsafety of the
- * underlying fs.
+ * underlying fs. Tmpfs never bypasses.
*/
error = vget(vp, locktype, td);
- if (error != 0) {
- VFS_UNLOCK_GIANT(vfslocked);
+ if (error != 0)
return (error);
- }
}
if (locktype == LK_EXCLUSIVE) {
*writecounted = TRUE;
@@ -1340,7 +1318,14 @@
objsize = round_page(va.va_size);
if (va.va_nlink == 0)
flags |= MAP_NOSYNC;
- obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred);
+ if (obj->type == OBJT_VNODE)
+ obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
+ cred);
+ else {
+ KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
+ ("wrong object type"));
+ vm_object_reference(obj);
+ }
if (obj == NULL) {
error = ENOMEM;
goto done;
@@ -1357,7 +1342,6 @@
vnode_pager_update_writecount(obj, objsize, 0);
}
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -1364,8 +1348,6 @@
/*
* vm_mmap_cdev()
*
- * MPSAFE
- *
* Helper function for vm_mmap. Perform sanity check specific for mmap
* operations on cdevs.
*/
@@ -1478,10 +1460,11 @@
objtype_t handle_type, void *handle,
vm_ooffset_t foff)
{
- boolean_t fitit;
+ boolean_t curmap, fitit;
+ vm_offset_t max_addr;
vm_object_t object = NULL;
struct thread *td = curthread;
- int docow, error, rv;
+ int docow, error, findspace, rv;
boolean_t writecounted;
if (size == 0)
@@ -1489,9 +1472,17 @@
size = round_page(size);
- PROC_LOCK(td->td_proc);
- if (td->td_proc->p_vmspace->vm_map.size + size >
- lim_cur(td->td_proc, RLIMIT_VMEM)) {
+ curmap = map == &td->td_proc->p_vmspace->vm_map;
+ if (curmap) {
+ PROC_LOCK(td->td_proc);
+ if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) {
+ PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
+ PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
if (!old_mlock && map->flags & MAP_WIREFUTURE) {
if (ptoa(pmap_wired_count(map->pmap)) + size >
lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
@@ -1510,14 +1501,7 @@
}
}
PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
}
- if (racct_set(td->td_proc, RACCT_VMEM,
- td->td_proc->p_vmspace->vm_map.size + size)) {
- PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- PROC_UNLOCK(td->td_proc);
/*
* We currently can only deal with page aligned file offsets.
@@ -1592,17 +1576,48 @@
docow |= MAP_INHERIT_SHARE;
if (writecounted)
docow |= MAP_VN_WRITECOUNT;
+ if (flags & MAP_STACK) {
+ if (object != NULL)
+ return (EINVAL);
+ docow |= MAP_STACK_GROWS_DOWN;
+ }
+ if ((flags & MAP_EXCL) != 0)
+ docow |= MAP_CHECK_EXCL;
+ if ((flags & MAP_GUARD) != 0)
+ docow |= MAP_CREATE_GUARD;
- if (flags & MAP_STACK)
- rv = vm_map_stack(map, *addr, size, prot, maxprot,
- docow | MAP_STACK_GROWS_DOWN);
- else if (fitit)
- rv = vm_map_find(map, object, foff, addr, size,
- object != NULL && object->type == OBJT_DEVICE ?
- VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow);
- else
+ if (fitit) {
+ if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
+ findspace = VMFS_SUPER_SPACE;
+ else if ((flags & MAP_ALIGNMENT_MASK) != 0)
+ findspace = VMFS_ALIGNED_SPACE(flags >>
+ MAP_ALIGNMENT_SHIFT);
+ else
+ findspace = VMFS_OPTIMAL_SPACE;
+ max_addr = 0;
+#ifdef MAP_32BIT
+ if ((flags & MAP_32BIT) != 0)
+ max_addr = MAP_32BIT_MAX_ADDR;
+#endif
+ if (curmap) {
+ vm_offset_t min_addr;
+
+ PROC_LOCK(td->td_proc);
+ min_addr = round_page((vm_offset_t)td->td_proc->
+ p_vmspace->vm_daddr + lim_max(td->td_proc,
+ RLIMIT_DATA));
+ PROC_UNLOCK(td->td_proc);
+ rv = vm_map_find_min(map, object, foff, addr, size,
+ min_addr, max_addr,
+ findspace, prot, maxprot, docow);
+ } else {
+ rv = vm_map_find(map, object, foff, addr, size,
+ max_addr, findspace, prot, maxprot, docow);
+ }
+ } else {
rv = vm_map_fixed(map, object, foff, *addr, size,
- prot, maxprot, docow);
+ prot, maxprot, docow);
+ }
if (rv == KERN_SUCCESS) {
/*
Modified: trunk/sys/vm/vm_object.c
===================================================================
--- trunk/sys/vm/vm_object.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_object.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -63,7 +64,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_object.c 321677 2017-07-29 08:24:51Z kib $");
#include "opt_vm.h"
@@ -78,6 +79,8 @@
#include <sys/proc.h> /* for curproc, pageproc */
#include <sys/socket.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/user.h>
#include <sys/vnode.h>
#include <sys/vmmeter.h>
#include <sys/sx.h>
@@ -93,6 +96,7 @@
#include <vm/swap_pager.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
+#include <vm/vm_radix.h>
#include <vm/vm_reserv.h>
#include <vm/uma.h>
@@ -164,15 +168,18 @@
vm_object_t object;
object = (vm_object_t)mem;
+ KASSERT(object->ref_count == 0,
+ ("object %p ref_count = %d", object, object->ref_count));
KASSERT(TAILQ_EMPTY(&object->memq),
- ("object %p has resident pages",
- object));
+ ("object %p has resident pages in its memq", object));
+ KASSERT(vm_radix_is_empty(&object->rtree),
+ ("object %p has resident pages in its trie", object));
#if VM_NRESERVLEVEL > 0
KASSERT(LIST_EMPTY(&object->rvq),
("object %p has reservations",
object));
#endif
- KASSERT(object->cache == NULL,
+ KASSERT(vm_object_cache_is_empty(object),
("object %p has cached pages",
object));
KASSERT(object->paging_in_progress == 0,
@@ -184,6 +191,9 @@
KASSERT(object->shadow_count == 0,
("object %p shadow_count = %d",
object, object->shadow_count));
+ KASSERT(object->type == OBJT_DEAD,
+ ("object %p has non-dead type %d",
+ object, object->type));
}
#endif
@@ -193,17 +203,27 @@
vm_object_t object;
object = (vm_object_t)mem;
- bzero(&object->mtx, sizeof(object->mtx));
- VM_OBJECT_LOCK_INIT(object, "standard object");
+ bzero(&object->lock, sizeof(object->lock));
+ rw_init_flags(&object->lock, "vm object", RW_DUPOK);
/* These are true for any object that has been freed */
+ object->type = OBJT_DEAD;
+ object->ref_count = 0;
+ object->rtree.rt_root = 0;
+ object->rtree.rt_flags = 0;
object->paging_in_progress = 0;
object->resident_page_count = 0;
object->shadow_count = 0;
+ object->cache.rt_root = 0;
+ object->cache.rt_flags = 0;
+
+ mtx_lock(&vm_object_list_mtx);
+ TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
+ mtx_unlock(&vm_object_list_mtx);
return (0);
}
-void
+static void
_vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
{
@@ -210,18 +230,36 @@
TAILQ_INIT(&object->memq);
LIST_INIT(&object->shadow_head);
- object->root = NULL;
object->type = type;
+ switch (type) {
+ case OBJT_DEAD:
+ panic("_vm_object_allocate: can't create OBJT_DEAD");
+ case OBJT_DEFAULT:
+ case OBJT_SWAP:
+ object->flags = OBJ_ONEMAPPING;
+ break;
+ case OBJT_DEVICE:
+ case OBJT_SG:
+ object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED;
+ break;
+ case OBJT_MGTDEVICE:
+ object->flags = OBJ_FICTITIOUS;
+ break;
+ case OBJT_PHYS:
+ object->flags = OBJ_UNMANAGED;
+ break;
+ case OBJT_VNODE:
+ object->flags = 0;
+ break;
+ default:
+ panic("_vm_object_allocate: type %d is undefined", type);
+ }
object->size = size;
object->generation = 1;
object->ref_count = 1;
object->memattr = VM_MEMATTR_DEFAULT;
- object->flags = 0;
object->cred = NULL;
object->charge = 0;
- if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
- object->flags = OBJ_ONEMAPPING;
- object->pg_color = 0;
object->handle = NULL;
object->backing_object = NULL;
object->backing_object_offset = (vm_ooffset_t) 0;
@@ -228,11 +266,6 @@
#if VM_NRESERVLEVEL > 0
LIST_INIT(&object->rvq);
#endif
- object->cache = NULL;
-
- mtx_lock(&vm_object_list_mtx);
- TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
- mtx_unlock(&vm_object_list_mtx);
}
/*
@@ -246,7 +279,7 @@
TAILQ_INIT(&vm_object_list);
mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
- VM_OBJECT_LOCK_INIT(kernel_object, "kernel object");
+ rw_init(&kernel_object->lock, "kernel vm object");
_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
kernel_object);
#if VM_NRESERVLEVEL > 0
@@ -254,7 +287,7 @@
kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
#endif
- VM_OBJECT_LOCK_INIT(kmem_object, "kmem object");
+ rw_init(&kmem_object->lock, "kmem vm object");
_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
kmem_object);
#if VM_NRESERVLEVEL > 0
@@ -273,7 +306,9 @@
#else
NULL,
#endif
- vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
+ vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+
+ vm_radix_init();
}
void
@@ -280,7 +315,7 @@
vm_object_clear_flag(vm_object_t object, u_short bits)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
object->flags &= ~bits;
}
@@ -297,10 +332,11 @@
vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
switch (object->type) {
case OBJT_DEFAULT:
case OBJT_DEVICE:
+ case OBJT_MGTDEVICE:
case OBJT_PHYS:
case OBJT_SG:
case OBJT_SWAP:
@@ -310,6 +346,9 @@
break;
case OBJT_DEAD:
return (KERN_INVALID_ARGUMENT);
+ default:
+ panic("vm_object_set_memattr: object %p is of undefined type",
+ object);
}
object->memattr = memattr;
return (KERN_SUCCESS);
@@ -319,7 +358,7 @@
vm_object_pip_add(vm_object_t object, short i)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
object->paging_in_progress += i;
}
@@ -327,7 +366,7 @@
vm_object_pip_subtract(vm_object_t object, short i)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
object->paging_in_progress -= i;
}
@@ -335,7 +374,7 @@
vm_object_pip_wakeup(vm_object_t object)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
object->paging_in_progress--;
if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
vm_object_clear_flag(object, OBJ_PIPWNT);
@@ -347,7 +386,7 @@
vm_object_pip_wakeupn(vm_object_t object, short i)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
if (i)
object->paging_in_progress -= i;
if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
@@ -360,10 +399,10 @@
vm_object_pip_wait(vm_object_t object, char *waitid)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
while (object->paging_in_progress) {
object->flags |= OBJ_PIPWNT;
- msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0);
+ VM_OBJECT_SLEEP(object, object, PVM, waitid, 0);
}
}
@@ -394,9 +433,9 @@
{
if (object == NULL)
return;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
vm_object_reference_locked(object);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
/*
@@ -411,7 +450,7 @@
{
struct vnode *vp;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
object->ref_count++;
if (object->type == OBJT_VNODE) {
vp = object->handle;
@@ -427,8 +466,7 @@
{
struct vnode *vp = (struct vnode *) object->handle;
- VFS_ASSERT_GIANT(vp->v_mount);
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT(object->type == OBJT_VNODE,
("vm_object_vndeallocate: not a vnode object"));
KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
@@ -439,25 +477,30 @@
}
#endif
- if (object->ref_count > 1) {
+ /*
+ * The test for text of vp vnode does not need a bypass to
+ * reach right VV_TEXT there, since it is obtained from
+ * object->handle.
+ */
+ if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) {
object->ref_count--;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
/* vrele may need the vnode lock. */
vrele(vp);
} else {
vhold(vp);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vdrop(vp);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
object->ref_count--;
if (object->type == OBJT_DEAD) {
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
VOP_UNLOCK(vp, 0);
} else {
if (object->ref_count == 0)
VOP_UNSET_TEXT(vp);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
vput(vp);
}
}
@@ -478,40 +521,14 @@
vm_object_deallocate(vm_object_t object)
{
vm_object_t temp;
+ struct vnode *vp;
while (object != NULL) {
- int vfslocked;
-
- vfslocked = 0;
- restart:
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (object->type == OBJT_VNODE) {
- struct vnode *vp = (struct vnode *) object->handle;
-
- /*
- * Conditionally acquire Giant for a vnode-backed
- * object. We have to be careful since the type of
- * a vnode object can change while the object is
- * unlocked.
- */
- if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
- vfslocked = 1;
- if (!mtx_trylock(&Giant)) {
- VM_OBJECT_UNLOCK(object);
- mtx_lock(&Giant);
- goto restart;
- }
- }
vm_object_vndeallocate(object);
- VFS_UNLOCK_GIANT(vfslocked);
return;
- } else
- /*
- * This is to handle the case that the object
- * changed type while we dropped its lock to
- * obtain Giant.
- */
- VFS_UNLOCK_GIANT(vfslocked);
+ }
KASSERT(object->ref_count != 0,
("vm_object_deallocate: object deallocated too many times: %d", object->type));
@@ -524,13 +541,33 @@
*/
object->ref_count--;
if (object->ref_count > 1) {
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return;
} else if (object->ref_count == 1) {
+ if (object->type == OBJT_SWAP &&
+ (object->flags & OBJ_TMPFS) != 0) {
+ vp = object->un_pager.swp.swp_tmpfs;
+ vhold(vp);
+ VM_OBJECT_WUNLOCK(object);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VM_OBJECT_WLOCK(object);
+ if (object->type == OBJT_DEAD ||
+ object->ref_count != 1) {
+ VM_OBJECT_WUNLOCK(object);
+ VOP_UNLOCK(vp, 0);
+ vdrop(vp);
+ return;
+ }
+ if ((object->flags & OBJ_TMPFS) != 0)
+ VOP_UNSET_TEXT(vp);
+ VOP_UNLOCK(vp, 0);
+ vdrop(vp);
+ }
if (object->shadow_count == 0 &&
object->handle == NULL &&
(object->type == OBJT_DEFAULT ||
- object->type == OBJT_SWAP)) {
+ (object->type == OBJT_SWAP &&
+ (object->flags & OBJ_TMPFS_NODE) == 0))) {
vm_object_set_flag(object, OBJ_ONEMAPPING);
} else if ((object->shadow_count == 1) &&
(object->handle == NULL) &&
@@ -543,12 +580,14 @@
("vm_object_deallocate: ref_count: %d, shadow_count: %d",
object->ref_count,
object->shadow_count));
- if (!VM_OBJECT_TRYLOCK(robject)) {
+ KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0,
+ ("shadowed tmpfs v_object %p", object));
+ if (!VM_OBJECT_TRYWLOCK(robject)) {
/*
* Avoid a potential deadlock.
*/
object->ref_count++;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
/*
* More likely than not the thread
* holding robject's lock has lower
@@ -572,28 +611,27 @@
robject->ref_count++;
retry:
if (robject->paging_in_progress) {
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
vm_object_pip_wait(robject,
"objde1");
temp = robject->backing_object;
if (object == temp) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
goto retry;
}
} else if (object->paging_in_progress) {
- VM_OBJECT_UNLOCK(robject);
+ VM_OBJECT_WUNLOCK(robject);
object->flags |= OBJ_PIPWNT;
- msleep(object,
- VM_OBJECT_MTX(object),
+ VM_OBJECT_SLEEP(object, object,
PDROP | PVM, "objde2", 0);
- VM_OBJECT_LOCK(robject);
+ VM_OBJECT_WLOCK(robject);
temp = robject->backing_object;
if (object == temp) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
goto retry;
}
} else
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
if (robject->ref_count == 1) {
robject->ref_count--;
@@ -602,21 +640,23 @@
}
object = robject;
vm_object_collapse(object);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
continue;
}
- VM_OBJECT_UNLOCK(robject);
+ VM_OBJECT_WUNLOCK(robject);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return;
}
doterm:
temp = object->backing_object;
if (temp != NULL) {
- VM_OBJECT_LOCK(temp);
+ KASSERT((object->flags & OBJ_TMPFS_NODE) == 0,
+ ("shadowed tmpfs v_object 2 %p", object));
+ VM_OBJECT_WLOCK(temp);
LIST_REMOVE(object, shadow_list);
temp->shadow_count--;
- VM_OBJECT_UNLOCK(temp);
+ VM_OBJECT_WUNLOCK(temp);
object->backing_object = NULL;
}
/*
@@ -627,7 +667,7 @@
if ((object->flags & OBJ_DEAD) == 0)
vm_object_terminate(object);
else
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
object = temp;
}
}
@@ -641,20 +681,9 @@
{
/*
- * Remove the object from the global object list.
- */
- mtx_lock(&vm_object_list_mtx);
- TAILQ_REMOVE(&vm_object_list, object, object_list);
- mtx_unlock(&vm_object_list_mtx);
-
- /*
* Release the allocation charge.
*/
if (object->cred != NULL) {
- KASSERT(object->type == OBJT_DEFAULT ||
- object->type == OBJT_SWAP,
- ("vm_object_terminate: non-swap obj %p has cred",
- object));
swap_release_by_cred(object->charge, object->cred);
object->charge = 0;
crfree(object->cred);
@@ -679,7 +708,7 @@
{
vm_page_t p, p_next;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
/*
* Make sure no one uses us.
@@ -705,11 +734,15 @@
* Clean pages and flush buffers.
*/
vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
vinvalbuf(vp, V_SAVE, 0, 0);
- VM_OBJECT_LOCK(object);
+ BO_LOCK(&vp->v_bufobj);
+ vp->v_bufobj.bo_flag |= BO_DEAD;
+ BO_UNLOCK(&vp->v_bufobj);
+
+ VM_OBJECT_WLOCK(object);
}
KASSERT(object->ref_count == 0,
@@ -723,8 +756,7 @@
* the object, the page and object are reset to any empty state.
*/
TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
- KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
- ("vm_object_terminate: freeing busy page %p", p));
+ vm_page_assert_unbusied(p);
vm_page_lock(p);
/*
* Optimize the page's removal from the object by resetting
@@ -746,7 +778,7 @@
* modified by the preceding loop.
*/
if (object->resident_page_count != 0) {
- object->root = NULL;
+ vm_radix_reclaim_allnodes(&object->rtree);
TAILQ_INIT(&object->memq);
object->resident_page_count = 0;
if (object->type == OBJT_VNODE)
@@ -757,14 +789,18 @@
if (__predict_false(!LIST_EMPTY(&object->rvq)))
vm_reserv_break_all(object);
#endif
- if (__predict_false(object->cache != NULL))
+ if (__predict_false(!vm_object_cache_is_empty(object)))
vm_page_cache_free(object, 0, 0);
+ KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT ||
+ object->type == OBJT_SWAP,
+ ("%s: non-swap obj %p has cred", __func__, object));
+
/*
* Let the pager know object is dead.
*/
vm_pager_deallocate(object);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
vm_object_destroy(object);
}
@@ -820,9 +856,13 @@
int curgeneration, n, pagerflags;
boolean_t clearobjflags, eio, res;
- mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- KASSERT(object->type == OBJT_VNODE, ("Not a vnode object"));
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ /*
+ * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE
+ * objects. The check below prevents the function from
+ * operating on non-vnode objects.
+ */
if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 ||
object->resident_page_count == 0)
return (TRUE);
@@ -846,7 +886,7 @@
np = TAILQ_NEXT(p, listq);
if (p->valid == 0)
continue;
- if (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) {
+ if (vm_page_sleep_if_busy(p, "vpcwai")) {
if (object->generation != curgeneration) {
if ((flags & OBJPC_SYNC) != 0)
goto rescan;
@@ -906,9 +946,8 @@
vm_page_t ma[vm_pageout_page_count], p_first, tp;
int count, i, mreq, runlen;
- mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
vm_page_lock_assert(p, MA_NOTOWNED);
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
count = 1;
mreq = 0;
@@ -915,7 +954,7 @@
for (tp = p; count < vm_pageout_page_count; count++) {
tp = vm_page_next(tp);
- if (tp == NULL || tp->busy != 0 || (tp->oflags & VPO_BUSY) != 0)
+ if (tp == NULL || vm_page_busied(tp))
break;
if (!vm_object_page_remove_write(tp, flags, clearobjflags))
break;
@@ -923,7 +962,7 @@
for (p_first = p; count < vm_pageout_page_count; count++) {
tp = vm_page_prev(p_first);
- if (tp == NULL || tp->busy != 0 || (tp->oflags & VPO_BUSY) != 0)
+ if (tp == NULL || vm_page_busied(tp))
break;
if (!vm_object_page_remove_write(tp, flags, clearobjflags))
break;
@@ -966,11 +1005,11 @@
return (TRUE);
res = TRUE;
error = 0;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
while ((backing_object = object->backing_object) != NULL) {
- VM_OBJECT_LOCK(backing_object);
+ VM_OBJECT_WLOCK(backing_object);
offset += object->backing_object_offset;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
object = backing_object;
if (object->size < OFF_TO_IDX(offset + size))
size = IDX_TO_OFF(object->size) - offset;
@@ -989,11 +1028,9 @@
*/
if (object->type == OBJT_VNODE &&
(object->flags & OBJ_MIGHTBEDIRTY) != 0) {
- int vfslocked;
vp = object->handle;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
(void) vn_start_write(vp, &mp, V_WAIT);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (syncio && !invalidate && offset == 0 &&
OFF_TO_IDX(size) == object->size) {
@@ -1010,18 +1047,17 @@
flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0;
fsync_after = FALSE;
}
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
res = vm_object_page_clean(object, offset, offset + size,
flags);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
if (fsync_after)
error = VOP_FSYNC(vp, MNT_WAIT, curthread);
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
vn_finished_write(mp);
if (error != 0)
res = FALSE;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
}
if ((object->type == OBJT_VNODE ||
object->type == OBJT_DEVICE) && invalidate) {
@@ -1039,7 +1075,7 @@
vm_object_page_remove(object, OFF_TO_IDX(offset),
OFF_TO_IDX(offset + size + PAGE_MASK), flags);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return (res);
}
@@ -1074,7 +1110,7 @@
if (object == NULL)
return;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
/*
* Locate and adjust resident pages
*/
@@ -1093,7 +1129,7 @@
(tobject->flags & OBJ_ONEMAPPING) == 0) {
goto unlock_tobject;
}
- } else if (tobject->type == OBJT_PHYS)
+ } else if ((tobject->flags & OBJ_UNMANAGED) != 0)
goto unlock_tobject;
m = vm_page_lookup(tobject, tpindex);
if (m == NULL && advise == MADV_WILLNEED) {
@@ -1115,10 +1151,10 @@
backing_object = tobject->backing_object;
if (backing_object == NULL)
goto unlock_tobject;
- VM_OBJECT_LOCK(backing_object);
+ VM_OBJECT_WLOCK(backing_object);
tpindex += OFF_TO_IDX(tobject->backing_object_offset);
if (tobject != object)
- VM_OBJECT_UNLOCK(tobject);
+ VM_OBJECT_WUNLOCK(tobject);
tobject = backing_object;
goto shadowlookup;
} else if (m->valid != VM_PAGE_BITS_ALL)
@@ -1135,7 +1171,7 @@
("vm_object_madvise: page %p is fictitious", m));
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("vm_object_madvise: page %p is not managed", m));
- if ((m->oflags & VPO_BUSY) || m->busy) {
+ if (vm_page_busied(m)) {
if (advise == MADV_WILLNEED) {
/*
* Reference the page before unlocking and
@@ -1144,39 +1180,17 @@
*/
vm_page_aflag_set(m, PGA_REFERENCED);
}
- vm_page_unlock(m);
if (object != tobject)
- VM_OBJECT_UNLOCK(object);
- m->oflags |= VPO_WANTED;
- msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo",
- 0);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WUNLOCK(object);
+ VM_OBJECT_WUNLOCK(tobject);
+ vm_page_busy_sleep(m, "madvpo", false);
+ VM_OBJECT_WLOCK(object);
goto relookup;
}
if (advise == MADV_WILLNEED) {
vm_page_activate(m);
- } else if (advise == MADV_DONTNEED) {
- vm_page_dontneed(m);
- } else if (advise == MADV_FREE) {
- /*
- * Mark the page clean. This will allow the page
- * to be freed up by the system. However, such pages
- * are often reused quickly by malloc()/free()
- * so we do not do anything that would cause
- * a page fault if we can help it.
- *
- * Specifically, we do not try to actually free
- * the page now nor do we try to put it in the
- * cache (which would cause a page fault on reuse).
- *
- * But we do make the page is freeable as we
- * can without actually taking the step of unmapping
- * it.
- */
- pmap_clear_modify(m);
- m->dirty = 0;
- m->act_count = 0;
- vm_page_dontneed(m);
+ } else {
+ vm_page_advise(m, advise);
}
vm_page_unlock(m);
if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
@@ -1183,9 +1197,9 @@
swap_pager_freespace(tobject, tpindex, 1);
unlock_tobject:
if (tobject != object)
- VM_OBJECT_UNLOCK(tobject);
+ VM_OBJECT_WUNLOCK(tobject);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
/*
@@ -1213,15 +1227,15 @@
* Don't create the new object if the old object isn't shared.
*/
if (source != NULL) {
- VM_OBJECT_LOCK(source);
+ VM_OBJECT_WLOCK(source);
if (source->ref_count == 1 &&
source->handle == NULL &&
(source->type == OBJT_DEFAULT ||
source->type == OBJT_SWAP)) {
- VM_OBJECT_UNLOCK(source);
+ VM_OBJECT_WUNLOCK(source);
return;
}
- VM_OBJECT_UNLOCK(source);
+ VM_OBJECT_WUNLOCK(source);
}
/*
@@ -1246,7 +1260,7 @@
*/
result->backing_object_offset = *offset;
if (source != NULL) {
- VM_OBJECT_LOCK(source);
+ VM_OBJECT_WLOCK(source);
LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
source->shadow_count++;
#if VM_NRESERVLEVEL > 0
@@ -1254,7 +1268,7 @@
result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
((1 << (VM_NFREEORDER - 1)) - 1);
#endif
- VM_OBJECT_UNLOCK(source);
+ VM_OBJECT_WUNLOCK(source);
}
@@ -1285,7 +1299,7 @@
return;
if (orig_object->ref_count <= 1)
return;
- VM_OBJECT_UNLOCK(orig_object);
+ VM_OBJECT_WUNLOCK(orig_object);
offidxstart = OFF_TO_IDX(entry->offset);
size = atop(entry->end - entry->start);
@@ -1300,17 +1314,17 @@
* At this point, the new object is still private, so the order in
* which the original and new objects are locked does not matter.
*/
- VM_OBJECT_LOCK(new_object);
- VM_OBJECT_LOCK(orig_object);
+ VM_OBJECT_WLOCK(new_object);
+ VM_OBJECT_WLOCK(orig_object);
source = orig_object->backing_object;
if (source != NULL) {
- VM_OBJECT_LOCK(source);
+ VM_OBJECT_WLOCK(source);
if ((source->flags & OBJ_DEAD) != 0) {
- VM_OBJECT_UNLOCK(source);
- VM_OBJECT_UNLOCK(orig_object);
- VM_OBJECT_UNLOCK(new_object);
+ VM_OBJECT_WUNLOCK(source);
+ VM_OBJECT_WUNLOCK(orig_object);
+ VM_OBJECT_WUNLOCK(new_object);
vm_object_deallocate(new_object);
- VM_OBJECT_LOCK(orig_object);
+ VM_OBJECT_WLOCK(orig_object);
return;
}
LIST_INSERT_HEAD(&source->shadow_head,
@@ -1318,7 +1332,7 @@
source->shadow_count++;
vm_object_reference_locked(source); /* for new_object */
vm_object_clear_flag(source, OBJ_ONEMAPPING);
- VM_OBJECT_UNLOCK(source);
+ VM_OBJECT_WUNLOCK(source);
new_object->backing_object_offset =
orig_object->backing_object_offset + entry->offset;
new_object->backing_object = source;
@@ -1344,18 +1358,42 @@
* We do not have to VM_PROT_NONE the page as mappings should
* not be changed by this operation.
*/
- if ((m->oflags & VPO_BUSY) || m->busy) {
- VM_OBJECT_UNLOCK(new_object);
- m->oflags |= VPO_WANTED;
- msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
- VM_OBJECT_LOCK(new_object);
+ if (vm_page_busied(m)) {
+ VM_OBJECT_WUNLOCK(new_object);
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(orig_object);
+ vm_page_busy_sleep(m, "spltwt", false);
+ VM_OBJECT_WLOCK(orig_object);
+ VM_OBJECT_WLOCK(new_object);
goto retry;
}
- vm_page_lock(m);
- vm_page_rename(m, new_object, idx);
- vm_page_unlock(m);
- /* page automatically made dirty by rename and cache handled */
- vm_page_busy(m);
+
+ /* vm_page_rename() will handle dirty and cache. */
+ if (vm_page_rename(m, new_object, idx)) {
+ VM_OBJECT_WUNLOCK(new_object);
+ VM_OBJECT_WUNLOCK(orig_object);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(orig_object);
+ VM_OBJECT_WLOCK(new_object);
+ goto retry;
+ }
+#if VM_NRESERVLEVEL > 0
+ /*
+ * If some of the reservation's allocated pages remain with
+ * the original object, then transferring the reservation to
+ * the new object is neither particularly beneficial nor
+ * particularly harmful as compared to leaving the reservation
+ * with the original object. If, however, all of the
+ * reservation's allocated pages are transferred to the new
+ * object, then transferring the reservation is typically
+ * beneficial. Determining which of these two cases applies
+ * would be more costly than unconditionally renaming the
+ * reservation.
+ */
+ vm_reserv_rename(m, new_object, orig_object, offidxstart);
+#endif
+ if (orig_object->type == OBJT_SWAP)
+ vm_page_xbusy(m);
}
if (orig_object->type == OBJT_SWAP) {
/*
@@ -1363,22 +1401,28 @@
* and new_object's locks are released and reacquired.
*/
swap_pager_copy(orig_object, new_object, offidxstart, 0);
+ TAILQ_FOREACH(m, &new_object->memq, listq)
+ vm_page_xunbusy(m);
/*
* Transfer any cached pages from orig_object to new_object.
+ * If swap_pager_copy() found swapped out pages within the
+ * specified range of orig_object, then it changed
+ * new_object's type to OBJT_SWAP when it transferred those
+ * pages to new_object. Otherwise, new_object's type
+ * should still be OBJT_DEFAULT and orig_object should not
+ * contain any cached pages within the specified range.
*/
- if (__predict_false(orig_object->cache != NULL))
+ if (__predict_false(!vm_object_cache_is_empty(orig_object)))
vm_page_cache_transfer(orig_object, offidxstart,
new_object);
}
- VM_OBJECT_UNLOCK(orig_object);
- TAILQ_FOREACH(m, &new_object->memq, listq)
- vm_page_wakeup(m);
- VM_OBJECT_UNLOCK(new_object);
+ VM_OBJECT_WUNLOCK(orig_object);
+ VM_OBJECT_WUNLOCK(new_object);
entry->object.vm_object = new_object;
entry->offset = 0LL;
vm_object_deallocate(orig_object);
- VM_OBJECT_LOCK(new_object);
+ VM_OBJECT_WLOCK(new_object);
}
#define OBSC_TEST_ALL_SHADOWED 0x0001
@@ -1385,16 +1429,43 @@
#define OBSC_COLLAPSE_NOWAIT 0x0002
#define OBSC_COLLAPSE_WAIT 0x0004
-static int
+static vm_page_t
+vm_object_backing_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
+ int op)
+{
+ vm_object_t backing_object;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ backing_object = object->backing_object;
+ VM_OBJECT_ASSERT_WLOCKED(backing_object);
+
+ KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p));
+ KASSERT(p == NULL || p->object == object || p->object == backing_object,
+ ("invalid ownership %p %p %p", p, object, backing_object));
+ if ((op & OBSC_COLLAPSE_NOWAIT) != 0)
+ return (next);
+ if (p != NULL)
+ vm_page_lock(p);
+ VM_OBJECT_WUNLOCK(object);
+ VM_OBJECT_WUNLOCK(backing_object);
+ if (p == NULL)
+ VM_WAIT;
+ else
+ vm_page_busy_sleep(p, "vmocol", false);
+ VM_OBJECT_WLOCK(object);
+ VM_OBJECT_WLOCK(backing_object);
+ return (TAILQ_FIRST(&backing_object->memq));
+}
+
+static bool
vm_object_backing_scan(vm_object_t object, int op)
{
- int r = 1;
- vm_page_t p;
vm_object_t backing_object;
- vm_pindex_t backing_offset_index;
+ vm_page_t next, p, pp;
+ vm_pindex_t backing_offset_index, new_pindex;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
backing_object = object->backing_object;
backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
@@ -1413,7 +1484,7 @@
* shadow test may succeed! XXX
*/
if (backing_object->type != OBJT_DEFAULT) {
- return (0);
+ return (false);
}
}
if (op & OBSC_COLLAPSE_WAIT) {
@@ -1425,24 +1496,19 @@
*/
p = TAILQ_FIRST(&backing_object->memq);
while (p) {
- vm_page_t next = TAILQ_NEXT(p, listq);
- vm_pindex_t new_pindex = p->pindex - backing_offset_index;
-
+ next = TAILQ_NEXT(p, listq);
+ new_pindex = p->pindex - backing_offset_index;
if (op & OBSC_TEST_ALL_SHADOWED) {
- vm_page_t pp;
-
/*
* Ignore pages outside the parent object's range
* and outside the parent object's mapping of the
* backing object.
*
- * note that we do not busy the backing object's
+ * Note that we do not busy the backing object's
* page.
*/
- if (
- p->pindex < backing_offset_index ||
- new_pindex >= object->size
- ) {
+ if (p->pindex < backing_offset_index ||
+ new_pindex >= object->size) {
p = next;
continue;
}
@@ -1458,13 +1524,9 @@
*/
pp = vm_page_lookup(object, new_pindex);
- if (
- (pp == NULL || pp->valid == 0) &&
- !vm_pager_has_page(object, new_pindex, NULL, NULL)
- ) {
- r = 0;
- break;
- }
+ if ((pp == NULL || pp->valid == 0) &&
+ !vm_pager_has_page(object, new_pindex, NULL, NULL))
+ return (false);
}
/*
@@ -1471,55 +1533,21 @@
* Check for busy page
*/
if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
- vm_page_t pp;
-
- if (op & OBSC_COLLAPSE_NOWAIT) {
- if ((p->oflags & VPO_BUSY) ||
- !p->valid ||
- p->busy) {
- p = next;
- continue;
- }
- } else if (op & OBSC_COLLAPSE_WAIT) {
- if ((p->oflags & VPO_BUSY) || p->busy) {
- VM_OBJECT_UNLOCK(object);
- p->oflags |= VPO_WANTED;
- msleep(p, VM_OBJECT_MTX(backing_object),
- PDROP | PVM, "vmocol", 0);
- VM_OBJECT_LOCK(object);
- VM_OBJECT_LOCK(backing_object);
- /*
- * If we slept, anything could have
- * happened. Since the object is
- * marked dead, the backing offset
- * should not have changed so we
- * just restart our scan.
- */
- p = TAILQ_FIRST(&backing_object->memq);
- continue;
- }
+ if (vm_page_busied(p)) {
+ p = vm_object_backing_scan_wait(object, p,
+ next, op);
+ continue;
}
- KASSERT(
- p->object == backing_object,
- ("vm_object_backing_scan: object mismatch")
- );
+ KASSERT(p->object == backing_object,
+ ("vm_object_backing_scan: object mismatch"));
- /*
- * Destroy any associated swap
- */
- if (backing_object->type == OBJT_SWAP) {
- swap_pager_freespace(
- backing_object,
- p->pindex,
- 1
- );
- }
+ if (p->pindex < backing_offset_index ||
+ new_pindex >= object->size) {
+ if (backing_object->type == OBJT_SWAP)
+ swap_pager_freespace(backing_object,
+ p->pindex, 1);
- if (
- p->pindex < backing_offset_index ||
- new_pindex >= object->size
- ) {
/*
* Page is out of the parent object's range, we
* can simply destroy it.
@@ -1537,35 +1565,45 @@
}
pp = vm_page_lookup(object, new_pindex);
- if (
- (op & OBSC_COLLAPSE_NOWAIT) != 0 &&
- (pp != NULL && pp->valid == 0)
- ) {
+ if (pp != NULL && vm_page_busied(pp)) {
/*
- * The page in the parent is not (yet) valid.
- * We don't know anything about the state of
- * the original page. It might be mapped,
- * so we must avoid the next if here.
+ * The page in the parent is busy and
+ * possibly not (yet) valid. Until
+ * its state is finalized by the busy
+ * bit owner, we can't tell whether it
+ * shadows the original page.
+ * Therefore, we must either skip it
+ * and the original (backing_object)
+ * page or wait for its state to be
+ * finalized.
*
- * This is due to a race in vm_fault() where
- * we must unbusy the original (backing_obj)
- * page before we can (re)lock the parent.
- * Hence we can get here.
+ * This is due to a race with vm_fault()
+ * where we must unbusy the original
+ * (backing_obj) page before we can
+ * (re)lock the parent. Hence we can
+ * get here.
*/
- p = next;
+ p = vm_object_backing_scan_wait(object, pp,
+ next, op);
continue;
}
- if (
- pp != NULL ||
- vm_pager_has_page(object, new_pindex, NULL, NULL)
- ) {
+
+ KASSERT(pp == NULL || pp->valid != 0,
+ ("unbusy invalid page %p", pp));
+
+ if (pp != NULL || vm_pager_has_page(object,
+ new_pindex, NULL, NULL)) {
/*
- * page already exists in parent OR swap exists
- * for this location in the parent. Destroy
- * the original page from the backing object.
- *
- * Leave the parent's page alone
+ * The page already exists in the
+ * parent OR swap exists for this
+ * location in the parent. Leave the
+ * parent's page alone. Destroy the
+ * original page from the backing
+ * object.
*/
+ if (backing_object->type == OBJT_SWAP)
+ swap_pager_freespace(backing_object,
+ p->pindex, 1);
vm_page_lock(p);
KASSERT(!pmap_page_is_mapped(p),
("freeing mapped page %p", p));
@@ -1578,6 +1616,25 @@
continue;
}
+ /*
+ * Page does not exist in parent, rename the
+ * page from the backing object to the main object.
+ *
+ * If the page was mapped to a process, it can remain
+ * mapped through the rename.
+ * vm_page_rename() will handle dirty and cache.
+ */
+ if (vm_page_rename(p, object, new_pindex)) {
+ p = vm_object_backing_scan_wait(object, NULL,
+ next, op);
+ continue;
+ }
+
+ /* Use the old pindex to free the right page. */
+ if (backing_object->type == OBJT_SWAP)
+ swap_pager_freespace(backing_object,
+ new_pindex + backing_offset_index, 1);
+
#if VM_NRESERVLEVEL > 0
/*
* Rename the reservation.
@@ -1585,22 +1642,10 @@
vm_reserv_rename(p, object, backing_object,
backing_offset_index);
#endif
-
- /*
- * Page does not exist in parent, rename the
- * page from the backing object to the main object.
- *
- * If the page was mapped to a process, it can remain
- * mapped through the rename.
- */
- vm_page_lock(p);
- vm_page_rename(p, object, new_pindex);
- vm_page_unlock(p);
- /* page automatically made dirty by rename */
}
p = next;
}
- return (r);
+ return (true);
}
@@ -1614,8 +1659,8 @@
{
vm_object_t backing_object = object->backing_object;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ VM_OBJECT_ASSERT_WLOCKED(backing_object);
if (backing_object->ref_count != 1)
return;
@@ -1633,11 +1678,11 @@
void
vm_object_collapse(vm_object_t object)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-
+ vm_object_t backing_object, new_backing_object;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
while (TRUE) {
- vm_object_t backing_object;
-
/*
* Verify that the conditions are right for collapse:
*
@@ -1650,7 +1695,7 @@
* we check the backing object first, because it is most likely
* not collapsable.
*/
- VM_OBJECT_LOCK(backing_object);
+ VM_OBJECT_WLOCK(backing_object);
if (backing_object->handle != NULL ||
(backing_object->type != OBJT_DEFAULT &&
backing_object->type != OBJT_SWAP) ||
@@ -1659,18 +1704,17 @@
(object->type != OBJT_DEFAULT &&
object->type != OBJT_SWAP) ||
(object->flags & OBJ_DEAD)) {
- VM_OBJECT_UNLOCK(backing_object);
+ VM_OBJECT_WUNLOCK(backing_object);
break;
}
- if (
- object->paging_in_progress != 0 ||
- backing_object->paging_in_progress != 0
- ) {
+ if (object->paging_in_progress != 0 ||
+ backing_object->paging_in_progress != 0) {
vm_object_qcollapse(object);
- VM_OBJECT_UNLOCK(backing_object);
+ VM_OBJECT_WUNLOCK(backing_object);
break;
}
+
/*
* We know that we can either collapse the backing object (if
* the parent is the only reference to it) or (perhaps) have
@@ -1682,6 +1726,9 @@
* case.
*/
if (backing_object->ref_count == 1) {
+ vm_object_pip_add(object, 1);
+ vm_object_pip_add(backing_object, 1);
+
/*
* If there is exactly one reference to the backing
* object, we can collapse it into the parent.
@@ -1704,6 +1751,9 @@
* swap_pager_copy() can sleep, in which case
* the backing_object's and object's locks are
* released and reacquired.
+ * Since swap_pager_copy() is being asked to
+ * destroy the source, it will change the
+ * backing_object's type to OBJT_DEFAULT.
*/
swap_pager_copy(
backing_object,
@@ -1713,7 +1763,8 @@
/*
* Free any cached pages from backing_object.
*/
- if (__predict_false(backing_object->cache != NULL))
+ if (__predict_false(
+ !vm_object_cache_is_empty(backing_object)))
vm_page_cache_free(backing_object, 0, 0);
}
/*
@@ -1725,7 +1776,7 @@
LIST_REMOVE(object, shadow_list);
backing_object->shadow_count--;
if (backing_object->backing_object) {
- VM_OBJECT_LOCK(backing_object->backing_object);
+ VM_OBJECT_WLOCK(backing_object->backing_object);
LIST_REMOVE(backing_object, shadow_list);
LIST_INSERT_HEAD(
&backing_object->backing_object->shadow_head,
@@ -1733,7 +1784,7 @@
/*
* The shadow_count has not changed.
*/
- VM_OBJECT_UNLOCK(backing_object->backing_object);
+ VM_OBJECT_WUNLOCK(backing_object->backing_object);
}
object->backing_object = backing_object->backing_object;
object->backing_object_offset +=
@@ -1749,21 +1800,23 @@
KASSERT(backing_object->ref_count == 1, (
"backing_object %p was somehow re-referenced during collapse!",
backing_object));
- VM_OBJECT_UNLOCK(backing_object);
+ vm_object_pip_wakeup(backing_object);
+ backing_object->type = OBJT_DEAD;
+ backing_object->ref_count = 0;
+ VM_OBJECT_WUNLOCK(backing_object);
vm_object_destroy(backing_object);
+ vm_object_pip_wakeup(object);
object_collapses++;
} else {
- vm_object_t new_backing_object;
-
/*
* If we do not entirely shadow the backing object,
* there is nothing we can do so we give up.
*/
if (object->resident_page_count != object->size &&
- vm_object_backing_scan(object,
- OBSC_TEST_ALL_SHADOWED) == 0) {
- VM_OBJECT_UNLOCK(backing_object);
+ !vm_object_backing_scan(object,
+ OBSC_TEST_ALL_SHADOWED)) {
+ VM_OBJECT_WUNLOCK(backing_object);
break;
}
@@ -1777,7 +1830,7 @@
new_backing_object = backing_object->backing_object;
if ((object->backing_object = new_backing_object) != NULL) {
- VM_OBJECT_LOCK(new_backing_object);
+ VM_OBJECT_WLOCK(new_backing_object);
LIST_INSERT_HEAD(
&new_backing_object->shadow_head,
object,
@@ -1785,7 +1838,7 @@
);
new_backing_object->shadow_count++;
vm_object_reference_locked(new_backing_object);
- VM_OBJECT_UNLOCK(new_backing_object);
+ VM_OBJECT_WUNLOCK(new_backing_object);
object->backing_object_offset +=
backing_object->backing_object_offset;
}
@@ -1795,7 +1848,7 @@
* its ref_count was at least 2, it will not vanish.
*/
backing_object->ref_count--;
- VM_OBJECT_UNLOCK(backing_object);
+ VM_OBJECT_WUNLOCK(backing_object);
object_bypasses++;
}
@@ -1836,10 +1889,9 @@
int options)
{
vm_page_t p, next;
- int wirings;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- KASSERT((object->type != OBJT_DEVICE && object->type != OBJT_PHYS) ||
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
(options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
("vm_object_page_remove: illegal options for object %p", object));
if (object->resident_page_count == 0)
@@ -1864,50 +1916,44 @@
* not specified.
*/
vm_page_lock(p);
- if ((wirings = p->wire_count) != 0 &&
- (wirings = pmap_page_wired_mappings(p)) != p->wire_count) {
- if ((options & OBJPR_NOTMAPPED) == 0) {
+ if (vm_page_xbusied(p)) {
+ VM_OBJECT_WUNLOCK(object);
+ vm_page_busy_sleep(p, "vmopax", true);
+ VM_OBJECT_WLOCK(object);
+ goto again;
+ }
+ if (p->wire_count != 0) {
+ if ((options & OBJPR_NOTMAPPED) == 0)
pmap_remove_all(p);
- /* Account for removal of wired mappings. */
- if (wirings != 0)
- p->wire_count -= wirings;
- }
if ((options & OBJPR_CLEANONLY) == 0) {
p->valid = 0;
vm_page_undirty(p);
}
- vm_page_unlock(p);
- continue;
+ goto next;
}
- if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
+ if (vm_page_busied(p)) {
+ VM_OBJECT_WUNLOCK(object);
+ vm_page_busy_sleep(p, "vmopar", false);
+ VM_OBJECT_WLOCK(object);
goto again;
+ }
KASSERT((p->flags & PG_FICTITIOUS) == 0,
("vm_object_page_remove: page %p is fictitious", p));
if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
if ((options & OBJPR_NOTMAPPED) == 0)
pmap_remove_write(p);
- if (p->dirty) {
- vm_page_unlock(p);
- continue;
- }
+ if (p->dirty)
+ goto next;
}
- if ((options & OBJPR_NOTMAPPED) == 0) {
+ if ((options & OBJPR_NOTMAPPED) == 0)
pmap_remove_all(p);
- /* Account for removal of wired mappings. */
- if (wirings != 0) {
- KASSERT(p->wire_count == wirings,
- ("inconsistent wire count %d %d %p",
- p->wire_count, wirings, p));
- p->wire_count = 0;
- atomic_subtract_int(&cnt.v_wire_count, 1);
- }
- }
vm_page_free(p);
+next:
vm_page_unlock(p);
}
vm_object_pip_wakeup(object);
skipmemq:
- if (__predict_false(object->cache != NULL))
+ if (__predict_false(!vm_object_cache_is_empty(object)))
vm_page_cache_free(object, start, end);
}
@@ -1923,7 +1969,7 @@
* pages are moved to the cache queue.
*
* This operation should only be performed on objects that
- * contain managed pages.
+ * contain non-fictitious, managed pages.
*
* The object must be locked.
*/
@@ -1933,9 +1979,8 @@
struct mtx *mtx, *new_mtx;
vm_page_t p, next;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- KASSERT((object->type != OBJT_DEVICE && object->type != OBJT_SG &&
- object->type != OBJT_PHYS),
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
("vm_object_page_cache: illegal object %p", object));
if (object->resident_page_count == 0)
return;
@@ -1982,10 +2027,9 @@
vm_pindex_t pindex;
int rv;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
for (pindex = start; pindex < end; pindex++) {
- m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL |
- VM_ALLOC_RETRY);
+ m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
if (m->valid != VM_PAGE_BITS_ALL) {
ma[0] = m;
rv = vm_pager_get_pages(object, ma, 1, 0);
@@ -2007,7 +2051,7 @@
if (pindex > start) {
m = vm_page_lookup(object, start);
while (m != NULL && m->pindex < pindex) {
- vm_page_wakeup(m);
+ vm_page_xunbusy(m);
m = TAILQ_NEXT(m, listq);
}
}
@@ -2043,10 +2087,11 @@
if (prev_object == NULL)
return (TRUE);
- VM_OBJECT_LOCK(prev_object);
- if (prev_object->type != OBJT_DEFAULT &&
- prev_object->type != OBJT_SWAP) {
- VM_OBJECT_UNLOCK(prev_object);
+ VM_OBJECT_WLOCK(prev_object);
+ if ((prev_object->type != OBJT_DEFAULT &&
+ prev_object->type != OBJT_SWAP) ||
+ (prev_object->flags & OBJ_TMPFS_NODE) != 0) {
+ VM_OBJECT_WUNLOCK(prev_object);
return (FALSE);
}
@@ -2061,7 +2106,7 @@
* pages not mapped to prev_entry may be in use anyway)
*/
if (prev_object->backing_object != NULL) {
- VM_OBJECT_UNLOCK(prev_object);
+ VM_OBJECT_WUNLOCK(prev_object);
return (FALSE);
}
@@ -2071,7 +2116,7 @@
if ((prev_object->ref_count > 1) &&
(prev_object->size != next_pindex)) {
- VM_OBJECT_UNLOCK(prev_object);
+ VM_OBJECT_WUNLOCK(prev_object);
return (FALSE);
}
@@ -2092,6 +2137,7 @@
*/
if (!reserved && !swap_reserve_by_cred(ptoa(next_size),
prev_object->cred)) {
+ VM_OBJECT_WUNLOCK(prev_object);
return (FALSE);
}
prev_object->charge += ptoa(next_size);
@@ -2125,7 +2171,7 @@
if (next_pindex + next_size > prev_object->size)
prev_object->size = next_pindex + next_size;
- VM_OBJECT_UNLOCK(prev_object);
+ VM_OBJECT_WUNLOCK(prev_object);
return (TRUE);
}
@@ -2133,9 +2179,14 @@
vm_object_set_writeable_dirty(vm_object_t object)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if (object->type != OBJT_VNODE)
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ if (object->type != OBJT_VNODE) {
+ if ((object->flags & OBJ_TMPFS_NODE) != 0) {
+ KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs"));
+ vm_object_set_flag(object, OBJ_TMPFS_DIRTY);
+ }
return;
+ }
object->generation++;
if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
return;
@@ -2142,6 +2193,228 @@
vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
}
+/*
+ * vm_object_unwire:
+ *
+ * For each page offset within the specified range of the given object,
+ * find the highest-level page in the shadow chain and unwire it. A page
+ * must exist at every page offset, and the highest-level page must be
+ * wired.
+ */
+void
+vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length,
+ uint8_t queue)
+{
+ vm_object_t tobject;
+ vm_page_t m, tm;
+ vm_pindex_t end_pindex, pindex, tpindex;
+ int depth, locked_depth;
+
+ KASSERT((offset & PAGE_MASK) == 0,
+ ("vm_object_unwire: offset is not page aligned"));
+ KASSERT((length & PAGE_MASK) == 0,
+ ("vm_object_unwire: length is not a multiple of PAGE_SIZE"));
+ /* The wired count of a fictitious page never changes. */
+ if ((object->flags & OBJ_FICTITIOUS) != 0)
+ return;
+ pindex = OFF_TO_IDX(offset);
+ end_pindex = pindex + atop(length);
+ locked_depth = 1;
+ VM_OBJECT_RLOCK(object);
+ m = vm_page_find_least(object, pindex);
+ while (pindex < end_pindex) {
+ if (m == NULL || pindex < m->pindex) {
+ /*
+ * The first object in the shadow chain doesn't
+ * contain a page at the current index. Therefore,
+ * the page must exist in a backing object.
+ */
+ tobject = object;
+ tpindex = pindex;
+ depth = 0;
+ do {
+ tpindex +=
+ OFF_TO_IDX(tobject->backing_object_offset);
+ tobject = tobject->backing_object;
+ KASSERT(tobject != NULL,
+ ("vm_object_unwire: missing page"));
+ if ((tobject->flags & OBJ_FICTITIOUS) != 0)
+ goto next_page;
+ depth++;
+ if (depth == locked_depth) {
+ locked_depth++;
+ VM_OBJECT_RLOCK(tobject);
+ }
+ } while ((tm = vm_page_lookup(tobject, tpindex)) ==
+ NULL);
+ } else {
+ tm = m;
+ m = TAILQ_NEXT(m, listq);
+ }
+ vm_page_lock(tm);
+ vm_page_unwire(tm, queue);
+ vm_page_unlock(tm);
+next_page:
+ pindex++;
+ }
+ /* Release the accumulated object locks. */
+ for (depth = 0; depth < locked_depth; depth++) {
+ tobject = object->backing_object;
+ VM_OBJECT_RUNLOCK(object);
+ object = tobject;
+ }
+}
+
+struct vnode *
+vm_object_vnode(vm_object_t object)
+{
+
+ VM_OBJECT_ASSERT_LOCKED(object);
+ if (object->type == OBJT_VNODE)
+ return (object->handle);
+ if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0)
+ return (object->un_pager.swp.swp_tmpfs);
+ return (NULL);
+}
+
+static int
+sysctl_vm_object_list(SYSCTL_HANDLER_ARGS)
+{
+ struct kinfo_vmobject *kvo;
+ char *fullpath, *freepath;
+ struct vnode *vp;
+ struct vattr va;
+ vm_object_t obj;
+ vm_page_t m;
+ int count, error;
+
+ if (req->oldptr == NULL) {
+ /*
+ * If an old buffer has not been provided, generate an
+ * estimate of the space needed for a subsequent call.
+ */
+ mtx_lock(&vm_object_list_mtx);
+ count = 0;
+ TAILQ_FOREACH(obj, &vm_object_list, object_list) {
+ if (obj->type == OBJT_DEAD)
+ continue;
+ count++;
+ }
+ mtx_unlock(&vm_object_list_mtx);
+ return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) *
+ count * 11 / 10));
+ }
+
+ kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK);
+ error = 0;
+
+ /*
+ * VM objects are type stable and are never removed from the
+ * list once added. This allows us to safely read obj->object_list
+ * after reacquiring the VM object lock.
+ */
+ mtx_lock(&vm_object_list_mtx);
+ TAILQ_FOREACH(obj, &vm_object_list, object_list) {
+ if (obj->type == OBJT_DEAD)
+ continue;
+ VM_OBJECT_RLOCK(obj);
+ if (obj->type == OBJT_DEAD) {
+ VM_OBJECT_RUNLOCK(obj);
+ continue;
+ }
+ mtx_unlock(&vm_object_list_mtx);
+ kvo->kvo_size = ptoa(obj->size);
+ kvo->kvo_resident = obj->resident_page_count;
+ kvo->kvo_ref_count = obj->ref_count;
+ kvo->kvo_shadow_count = obj->shadow_count;
+ kvo->kvo_memattr = obj->memattr;
+ kvo->kvo_active = 0;
+ kvo->kvo_inactive = 0;
+ TAILQ_FOREACH(m, &obj->memq, listq) {
+ /*
+ * A page may belong to the object but be
+ * dequeued and set to PQ_NONE while the
+ * object lock is not held. This makes the
+ * reads of m->queue below racy, and we do not
+ * count pages set to PQ_NONE. However, this
+ * sysctl is only meant to give an
+ * approximation of the system anyway.
+ */
+ if (m->queue == PQ_ACTIVE)
+ kvo->kvo_active++;
+ else if (m->queue == PQ_INACTIVE)
+ kvo->kvo_inactive++;
+ }
+
+ kvo->kvo_vn_fileid = 0;
+ kvo->kvo_vn_fsid = 0;
+ freepath = NULL;
+ fullpath = "";
+ vp = NULL;
+ switch (obj->type) {
+ case OBJT_DEFAULT:
+ kvo->kvo_type = KVME_TYPE_DEFAULT;
+ break;
+ case OBJT_VNODE:
+ kvo->kvo_type = KVME_TYPE_VNODE;
+ vp = obj->handle;
+ vref(vp);
+ break;
+ case OBJT_SWAP:
+ kvo->kvo_type = KVME_TYPE_SWAP;
+ break;
+ case OBJT_DEVICE:
+ kvo->kvo_type = KVME_TYPE_DEVICE;
+ break;
+ case OBJT_PHYS:
+ kvo->kvo_type = KVME_TYPE_PHYS;
+ break;
+ case OBJT_DEAD:
+ kvo->kvo_type = KVME_TYPE_DEAD;
+ break;
+ case OBJT_SG:
+ kvo->kvo_type = KVME_TYPE_SG;
+ break;
+ case OBJT_MGTDEVICE:
+ kvo->kvo_type = KVME_TYPE_MGTDEVICE;
+ break;
+ default:
+ kvo->kvo_type = KVME_TYPE_UNKNOWN;
+ break;
+ }
+ VM_OBJECT_RUNLOCK(obj);
+ if (vp != NULL) {
+ vn_fullpath(curthread, vp, &fullpath, &freepath);
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) {
+ kvo->kvo_vn_fileid = va.va_fileid;
+ kvo->kvo_vn_fsid = va.va_fsid;
+ }
+ vput(vp);
+ }
+
+ strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path));
+ if (freepath != NULL)
+ free(freepath, M_TEMP);
+
+ /* Pack record size down */
+ kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path)
+ + strlen(kvo->kvo_path) + 1;
+ kvo->kvo_structsize = roundup(kvo->kvo_structsize,
+ sizeof(uint64_t));
+ error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize);
+ mtx_lock(&vm_object_list_mtx);
+ if (error)
+ break;
+ }
+ mtx_unlock(&vm_object_list_mtx);
+ free(kvo, M_TEMP);
+ return (error);
+}
+SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP |
+ CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject",
+ "List of VM objects");
+
#include "opt_ddb.h"
#ifdef DDB
#include <sys/kernel.h>
@@ -2206,12 +2479,6 @@
/* sx_sunlock(&allproc_lock); */
if (_vm_object_in_map(kernel_map, object, 0))
return 1;
- if (_vm_object_in_map(kmem_map, object, 0))
- return 1;
- if (_vm_object_in_map(pager_map, object, 0))
- return 1;
- if (_vm_object_in_map(buffer_map, object, 0))
- return 1;
return 0;
}
Modified: trunk/sys/vm/vm_object.h
===================================================================
--- trunk/sys/vm/vm_object.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_object.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -57,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_object.h 313384 2017-02-07 08:33:46Z kib $
*/
/*
@@ -70,24 +71,40 @@
#include <sys/queue.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
+#include <sys/_rwlock.h>
+#include <vm/_vm_radix.h>
+
/*
* Types defined:
*
* vm_object_t Virtual memory object.
*
+ * The root of cached pages pool is protected by both the per-object lock
+ * and the free pages queue mutex.
+ * On insert in the cache radix trie, the per-object lock is expected
+ * to be already held and the free pages queue mutex will be
+ * acquired during the operation too.
+ * On remove and lookup from the cache radix trie, only the free
+ * pages queue mutex is expected to be locked.
+ * These rules allow for reliably checking for the presence of cached
+ * pages with only the per-object lock held, thereby reducing contention
+ * for the free pages queue mutex.
+ *
* List of locks
* (c) const until freed
+ * (o) per-object lock
+ * (f) free pages queue mutex
*
*/
struct vm_object {
- struct mtx mtx;
+ struct rwlock lock;
TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
- TAILQ_HEAD(, vm_page) memq; /* list of resident pages */
- vm_page_t root; /* root of the resident page splay tree */
+ TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
+ struct vm_radix rtree; /* root of the resident page radix trie*/
vm_pindex_t size; /* Object size */
int generation; /* generation ID */
int ref_count; /* How many refs?? */
@@ -96,13 +113,13 @@
objtype_t type; /* type of pager */
u_short flags; /* see below */
u_short pg_color; /* (c) color of first page in obj */
- u_short pad1; /* Old pip counter */
+ u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */
int resident_page_count; /* number of resident pages */
struct vm_object *backing_object; /* object that I'm a shadow of */
vm_ooffset_t backing_object_offset;/* Offset in backing object */
TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
LIST_HEAD(, vm_reserv) rvq; /* list of reservations */
- vm_page_t cache; /* root of the cache page splay tree */
+ struct vm_radix cache; /* (o + f) root of the cache page radix trie */
void *handle;
union {
/*
@@ -123,6 +140,7 @@
struct {
TAILQ_HEAD(, vm_page) devp_pglist;
struct cdev_pager_ops *ops;
+ struct cdev *dev;
} devp;
/*
@@ -137,33 +155,46 @@
/*
* Swap pager
*
+ * swp_tmpfs - back-pointer to the tmpfs vnode,
+ * if any, which uses the vm object
+ * as backing store. The handle
+ * cannot be reused for linking,
+ * because the vnode can be
+ * reclaimed and recreated, making
+ * the handle changed and hash-chain
+ * invalid.
+ *
* swp_bcount - number of swap 'swblock' metablocks, each
* contains up to 16 swapblk assignments.
* see vm/swap_pager.h
*/
struct {
+ void *swp_tmpfs;
int swp_bcount;
} swp;
} un_pager;
struct ucred *cred;
vm_ooffset_t charge;
- u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */
};
/*
* Flags
*/
-#define OBJ_ACTIVE 0x0004 /* active objects */
+#define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */
+#define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */
#define OBJ_DEAD 0x0008 /* dead objects (during rundown) */
#define OBJ_NOSPLIT 0x0010 /* dont split this object */
#define OBJ_PIPWNT 0x0040 /* paging in progress wanted */
#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */
+#define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */
+#define OBJ_TMPFS_DIRTY 0x0400 /* dirty tmpfs obj */
#define OBJ_COLORED 0x1000 /* pg_color is defined */
#define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */
#define OBJ_DISCONNECTWNT 0x4000 /* disconnect from vnode wanted */
+#define OBJ_TMPFS 0x8000 /* has tmpfs vnode allocated */
-#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
-#define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
+#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
+#define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
#ifdef _KERNEL
@@ -188,16 +219,32 @@
#define kernel_object (&kernel_object_store)
#define kmem_object (&kmem_object_store)
-#define VM_OBJECT_LOCK(object) mtx_lock(&(object)->mtx)
-#define VM_OBJECT_LOCK_ASSERT(object, type) \
- mtx_assert(&(object)->mtx, (type))
-#define VM_OBJECT_LOCK_INIT(object, type) \
- mtx_init(&(object)->mtx, "vm object", \
- (type), MTX_DEF | MTX_DUPOK)
-#define VM_OBJECT_LOCKED(object) mtx_owned(&(object)->mtx)
-#define VM_OBJECT_MTX(object) (&(object)->mtx)
-#define VM_OBJECT_TRYLOCK(object) mtx_trylock(&(object)->mtx)
-#define VM_OBJECT_UNLOCK(object) mtx_unlock(&(object)->mtx)
+#define VM_OBJECT_ASSERT_LOCKED(object) \
+ rw_assert(&(object)->lock, RA_LOCKED)
+#define VM_OBJECT_ASSERT_RLOCKED(object) \
+ rw_assert(&(object)->lock, RA_RLOCKED)
+#define VM_OBJECT_ASSERT_WLOCKED(object) \
+ rw_assert(&(object)->lock, RA_WLOCKED)
+#define VM_OBJECT_ASSERT_UNLOCKED(object) \
+ rw_assert(&(object)->lock, RA_UNLOCKED)
+#define VM_OBJECT_LOCK_DOWNGRADE(object) \
+ rw_downgrade(&(object)->lock)
+#define VM_OBJECT_RLOCK(object) \
+ rw_rlock(&(object)->lock)
+#define VM_OBJECT_RUNLOCK(object) \
+ rw_runlock(&(object)->lock)
+#define VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo) \
+ rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo))
+#define VM_OBJECT_TRYRLOCK(object) \
+ rw_try_rlock(&(object)->lock)
+#define VM_OBJECT_TRYWLOCK(object) \
+ rw_try_wlock(&(object)->lock)
+#define VM_OBJECT_TRYUPGRADE(object) \
+ rw_try_upgrade(&(object)->lock)
+#define VM_OBJECT_WLOCK(object) \
+ rw_wlock(&(object)->lock)
+#define VM_OBJECT_WUNLOCK(object) \
+ rw_wunlock(&(object)->lock)
/*
* The object must be locked or thread private.
@@ -216,8 +263,14 @@
void vm_object_pip_wakeupn(vm_object_t object, short i);
void vm_object_pip_wait(vm_object_t object, char *waitid);
+static __inline boolean_t
+vm_object_cache_is_empty(vm_object_t object)
+{
+
+ return (vm_radix_is_empty(&object->cache));
+}
+
vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
-void _vm_object_allocate (objtype_t, vm_pindex_t, vm_object_t);
boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t,
boolean_t);
void vm_object_collapse (vm_object_t);
@@ -242,6 +295,9 @@
void vm_object_split(vm_map_entry_t);
boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t,
boolean_t);
+void vm_object_unwire(vm_object_t object, vm_ooffset_t offset,
+ vm_size_t length, uint8_t queue);
+struct vnode *vm_object_vnode(vm_object_t object);
#endif /* _KERNEL */
#endif /* _VM_OBJECT_ */
Modified: trunk/sys/vm/vm_page.c
===================================================================
--- trunk/sys/vm/vm_page.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_page.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991 Regents of the University of California.
* All rights reserved.
@@ -63,11 +64,16 @@
/*
* GENERAL RULES ON VM_PAGE MANIPULATION
*
- * - a pageq mutex is required when adding or removing a page from a
- * page queue (vm_page_queue[]), regardless of other mutexes or the
- * busy state of a page.
+ * - A page queue lock is required when adding or removing a page from a
+ * page queue regardless of other locks or the busy state of a page.
*
- * - The object mutex is held when inserting or removing
+ * * In general, no thread besides the page daemon can acquire or
+ * hold more than one page queue lock at a time.
+ *
+ * * The page daemon can acquire and hold any pair of page queue
+ * locks in any order.
+ *
+ * - The object lock is required when inserting or removing
* pages from an object (vm_page_insert() or vm_page_remove()).
*
*/
@@ -77,7 +83,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_page.c 320190 2017-06-21 14:39:31Z jhb $");
#include "opt_vm.h"
@@ -87,9 +93,11 @@
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/malloc.h>
+#include <sys/mman.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/rwlock.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
@@ -103,6 +111,7 @@
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
#include <vm/vm_reserv.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
@@ -115,11 +124,10 @@
* page structure.
*/
-struct vpgqueues vm_page_queues[PQ_COUNT];
-struct vpglocks vm_page_queue_lock;
-struct vpglocks vm_page_queue_free_lock;
+struct vm_domain vm_dom[MAXMEMDOM];
+struct mtx_padalign vm_page_queue_free_mtx;
-struct vpglocks pa_lock[PA_LOCK_COUNT];
+struct mtx_padalign pa_lock[PA_LOCK_COUNT];
vm_page_t vm_page_array;
long vm_page_array_size;
@@ -131,16 +139,21 @@
SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
"number of pages allocated for bootstrapping the VM system");
-int pa_tryrelock_restart;
+static int pa_tryrelock_restart;
SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
&pa_tryrelock_restart, 0, "Number of tryrelock restarts");
static uma_zone_t fakepg_zone;
+static struct vnode *vm_page_alloc_init(vm_page_t m);
+static void vm_page_cache_turn_free(vm_page_t m);
static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
-static void vm_page_queue_remove(int queue, vm_page_t m);
static void vm_page_enqueue(int queue, vm_page_t m);
static void vm_page_init_fakepg(void *dummy);
+static int vm_page_insert_after(vm_page_t m, vm_object_t object,
+ vm_pindex_t pindex, vm_page_t mpred);
+static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
+ vm_page_t mpred);
SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
@@ -233,20 +246,46 @@
return (0);
}
+static void
+vm_page_domain_init(struct vm_domain *vmd)
+{
+ struct vm_pagequeue *pq;
+ int i;
+
+ *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
+ "vm inactive pagequeue";
+ *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
+ &cnt.v_inactive_count;
+ *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
+ "vm active pagequeue";
+ *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
+ &cnt.v_active_count;
+ vmd->vmd_page_count = 0;
+ vmd->vmd_free_count = 0;
+ vmd->vmd_segs = 0;
+ vmd->vmd_oom = FALSE;
+ vmd->vmd_pass = 0;
+ for (i = 0; i < PQ_COUNT; i++) {
+ pq = &vmd->vmd_pagequeues[i];
+ TAILQ_INIT(&pq->pq_pl);
+ mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
+ MTX_DEF | MTX_DUPOK);
+ }
+}
+
/*
* vm_page_startup:
*
- * Initializes the resident memory module.
- *
- * Allocates memory for the page cells, and
- * for the object/offset-to-page hash table headers.
- * Each page cell is initialized and placed on the free list.
+ * Initializes the resident memory module. Allocates physical memory for
+ * bootstrapping UMA and some data structures that are used to manage
+ * physical pages. Initializes these structures, and populates the free
+ * page queues.
*/
vm_offset_t
vm_page_startup(vm_offset_t vaddr)
{
vm_offset_t mapped;
- vm_paddr_t page_range;
+ vm_paddr_t high_avail, low_avail, page_range, size;
vm_paddr_t new_end;
int i;
vm_paddr_t pa;
@@ -256,7 +295,6 @@
/* the biggest memory array is the second group of pages */
vm_paddr_t end;
vm_paddr_t biggestsize;
- vm_paddr_t low_water, high_water;
int biggestone;
biggestsize = 0;
@@ -268,48 +306,34 @@
phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
}
- low_water = phys_avail[0];
- high_water = phys_avail[1];
+#ifdef XEN
+ /*
+ * There is no obvious reason why i386 PV Xen needs vm_page structs
+ * created for these pseudo-physical addresses. XXX
+ */
+ vm_phys_add_seg(0, phys_avail[0]);
+#endif
for (i = 0; phys_avail[i + 1]; i += 2) {
- vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
-
+ size = phys_avail[i + 1] - phys_avail[i];
if (size > biggestsize) {
biggestone = i;
biggestsize = size;
}
- if (phys_avail[i] < low_water)
- low_water = phys_avail[i];
- if (phys_avail[i + 1] > high_water)
- high_water = phys_avail[i + 1];
}
-#ifdef XEN
- low_water = 0;
-#endif
-
end = phys_avail[biggestone+1];
/*
* Initialize the page and queue locks.
*/
- mtx_init(&vm_page_queue_mtx, "vm page queue", NULL, MTX_DEF |
- MTX_RECURSE);
mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
for (i = 0; i < PA_LOCK_COUNT; i++)
- mtx_init(&pa_lock[i].data, "vm page", NULL, MTX_DEF);
+ mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
+ for (i = 0; i < vm_ndomains; i++)
+ vm_page_domain_init(&vm_dom[i]);
/*
- * Initialize the queue headers for the hold queue, the active queue,
- * and the inactive queue.
- */
- for (i = 0; i < PQ_COUNT; i++)
- TAILQ_INIT(&vm_page_queues[i].pl);
- vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
- vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
- vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
-
- /*
* Allocate memory for use when boot strapping the kernel memory
* allocator.
*/
@@ -344,6 +368,16 @@
new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
bzero((void *)vm_page_dump, vm_page_dump_size);
#endif
+#if defined(__amd64__) || defined(__mips__)
+ /*
+ * Include the UMA bootstrap pages and vm_page_dump in a crash dump.
+ * When pmap_map() uses the direct map, they are not automatically
+ * included.
+ */
+ for (pa = new_end; pa < end; pa += PAGE_SIZE)
+ dump_add_page(pa);
+#endif
+ phys_avail[biggestone + 1] = new_end;
#ifdef __amd64__
/*
* Request that the physical pages underlying the message buffer be
@@ -359,29 +393,80 @@
#endif
/*
* Compute the number of pages of memory that will be available for
- * use (taking into account the overhead of a page structure per
- * page).
+ * use, taking into account the overhead of a page structure per page.
+ * In other words, solve
+ * "available physical memory" - round_page(page_range *
+ * sizeof(struct vm_page)) = page_range * PAGE_SIZE
+ * for page_range.
*/
- first_page = low_water / PAGE_SIZE;
+ low_avail = phys_avail[0];
+ high_avail = phys_avail[1];
+ for (i = 0; i < vm_phys_nsegs; i++) {
+ if (vm_phys_segs[i].start < low_avail)
+ low_avail = vm_phys_segs[i].start;
+ if (vm_phys_segs[i].end > high_avail)
+ high_avail = vm_phys_segs[i].end;
+ }
+ /* Skip the first chunk. It is already accounted for. */
+ for (i = 2; phys_avail[i + 1] != 0; i += 2) {
+ if (phys_avail[i] < low_avail)
+ low_avail = phys_avail[i];
+ if (phys_avail[i + 1] > high_avail)
+ high_avail = phys_avail[i + 1];
+ }
+ first_page = low_avail / PAGE_SIZE;
#ifdef VM_PHYSSEG_SPARSE
- page_range = 0;
+ size = 0;
+ for (i = 0; i < vm_phys_nsegs; i++)
+ size += vm_phys_segs[i].end - vm_phys_segs[i].start;
for (i = 0; phys_avail[i + 1] != 0; i += 2)
- page_range += atop(phys_avail[i + 1] - phys_avail[i]);
+ size += phys_avail[i + 1] - phys_avail[i];
#elif defined(VM_PHYSSEG_DENSE)
- page_range = high_water / PAGE_SIZE - first_page;
+ size = high_avail - low_avail;
#else
#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
#endif
+
+#ifdef VM_PHYSSEG_DENSE
+ /*
+ * In the VM_PHYSSEG_DENSE case, the number of pages can account for
+ * the overhead of a page structure per page only if vm_page_array is
+ * allocated from the last physical memory chunk. Otherwise, we must
+ * allocate page structures representing the physical memory
+ * underlying vm_page_array, even though they will not be used.
+ */
+ if (new_end != high_avail)
+ page_range = size / PAGE_SIZE;
+ else
+#endif
+ {
+ page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
+
+ /*
+ * If the partial bytes remaining are large enough for
+ * a page (PAGE_SIZE) without a corresponding
+ * 'struct vm_page', then new_end will contain an
+ * extra page after subtracting the length of the VM
+ * page array. Compensate by subtracting an extra
+ * page from new_end.
+ */
+ if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
+ if (new_end == high_avail)
+ high_avail -= PAGE_SIZE;
+ new_end -= PAGE_SIZE;
+ }
+ }
end = new_end;
/*
* Reserve an unmapped guard page to trap access to vm_page_array[-1].
+ * However, because this page is allocated from KVM, out-of-bounds
+ * accesses using the direct map will not be trapped.
*/
vaddr += PAGE_SIZE;
/*
- * Initialize the mem entry structures now, and put them in the free
- * queue.
+ * Allocate physical memory for the page structures, and map it.
*/
new_end = trunc_page(end - page_range * sizeof(struct vm_page));
mapped = pmap_map(&vaddr, new_end, end,
@@ -389,24 +474,30 @@
vm_page_array = (vm_page_t) mapped;
#if VM_NRESERVLEVEL > 0
/*
- * Allocate memory for the reservation management system's data
- * structures.
+ * Allocate physical memory for the reservation management system's
+ * data structures, and map it.
*/
- new_end = vm_reserv_startup(&vaddr, new_end, high_water);
+ if (high_avail == end)
+ high_avail = new_end;
+ new_end = vm_reserv_startup(&vaddr, new_end, high_avail);
#endif
#if defined(__amd64__) || defined(__mips__)
/*
- * pmap_map on amd64 and mips can come out of the direct-map, not kvm
- * like i386, so the pages must be tracked for a crashdump to include
- * this data. This includes the vm_page_array and the early UMA
- * bootstrap pages.
+ * Include vm_page_array and vm_reserv_array in a crash dump.
*/
- for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
+ for (pa = new_end; pa < end; pa += PAGE_SIZE)
dump_add_page(pa);
#endif
phys_avail[biggestone + 1] = new_end;
/*
+ * Add physical memory segments corresponding to the available
+ * physical pages.
+ */
+ for (i = 0; phys_avail[i + 1] != 0; i += 2)
+ vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
+
+ /*
* Clear all of the page structures
*/
bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
@@ -449,130 +540,191 @@
return (vaddr);
}
-
-CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
-
void
-vm_page_aflag_set(vm_page_t m, uint8_t bits)
+vm_page_reference(vm_page_t m)
{
- uint32_t *addr, val;
- /*
- * The PGA_WRITEABLE flag can only be set if the page is managed and
- * VPO_BUSY. Currently, this flag is only set by pmap_enter().
- */
- KASSERT((bits & PGA_WRITEABLE) == 0 ||
- (m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == VPO_BUSY,
- ("PGA_WRITEABLE and !VPO_BUSY"));
+ vm_page_aflag_set(m, PGA_REFERENCED);
+}
- /*
- * We want to use atomic updates for m->aflags, which is a
- * byte wide. Not all architectures provide atomic operations
- * on the single-byte destination. Punt and access the whole
- * 4-byte word with an atomic update. Parallel non-atomic
- * updates to the fields included in the update by proximity
- * are handled properly by atomics.
- */
- addr = (void *)&m->aflags;
- MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
- val = bits;
-#if BYTE_ORDER == BIG_ENDIAN
- val <<= 24;
-#endif
- atomic_set_32(addr, val);
-}
-
+/*
+ * vm_page_busy_downgrade:
+ *
+ * Downgrade an exclusive busy page into a single shared busy page.
+ */
void
-vm_page_aflag_clear(vm_page_t m, uint8_t bits)
+vm_page_busy_downgrade(vm_page_t m)
{
- uint32_t *addr, val;
+ u_int x;
+ bool locked;
- /*
- * The PGA_REFERENCED flag can only be cleared if the object
- * containing the page is locked.
- */
- KASSERT((bits & PGA_REFERENCED) == 0 || VM_OBJECT_LOCKED(m->object),
- ("PGA_REFERENCED and !VM_OBJECT_LOCKED"));
+ vm_page_assert_xbusied(m);
+ locked = mtx_owned(vm_page_lockptr(m));
- /*
- * See the comment in vm_page_aflag_set().
- */
- addr = (void *)&m->aflags;
- MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
- val = bits;
-#if BYTE_ORDER == BIG_ENDIAN
- val <<= 24;
-#endif
- atomic_clear_32(addr, val);
+ for (;;) {
+ x = m->busy_lock;
+ x &= VPB_BIT_WAITERS;
+ if (x != 0 && !locked)
+ vm_page_lock(m);
+ if (atomic_cmpset_rel_int(&m->busy_lock,
+ VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1)))
+ break;
+ if (x != 0 && !locked)
+ vm_page_unlock(m);
+ }
+ if (x != 0) {
+ wakeup(m);
+ if (!locked)
+ vm_page_unlock(m);
+ }
}
-void
-vm_page_reference(vm_page_t m)
+/*
+ * vm_page_sbusied:
+ *
+ * Return a positive value if the page is shared busied, 0 otherwise.
+ */
+int
+vm_page_sbusied(vm_page_t m)
{
+ u_int x;
- vm_page_aflag_set(m, PGA_REFERENCED);
+ x = m->busy_lock;
+ return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
}
+/*
+ * vm_page_sunbusy:
+ *
+ * Shared unbusy a page.
+ */
void
-vm_page_busy(vm_page_t m)
+vm_page_sunbusy(vm_page_t m)
{
+ u_int x;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- KASSERT((m->oflags & VPO_BUSY) == 0,
- ("vm_page_busy: page already busy!!!"));
- m->oflags |= VPO_BUSY;
+ vm_page_assert_sbusied(m);
+
+ for (;;) {
+ x = m->busy_lock;
+ if (VPB_SHARERS(x) > 1) {
+ if (atomic_cmpset_int(&m->busy_lock, x,
+ x - VPB_ONE_SHARER))
+ break;
+ continue;
+ }
+ if ((x & VPB_BIT_WAITERS) == 0) {
+ KASSERT(x == VPB_SHARERS_WORD(1),
+ ("vm_page_sunbusy: invalid lock state"));
+ if (atomic_cmpset_int(&m->busy_lock,
+ VPB_SHARERS_WORD(1), VPB_UNBUSIED))
+ break;
+ continue;
+ }
+ KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS),
+ ("vm_page_sunbusy: invalid lock state for waiters"));
+
+ vm_page_lock(m);
+ if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) {
+ vm_page_unlock(m);
+ continue;
+ }
+ wakeup(m);
+ vm_page_unlock(m);
+ break;
+ }
}
/*
- * vm_page_flash:
+ * vm_page_busy_sleep:
*
- * wakeup anyone waiting for the page.
+ * Sleep and release the page lock, using the page pointer as wchan.
+ * This is used to implement the hard-path of busying mechanism.
+ *
+ * The given page must be locked.
+ *
+ * If nonshared is true, sleep only if the page is xbusy.
*/
void
-vm_page_flash(vm_page_t m)
+vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared)
{
+ u_int x;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if (m->oflags & VPO_WANTED) {
- m->oflags &= ~VPO_WANTED;
- wakeup(m);
+ vm_page_assert_locked(m);
+
+ x = m->busy_lock;
+ if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) ||
+ ((x & VPB_BIT_WAITERS) == 0 &&
+ !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) {
+ vm_page_unlock(m);
+ return;
}
+ msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0);
}
/*
- * vm_page_wakeup:
+ * vm_page_trysbusy:
*
- * clear the VPO_BUSY flag and wakeup anyone waiting for the
- * page.
- *
+ * Try to shared busy a page.
+ * If the operation succeeds 1 is returned otherwise 0.
+ * The operation never sleeps.
*/
-void
-vm_page_wakeup(vm_page_t m)
+int
+vm_page_trysbusy(vm_page_t m)
{
+ u_int x;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
- m->oflags &= ~VPO_BUSY;
- vm_page_flash(m);
+ for (;;) {
+ x = m->busy_lock;
+ if ((x & VPB_BIT_SHARED) == 0)
+ return (0);
+ if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER))
+ return (1);
+ }
}
+/*
+ * vm_page_xunbusy_hard:
+ *
+ * Called after the first try the exclusive unbusy of a page failed.
+ * It is assumed that the waiters bit is on.
+ */
void
-vm_page_io_start(vm_page_t m)
+vm_page_xunbusy_hard(vm_page_t m)
{
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- m->busy++;
+ vm_page_assert_xbusied(m);
+
+ vm_page_lock(m);
+ atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
+ wakeup(m);
+ vm_page_unlock(m);
}
+/*
+ * vm_page_flash:
+ *
+ * Wakeup anyone waiting for the page.
+ * The ownership bits do not change.
+ *
+ * The given page must be locked.
+ */
void
-vm_page_io_finish(vm_page_t m)
+vm_page_flash(vm_page_t m)
{
+ u_int x;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m));
- m->busy--;
- if (m->busy == 0)
- vm_page_flash(m);
+ vm_page_lock_assert(m, MA_OWNED);
+
+ for (;;) {
+ x = m->busy_lock;
+ if ((x & VPB_BIT_WAITERS) == 0)
+ return;
+ if (atomic_cmpset_int(&m->busy_lock, x,
+ x & (~VPB_BIT_WAITERS)))
+ break;
+ }
+ wakeup(m);
}
/*
@@ -594,9 +746,9 @@
{
vm_page_lock_assert(mem, MA_OWNED);
+ KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!"));
--mem->hold_count;
- KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
- if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
+ if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
vm_page_free_toq(mem);
}
@@ -687,8 +839,10 @@
/* Fictitious pages don't use "segind". */
m->flags = PG_FICTITIOUS;
/* Fictitious pages don't use "order" or "pool". */
- m->oflags = VPO_BUSY | VPO_UNMANAGED;
+ m->oflags = VPO_UNMANAGED;
+ m->busy_lock = VPB_SINGLE_EXCLUSIVER;
m->wire_count = 1;
+ pmap_page_init(m);
memattr:
pmap_page_set_memattr(m, memattr);
}
@@ -766,16 +920,13 @@
* deactivating the page is usually the best choice,
* unless the page is wanted by another thread.
*/
- if (m->oflags & VPO_WANTED) {
- vm_page_lock(m);
+ vm_page_lock(m);
+ if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
vm_page_activate(m);
- vm_page_unlock(m);
- } else {
- vm_page_lock(m);
+ else
vm_page_deactivate(m);
- vm_page_unlock(m);
- }
- vm_page_wakeup(m);
+ vm_page_unlock(m);
+ vm_page_xunbusy(m);
} else {
/*
* Free the completely invalid page. Such page state
@@ -790,35 +941,42 @@
}
/*
- * vm_page_sleep:
+ * vm_page_sleep_if_busy:
*
- * Sleep and release the page and page queues locks.
+ * Sleep and release the page queues lock if the page is busied.
+ * Returns TRUE if the thread slept.
*
- * The object containing the given page must be locked.
+ * The given page must be unlocked and object containing it must
+ * be locked.
*/
-void
-vm_page_sleep(vm_page_t m, const char *msg)
+int
+vm_page_sleep_if_busy(vm_page_t m, const char *msg)
{
+ vm_object_t obj;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if (mtx_owned(&vm_page_queue_mtx))
- vm_page_unlock_queues();
- if (mtx_owned(vm_page_lockptr(m)))
- vm_page_unlock(m);
+ vm_page_lock_assert(m, MA_NOTOWNED);
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
- /*
- * It's possible that while we sleep, the page will get
- * unbusied and freed. If we are holding the object
- * lock, we will assume we hold a reference to the object
- * such that even if m->object changes, we can re-lock
- * it.
- */
- m->oflags |= VPO_WANTED;
- msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
+ if (vm_page_busied(m)) {
+ /*
+ * The page-specific object must be cached because page
+ * identity can change during the sleep, causing the
+ * re-lock of a different object.
+ * It is assumed that a reference to the object is already
+ * held by the callers.
+ */
+ obj = m->object;
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(obj);
+ vm_page_busy_sleep(m, msg, false);
+ VM_OBJECT_WLOCK(obj);
+ return (TRUE);
+ }
+ return (FALSE);
}
/*
- * vm_page_dirty:
+ * vm_page_dirty_KBI: [ internal use only ]
*
* Set all bits in the page's dirty field.
*
@@ -826,11 +984,14 @@
* call is made from the machine-independent layer.
*
* See vm_page_clear_dirty_mask().
+ *
+ * This function should only be called by vm_page_dirty().
*/
void
-vm_page_dirty(vm_page_t m)
+vm_page_dirty_KBI(vm_page_t m)
{
+ /* These assertions refer to this operation by its public name. */
KASSERT((m->flags & PG_CACHED) == 0,
("vm_page_dirty: page in cache!"));
KASSERT(!VM_PAGE_IS_FREE(m),
@@ -841,77 +1002,52 @@
}
/*
- * vm_page_splay:
+ * vm_page_insert: [ internal use only ]
*
- * Implements Sleator and Tarjan's top-down splay algorithm. Returns
- * the vm_page containing the given pindex. If, however, that
- * pindex is not found in the vm_object, returns a vm_page that is
- * adjacent to the pindex, coming before or after it.
+ * Inserts the given mem entry into the object and object list.
+ *
+ * The object must be locked.
*/
-vm_page_t
-vm_page_splay(vm_pindex_t pindex, vm_page_t root)
+int
+vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
{
- struct vm_page dummy;
- vm_page_t lefttreemax, righttreemin, y;
+ vm_page_t mpred;
- if (root == NULL)
- return (root);
- lefttreemax = righttreemin = &dummy;
- for (;; root = y) {
- if (pindex < root->pindex) {
- if ((y = root->left) == NULL)
- break;
- if (pindex < y->pindex) {
- /* Rotate right. */
- root->left = y->right;
- y->right = root;
- root = y;
- if ((y = root->left) == NULL)
- break;
- }
- /* Link into the new root's right tree. */
- righttreemin->left = root;
- righttreemin = root;
- } else if (pindex > root->pindex) {
- if ((y = root->right) == NULL)
- break;
- if (pindex > y->pindex) {
- /* Rotate left. */
- root->right = y->left;
- y->left = root;
- root = y;
- if ((y = root->right) == NULL)
- break;
- }
- /* Link into the new root's left tree. */
- lefttreemax->right = root;
- lefttreemax = root;
- } else
- break;
- }
- /* Assemble the new root. */
- lefttreemax->right = root->left;
- righttreemin->left = root->right;
- root->left = dummy.right;
- root->right = dummy.left;
- return (root);
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ mpred = vm_radix_lookup_le(&object->rtree, pindex);
+ return (vm_page_insert_after(m, object, pindex, mpred));
}
/*
- * vm_page_insert: [ internal use only ]
+ * vm_page_insert_after:
*
- * Inserts the given mem entry into the object and object list.
+ * Inserts the page "m" into the specified object at offset "pindex".
*
+ * The page "mpred" must immediately precede the offset "pindex" within
+ * the specified object.
+ *
* The object must be locked.
*/
-void
-vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
+static int
+vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
+ vm_page_t mpred)
{
- vm_page_t root;
+ vm_page_t msucc;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if (m->object != NULL)
- panic("vm_page_insert: page already inserted");
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(m->object == NULL,
+ ("vm_page_insert_after: page already inserted"));
+ if (mpred != NULL) {
+ KASSERT(mpred->object == object,
+ ("vm_page_insert_after: object doesn't contain mpred"));
+ KASSERT(mpred->pindex < pindex,
+ ("vm_page_insert_after: mpred doesn't precede pindex"));
+ msucc = TAILQ_NEXT(mpred, listq);
+ } else
+ msucc = TAILQ_FIRST(&object->memq);
+ if (msucc != NULL)
+ KASSERT(msucc->pindex > pindex,
+ ("vm_page_insert_after: msucc doesn't succeed pindex"));
/*
* Record the object/offset pair in this page
@@ -922,29 +1058,45 @@
/*
* Now link into the object's ordered list of backed pages.
*/
- root = object->root;
- if (root == NULL) {
- m->left = NULL;
- m->right = NULL;
- TAILQ_INSERT_TAIL(&object->memq, m, listq);
- } else {
- root = vm_page_splay(pindex, root);
- if (pindex < root->pindex) {
- m->left = root->left;
- m->right = root;
- root->left = NULL;
- TAILQ_INSERT_BEFORE(root, m, listq);
- } else if (pindex == root->pindex)
- panic("vm_page_insert: offset already allocated");
- else {
- m->right = root->right;
- m->left = root;
- root->right = NULL;
- TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
- }
+ if (vm_radix_insert(&object->rtree, m)) {
+ m->object = NULL;
+ m->pindex = 0;
+ return (1);
}
- object->root = m;
+ vm_page_insert_radixdone(m, object, mpred);
+ return (0);
+}
+/*
+ * vm_page_insert_radixdone:
+ *
+ * Complete page "m" insertion into the specified object after the
+ * radix trie hooking.
+ *
+ * The page "mpred" must precede the offset "m->pindex" within the
+ * specified object.
+ *
+ * The object must be locked.
+ */
+static void
+vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
+{
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(object != NULL && m->object == object,
+ ("vm_page_insert_radixdone: page %p has inconsistent object", m));
+ if (mpred != NULL) {
+ KASSERT(mpred->object == object,
+ ("vm_page_insert_after: object doesn't contain mpred"));
+ KASSERT(mpred->pindex < m->pindex,
+ ("vm_page_insert_after: mpred doesn't precede pindex"));
+ }
+
+ if (mpred != NULL)
+ TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
+ else
+ TAILQ_INSERT_HEAD(&object->memq, m, listq);
+
/*
* Show that the object has one more resident page.
*/
@@ -977,57 +1129,30 @@
vm_page_remove(vm_page_t m)
{
vm_object_t object;
- vm_page_t next, prev, root;
+ boolean_t lockacq;
if ((m->oflags & VPO_UNMANAGED) == 0)
vm_page_lock_assert(m, MA_OWNED);
if ((object = m->object) == NULL)
return;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if (m->oflags & VPO_BUSY) {
- m->oflags &= ~VPO_BUSY;
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ if (vm_page_xbusied(m)) {
+ lockacq = FALSE;
+ if ((m->oflags & VPO_UNMANAGED) != 0 &&
+ !mtx_owned(vm_page_lockptr(m))) {
+ lockacq = TRUE;
+ vm_page_lock(m);
+ }
vm_page_flash(m);
+ atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
+ if (lockacq)
+ vm_page_unlock(m);
}
/*
* Now remove from the object's list of backed pages.
*/
- if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
- /*
- * Since the page's successor in the list is also its parent
- * in the tree, its right subtree must be empty.
- */
- next->left = m->left;
- KASSERT(m->right == NULL,
- ("vm_page_remove: page %p has right child", m));
- } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
- prev->right == m) {
- /*
- * Since the page's predecessor in the list is also its parent
- * in the tree, its left subtree must be empty.
- */
- KASSERT(m->left == NULL,
- ("vm_page_remove: page %p has left child", m));
- prev->right = m->right;
- } else {
- if (m != object->root)
- vm_page_splay(m->pindex, object->root);
- if (m->left == NULL)
- root = m->right;
- else if (m->right == NULL)
- root = m->left;
- else {
- /*
- * Move the page's successor to the root, because
- * pages are usually removed in ascending order.
- */
- if (m->right != next)
- vm_page_splay(m->pindex, m->right);
- next->left = m->left;
- root = next;
- }
- object->root = root;
- }
+ vm_radix_remove(&object->rtree, m->pindex);
TAILQ_REMOVE(&object->memq, m, listq);
/*
@@ -1055,15 +1180,9 @@
vm_page_t
vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
{
- vm_page_t m;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if ((m = object->root) != NULL && m->pindex != pindex) {
- m = vm_page_splay(pindex, m);
- if ((object->root = m)->pindex != pindex)
- m = NULL;
- }
- return (m);
+ VM_OBJECT_ASSERT_LOCKED(object);
+ return (vm_radix_lookup(&object->rtree, pindex));
}
/*
@@ -1079,14 +1198,9 @@
{
vm_page_t m;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if ((m = TAILQ_FIRST(&object->memq)) != NULL) {
- if (m->pindex < pindex) {
- m = vm_page_splay(pindex, object->root);
- if ((object->root = m)->pindex < pindex)
- m = TAILQ_NEXT(m, listq);
- }
- }
+ VM_OBJECT_ASSERT_LOCKED(object);
+ if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
+ m = vm_radix_lookup_ge(&object->rtree, pindex);
return (m);
}
@@ -1101,10 +1215,12 @@
{
vm_page_t next;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if ((next = TAILQ_NEXT(m, listq)) != NULL &&
- next->pindex != m->pindex + 1)
- next = NULL;
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if ((next = TAILQ_NEXT(m, listq)) != NULL) {
+ MPASS(next->object == m->object);
+ if (next->pindex != m->pindex + 1)
+ next = NULL;
+ }
return (next);
}
@@ -1119,14 +1235,64 @@
{
vm_page_t prev;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
- prev->pindex != m->pindex - 1)
- prev = NULL;
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
+ MPASS(prev->object == m->object);
+ if (prev->pindex != m->pindex - 1)
+ prev = NULL;
+ }
return (prev);
}
/*
+ * Uses the page mnew as a replacement for an existing page at index
+ * pindex which must be already present in the object.
+ *
+ * The existing page must not be on a paging queue.
+ */
+vm_page_t
+vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
+{
+ vm_page_t mold, mpred;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ /*
+ * This function mostly follows vm_page_insert() and
+ * vm_page_remove() without the radix, object count and vnode
+ * dance. Double check such functions for more comments.
+ */
+ mpred = vm_radix_lookup(&object->rtree, pindex);
+ KASSERT(mpred != NULL,
+ ("vm_page_replace: replacing page not present with pindex"));
+ mpred = TAILQ_PREV(mpred, respgs, listq);
+ if (mpred != NULL)
+ KASSERT(mpred->pindex < pindex,
+ ("vm_page_insert_after: mpred doesn't precede pindex"));
+
+ mnew->object = object;
+ mnew->pindex = pindex;
+ mold = vm_radix_replace(&object->rtree, mnew);
+ KASSERT(mold->queue == PQ_NONE,
+ ("vm_page_replace: mold is on a paging queue"));
+
+ /* Detach the old page from the resident tailq. */
+ TAILQ_REMOVE(&object->memq, mold, listq);
+
+ mold->object = NULL;
+ vm_page_xunbusy(mold);
+
+ /* Insert the new page in the resident tailq. */
+ if (mpred != NULL)
+ TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq);
+ else
+ TAILQ_INSERT_HEAD(&object->memq, mnew, listq);
+ if (pmap_page_is_write_mapped(mnew))
+ vm_object_set_writeable_dirty(object);
+ return (mold);
+}
+
+/*
* vm_page_rename:
*
* Move the given memory entry from its
@@ -1144,15 +1310,47 @@
* or vm_page_dirty() will panic. Dirty pages are not allowed
* on the cache.
*
- * The objects must be locked. The page must be locked if it is managed.
+ * The objects must be locked.
*/
-void
+int
vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
{
+ vm_page_t mpred;
+ vm_pindex_t opidx;
+ VM_OBJECT_ASSERT_WLOCKED(new_object);
+
+ mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
+ KASSERT(mpred == NULL || mpred->pindex != new_pindex,
+ ("vm_page_rename: pindex already renamed"));
+
+ /*
+ * Create a custom version of vm_page_insert() which does not depend
+ * by m_prev and can cheat on the implementation aspects of the
+ * function.
+ */
+ opidx = m->pindex;
+ m->pindex = new_pindex;
+ if (vm_radix_insert(&new_object->rtree, m)) {
+ m->pindex = opidx;
+ return (1);
+ }
+
+ /*
+ * The operation cannot fail anymore. The removal must happen before
+ * the listq iterator is tainted.
+ */
+ m->pindex = opidx;
+ vm_page_lock(m);
vm_page_remove(m);
- vm_page_insert(m, new_object, new_pindex);
+
+ /* Return back to the new pindex to complete vm_page_insert(). */
+ m->pindex = new_pindex;
+ m->object = new_object;
+ vm_page_unlock(m);
+ vm_page_insert_radixdone(m, new_object, mpred);
vm_page_dirty(m);
+ return (0);
}
/*
@@ -1166,55 +1364,21 @@
void
vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
{
- vm_page_t m, m_next;
+ vm_page_t m;
boolean_t empty;
mtx_lock(&vm_page_queue_free_mtx);
- if (__predict_false(object->cache == NULL)) {
+ if (__predict_false(vm_radix_is_empty(&object->cache))) {
mtx_unlock(&vm_page_queue_free_mtx);
return;
}
- m = object->cache = vm_page_splay(start, object->cache);
- if (m->pindex < start) {
- if (m->right == NULL)
- m = NULL;
- else {
- m_next = vm_page_splay(start, m->right);
- m_next->left = m;
- m->right = NULL;
- m = object->cache = m_next;
- }
+ while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) {
+ if (end != 0 && m->pindex >= end)
+ break;
+ vm_radix_remove(&object->cache, m->pindex);
+ vm_page_cache_turn_free(m);
}
-
- /*
- * At this point, "m" is either (1) a reference to the page
- * with the least pindex that is greater than or equal to
- * "start" or (2) NULL.
- */
- for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
- /*
- * Find "m"'s successor and remove "m" from the
- * object's cache.
- */
- if (m->right == NULL) {
- object->cache = m->left;
- m_next = NULL;
- } else {
- m_next = vm_page_splay(start, m->right);
- m_next->left = m->left;
- object->cache = m_next;
- }
- /* Convert "m" to a free page. */
- m->object = NULL;
- m->valid = 0;
- /* Clear PG_CACHED and set PG_FREE. */
- m->flags ^= PG_CACHED | PG_FREE;
- KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
- ("vm_page_cache_free: page %p has inconsistent flags", m));
- cnt.v_cache_count--;
- cnt.v_free_count++;
- }
- empty = object->cache == NULL;
+ empty = vm_radix_is_empty(&object->cache);
mtx_unlock(&vm_page_queue_free_mtx);
if (object->type == OBJT_VNODE && empty)
vdrop(object->handle);
@@ -1229,15 +1393,9 @@
static inline vm_page_t
vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
{
- vm_page_t m;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- if ((m = object->cache) != NULL && m->pindex != pindex) {
- m = vm_page_splay(pindex, m);
- if ((object->cache = m)->pindex != pindex)
- m = NULL;
- }
- return (m);
+ return (vm_radix_lookup(&object->cache, pindex));
}
/*
@@ -1246,31 +1404,14 @@
*
* The free page queue must be locked.
*/
-void
+static void
vm_page_cache_remove(vm_page_t m)
{
- vm_object_t object;
- vm_page_t root;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
KASSERT((m->flags & PG_CACHED) != 0,
("vm_page_cache_remove: page %p is not cached", m));
- object = m->object;
- if (m != object->cache) {
- root = vm_page_splay(m->pindex, object->cache);
- KASSERT(root == m,
- ("vm_page_cache_remove: page %p is not cached in object %p",
- m, object));
- }
- if (m->left == NULL)
- root = m->right;
- else if (m->right == NULL)
- root = m->left;
- else {
- root = vm_page_splay(m->pindex, m->left);
- root->right = m->right;
- }
- object->cache = root;
+ vm_radix_remove(&m->object->cache, m->pindex);
m->object = NULL;
cnt.v_cache_count--;
}
@@ -1290,7 +1431,7 @@
vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
vm_object_t new_object)
{
- vm_page_t m, m_next;
+ vm_page_t m;
/*
* Insertion into an object's collection of cached pages
@@ -1297,54 +1438,26 @@
* requires the object to be locked. In contrast, removal does
* not.
*/
- VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
- KASSERT(new_object->cache == NULL,
+ VM_OBJECT_ASSERT_WLOCKED(new_object);
+ KASSERT(vm_radix_is_empty(&new_object->cache),
("vm_page_cache_transfer: object %p has cached pages",
new_object));
mtx_lock(&vm_page_queue_free_mtx);
- if ((m = orig_object->cache) != NULL) {
+ while ((m = vm_radix_lookup_ge(&orig_object->cache,
+ offidxstart)) != NULL) {
/*
* Transfer all of the pages with offset greater than or
* equal to 'offidxstart' from the original object's
* cache to the new object's cache.
*/
- m = vm_page_splay(offidxstart, m);
- if (m->pindex < offidxstart) {
- orig_object->cache = m;
- new_object->cache = m->right;
- m->right = NULL;
- } else {
- orig_object->cache = m->left;
- new_object->cache = m;
- m->left = NULL;
- }
- while ((m = new_object->cache) != NULL) {
- if ((m->pindex - offidxstart) >= new_object->size) {
- /*
- * Return all of the cached pages with
- * offset greater than or equal to the
- * new object's size to the original
- * object's cache.
- */
- new_object->cache = m->left;
- m->left = orig_object->cache;
- orig_object->cache = m;
- break;
- }
- m_next = vm_page_splay(m->pindex, m->right);
- /* Update the page's object and offset. */
- m->object = new_object;
- m->pindex -= offidxstart;
- if (m_next == NULL)
- break;
- m->right = NULL;
- m_next->left = m;
- new_object->cache = m_next;
- }
- KASSERT(new_object->cache == NULL ||
- new_object->type == OBJT_SWAP,
- ("vm_page_cache_transfer: object %p's type is incompatible"
- " with cached pages", new_object));
+ if ((m->pindex - offidxstart) >= new_object->size)
+ break;
+ vm_radix_remove(&orig_object->cache, m->pindex);
+ /* Update the page's object and offset. */
+ m->object = new_object;
+ m->pindex -= offidxstart;
+ if (vm_radix_insert(&new_object->cache, m))
+ vm_page_cache_turn_free(m);
}
mtx_unlock(&vm_page_queue_free_mtx);
}
@@ -1367,8 +1480,8 @@
* page queues lock in order to prove that the specified page doesn't
* exist.
*/
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if (object->cache == NULL)
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ if (__predict_true(vm_object_cache_is_empty(object)))
return (FALSE);
mtx_lock(&vm_page_queue_free_mtx);
m = vm_page_cache_lookup(object, pindex);
@@ -1379,8 +1492,8 @@
/*
* vm_page_alloc:
*
- * Allocate and return a memory cell associated
- * with this VM object/offset pair.
+ * Allocate and return a page that is associated with the specified
+ * object and offset pair. By default, this page is exclusive busied.
*
* The caller must always specify an allocation class.
*
@@ -1390,13 +1503,18 @@
* VM_ALLOC_INTERRUPT interrupt time request
*
* optional allocation flags:
- * VM_ALLOC_ZERO prefer a zeroed page
- * VM_ALLOC_WIRED wire the allocated page
- * VM_ALLOC_NOOBJ page is not associated with a vm object
- * VM_ALLOC_NOBUSY do not set the page busy
+ * VM_ALLOC_COUNT(number) the number of additional pages that the caller
+ * intends to allocate
* VM_ALLOC_IFCACHED return page only if it is cached
* VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page
* is cached
+ * VM_ALLOC_NOBUSY do not exclusive busy the page
+ * VM_ALLOC_NODUMP do not include the page in a kernel core dump
+ * VM_ALLOC_NOOBJ page is not associated with an object and
+ * should not be exclusive busy
+ * VM_ALLOC_SBUSY shared busy the allocated page
+ * VM_ALLOC_WIRED wire the allocated page
+ * VM_ALLOC_ZERO prefer a zeroed page
*
* This routine may not sleep.
*/
@@ -1405,28 +1523,43 @@
{
struct vnode *vp = NULL;
vm_object_t m_object;
- vm_page_t m;
- int flags, page_req;
+ vm_page_t m, mpred;
+ int flags, req_class;
- if ((req & VM_ALLOC_NOOBJ) == 0) {
- KASSERT(object != NULL,
- ("vm_page_alloc: NULL object."));
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- }
+ mpred = 0; /* XXX: pacify gcc */
+ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
+ (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
+ ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
+ (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
+ ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
+ req));
+ if (object != NULL)
+ VM_OBJECT_ASSERT_WLOCKED(object);
- page_req = req & VM_ALLOC_CLASS_MASK;
+ req_class = req & VM_ALLOC_CLASS_MASK;
/*
- * The pager is allowed to eat deeper into the free page list.
+ * The page daemon is allowed to dig deeper into the free page list.
*/
- if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT))
- page_req = VM_ALLOC_SYSTEM;
+ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+ req_class = VM_ALLOC_SYSTEM;
- mtx_lock(&vm_page_queue_free_mtx);
+ if (object != NULL) {
+ mpred = vm_radix_lookup_le(&object->rtree, pindex);
+ KASSERT(mpred == NULL || mpred->pindex != pindex,
+ ("vm_page_alloc: pindex already allocated"));
+ }
+
+ /*
+ * The page allocation request can came from consumers which already
+ * hold the free page queue mutex, like vm_page_insert() in
+ * vm_page_cache().
+ */
+ mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
- (page_req == VM_ALLOC_SYSTEM &&
+ (req_class == VM_ALLOC_SYSTEM &&
cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
- (page_req == VM_ALLOC_INTERRUPT &&
+ (req_class == VM_ALLOC_INTERRUPT &&
cnt.v_free_count + cnt.v_cache_count > 0)) {
/*
* Allocate from the free queue if the number of free pages
@@ -1451,10 +1584,9 @@
mtx_unlock(&vm_page_queue_free_mtx);
return (NULL);
#if VM_NRESERVLEVEL > 0
- } else if (object == NULL || object->type == OBJT_DEVICE ||
- object->type == OBJT_SG ||
- (object->flags & OBJ_COLORED) == 0 ||
- (m = vm_reserv_alloc_page(object, pindex)) == NULL) {
+ } else if (object == NULL || (object->flags & (OBJ_COLORED |
+ OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
+ vm_reserv_alloc_page(object, pindex, mpred)) == NULL) {
#else
} else {
#endif
@@ -1474,7 +1606,7 @@
*/
mtx_unlock(&vm_page_queue_free_mtx);
atomic_add_int(&vm_pageout_deficit,
- MAX((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
+ max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
pagedaemon_wakeup();
return (NULL);
}
@@ -1482,18 +1614,19 @@
/*
* At this point we had better have found a good page.
*/
-
KASSERT(m != NULL, ("vm_page_alloc: missing page"));
KASSERT(m->queue == PQ_NONE,
("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
- KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m));
+ KASSERT(!vm_page_busied(m), ("vm_page_alloc: page %p is busy", m));
KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
("vm_page_alloc: page %p has unexpected memattr %d", m,
pmap_page_get_memattr(m)));
if ((m->flags & PG_CACHED) != 0) {
+ KASSERT((m->flags & PG_ZERO) == 0,
+ ("vm_page_alloc: cached page %p is PG_ZERO", m));
KASSERT(m->valid != 0,
("vm_page_alloc: cached page %p is invalid", m));
if (m->object == object && m->pindex == pindex)
@@ -1502,7 +1635,8 @@
m->valid = 0;
m_object = m->object;
vm_page_cache_remove(m);
- if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
+ if (m_object->type == OBJT_VNODE &&
+ vm_object_cache_is_empty(m_object))
vp = m_object->handle;
} else {
KASSERT(VM_PAGE_IS_FREE(m),
@@ -1509,7 +1643,7 @@
("vm_page_alloc: page %p is not free", m));
KASSERT(m->valid == 0,
("vm_page_alloc: free page %p is valid", m));
- cnt.v_free_count--;
+ vm_phys_freecnt_adj(m, -1);
}
/*
@@ -1517,22 +1651,23 @@
* must be cleared before the free page queues lock is released.
*/
flags = 0;
- if (req & VM_ALLOC_NODUMP)
- flags |= PG_NODUMP;
if (m->flags & PG_ZERO) {
vm_page_zero_count--;
if (req & VM_ALLOC_ZERO)
flags = PG_ZERO;
}
+ if (req & VM_ALLOC_NODUMP)
+ flags |= PG_NODUMP;
m->flags = flags;
mtx_unlock(&vm_page_queue_free_mtx);
m->aflags = 0;
- if (object == NULL || object->type == OBJT_PHYS)
- m->oflags = VPO_UNMANAGED;
- else
- m->oflags = 0;
- if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0)
- m->oflags |= VPO_BUSY;
+ m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
+ VPO_UNMANAGED : 0;
+ m->busy_lock = VPB_UNBUSIED;
+ if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
+ m->busy_lock = VPB_SINGLE_EXCLUSIVER;
+ if ((req & VM_ALLOC_SBUSY) != 0)
+ m->busy_lock = VPB_SHARERS_WORD(1);
if (req & VM_ALLOC_WIRED) {
/*
* The page lock is not required for wiring a page until that
@@ -1544,11 +1679,26 @@
m->act_count = 0;
if (object != NULL) {
+ if (vm_page_insert_after(m, object, pindex, mpred)) {
+ /* See the comment below about hold count. */
+ if (vp != NULL)
+ vdrop(vp);
+ pagedaemon_wakeup();
+ if (req & VM_ALLOC_WIRED) {
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+ m->wire_count = 0;
+ }
+ m->object = NULL;
+ m->oflags = VPO_UNMANAGED;
+ m->busy_lock = VPB_UNBUSIED;
+ vm_page_free(m);
+ return (NULL);
+ }
+
/* Ignore device objects; the pager sets "memattr" for them. */
if (object->memattr != VM_MEMATTR_DEFAULT &&
- object->type != OBJT_DEVICE && object->type != OBJT_SG)
+ (object->flags & OBJ_FICTITIOUS) == 0)
pmap_page_set_memattr(m, object->memattr);
- vm_page_insert(m, object, pindex);
} else
m->pindex = pindex;
@@ -1571,13 +1721,204 @@
return (m);
}
+static void
+vm_page_alloc_contig_vdrop(struct spglist *lst)
+{
+
+ while (!SLIST_EMPTY(lst)) {
+ vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv);
+ SLIST_REMOVE_HEAD(lst, plinks.s.ss);
+ }
+}
+
/*
+ * vm_page_alloc_contig:
+ *
+ * Allocate a contiguous set of physical pages of the given size "npages"
+ * from the free lists. All of the physical pages must be at or above
+ * the given physical address "low" and below the given physical address
+ * "high". The given value "alignment" determines the alignment of the
+ * first physical page in the set. If the given value "boundary" is
+ * non-zero, then the set of physical pages cannot cross any physical
+ * address boundary that is a multiple of that value. Both "alignment"
+ * and "boundary" must be a power of two.
+ *
+ * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
+ * then the memory attribute setting for the physical pages is configured
+ * to the object's memory attribute setting. Otherwise, the memory
+ * attribute setting for the physical pages is configured to "memattr",
+ * overriding the object's memory attribute setting. However, if the
+ * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
+ * memory attribute setting for the physical pages cannot be configured
+ * to VM_MEMATTR_DEFAULT.
+ *
+ * The caller must always specify an allocation class.
+ *
+ * allocation classes:
+ * VM_ALLOC_NORMAL normal process request
+ * VM_ALLOC_SYSTEM system *really* needs a page
+ * VM_ALLOC_INTERRUPT interrupt time request
+ *
+ * optional allocation flags:
+ * VM_ALLOC_NOBUSY do not exclusive busy the page
+ * VM_ALLOC_NODUMP do not include the page in a kernel core dump
+ * VM_ALLOC_NOOBJ page is not associated with an object and
+ * should not be exclusive busy
+ * VM_ALLOC_SBUSY shared busy the allocated page
+ * VM_ALLOC_WIRED wire the allocated page
+ * VM_ALLOC_ZERO prefer a zeroed page
+ *
+ * This routine may not sleep.
+ */
+vm_page_t
+vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
+ u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
+ vm_paddr_t boundary, vm_memattr_t memattr)
+{
+ struct vnode *drop;
+ struct spglist deferred_vdrop_list;
+ vm_page_t m, m_tmp, m_ret;
+ u_int flags, oflags;
+ int req_class;
+
+ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
+ (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
+ ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
+ (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
+ ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
+ req));
+ if (object != NULL) {
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(object->type == OBJT_PHYS,
+ ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
+ object));
+ }
+ KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
+ req_class = req & VM_ALLOC_CLASS_MASK;
+
+ /*
+ * The page daemon is allowed to dig deeper into the free page list.
+ */
+ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+ req_class = VM_ALLOC_SYSTEM;
+
+ SLIST_INIT(&deferred_vdrop_list);
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (cnt.v_free_count + cnt.v_cache_count >= npages +
+ cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
+ cnt.v_free_count + cnt.v_cache_count >= npages +
+ cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
+ cnt.v_free_count + cnt.v_cache_count >= npages)) {
+#if VM_NRESERVLEVEL > 0
+retry:
+ if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
+ (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
+ low, high, alignment, boundary)) == NULL)
+#endif
+ m_ret = vm_phys_alloc_contig(npages, low, high,
+ alignment, boundary);
+ } else {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ atomic_add_int(&vm_pageout_deficit, npages);
+ pagedaemon_wakeup();
+ return (NULL);
+ }
+ if (m_ret != NULL)
+ for (m = m_ret; m < &m_ret[npages]; m++) {
+ drop = vm_page_alloc_init(m);
+ if (drop != NULL) {
+ /*
+ * Enqueue the vnode for deferred vdrop().
+ */
+ m->plinks.s.pv = drop;
+ SLIST_INSERT_HEAD(&deferred_vdrop_list, m,
+ plinks.s.ss);
+ }
+ }
+ else {
+#if VM_NRESERVLEVEL > 0
+ if (vm_reserv_reclaim_contig(npages, low, high, alignment,
+ boundary))
+ goto retry;
+#endif
+ }
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (m_ret == NULL)
+ return (NULL);
+
+ /*
+ * Initialize the pages. Only the PG_ZERO flag is inherited.
+ */
+ flags = 0;
+ if ((req & VM_ALLOC_ZERO) != 0)
+ flags = PG_ZERO;
+ if ((req & VM_ALLOC_NODUMP) != 0)
+ flags |= PG_NODUMP;
+ if ((req & VM_ALLOC_WIRED) != 0)
+ atomic_add_int(&cnt.v_wire_count, npages);
+ oflags = VPO_UNMANAGED;
+ if (object != NULL) {
+ if (object->memattr != VM_MEMATTR_DEFAULT &&
+ memattr == VM_MEMATTR_DEFAULT)
+ memattr = object->memattr;
+ }
+ for (m = m_ret; m < &m_ret[npages]; m++) {
+ m->aflags = 0;
+ m->flags = (m->flags | PG_NODUMP) & flags;
+ m->busy_lock = VPB_UNBUSIED;
+ if (object != NULL) {
+ if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
+ m->busy_lock = VPB_SINGLE_EXCLUSIVER;
+ if ((req & VM_ALLOC_SBUSY) != 0)
+ m->busy_lock = VPB_SHARERS_WORD(1);
+ }
+ if ((req & VM_ALLOC_WIRED) != 0)
+ m->wire_count = 1;
+ /* Unmanaged pages don't use "act_count". */
+ m->oflags = oflags;
+ if (object != NULL) {
+ if (vm_page_insert(m, object, pindex)) {
+ vm_page_alloc_contig_vdrop(
+ &deferred_vdrop_list);
+ if (vm_paging_needed())
+ pagedaemon_wakeup();
+ if ((req & VM_ALLOC_WIRED) != 0)
+ atomic_subtract_int(&cnt.v_wire_count,
+ npages);
+ for (m_tmp = m, m = m_ret;
+ m < &m_ret[npages]; m++) {
+ if ((req & VM_ALLOC_WIRED) != 0)
+ m->wire_count = 0;
+ if (m >= m_tmp) {
+ m->object = NULL;
+ m->oflags |= VPO_UNMANAGED;
+ }
+ m->busy_lock = VPB_UNBUSIED;
+ vm_page_free(m);
+ }
+ return (NULL);
+ }
+ } else
+ m->pindex = pindex;
+ if (memattr != VM_MEMATTR_DEFAULT)
+ pmap_page_set_memattr(m, memattr);
+ pindex++;
+ }
+ vm_page_alloc_contig_vdrop(&deferred_vdrop_list);
+ if (vm_paging_needed())
+ pagedaemon_wakeup();
+ return (m_ret);
+}
+
+/*
* Initialize a page that has been freshly dequeued from a freelist.
* The caller has to drop the vnode returned, if it is not NULL.
*
+ * This function may only be used to initialize unmanaged pages.
+ *
* To be called with vm_page_queue_free_mtx held.
*/
-struct vnode *
+static struct vnode *
vm_page_alloc_init(vm_page_t m)
{
struct vnode *drop;
@@ -1590,7 +1931,7 @@
("vm_page_alloc_init: page %p is wired", m));
KASSERT(m->hold_count == 0,
("vm_page_alloc_init: page %p is held", m));
- KASSERT(m->busy == 0,
+ KASSERT(!vm_page_busied(m),
("vm_page_alloc_init: page %p is busy", m));
KASSERT(m->dirty == 0,
("vm_page_alloc_init: page %p is dirty", m));
@@ -1600,11 +1941,13 @@
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
drop = NULL;
if ((m->flags & PG_CACHED) != 0) {
+ KASSERT((m->flags & PG_ZERO) == 0,
+ ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
m->valid = 0;
m_object = m->object;
vm_page_cache_remove(m);
if (m_object->type == OBJT_VNODE &&
- m_object->cache == NULL)
+ vm_object_cache_is_empty(m_object))
drop = m_object->handle;
} else {
KASSERT(VM_PAGE_IS_FREE(m),
@@ -1611,24 +1954,34 @@
("vm_page_alloc_init: page %p is not free", m));
KASSERT(m->valid == 0,
("vm_page_alloc_init: free page %p is valid", m));
- cnt.v_free_count--;
+ vm_phys_freecnt_adj(m, -1);
+ if ((m->flags & PG_ZERO) != 0)
+ vm_page_zero_count--;
}
- if (m->flags & PG_ZERO)
- vm_page_zero_count--;
/* Don't clear the PG_ZERO flag; we'll need it later. */
m->flags &= PG_ZERO;
- m->aflags = 0;
- m->oflags = VPO_UNMANAGED;
- /* Unmanaged pages don't use "act_count". */
return (drop);
}
/*
* vm_page_alloc_freelist:
- *
- * Allocate a page from the specified freelist.
- * Only the ALLOC_CLASS values in req are honored, other request flags
- * are ignored.
+ *
+ * Allocate a physical page from the specified free page list.
+ *
+ * The caller must always specify an allocation class.
+ *
+ * allocation classes:
+ * VM_ALLOC_NORMAL normal process request
+ * VM_ALLOC_SYSTEM system *really* needs a page
+ * VM_ALLOC_INTERRUPT interrupt time request
+ *
+ * optional allocation flags:
+ * VM_ALLOC_COUNT(number) the number of additional pages that the caller
+ * intends to allocate
+ * VM_ALLOC_WIRED wire the allocated page
+ * VM_ALLOC_ZERO prefer a zeroed page
+ *
+ * This routine may not sleep.
*/
vm_page_t
vm_page_alloc_freelist(int flind, int req)
@@ -1635,20 +1988,33 @@
{
struct vnode *drop;
vm_page_t m;
- int page_req;
+ u_int flags;
+ int req_class;
- m = NULL;
- page_req = req & VM_ALLOC_CLASS_MASK;
- mtx_lock(&vm_page_queue_free_mtx);
+ req_class = req & VM_ALLOC_CLASS_MASK;
+
/*
+ * The page daemon is allowed to dig deeper into the free page list.
+ */
+ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+ req_class = VM_ALLOC_SYSTEM;
+
+ /*
* Do not allocate reserved pages unless the req has asked for it.
*/
+ mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
- (page_req == VM_ALLOC_SYSTEM &&
+ (req_class == VM_ALLOC_SYSTEM &&
cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
- (page_req == VM_ALLOC_INTERRUPT &&
- cnt.v_free_count + cnt.v_cache_count > 0)) {
+ (req_class == VM_ALLOC_INTERRUPT &&
+ cnt.v_free_count + cnt.v_cache_count > 0))
m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
+ else {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ atomic_add_int(&vm_pageout_deficit,
+ max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
+ pagedaemon_wakeup();
+ return (NULL);
}
if (m == NULL) {
mtx_unlock(&vm_page_queue_free_mtx);
@@ -1656,8 +2022,29 @@
}
drop = vm_page_alloc_init(m);
mtx_unlock(&vm_page_queue_free_mtx);
- if (drop)
+
+ /*
+ * Initialize the page. Only the PG_ZERO flag is inherited.
+ */
+ m->aflags = 0;
+ flags = 0;
+ if ((req & VM_ALLOC_ZERO) != 0)
+ flags = PG_ZERO;
+ m->flags &= flags;
+ if ((req & VM_ALLOC_WIRED) != 0) {
+ /*
+ * The page lock is not required for wiring a page that does
+ * not belong to an object.
+ */
+ atomic_add_int(&cnt.v_wire_count, 1);
+ m->wire_count = 1;
+ }
+ /* Unmanaged pages don't use "act_count". */
+ m->oflags = VPO_UNMANAGED;
+ if (drop != NULL)
vdrop(drop);
+ if (vm_paging_needed())
+ pagedaemon_wakeup();
return (m);
}
@@ -1709,84 +2096,117 @@
"pfault", 0);
}
+struct vm_pagequeue *
+vm_page_pagequeue(vm_page_t m)
+{
+
+ return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
+}
+
/*
- * vm_page_requeue:
+ * vm_page_dequeue:
*
- * Move the given page to the tail of its present page queue.
+ * Remove the given page from its current page queue.
*
- * The page queues must be locked.
+ * The page must be locked.
*/
void
-vm_page_requeue(vm_page_t m)
+vm_page_dequeue(vm_page_t m)
{
- struct vpgqueues *vpq;
- int queue;
+ struct vm_pagequeue *pq;
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
- queue = m->queue;
- KASSERT(queue != PQ_NONE,
- ("vm_page_requeue: page %p is not queued", m));
- vpq = &vm_page_queues[queue];
- TAILQ_REMOVE(&vpq->pl, m, pageq);
- TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
+ vm_page_lock_assert(m, MA_OWNED);
+ KASSERT(m->queue != PQ_NONE,
+ ("vm_page_dequeue: page %p is not queued", m));
+ pq = vm_page_pagequeue(m);
+ vm_pagequeue_lock(pq);
+ m->queue = PQ_NONE;
+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+ vm_pagequeue_cnt_dec(pq);
+ vm_pagequeue_unlock(pq);
}
/*
- * vm_page_queue_remove:
+ * vm_page_dequeue_locked:
*
- * Remove the given page from the specified queue.
+ * Remove the given page from its current page queue.
*
- * The page and page queues must be locked.
+ * The page and page queue must be locked.
*/
-static __inline void
-vm_page_queue_remove(int queue, vm_page_t m)
+void
+vm_page_dequeue_locked(vm_page_t m)
{
- struct vpgqueues *pq;
+ struct vm_pagequeue *pq;
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
vm_page_lock_assert(m, MA_OWNED);
- pq = &vm_page_queues[queue];
- TAILQ_REMOVE(&pq->pl, m, pageq);
- (*pq->cnt)--;
+ pq = vm_page_pagequeue(m);
+ vm_pagequeue_assert_locked(pq);
+ m->queue = PQ_NONE;
+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+ vm_pagequeue_cnt_dec(pq);
}
/*
- * vm_pageq_remove:
+ * vm_page_enqueue:
*
- * Remove a page from its queue.
+ * Add the given page to the specified page queue.
*
- * The given page must be locked.
+ * The page must be locked.
*/
+static void
+vm_page_enqueue(int queue, vm_page_t m)
+{
+ struct vm_pagequeue *pq;
+
+ vm_page_lock_assert(m, MA_OWNED);
+ pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
+ vm_pagequeue_lock(pq);
+ m->queue = queue;
+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+ vm_pagequeue_cnt_inc(pq);
+ vm_pagequeue_unlock(pq);
+}
+
+/*
+ * vm_page_requeue:
+ *
+ * Move the given page to the tail of its current page queue.
+ *
+ * The page must be locked.
+ */
void
-vm_pageq_remove(vm_page_t m)
+vm_page_requeue(vm_page_t m)
{
- int queue;
+ struct vm_pagequeue *pq;
vm_page_lock_assert(m, MA_OWNED);
- if ((queue = m->queue) != PQ_NONE) {
- vm_page_lock_queues();
- m->queue = PQ_NONE;
- vm_page_queue_remove(queue, m);
- vm_page_unlock_queues();
- }
+ KASSERT(m->queue != PQ_NONE,
+ ("vm_page_requeue: page %p is not queued", m));
+ pq = vm_page_pagequeue(m);
+ vm_pagequeue_lock(pq);
+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+ vm_pagequeue_unlock(pq);
}
/*
- * vm_page_enqueue:
+ * vm_page_requeue_locked:
*
- * Add the given page to the specified queue.
+ * Move the given page to the tail of its current page queue.
*
- * The page queues must be locked.
+ * The page queue must be locked.
*/
-static void
-vm_page_enqueue(int queue, vm_page_t m)
+void
+vm_page_requeue_locked(vm_page_t m)
{
- struct vpgqueues *vpq;
+ struct vm_pagequeue *pq;
- vpq = &vm_page_queues[queue];
- m->queue = queue;
- TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
- ++*vpq->cnt;
+ KASSERT(m->queue != PQ_NONE,
+ ("vm_page_requeue_locked: page %p is not queued", m));
+ pq = vm_page_pagequeue(m);
+ vm_pagequeue_assert_locked(pq);
+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
}
/*
@@ -1804,16 +2224,13 @@
int queue;
vm_page_lock_assert(m, MA_OWNED);
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
if ((queue = m->queue) != PQ_ACTIVE) {
if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
if (m->act_count < ACT_INIT)
m->act_count = ACT_INIT;
- vm_page_lock_queues();
if (queue != PQ_NONE)
- vm_page_queue_remove(queue, m);
+ vm_page_dequeue(m);
vm_page_enqueue(PQ_ACTIVE, m);
- vm_page_unlock_queues();
} else
KASSERT(queue == PQ_NONE,
("vm_page_activate: wired page %p is queued", m));
@@ -1858,6 +2275,28 @@
}
/*
+ * Turn a cached page into a free page, by changing its attributes.
+ * Keep the statistics up-to-date.
+ *
+ * The free page queue must be locked.
+ */
+static void
+vm_page_cache_turn_free(vm_page_t m)
+{
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+ m->object = NULL;
+ m->valid = 0;
+ /* Clear PG_CACHED and set PG_FREE. */
+ m->flags ^= PG_CACHED | PG_FREE;
+ KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
+ ("vm_page_cache_free: page %p has inconsistent flags", m));
+ cnt.v_cache_count--;
+ vm_phys_freecnt_adj(m, 1);
+}
+
+/*
* vm_page_free_toq:
*
* Returns the given page to the free list,
@@ -1873,12 +2312,14 @@
vm_page_lock_assert(m, MA_OWNED);
KASSERT(!pmap_page_is_mapped(m),
("vm_page_free_toq: freeing mapped page %p", m));
- }
+ } else
+ KASSERT(m->queue == PQ_NONE,
+ ("vm_page_free_toq: unmanaged page %p is queued", m));
PCPU_INC(cnt.v_tfree);
if (VM_PAGE_IS_FREE(m))
panic("vm_page_free: freeing free page %p", m);
- else if (m->busy != 0)
+ else if (vm_page_sbusied(m))
panic("vm_page_free: freeing busy page %p", m);
/*
@@ -1887,8 +2328,7 @@
* callback routine until after we've put the page on the
* appropriate free queue.
*/
- if ((m->oflags & VPO_UNMANAGED) == 0)
- vm_pageq_remove(m);
+ vm_page_remque(m);
vm_page_remove(m);
/*
@@ -1906,9 +2346,9 @@
panic("vm_page_free: freeing wired page %p", m);
if (m->hold_count != 0) {
m->flags &= ~PG_ZERO;
- vm_page_lock_queues();
- vm_page_enqueue(PQ_HOLD, m);
- vm_page_unlock_queues();
+ KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+ ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
+ m->flags |= PG_UNHOLDFREE;
} else {
/*
* Restore the default memory attribute to the page.
@@ -1922,7 +2362,7 @@
*/
mtx_lock(&vm_page_queue_free_mtx);
m->flags |= PG_FREE;
- cnt.v_free_count++;
+ vm_phys_freecnt_adj(m, 1);
#if VM_NRESERVLEVEL > 0
if (!vm_reserv_free_page(m))
#else
@@ -1966,8 +2406,10 @@
return;
}
if (m->wire_count == 0) {
- if ((m->oflags & VPO_UNMANAGED) == 0)
- vm_pageq_remove(m);
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
+ m->queue == PQ_NONE,
+ ("vm_page_wire: unmanaged page %p is queued", m));
+ vm_page_remque(m);
atomic_add_int(&cnt.v_wire_count, 1);
}
m->wire_count++;
@@ -1986,7 +2428,7 @@
* However, unless the page belongs to an object, it is not enqueued because
* it cannot be paged out.
*
- * If a page is fictitious, then its wire count must alway be one.
+ * If a page is fictitious, then its wire count must always be one.
*
* A managed page must be locked.
*/
@@ -2010,9 +2452,7 @@
return;
if (!activate)
m->flags &= ~PG_WINATCFLS;
- vm_page_lock_queues();
vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m);
- vm_page_unlock_queues();
}
} else
panic("vm_page_unwire: page %p's wire count is zero", m);
@@ -2041,29 +2481,36 @@
static inline void
_vm_page_deactivate(vm_page_t m, int athead)
{
+ struct vm_pagequeue *pq;
int queue;
- vm_page_lock_assert(m, MA_OWNED);
+ vm_page_assert_locked(m);
/*
- * Ignore if already inactive.
+ * Ignore if the page is already inactive, unless it is unlikely to be
+ * reactivated.
*/
- if ((queue = m->queue) == PQ_INACTIVE)
+ if ((queue = m->queue) == PQ_INACTIVE && !athead)
return;
if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
- m->flags &= ~PG_WINATCFLS;
- vm_page_lock_queues();
- if (queue != PQ_NONE)
- vm_page_queue_remove(queue, m);
+ pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
+ /* Avoid multiple acquisitions of the inactive queue lock. */
+ if (queue == PQ_INACTIVE) {
+ vm_pagequeue_lock(pq);
+ vm_page_dequeue_locked(m);
+ } else {
+ if (queue != PQ_NONE)
+ vm_page_dequeue(m);
+ m->flags &= ~PG_WINATCFLS;
+ vm_pagequeue_lock(pq);
+ }
+ m->queue = PQ_INACTIVE;
if (athead)
- TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m,
- pageq);
+ TAILQ_INSERT_HEAD(&pq->pq_pl, m, plinks.q);
else
- TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m,
- pageq);
- m->queue = PQ_INACTIVE;
- cnt.v_inactive_count++;
- vm_page_unlock_queues();
+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+ vm_pagequeue_cnt_inc(pq);
+ vm_pagequeue_unlock(pq);
}
}
@@ -2089,9 +2536,9 @@
{
vm_page_lock_assert(m, MA_OWNED);
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if (m->dirty || m->hold_count || m->busy || m->wire_count ||
- (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if (m->dirty || m->hold_count || m->wire_count ||
+ (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
return (0);
pmap_remove_all(m);
if (m->dirty)
@@ -2112,9 +2559,9 @@
vm_page_lock_assert(m, MA_OWNED);
if (m->object != NULL)
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if (m->dirty || m->hold_count || m->busy || m->wire_count ||
- (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if (m->dirty || m->hold_count || m->wire_count ||
+ (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
return (0);
pmap_remove_all(m);
if (m->dirty)
@@ -2134,17 +2581,17 @@
vm_page_cache(vm_page_t m)
{
vm_object_t object;
- vm_page_t next, prev, root;
+ boolean_t cache_was_empty;
vm_page_lock_assert(m, MA_OWNED);
object = m->object;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy ||
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) ||
m->hold_count || m->wire_count)
panic("vm_page_cache: attempting to cache busy page");
- pmap_remove_all(m);
- if (m->dirty != 0)
- panic("vm_page_cache: page %p is dirty", m);
+ KASSERT(!pmap_page_is_mapped(m),
+ ("vm_page_cache: page %p is mapped", m));
+ KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m));
if (m->valid == 0 || object->type == OBJT_DEFAULT ||
(object->type == OBJT_SWAP &&
!vm_pager_has_page(object, m->pindex, NULL, NULL))) {
@@ -2158,53 +2605,17 @@
}
KASSERT((m->flags & PG_CACHED) == 0,
("vm_page_cache: page %p is already cached", m));
- PCPU_INC(cnt.v_tcached);
/*
* Remove the page from the paging queues.
*/
- vm_pageq_remove(m);
+ vm_page_remque(m);
/*
* Remove the page from the object's collection of resident
* pages.
*/
- if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
- /*
- * Since the page's successor in the list is also its parent
- * in the tree, its right subtree must be empty.
- */
- next->left = m->left;
- KASSERT(m->right == NULL,
- ("vm_page_cache: page %p has right child", m));
- } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
- prev->right == m) {
- /*
- * Since the page's predecessor in the list is also its parent
- * in the tree, its left subtree must be empty.
- */
- KASSERT(m->left == NULL,
- ("vm_page_cache: page %p has left child", m));
- prev->right = m->right;
- } else {
- if (m != object->root)
- vm_page_splay(m->pindex, object->root);
- if (m->left == NULL)
- root = m->right;
- else if (m->right == NULL)
- root = m->left;
- else {
- /*
- * Move the page's successor to the root, because
- * pages are usually removed in ascending order.
- */
- if (m->right != next)
- vm_page_splay(m->pindex, m->right);
- next->left = m->left;
- root = next;
- }
- object->root = root;
- }
+ vm_radix_remove(&object->rtree, m->pindex);
TAILQ_REMOVE(&object->memq, m, listq);
object->resident_page_count--;
@@ -2220,27 +2631,28 @@
*/
m->flags &= ~PG_ZERO;
mtx_lock(&vm_page_queue_free_mtx);
+ cache_was_empty = vm_radix_is_empty(&object->cache);
+ if (vm_radix_insert(&object->cache, m)) {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (object->type == OBJT_VNODE &&
+ object->resident_page_count == 0)
+ vdrop(object->handle);
+ m->object = NULL;
+ vm_page_free(m);
+ return;
+ }
+
+ /*
+ * The above call to vm_radix_insert() could reclaim the one pre-
+ * existing cached page from this object, resulting in a call to
+ * vdrop().
+ */
+ if (!cache_was_empty)
+ cache_was_empty = vm_radix_is_singleton(&object->cache);
+
m->flags |= PG_CACHED;
cnt.v_cache_count++;
- root = object->cache;
- if (root == NULL) {
- m->left = NULL;
- m->right = NULL;
- } else {
- root = vm_page_splay(m->pindex, root);
- if (m->pindex < root->pindex) {
- m->left = root->left;
- m->right = root;
- root->left = NULL;
- } else if (__predict_false(m->pindex == root->pindex))
- panic("vm_page_cache: offset already cached");
- else {
- m->right = root->right;
- m->left = root;
- root->right = NULL;
- }
- }
- object->cache = m;
+ PCPU_INC(cnt.v_tcached);
#if VM_NRESERVLEVEL > 0
if (!vm_reserv_free_page(m)) {
#else
@@ -2258,87 +2670,60 @@
* the object's only resident page.
*/
if (object->type == OBJT_VNODE) {
- if (root == NULL && object->resident_page_count != 0)
+ if (cache_was_empty && object->resident_page_count != 0)
vhold(object->handle);
- else if (root != NULL && object->resident_page_count == 0)
+ else if (!cache_was_empty && object->resident_page_count == 0)
vdrop(object->handle);
}
}
/*
- * vm_page_dontneed
+ * vm_page_advise
*
- * Cache, deactivate, or do nothing as appropriate. This routine
- * is typically used by madvise() MADV_DONTNEED.
+ * Deactivate or do nothing, as appropriate. This routine is used
+ * by madvise() and vop_stdadvise().
*
- * Generally speaking we want to move the page into the cache so
- * it gets reused quickly. However, this can result in a silly syndrome
- * due to the page recycling too quickly. Small objects will not be
- * fully cached. On the otherhand, if we move the page to the inactive
- * queue we wind up with a problem whereby very large objects
- * unnecessarily blow away our inactive and cache queues.
- *
- * The solution is to move the pages based on a fixed weighting. We
- * either leave them alone, deactivate them, or move them to the cache,
- * where moving them to the cache has the highest weighting.
- * By forcing some pages into other queues we eventually force the
- * system to balance the queues, potentially recovering other unrelated
- * space from active. The idea is to not force this to happen too
- * often.
- *
* The object and page must be locked.
*/
void
-vm_page_dontneed(vm_page_t m)
+vm_page_advise(vm_page_t m, int advice)
{
- int dnw;
- int head;
- vm_page_lock_assert(m, MA_OWNED);
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- dnw = PCPU_GET(dnweight);
- PCPU_INC(dnweight);
-
- /*
- * Occasionally leave the page alone.
- */
- if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) {
- if (m->act_count >= ACT_INIT)
- --m->act_count;
+ vm_page_assert_locked(m);
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if (advice == MADV_FREE)
+ /*
+ * Mark the page clean. This will allow the page to be freed
+ * up by the system. However, such pages are often reused
+ * quickly by malloc() so we do not do anything that would
+ * cause a page fault if we can help it.
+ *
+ * Specifically, we do not try to actually free the page now
+ * nor do we try to put it in the cache (which would cause a
+ * page fault on reuse).
+ *
+ * But we do make the page as freeable as we can without
+ * actually taking the step of unmapping it.
+ */
+ vm_page_undirty(m);
+ else if (advice != MADV_DONTNEED)
return;
- }
/*
* Clear any references to the page. Otherwise, the page daemon will
* immediately reactivate the page.
- *
- * Perform the pmap_clear_reference() first. Otherwise, a concurrent
- * pmap operation, such as pmap_remove(), could clear a reference in
- * the pmap and set PGA_REFERENCED on the page before the
- * pmap_clear_reference() had completed. Consequently, the page would
- * appear referenced based upon an old reference that occurred before
- * this function ran.
*/
- pmap_clear_reference(m);
vm_page_aflag_clear(m, PGA_REFERENCED);
- if (m->dirty == 0 && pmap_is_modified(m))
+ if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
vm_page_dirty(m);
- if (m->dirty || (dnw & 0x0070) == 0) {
- /*
- * Deactivate the page 3 times out of 32.
- */
- head = 0;
- } else {
- /*
- * Cache the page 28 times out of every 32. Note that
- * the page is deactivated instead of cached, but placed
- * at the head of the queue instead of the tail.
- */
- head = 1;
- }
- _vm_page_deactivate(m, head);
+ /*
+ * Place clean pages at the head of the inactive queue rather than the
+ * tail, thus defeating the queue's LRU operation and ensuring that the
+ * page will be reused quickly.
+ */
+ _vm_page_deactivate(m, m->dirty == 0);
}
/*
@@ -2347,9 +2732,6 @@
* to be in the object. If the page doesn't exist, first allocate it
* and then conditionally zero it.
*
- * The caller must always specify the VM_ALLOC_RETRY flag. This is intended
- * to facilitate its eventual removal.
- *
* This routine may sleep.
*
* The object must be locked on entry. The lock will, however, be released
@@ -2359,14 +2741,17 @@
vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
{
vm_page_t m;
+ int sleep;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- KASSERT((allocflags & VM_ALLOC_RETRY) != 0,
- ("vm_page_grab: VM_ALLOC_RETRY is required"));
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
+ (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
+ ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
retrylookup:
if ((m = vm_page_lookup(object, pindex)) != NULL) {
- if ((m->oflags & VPO_BUSY) != 0 ||
- ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) {
+ sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
+ vm_page_xbusied(m) : vm_page_busied(m);
+ if (sleep) {
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
@@ -2373,7 +2758,11 @@
* likely to reclaim it.
*/
vm_page_aflag_set(m, PGA_REFERENCED);
- vm_page_sleep(m, "pgrbwt");
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(object);
+ vm_page_busy_sleep(m, "pgrbwt", (allocflags &
+ VM_ALLOC_IGN_SBUSY) != 0);
+ VM_OBJECT_WLOCK(object);
goto retrylookup;
} else {
if ((allocflags & VM_ALLOC_WIRED) != 0) {
@@ -2381,17 +2770,19 @@
vm_page_wire(m);
vm_page_unlock(m);
}
- if ((allocflags & VM_ALLOC_NOBUSY) == 0)
- vm_page_busy(m);
+ if ((allocflags &
+ (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
+ vm_page_xbusy(m);
+ if ((allocflags & VM_ALLOC_SBUSY) != 0)
+ vm_page_sbusy(m);
return (m);
}
}
- m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY |
- VM_ALLOC_IGN_SBUSY));
+ m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_IGN_SBUSY);
if (m == NULL) {
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
VM_WAIT;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
goto retrylookup;
} else if (m->valid != 0)
return (m);
@@ -2427,7 +2818,7 @@
}
/*
- * vm_page_set_valid:
+ * vm_page_set_valid_range:
*
* Sets portions of a page valid. The arguments are expected
* to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
@@ -2437,11 +2828,11 @@
* (base + size) must be less then or equal to PAGE_SIZE.
*/
void
-vm_page_set_valid(vm_page_t m, int base, int size)
+vm_page_set_valid_range(vm_page_t m, int base, int size)
{
int endoff, frag;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
if (size == 0) /* handle degenerate case */
return;
@@ -2470,7 +2861,7 @@
* is already dirty.
*/
KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
- ("vm_page_set_valid: page %p is dirty", m));
+ ("vm_page_set_valid_range: page %p is dirty", m));
/*
* Set valid bits inclusive of any overlap.
@@ -2490,12 +2881,12 @@
#endif
/*
- * If the object is locked and the page is neither VPO_BUSY nor
+ * If the object is locked and the page is neither exclusive busy nor
* write mapped, then the page's dirty field cannot possibly be
* set by a concurrent pmap operation.
*/
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if ((m->oflags & VPO_BUSY) == 0 && !pmap_page_is_write_mapped(m))
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
m->dirty &= ~pagebits;
else {
/*
@@ -2548,7 +2939,7 @@
vm_page_bits_t oldvalid, pagebits;
int endoff, frag;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
if (size == 0) /* handle degenerate case */
return;
@@ -2637,14 +3028,20 @@
vm_page_set_invalid(vm_page_t m, int base, int size)
{
vm_page_bits_t bits;
+ vm_object_t object;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- KASSERT((m->oflags & VPO_BUSY) == 0,
- ("vm_page_set_invalid: page %p is busy", m));
- bits = vm_page_bits(base, size);
- if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
+ object = m->object;
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
+ size >= object->un_pager.vnp.vnp_size)
+ bits = VM_PAGE_BITS_ALL;
+ else
+ bits = vm_page_bits(base, size);
+ if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL &&
+ bits != 0)
pmap_remove_all(m);
- KASSERT(!pmap_page_is_mapped(m),
+ KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) ||
+ !pmap_page_is_mapped(m),
("vm_page_set_invalid: page %p is mapped", m));
m->valid &= ~bits;
m->dirty &= ~bits;
@@ -2667,11 +3064,11 @@
int b;
int i;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
/*
* Scan the valid bits looking for invalid sections that
- * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the
- * valid bit may be set ) have already been zerod by
+ * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the
+ * valid bit may be set ) have already been zeroed by
* vm_page_set_validclean().
*/
for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
@@ -2706,15 +3103,37 @@
{
vm_page_bits_t bits;
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ VM_OBJECT_ASSERT_LOCKED(m->object);
bits = vm_page_bits(base, size);
- if (m->valid && ((m->valid & bits) == bits))
- return 1;
- else
- return 0;
+ return (m->valid != 0 && (m->valid & bits) == bits);
}
/*
+ * vm_page_ps_is_valid:
+ *
+ * Returns TRUE if the entire (super)page is valid and FALSE otherwise.
+ */
+boolean_t
+vm_page_ps_is_valid(vm_page_t m)
+{
+ int i, npages;
+
+ VM_OBJECT_ASSERT_LOCKED(m->object);
+ npages = atop(pagesizes[m->psind]);
+
+ /*
+ * The physically contiguous pages that make up a superpage, i.e., a
+ * page with a page size index ("psind") greater than zero, will
+ * occupy adjacent entries in vm_page_array[].
+ */
+ for (i = 0; i < npages; i++) {
+ if (m[i].valid != VM_PAGE_BITS_ALL)
+ return (FALSE);
+ }
+ return (TRUE);
+}
+
+/*
* Set the page's dirty bits if the page is modified.
*/
void
@@ -2721,7 +3140,7 @@
vm_page_test_dirty(vm_page_t m)
{
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
vm_page_dirty(m);
}
@@ -2749,127 +3168,51 @@
#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
void
-vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
+vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
{
- mtx_assert_(vm_page_lockptr(m), a, file, line);
+ vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
}
-#endif
-int so_zerocp_fullpage = 0;
-
-/*
- * Replace the given page with a copy. The copied page assumes
- * the portion of the given page's "wire_count" that is not the
- * responsibility of this copy-on-write mechanism.
- *
- * The object containing the given page must have a non-zero
- * paging-in-progress count and be locked.
- */
void
-vm_page_cowfault(vm_page_t m)
+vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
{
- vm_page_t mnew;
- vm_object_t object;
- vm_pindex_t pindex;
- mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
- vm_page_lock_assert(m, MA_OWNED);
- object = m->object;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- KASSERT(object->paging_in_progress != 0,
- ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
- object));
- pindex = m->pindex;
-
- retry_alloc:
- pmap_remove_all(m);
- vm_page_remove(m);
- mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
- if (mnew == NULL) {
- vm_page_insert(m, object, pindex);
- vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
- VM_WAIT;
- VM_OBJECT_LOCK(object);
- if (m == vm_page_lookup(object, pindex)) {
- vm_page_lock(m);
- goto retry_alloc;
- } else {
- /*
- * Page disappeared during the wait.
- */
- return;
- }
- }
-
- if (m->cow == 0) {
- /*
- * check to see if we raced with an xmit complete when
- * waiting to allocate a page. If so, put things back
- * the way they were
- */
- vm_page_unlock(m);
- vm_page_lock(mnew);
- vm_page_free(mnew);
- vm_page_unlock(mnew);
- vm_page_insert(m, object, pindex);
- } else { /* clear COW & copy page */
- if (!so_zerocp_fullpage)
- pmap_copy_page(m, mnew);
- mnew->valid = VM_PAGE_BITS_ALL;
- vm_page_dirty(mnew);
- mnew->wire_count = m->wire_count - m->cow;
- m->wire_count = m->cow;
- vm_page_unlock(m);
- }
+ mtx_assert_(vm_page_lockptr(m), a, file, line);
}
+#endif
-void
-vm_page_cowclear(vm_page_t m)
+#ifdef INVARIANTS
+void
+vm_page_object_lock_assert(vm_page_t m)
{
- vm_page_lock_assert(m, MA_OWNED);
- if (m->cow) {
- m->cow--;
- /*
- * let vm_fault add back write permission lazily
- */
- }
/*
- * sf_buf_free() will free the page, so we needn't do it here
- */
+ * Certain of the page's fields may only be modified by the
+ * holder of the containing object's lock or the exclusive busy.
+ * holder. Unfortunately, the holder of the write busy is
+ * not recorded, and thus cannot be checked here.
+ */
+ if (m->object != NULL && !vm_page_xbusied(m))
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
}
-int
-vm_page_cowsetup(vm_page_t m)
+void
+vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits)
{
- vm_page_lock_assert(m, MA_OWNED);
- if ((m->flags & PG_FICTITIOUS) != 0 ||
- (m->oflags & VPO_UNMANAGED) != 0 ||
- m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYLOCK(m->object))
- return (EBUSY);
- m->cow++;
- pmap_remove_write(m);
- VM_OBJECT_UNLOCK(m->object);
- return (0);
-}
+ if ((bits & PGA_WRITEABLE) == 0)
+ return;
-#ifdef INVARIANTS
-void
-vm_page_object_lock_assert(vm_page_t m)
-{
-
/*
- * Certain of the page's fields may only be modified by the
- * holder of the containing object's lock or the setter of the
- * page's VPO_BUSY flag. Unfortunately, the setter of the
- * VPO_BUSY flag is not recorded, and thus cannot be checked
- * here.
+ * The PGA_WRITEABLE flag can only be set if the page is
+ * managed, is exclusively busied or the object is locked.
+ * Currently, this flag is only set by pmap_enter().
*/
- if (m->object != NULL && (m->oflags & VPO_BUSY) == 0)
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("PGA_WRITEABLE on unmanaged page"));
+ if (!vm_page_xbusied(m))
+ VM_OBJECT_ASSERT_LOCKED(m->object);
}
#endif
@@ -2895,18 +3238,20 @@
DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
{
-
- db_printf("PQ_FREE:");
- db_printf(" %d", cnt.v_free_count);
- db_printf("\n");
-
- db_printf("PQ_CACHE:");
- db_printf(" %d", cnt.v_cache_count);
- db_printf("\n");
+ int dom;
- db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
- *vm_page_queues[PQ_ACTIVE].cnt,
- *vm_page_queues[PQ_INACTIVE].cnt);
+ db_printf("pq_free %d pq_cache %d\n",
+ cnt.v_free_count, cnt.v_cache_count);
+ for (dom = 0; dom < vm_ndomains; dom++) {
+ db_printf(
+ "dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n",
+ dom,
+ vm_dom[dom].vmd_page_count,
+ vm_dom[dom].vmd_free_count,
+ vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
+ vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
+ vm_dom[dom].vmd_pass);
+ }
}
DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
@@ -2926,9 +3271,9 @@
m = (vm_page_t)addr;
db_printf(
"page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
- " af 0x%x of 0x%x f 0x%x act %d busy %d valid 0x%x dirty 0x%x\n",
+ " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
- m->flags, m->act_count, m->busy, m->valid, m->dirty);
+ m->flags, m->act_count, m->busy_lock, m->valid, m->dirty);
}
#endif /* DDB */
Modified: trunk/sys/vm/vm_page.h
===================================================================
--- trunk/sys/vm/vm_page.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_page.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -57,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_page.h 307672 2016-10-20 13:12:19Z kib $
*/
/*
@@ -74,9 +75,9 @@
*
* A small structure is kept for each resident
* page, indexed by page number. Each structure
- * is an element of several lists:
+ * is an element of several collections:
*
- * A hash table bucket used to quickly
+ * A radix tree used to quickly
* perform object/offset lookups
*
* A list of all pages for a given object,
@@ -92,7 +93,7 @@
* In general, operations on this structure's mutable fields are
* synchronized using either one of or a combination of the lock on the
* object that the page belongs to (O), the pool lock for the page (P),
- * or the lock for either the free or paging queues (Q). If a field is
+ * or the lock for either the free or paging queue (Q). If a field is
* annotated below with two of these locks, then holding either lock is
* sufficient for read access, but both locks are required for write
* access.
@@ -111,8 +112,6 @@
* field is encapsulated in vm_page_clear_dirty_mask().
*/
-TAILQ_HEAD(pglist, vm_page);
-
#if PAGE_SIZE == 4096
#define VM_PAGE_BITS_ALL 0xffu
typedef uint8_t vm_page_bits_t;
@@ -128,31 +127,38 @@
#endif
struct vm_page {
- TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO queue or free list (Q) */
- TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */
- struct vm_page *left; /* splay tree link (O) */
- struct vm_page *right; /* splay tree link (O) */
-
- vm_object_t object; /* which object am I in (O,P)*/
+ union {
+ TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */
+ struct {
+ SLIST_ENTRY(vm_page) ss; /* private slists */
+ void *pv;
+ } s;
+ struct {
+ u_long p;
+ u_long v;
+ } memguard;
+ } plinks;
+ TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */
+ vm_object_t object; /* which object am I in (O,P) */
vm_pindex_t pindex; /* offset into object (O,P) */
vm_paddr_t phys_addr; /* physical address of page */
struct md_page md; /* machine dependant stuff */
+ u_int wire_count; /* wired down maps refs (P) */
+ volatile u_int busy_lock; /* busy owners lock */
+ uint16_t hold_count; /* page hold count (P) */
+ uint16_t flags; /* page PG_* flags (P) */
+ uint8_t aflags; /* access is atomic */
+ uint8_t oflags; /* page VPO_* flags (O) */
uint8_t queue; /* page queue index (P,Q) */
int8_t segind;
- short hold_count; /* page hold count (P) */
uint8_t order; /* index of the buddy queue */
uint8_t pool;
- u_short cow; /* page cow mapping count (P) */
- u_int wire_count; /* wired down maps refs (P) */
- uint8_t aflags; /* access is atomic */
- uint8_t flags; /* see below, often immutable after alloc */
- u_short oflags; /* page flags (O) */
- u_char act_count; /* page usage count (O) */
- u_char busy; /* page busy count (O) */
- /* NOTE that these must support one bit per DEV_BSIZE in a page!!! */
+ u_char act_count; /* page usage count (P) */
+ /* NOTE that these must support one bit per DEV_BSIZE in a page */
/* so, on normal X86 kernels, they must be at least 8 bits wide */
vm_page_bits_t valid; /* map of valid DEV_BSIZE chunks (O) */
vm_page_bits_t dirty; /* map of dirty DEV_BSIZE chunks (M) */
+ int8_t psind; /* pagesizes[] index (O) */
};
/*
@@ -169,33 +175,88 @@
* mappings, and such pages are also not on any PQ queue.
*
*/
-#define VPO_BUSY 0x0001 /* page is in transit */
-#define VPO_WANTED 0x0002 /* someone is waiting for page */
-#define VPO_UNMANAGED 0x0004 /* No PV management for page */
-#define VPO_SWAPINPROG 0x0200 /* swap I/O in progress on page */
-#define VPO_NOSYNC 0x0400 /* do not collect for syncer */
+#define VPO_UNUSED01 0x01 /* --available-- */
+#define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */
+#define VPO_UNMANAGED 0x04 /* no PV management for page */
+#define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */
+#define VPO_NOSYNC 0x10 /* do not collect for syncer */
+/*
+ * Busy page implementation details.
+ * The algorithm is taken mostly by rwlock(9) and sx(9) locks implementation,
+ * even if the support for owner identity is removed because of size
+ * constraints. Checks on lock recursion are then not possible, while the
+ * lock assertions effectiveness is someway reduced.
+ */
+#define VPB_BIT_SHARED 0x01
+#define VPB_BIT_EXCLUSIVE 0x02
+#define VPB_BIT_WAITERS 0x04
+#define VPB_BIT_FLAGMASK \
+ (VPB_BIT_SHARED | VPB_BIT_EXCLUSIVE | VPB_BIT_WAITERS)
+
+#define VPB_SHARERS_SHIFT 3
+#define VPB_SHARERS(x) \
+ (((x) & ~VPB_BIT_FLAGMASK) >> VPB_SHARERS_SHIFT)
+#define VPB_SHARERS_WORD(x) ((x) << VPB_SHARERS_SHIFT | VPB_BIT_SHARED)
+#define VPB_ONE_SHARER (1 << VPB_SHARERS_SHIFT)
+
+#define VPB_SINGLE_EXCLUSIVER VPB_BIT_EXCLUSIVE
+
+#define VPB_UNBUSIED VPB_SHARERS_WORD(0)
+
#define PQ_NONE 255
#define PQ_INACTIVE 0
#define PQ_ACTIVE 1
-#define PQ_HOLD 2
-#define PQ_COUNT 3
+#define PQ_COUNT 2
-struct vpgqueues {
- struct pglist pl;
- int *cnt;
+TAILQ_HEAD(pglist, vm_page);
+SLIST_HEAD(spglist, vm_page);
+
+struct vm_pagequeue {
+ struct mtx pq_mutex;
+ struct pglist pq_pl;
+ int pq_cnt;
+ u_int * const pq_vcnt;
+ const char * const pq_name;
+} __aligned(CACHE_LINE_SIZE);
+
+
+struct vm_domain {
+ struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
+ u_int vmd_page_count;
+ u_int vmd_free_count;
+ long vmd_segs; /* bitmask of the segments */
+ boolean_t vmd_oom;
+ int vmd_pass; /* local pagedaemon pass */
+ int vmd_oom_seq;
+ int vmd_last_active_scan;
+ struct vm_page vmd_marker; /* marker for pagedaemon private use */
};
-extern struct vpgqueues vm_page_queues[PQ_COUNT];
+extern struct vm_domain vm_dom[MAXMEMDOM];
-struct vpglocks {
- struct mtx data;
- char pad[CACHE_LINE_SIZE - sizeof(struct mtx)];
-} __aligned(CACHE_LINE_SIZE);
+#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED)
+#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex)
+#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex)
-extern struct vpglocks vm_page_queue_free_lock;
-extern struct vpglocks pa_lock[];
+#ifdef _KERNEL
+static __inline void
+vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
+{
+#ifdef notyet
+ vm_pagequeue_assert_locked(pq);
+#endif
+ pq->pq_cnt += addend;
+ atomic_add_int(pq->pq_vcnt, addend);
+}
+#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1)
+#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1)
+#endif /* _KERNEL */
+
+extern struct mtx_padalign vm_page_queue_free_mtx;
+extern struct mtx_padalign pa_lock[];
+
#if defined(__arm__)
#define PDRSHIFT PDR_SHIFT
#elif !defined(PDRSHIFT)
@@ -203,7 +264,7 @@
#endif
#define pa_index(pa) ((pa) >> PDRSHIFT)
-#define PA_LOCKPTR(pa) &pa_lock[pa_index((pa)) % PA_LOCK_COUNT].data
+#define PA_LOCKPTR(pa) ((struct mtx *)(&pa_lock[pa_index(pa) % PA_LOCK_COUNT]))
#define PA_LOCKOBJPTR(pa) ((struct lock_object *)PA_LOCKPTR((pa)))
#define PA_LOCK(pa) mtx_lock(PA_LOCKPTR(pa))
#define PA_TRYLOCK(pa) mtx_trylock(PA_LOCKPTR(pa))
@@ -222,35 +283,36 @@
#define vm_page_lock(m) vm_page_lock_KBI((m), LOCK_FILE, LOCK_LINE)
#define vm_page_unlock(m) vm_page_unlock_KBI((m), LOCK_FILE, LOCK_LINE)
#define vm_page_trylock(m) vm_page_trylock_KBI((m), LOCK_FILE, LOCK_LINE)
+#else /* !KLD_MODULE */
+#define vm_page_lockptr(m) (PA_LOCKPTR(VM_PAGE_TO_PHYS((m))))
+#define vm_page_lock(m) mtx_lock(vm_page_lockptr((m)))
+#define vm_page_unlock(m) mtx_unlock(vm_page_lockptr((m)))
+#define vm_page_trylock(m) mtx_trylock(vm_page_lockptr((m)))
+#endif
#if defined(INVARIANTS)
+#define vm_page_assert_locked(m) \
+ vm_page_assert_locked_KBI((m), __FILE__, __LINE__)
#define vm_page_lock_assert(m, a) \
vm_page_lock_assert_KBI((m), (a), __FILE__, __LINE__)
#else
+#define vm_page_assert_locked(m)
#define vm_page_lock_assert(m, a)
#endif
-#else /* !KLD_MODULE */
-#define vm_page_lockptr(m) (PA_LOCKPTR(VM_PAGE_TO_PHYS((m))))
-#define vm_page_lock(m) mtx_lock(vm_page_lockptr((m)))
-#define vm_page_unlock(m) mtx_unlock(vm_page_lockptr((m)))
-#define vm_page_trylock(m) mtx_trylock(vm_page_lockptr((m)))
-#define vm_page_lock_assert(m, a) mtx_assert(vm_page_lockptr((m)), (a))
-#endif
-#define vm_page_queue_free_mtx vm_page_queue_free_lock.data
-
/*
- * These are the flags defined for vm_page.
+ * The vm_page's aflags are updated using atomic operations. To set or clear
+ * these flags, the functions vm_page_aflag_set() and vm_page_aflag_clear()
+ * must be used. Neither these flags nor these functions are part of the KBI.
*
- * aflags are updated by atomic accesses. Use the vm_page_aflag_set()
- * and vm_page_aflag_clear() functions to set and clear the flags.
+ * PGA_REFERENCED may be cleared only if the page is locked. It is set by
+ * both the MI and MD VM layers. However, kernel loadable modules should not
+ * directly set this flag. They should call vm_page_reference() instead.
*
- * PGA_REFERENCED may be cleared only if the object containing the page is
- * locked. It is set by both the MI and MD VM layers.
+ * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter().
+ * When it does so, the object must be locked, or the page must be
+ * exclusive busied. The MI VM layer must never access this flag
+ * directly. Instead, it should call pmap_page_is_write_mapped().
*
- * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter(). When it
- * does so, the page must be VPO_BUSY. The MI VM layer must never access this
- * flag directly. Instead, it should call pmap_page_is_write_mapped().
- *
* PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has
* at least one executable mapping. It is not consumed by the MI VM layer.
*/
@@ -262,14 +324,14 @@
* Page flags. If changed at any other time than page allocation or
* freeing, the modification must be protected by the vm_page lock.
*/
-#define PG_CACHED 0x01 /* page is cached */
-#define PG_FREE 0x02 /* page is free */
-#define PG_FICTITIOUS 0x04 /* physical page doesn't exist */
-#define PG_ZERO 0x08 /* page is zeroed */
-#define PG_MARKER 0x10 /* special queue marker page */
-#define PG_SLAB 0x20 /* object pointer is actually a slab */
-#define PG_WINATCFLS 0x40 /* flush dirty page on inactive q */
-#define PG_NODUMP 0x80 /* don't include this page in a dump */
+#define PG_CACHED 0x0001 /* page is cached */
+#define PG_FREE 0x0002 /* page is free */
+#define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */
+#define PG_ZERO 0x0008 /* page is zeroed */
+#define PG_MARKER 0x0010 /* special queue marker page */
+#define PG_WINATCFLS 0x0040 /* flush dirty page on inactive q */
+#define PG_NODUMP 0x0080 /* don't include this page in a dump */
+#define PG_UNHOLDFREE 0x0100 /* delayed free of a held page */
/*
* Misc constants.
@@ -281,8 +343,12 @@
#ifdef _KERNEL
+#include <sys/systm.h>
+
+#include <machine/atomic.h>
+
/*
- * Each pageable resident page falls into one of five lists:
+ * Each pageable resident page falls into one of four lists:
*
* free
* Available for allocation now.
@@ -291,10 +357,6 @@
* Almost available for allocation. Still associated with
* an object, but clean and immediately freeable.
*
- * hold
- * Will become free after a pending I/O operation
- * completes.
- *
* The following lists are LRU sorted:
*
* inactive
@@ -308,7 +370,6 @@
*
*/
-struct vnode;
extern int vm_page_zero_count;
extern vm_page_t vm_page_array; /* First resident page in table */
@@ -319,16 +380,8 @@
#define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr)
-vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
-
vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
-extern struct vpglocks vm_page_queue_lock;
-
-#define vm_page_queue_mtx vm_page_queue_lock.data
-#define vm_page_lock_queues() mtx_lock(&vm_page_queue_mtx)
-#define vm_page_unlock_queues() mtx_unlock(&vm_page_queue_mtx)
-
/* page allocation classes: */
#define VM_ALLOC_NORMAL 0
#define VM_ALLOC_INTERRUPT 1
@@ -337,7 +390,6 @@
/* page allocation flags: */
#define VM_ALLOC_WIRED 0x0020 /* non pageable */
#define VM_ALLOC_ZERO 0x0040 /* Try to obtain a zeroed page */
-#define VM_ALLOC_RETRY 0x0080 /* Mandatory with vm_page_grab() */
#define VM_ALLOC_NOOBJ 0x0100 /* No associated object */
#define VM_ALLOC_NOBUSY 0x0200 /* Do not busy the page */
#define VM_ALLOC_IFCACHED 0x0400 /* Fail if the page is not cached */
@@ -344,61 +396,85 @@
#define VM_ALLOC_IFNOTCACHED 0x0800 /* Fail if the page is cached */
#define VM_ALLOC_IGN_SBUSY 0x1000 /* vm_page_grab() only */
#define VM_ALLOC_NODUMP 0x2000 /* don't include in dump */
+#define VM_ALLOC_SBUSY 0x4000 /* Shared busy the page */
#define VM_ALLOC_COUNT_SHIFT 16
#define VM_ALLOC_COUNT(count) ((count) << VM_ALLOC_COUNT_SHIFT)
-void vm_page_aflag_set(vm_page_t m, uint8_t bits);
-void vm_page_aflag_clear(vm_page_t m, uint8_t bits);
-void vm_page_busy(vm_page_t m);
+#ifdef M_NOWAIT
+static inline int
+malloc2vm_flags(int malloc_flags)
+{
+ int pflags;
+
+ KASSERT((malloc_flags & M_USE_RESERVE) == 0 ||
+ (malloc_flags & M_NOWAIT) != 0,
+ ("M_USE_RESERVE requires M_NOWAIT"));
+ pflags = (malloc_flags & M_USE_RESERVE) != 0 ? VM_ALLOC_INTERRUPT :
+ VM_ALLOC_SYSTEM;
+ if ((malloc_flags & M_ZERO) != 0)
+ pflags |= VM_ALLOC_ZERO;
+ if ((malloc_flags & M_NODUMP) != 0)
+ pflags |= VM_ALLOC_NODUMP;
+ return (pflags);
+}
+#endif
+
+void vm_page_busy_downgrade(vm_page_t m);
+void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared);
void vm_page_flash(vm_page_t m);
-void vm_page_io_start(vm_page_t m);
-void vm_page_io_finish(vm_page_t m);
void vm_page_hold(vm_page_t mem);
void vm_page_unhold(vm_page_t mem);
void vm_page_free(vm_page_t m);
void vm_page_free_zero(vm_page_t m);
-void vm_page_dirty(vm_page_t m);
-void vm_page_wakeup(vm_page_t m);
-void vm_pageq_remove(vm_page_t m);
-
void vm_page_activate (vm_page_t);
+void vm_page_advise(vm_page_t m, int advice);
vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
+vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
+ u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
+ vm_paddr_t boundary, vm_memattr_t memattr);
vm_page_t vm_page_alloc_freelist(int, int);
-struct vnode *vm_page_alloc_init(vm_page_t);
vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
void vm_page_cache(vm_page_t);
void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
-void vm_page_cache_remove(vm_page_t);
void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
int vm_page_try_to_cache (vm_page_t);
int vm_page_try_to_free (vm_page_t);
-void vm_page_dontneed(vm_page_t);
void vm_page_deactivate (vm_page_t);
+void vm_page_dequeue(vm_page_t m);
+void vm_page_dequeue_locked(vm_page_t m);
vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
-void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
+int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
vm_page_t vm_page_next(vm_page_t m);
int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
+struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
vm_page_t vm_page_prev(vm_page_t m);
+boolean_t vm_page_ps_is_valid(vm_page_t m);
void vm_page_putfake(vm_page_t m);
void vm_page_readahead_finish(vm_page_t m);
void vm_page_reference(vm_page_t m);
void vm_page_remove (vm_page_t);
-void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
+int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
+vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object,
+ vm_pindex_t pindex);
void vm_page_requeue(vm_page_t m);
-void vm_page_set_valid(vm_page_t m, int base, int size);
-void vm_page_sleep(vm_page_t m, const char *msg);
-vm_page_t vm_page_splay(vm_pindex_t, vm_page_t);
+void vm_page_requeue_locked(vm_page_t m);
+int vm_page_sbusied(vm_page_t m);
+void vm_page_set_valid_range(vm_page_t m, int base, int size);
+int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
vm_offset_t vm_page_startup(vm_offset_t vaddr);
+void vm_page_sunbusy(vm_page_t m);
+int vm_page_trysbusy(vm_page_t m);
void vm_page_unhold_pages(vm_page_t *ma, int count);
void vm_page_unwire (vm_page_t, int);
void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_wire (vm_page_t);
+void vm_page_xunbusy_hard(vm_page_t m);
void vm_page_set_validclean (vm_page_t, int, int);
void vm_page_clear_dirty (vm_page_t, int, int);
void vm_page_set_invalid (vm_page_t, int, int);
@@ -408,46 +484,170 @@
void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid);
void vm_page_free_toq(vm_page_t m);
void vm_page_zero_idle_wakeup(void);
-void vm_page_cowfault (vm_page_t);
-int vm_page_cowsetup(vm_page_t);
-void vm_page_cowclear (vm_page_t);
+void vm_page_dirty_KBI(vm_page_t m);
void vm_page_lock_KBI(vm_page_t m, const char *file, int line);
void vm_page_unlock_KBI(vm_page_t m, const char *file, int line);
int vm_page_trylock_KBI(vm_page_t m, const char *file, int line);
#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
+void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line);
void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line);
#endif
+#define vm_page_assert_sbusied(m) \
+ KASSERT(vm_page_sbusied(m), \
+ ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \
+ (void *)m, __FILE__, __LINE__));
+
+#define vm_page_assert_unbusied(m) \
+ KASSERT(!vm_page_busied(m), \
+ ("vm_page_assert_unbusied: page %p busy @ %s:%d", \
+ (void *)m, __FILE__, __LINE__));
+
+#define vm_page_assert_xbusied(m) \
+ KASSERT(vm_page_xbusied(m), \
+ ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \
+ (void *)m, __FILE__, __LINE__));
+
+#define vm_page_busied(m) \
+ ((m)->busy_lock != VPB_UNBUSIED)
+
+#define vm_page_sbusy(m) do { \
+ if (!vm_page_trysbusy(m)) \
+ panic("%s: page %p failed shared busing", __func__, m); \
+} while (0)
+
+#define vm_page_tryxbusy(m) \
+ (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED, \
+ VPB_SINGLE_EXCLUSIVER))
+
+#define vm_page_xbusied(m) \
+ ((m->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0)
+
+#define vm_page_xbusy(m) do { \
+ if (!vm_page_tryxbusy(m)) \
+ panic("%s: page %p failed exclusive busing", __func__, \
+ m); \
+} while (0)
+
+#define vm_page_xunbusy(m) do { \
+ if (!atomic_cmpset_rel_int(&(m)->busy_lock, \
+ VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) \
+ vm_page_xunbusy_hard(m); \
+} while (0)
+
#ifdef INVARIANTS
void vm_page_object_lock_assert(vm_page_t m);
#define VM_PAGE_OBJECT_LOCK_ASSERT(m) vm_page_object_lock_assert(m)
+void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits);
+#define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) \
+ vm_page_assert_pga_writeable(m, bits)
#else
#define VM_PAGE_OBJECT_LOCK_ASSERT(m) (void)0
+#define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) (void)0
#endif
/*
- * vm_page_sleep_if_busy:
+ * We want to use atomic updates for the aflags field, which is 8 bits wide.
+ * However, not all architectures support atomic operations on 8-bit
+ * destinations. In order that we can easily use a 32-bit operation, we
+ * require that the aflags field be 32-bit aligned.
+ */
+CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
+
+/*
+ * Clear the given bits in the specified page.
+ */
+static inline void
+vm_page_aflag_clear(vm_page_t m, uint8_t bits)
+{
+ uint32_t *addr, val;
+
+ /*
+ * The PGA_REFERENCED flag can only be cleared if the page is locked.
+ */
+ if ((bits & PGA_REFERENCED) != 0)
+ vm_page_assert_locked(m);
+
+ /*
+ * Access the whole 32-bit word containing the aflags field with an
+ * atomic update. Parallel non-atomic updates to the other fields
+ * within this word are handled properly by the atomic update.
+ */
+ addr = (void *)&m->aflags;
+ KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
+ ("vm_page_aflag_clear: aflags is misaligned"));
+ val = bits;
+#if BYTE_ORDER == BIG_ENDIAN
+ val <<= 24;
+#endif
+ atomic_clear_32(addr, val);
+}
+
+/*
+ * Set the given bits in the specified page.
+ */
+static inline void
+vm_page_aflag_set(vm_page_t m, uint8_t bits)
+{
+ uint32_t *addr, val;
+
+ VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits);
+
+ /*
+ * Access the whole 32-bit word containing the aflags field with an
+ * atomic update. Parallel non-atomic updates to the other fields
+ * within this word are handled properly by the atomic update.
+ */
+ addr = (void *)&m->aflags;
+ KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
+ ("vm_page_aflag_set: aflags is misaligned"));
+ val = bits;
+#if BYTE_ORDER == BIG_ENDIAN
+ val <<= 24;
+#endif
+ atomic_set_32(addr, val);
+}
+
+/*
+ * vm_page_dirty:
*
- * Sleep and release the page queues lock if VPO_BUSY is set or,
- * if also_m_busy is TRUE, busy is non-zero. Returns TRUE if the
- * thread slept and the page queues lock was released.
- * Otherwise, retains the page queues lock and returns FALSE.
+ * Set all bits in the page's dirty field.
*
- * The object containing the given page must be locked.
+ * The object containing the specified page must be locked if the
+ * call is made from the machine-independent layer.
+ *
+ * See vm_page_clear_dirty_mask().
*/
-static __inline int
-vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg)
+static __inline void
+vm_page_dirty(vm_page_t m)
{
- if ((m->oflags & VPO_BUSY) || (also_m_busy && m->busy)) {
- vm_page_sleep(m, msg);
- return (TRUE);
- }
- return (FALSE);
+ /* Use vm_page_dirty_KBI() under INVARIANTS to save memory. */
+#if defined(KLD_MODULE) || defined(INVARIANTS)
+ vm_page_dirty_KBI(m);
+#else
+ m->dirty = VM_PAGE_BITS_ALL;
+#endif
}
/*
+ * vm_page_remque:
+ *
+ * If the given page is in a page queue, then remove it from that page
+ * queue.
+ *
+ * The page must be locked.
+ */
+static inline void
+vm_page_remque(vm_page_t m)
+{
+
+ if (m->queue != PQ_NONE)
+ vm_page_dequeue(m);
+}
+
+/*
* vm_page_undirty:
*
* Set page to not be dirty. Note: does not clear pmap modify bits
Modified: trunk/sys/vm/vm_pageout.c
===================================================================
--- trunk/sys/vm/vm_pageout.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_pageout.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991 Regents of the University of California.
* All rights reserved.
@@ -73,9 +74,10 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pageout.c 320550 2017-07-01 19:24:53Z alc $");
#include "opt_vm.h"
+#include "opt_kdtrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -89,9 +91,13 @@
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
+#include <sys/sdt.h>
#include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/vmmeter.h>
+#include <sys/rwlock.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
@@ -102,6 +108,7 @@
#include <vm/vm_map.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
#include <vm/swap_pager.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
@@ -112,9 +119,15 @@
/* the kernel process "vm_pageout"*/
static void vm_pageout(void);
+static void vm_pageout_init(void);
static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_scan(int pass);
+static void vm_pageout_scan(struct vm_domain *vmd, int pass);
+static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
+ int starting_page_shortage);
+SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
+ NULL);
+
struct proc *pageproc;
static struct kproc_desc page_kp = {
@@ -122,9 +135,13 @@
vm_pageout,
&pageproc
};
-SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,
+SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
&page_kp);
+SDT_PROVIDER_DEFINE(vm);
+SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
+SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
+
#if !defined(NO_SWAPPING)
/* the kernel process "vm_daemon"*/
static void vm_daemon(void);
@@ -142,6 +159,8 @@
int vm_pages_needed; /* Event on which pageout daemon sleeps */
int vm_pageout_deficit; /* Estimated number of pages deficit */
int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */
+int vm_pageout_wakeup_thresh;
+static int vm_pageout_oom_seq = 12;
#if !defined(NO_SWAPPING)
static int vm_pageout_req_swapout; /* XXX */
@@ -151,35 +170,34 @@
MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
#endif
static int vm_max_launder = 32;
-static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
-static int vm_pageout_full_stats_interval = 0;
-static int vm_pageout_algorithm=0;
-static int defer_swap_pageouts=0;
-static int disable_swap_pageouts=0;
+static int vm_pageout_update_period;
+static int defer_swap_pageouts;
+static int disable_swap_pageouts;
+static int lowmem_period = 10;
+static time_t lowmem_uptime;
#if defined(NO_SWAPPING)
-static int vm_swap_enabled=0;
-static int vm_swap_idle_enabled=0;
+static int vm_swap_enabled = 0;
+static int vm_swap_idle_enabled = 0;
#else
-static int vm_swap_enabled=1;
-static int vm_swap_idle_enabled=0;
+static int vm_swap_enabled = 1;
+static int vm_swap_idle_enabled = 0;
#endif
-SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
- CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
+SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
+ CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
+ "free page threshold for waking up the pageout daemon");
SYSCTL_INT(_vm, OID_AUTO, max_launder,
CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
-SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
- CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
+SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
+ CTLFLAG_RW, &vm_pageout_update_period, 0,
+ "Maximum active LRU update period");
+
+SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
+ "Low memory callback period");
-SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
- CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
-
-SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
- CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
-
#if defined(NO_SWAPPING)
SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
@@ -202,6 +220,10 @@
SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
+SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
+ CTLFLAG_RW, &vm_pageout_oom_seq, 0,
+ "back-to-back calls to oom detector to start OOM");
+
#define VM_PAGEOUT_PAGE_COUNT 16
int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
@@ -209,18 +231,21 @@
SYSCTL_INT(_vm, OID_AUTO, max_wired,
CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
+static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
+static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
+ vm_paddr_t);
#if !defined(NO_SWAPPING)
static void vm_pageout_map_deactivate_pages(vm_map_t, long);
static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
static void vm_req_vmdaemon(int req);
#endif
-static void vm_pageout_page_stats(void);
+static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
/*
* Initialize a dummy page for marking the caller's place in the specified
* paging queue. In principle, this function only needs to set the flag
- * PG_MARKER. Nonetheless, it sets the flag VPO_BUSY and initializes the hold
- * count to one as safety precautions.
+ * PG_MARKER. Nonetheless, it write busies and initializes the hold count
+ * to one as safety precautions.
*/
static void
vm_pageout_init_marker(vm_page_t marker, u_short queue)
@@ -228,7 +253,7 @@
bzero(marker, sizeof(*marker));
marker->flags = PG_MARKER;
- marker->oflags = VPO_BUSY;
+ marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
marker->queue = queue;
marker->hold_count = 1;
}
@@ -236,9 +261,9 @@
/*
* vm_pageout_fallback_object_lock:
*
- * Lock vm object currently associated with `m'. VM_OBJECT_TRYLOCK is
+ * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
* known to have failed and page queue must be either PQ_ACTIVE or
- * PQ_INACTIVE. To avoid lock order violation, unlock the page queues
+ * PQ_INACTIVE. To avoid lock order violation, unlock the page queue
* while locking the vm object. Use marker page to detect page queue
* changes and maintain notion of next page on page queue. Return
* TRUE if no changes were detected, FALSE otherwise. vm object is
@@ -247,10 +272,11 @@
* This function depends on both the lock portion of struct vm_object
* and normal struct vm_page being type stable.
*/
-boolean_t
+static boolean_t
vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
{
struct vm_page marker;
+ struct vm_pagequeue *pq;
boolean_t unchanged;
u_short queue;
vm_object_t object;
@@ -257,23 +283,32 @@
queue = m->queue;
vm_pageout_init_marker(&marker, queue);
+ pq = vm_page_pagequeue(m);
object = m->object;
- TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl,
- m, &marker, pageq);
- vm_page_unlock_queues();
+ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
+ vm_pagequeue_unlock(pq);
vm_page_unlock(m);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
vm_page_lock(m);
- vm_page_lock_queues();
+ vm_pagequeue_lock(pq);
- /* Page queue might have changed. */
- *next = TAILQ_NEXT(&marker, pageq);
- unchanged = (m->queue == queue &&
- m->object == object &&
- &marker == TAILQ_NEXT(m, pageq));
- TAILQ_REMOVE(&vm_page_queues[queue].pl,
- &marker, pageq);
+ /*
+ * The page's object might have changed, and/or the page might
+ * have moved from its original position in the queue. If the
+ * page's object has changed, then the caller should abandon
+ * processing the page because the wrong object lock was
+ * acquired. Use the marker's plinks.q, not the page's, to
+ * determine if the page has been moved. The state of the
+ * page's plinks.q can be indeterminate; whereas, the marker's
+ * plinks.q must be valid.
+ */
+ *next = TAILQ_NEXT(&marker, plinks.q);
+ unchanged = m->object == object &&
+ m == TAILQ_PREV(&marker, pglist, plinks.q);
+ KASSERT(!unchanged || m->queue == queue,
+ ("page %p queue %d %d", m, queue, m->queue));
+ TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
return (unchanged);
}
@@ -286,31 +321,33 @@
*
* This function depends on normal struct vm_page being type stable.
*/
-boolean_t
+static boolean_t
vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
{
struct vm_page marker;
+ struct vm_pagequeue *pq;
boolean_t unchanged;
u_short queue;
vm_page_lock_assert(m, MA_NOTOWNED);
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-
if (vm_page_trylock(m))
return (TRUE);
queue = m->queue;
vm_pageout_init_marker(&marker, queue);
+ pq = vm_page_pagequeue(m);
- TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl, m, &marker, pageq);
- vm_page_unlock_queues();
+ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
+ vm_pagequeue_unlock(pq);
vm_page_lock(m);
- vm_page_lock_queues();
+ vm_pagequeue_lock(pq);
/* Page queue might have changed. */
- *next = TAILQ_NEXT(&marker, pageq);
- unchanged = (m->queue == queue && &marker == TAILQ_NEXT(m, pageq));
- TAILQ_REMOVE(&vm_page_queues[queue].pl, &marker, pageq);
+ *next = TAILQ_NEXT(&marker, plinks.q);
+ unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
+ KASSERT(!unchanged || m->queue == queue,
+ ("page %p queue %d %d", m, queue, m->queue));
+ TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
return (unchanged);
}
@@ -334,7 +371,7 @@
vm_page_lock_assert(m, MA_OWNED);
object = m->object;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
/*
* It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
@@ -348,8 +385,7 @@
/*
* Can't clean the page if it's busy or held.
*/
- KASSERT(m->busy == 0 && (m->oflags & VPO_BUSY) == 0,
- ("vm_pageout_clean: page %p is busy", m));
+ vm_page_assert_unbusied(m);
KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
vm_page_unlock(m);
@@ -387,15 +423,17 @@
break;
}
- if ((p = vm_page_prev(pb)) == NULL ||
- (p->oflags & VPO_BUSY) != 0 || p->busy != 0) {
+ if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
ib = 0;
break;
}
+ vm_page_test_dirty(p);
+ if (p->dirty == 0) {
+ ib = 0;
+ break;
+ }
vm_page_lock(p);
- vm_page_test_dirty(p);
- if (p->dirty == 0 ||
- p->queue != PQ_INACTIVE ||
+ if (p->queue != PQ_INACTIVE ||
p->hold_count != 0) { /* may be undergoing I/O */
vm_page_unlock(p);
ib = 0;
@@ -417,13 +455,13 @@
pindex + is < object->size) {
vm_page_t p;
- if ((p = vm_page_next(ps)) == NULL ||
- (p->oflags & VPO_BUSY) != 0 || p->busy != 0)
+ if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
break;
+ vm_page_test_dirty(p);
+ if (p->dirty == 0)
+ break;
vm_page_lock(p);
- vm_page_test_dirty(p);
- if (p->dirty == 0 ||
- p->queue != PQ_INACTIVE ||
+ if (p->queue != PQ_INACTIVE ||
p->hold_count != 0) { /* may be undergoing I/O */
vm_page_unlock(p);
break;
@@ -472,8 +510,7 @@
int numpagedout = 0;
int i, runlen;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
/*
* Initiate I/O. Bump the vm_page_t->busy counter and
@@ -489,7 +526,7 @@
KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
("vm_pageout_flush: partially invalid page %p index %d/%d",
mc[i], i, count));
- vm_page_io_start(mc[i]);
+ vm_page_sbusy(mc[i]);
pmap_remove_write(mc[i]);
}
vm_object_pip_add(object, count);
@@ -545,7 +582,7 @@
*/
if (pageout_status[i] != VM_PAGER_PEND) {
vm_object_pip_wakeup(object);
- vm_page_io_finish(mt);
+ vm_page_sunbusy(mt);
if (vm_page_count_severe()) {
vm_page_lock(mt);
vm_page_try_to_cache(mt);
@@ -558,6 +595,170 @@
return (numpagedout);
}
+static boolean_t
+vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
+ vm_paddr_t high)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ vm_object_t object;
+ vm_paddr_t pa;
+ vm_page_t m, m_tmp, next;
+ int lockmode;
+
+ vm_pagequeue_lock(pq);
+ TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) {
+ if ((m->flags & PG_MARKER) != 0)
+ continue;
+ pa = VM_PAGE_TO_PHYS(m);
+ if (pa < low || pa + PAGE_SIZE > high)
+ continue;
+ if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
+ vm_page_unlock(m);
+ continue;
+ }
+ object = m->object;
+ if ((!VM_OBJECT_TRYWLOCK(object) &&
+ (!vm_pageout_fallback_object_lock(m, &next) ||
+ m->hold_count != 0)) || vm_page_busied(m)) {
+ vm_page_unlock(m);
+ VM_OBJECT_WUNLOCK(object);
+ continue;
+ }
+ vm_page_test_dirty(m);
+ if (m->dirty == 0 && object->ref_count != 0)
+ pmap_remove_all(m);
+ if (m->dirty != 0) {
+ vm_page_unlock(m);
+ if (tries == 0 || (object->flags & OBJ_DEAD) != 0) {
+ VM_OBJECT_WUNLOCK(object);
+ continue;
+ }
+ if (object->type == OBJT_VNODE) {
+ vm_pagequeue_unlock(pq);
+ vp = object->handle;
+ vm_object_reference_locked(object);
+ VM_OBJECT_WUNLOCK(object);
+ (void)vn_start_write(vp, &mp, V_WAIT);
+ lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
+ LK_SHARED : LK_EXCLUSIVE;
+ vn_lock(vp, lockmode | LK_RETRY);
+ VM_OBJECT_WLOCK(object);
+ vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
+ VM_OBJECT_WUNLOCK(object);
+ VOP_UNLOCK(vp, 0);
+ vm_object_deallocate(object);
+ vn_finished_write(mp);
+ return (TRUE);
+ } else if (object->type == OBJT_SWAP ||
+ object->type == OBJT_DEFAULT) {
+ vm_pagequeue_unlock(pq);
+ m_tmp = m;
+ vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC,
+ 0, NULL, NULL);
+ VM_OBJECT_WUNLOCK(object);
+ return (TRUE);
+ }
+ } else {
+ /*
+ * Dequeue here to prevent lock recursion in
+ * vm_page_cache().
+ */
+ vm_page_dequeue_locked(m);
+ vm_page_cache(m);
+ vm_page_unlock(m);
+ }
+ VM_OBJECT_WUNLOCK(object);
+ }
+ vm_pagequeue_unlock(pq);
+ return (FALSE);
+}
+
+/*
+ * Increase the number of cached pages. The specified value, "tries",
+ * determines which categories of pages are cached:
+ *
+ * 0: All clean, inactive pages within the specified physical address range
+ * are cached. Will not sleep.
+ * 1: The vm_lowmem handlers are called. All inactive pages within
+ * the specified physical address range are cached. May sleep.
+ * 2: The vm_lowmem handlers are called. All inactive and active pages
+ * within the specified physical address range are cached. May sleep.
+ */
+void
+vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
+{
+ int actl, actmax, inactl, inactmax, dom, initial_dom;
+ static int start_dom = 0;
+
+ if (tries > 0) {
+ /*
+ * Decrease registered cache sizes. The vm_lowmem handlers
+ * may acquire locks and/or sleep, so they can only be invoked
+ * when "tries" is greater than zero.
+ */
+ SDT_PROBE0(vm, , , vm__lowmem_cache);
+ EVENTHANDLER_INVOKE(vm_lowmem, 0);
+
+ /*
+ * We do this explicitly after the caches have been drained
+ * above.
+ */
+ uma_reclaim();
+ }
+
+ /*
+ * Make the next scan start on the next domain.
+ */
+ initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
+
+ inactl = 0;
+ inactmax = cnt.v_inactive_count;
+ actl = 0;
+ actmax = tries < 2 ? 0 : cnt.v_active_count;
+ dom = initial_dom;
+
+ /*
+ * Scan domains in round-robin order, first inactive queues,
+ * then active. Since domain usually owns large physically
+ * contiguous chunk of memory, it makes sense to completely
+ * exhaust one domain before switching to next, while growing
+ * the pool of contiguous physical pages.
+ *
+ * Do not even start launder a domain which cannot contain
+ * the specified address range, as indicated by segments
+ * constituting the domain.
+ */
+again_inact:
+ if (inactl < inactmax) {
+ if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+ low, high) &&
+ vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
+ tries, low, high)) {
+ inactl++;
+ goto again_inact;
+ }
+ if (++dom == vm_ndomains)
+ dom = 0;
+ if (dom != initial_dom)
+ goto again_inact;
+ }
+again_act:
+ if (actl < actmax) {
+ if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+ low, high) &&
+ vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
+ tries, low, high)) {
+ actl++;
+ goto again_act;
+ }
+ if (++dom == vm_ndomains)
+ dom = 0;
+ if (dom != initial_dom)
+ goto again_act;
+ }
+}
+
#if !defined(NO_SWAPPING)
/*
* vm_pageout_object_deactivate_pages
@@ -573,17 +774,17 @@
{
vm_object_t backing_object, object;
vm_page_t p;
- int actcount, remove_mode;
+ int act_delta, remove_mode;
- VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
- if (first_object->type == OBJT_DEVICE ||
- first_object->type == OBJT_SG)
+ VM_OBJECT_ASSERT_LOCKED(first_object);
+ if ((first_object->flags & OBJ_FICTITIOUS) != 0)
return;
for (object = first_object;; object = backing_object) {
if (pmap_resident_count(pmap) <= desired)
goto unlock_return;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if (object->type == OBJT_PHYS || object->paging_in_progress)
+ VM_OBJECT_ASSERT_LOCKED(object);
+ if ((object->flags & OBJ_UNMANAGED) != 0 ||
+ object->paging_in_progress != 0)
goto unlock_return;
remove_mode = 0;
@@ -595,7 +796,7 @@
TAILQ_FOREACH(p, &object->memq, listq) {
if (pmap_resident_count(pmap) <= desired)
goto unlock_return;
- if ((p->oflags & VPO_BUSY) != 0 || p->busy != 0)
+ if (vm_page_busied(p))
continue;
PCPU_INC(cnt.v_pdpages);
vm_page_lock(p);
@@ -604,37 +805,30 @@
vm_page_unlock(p);
continue;
}
- actcount = pmap_ts_referenced(p);
+ act_delta = pmap_ts_referenced(p);
if ((p->aflags & PGA_REFERENCED) != 0) {
- if (actcount == 0)
- actcount = 1;
+ if (act_delta == 0)
+ act_delta = 1;
vm_page_aflag_clear(p, PGA_REFERENCED);
}
- if (p->queue != PQ_ACTIVE && actcount != 0) {
+ if (p->queue != PQ_ACTIVE && act_delta != 0) {
vm_page_activate(p);
- p->act_count += actcount;
+ p->act_count += act_delta;
} else if (p->queue == PQ_ACTIVE) {
- if (actcount == 0) {
+ if (act_delta == 0) {
p->act_count -= min(p->act_count,
ACT_DECLINE);
- if (!remove_mode &&
- (vm_pageout_algorithm ||
- p->act_count == 0)) {
+ if (!remove_mode && p->act_count == 0) {
pmap_remove_all(p);
vm_page_deactivate(p);
- } else {
- vm_page_lock_queues();
+ } else
vm_page_requeue(p);
- vm_page_unlock_queues();
- }
} else {
vm_page_activate(p);
if (p->act_count < ACT_MAX -
ACT_ADVANCE)
p->act_count += ACT_ADVANCE;
- vm_page_lock_queues();
vm_page_requeue(p);
- vm_page_unlock_queues();
}
} else if (p->queue == PQ_INACTIVE)
pmap_remove_all(p);
@@ -642,13 +836,13 @@
}
if ((backing_object = object->backing_object) == NULL)
goto unlock_return;
- VM_OBJECT_LOCK(backing_object);
+ VM_OBJECT_RLOCK(backing_object);
if (object != first_object)
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_RUNLOCK(object);
}
unlock_return:
if (object != first_object)
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_RUNLOCK(object);
}
/*
@@ -678,15 +872,15 @@
while (tmpe != &map->header) {
if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
obj = tmpe->object.vm_object;
- if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) {
+ if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
if (obj->shadow_count <= 1 &&
(bigobj == NULL ||
bigobj->resident_page_count < obj->resident_page_count)) {
if (bigobj != NULL)
- VM_OBJECT_UNLOCK(bigobj);
+ VM_OBJECT_RUNLOCK(bigobj);
bigobj = obj;
} else
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_RUNLOCK(obj);
}
}
if (tmpe->wired_count > 0)
@@ -696,7 +890,7 @@
if (bigobj != NULL) {
vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
- VM_OBJECT_UNLOCK(bigobj);
+ VM_OBJECT_RUNLOCK(bigobj);
}
/*
* Next, hunt around for other pages to deactivate. We actually
@@ -709,15 +903,23 @@
if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
obj = tmpe->object.vm_object;
if (obj != NULL) {
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_RLOCK(obj);
vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_RUNLOCK(obj);
}
}
tmpe = tmpe->next;
}
+#ifdef __ia64__
/*
+ * Remove all non-wired, managed mappings if a process is swapped out.
+ * This will free page table pages.
+ */
+ if (desired == 0)
+ pmap_remove_pages(map->pmap);
+#else
+ /*
* Remove all mappings if a process is swapped out, this will free page
* table pages.
*/
@@ -725,6 +927,8 @@
pmap_remove(vm_map_pmap(map), vm_map_min(map),
vm_map_max(map));
}
+#endif
+
vm_map_unlock(map);
}
#endif /* !defined(NO_SWAPPING) */
@@ -731,52 +935,63 @@
/*
* vm_pageout_scan does the dirty work for the pageout daemon.
+ *
+ * pass 0 - Update active LRU/deactivate pages
+ * pass 1 - Move inactive to cache or free
+ * pass 2 - Launder dirty pages
*/
static void
-vm_pageout_scan(int pass)
+vm_pageout_scan(struct vm_domain *vmd, int pass)
{
vm_page_t m, next;
- struct vm_page marker;
- int page_shortage, maxscan, pcount;
- int addl_page_shortage;
+ struct vm_pagequeue *pq;
vm_object_t object;
- int actcount;
+ long min_scan;
+ int act_delta, addl_page_shortage, deficit, maxscan, page_shortage;
int vnodes_skipped = 0;
- int maxlaunder;
- boolean_t queues_locked;
+ int maxlaunder, scan_tick, scanned, starting_page_shortage;
+ int lockmode;
+ boolean_t queue_locked;
/*
- * Decrease registered cache sizes.
+ * If we need to reclaim memory ask kernel caches to return
+ * some. We rate limit to avoid thrashing.
*/
- EVENTHANDLER_INVOKE(vm_lowmem, 0);
- /*
- * We do this explicitly after the caches have been drained above.
- */
- uma_reclaim();
+ if (vmd == &vm_dom[0] && pass > 0 &&
+ (time_uptime - lowmem_uptime) >= lowmem_period) {
+ /*
+ * Decrease registered cache sizes.
+ */
+ SDT_PROBE0(vm, , , vm__lowmem_scan);
+ EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
+ /*
+ * We do this explicitly after the caches have been
+ * drained above.
+ */
+ uma_reclaim();
+ lowmem_uptime = time_uptime;
+ }
/*
* The addl_page_shortage is the number of temporarily
* stuck pages in the inactive queue. In other words, the
- * number of pages from cnt.v_inactive_count that should be
+ * number of pages from the inactive count that should be
* discounted in setting the target for the active queue scan.
*/
- addl_page_shortage = atomic_readandclear_int(&vm_pageout_deficit);
+ addl_page_shortage = 0;
/*
* Calculate the number of pages we want to either free or move
* to the cache.
*/
- page_shortage = vm_paging_target() + addl_page_shortage;
+ if (pass > 0) {
+ deficit = atomic_readandclear_int(&vm_pageout_deficit);
+ page_shortage = vm_paging_target() + deficit;
+ } else
+ page_shortage = deficit = 0;
+ starting_page_shortage = page_shortage;
- vm_pageout_init_marker(&marker, PQ_INACTIVE);
-
/*
- * Start scanning the inactive queue for pages we can move to the
- * cache or free. The scan will stop when the target is reached or
- * we have scanned the entire inactive queue. Note that m->act_count
- * is not used to form decisions for the inactive queue, only for the
- * active queue.
- *
* maxlaunder limits the number of dirty pages we flush per scan.
* For most systems a smaller value (16 or 32) is more robust under
* extreme memory and disk pressure because any unnecessary writes
@@ -788,21 +1003,29 @@
*/
if ((maxlaunder = vm_max_launder) <= 1)
maxlaunder = 1;
- if (pass)
+ if (pass > 1)
maxlaunder = 10000;
- vm_page_lock_queues();
- queues_locked = TRUE;
- maxscan = cnt.v_inactive_count;
- for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
+ /*
+ * Start scanning the inactive queue for pages we can move to the
+ * cache or free. The scan will stop when the target is reached or
+ * we have scanned the entire inactive queue. Note that m->act_count
+ * is not used to form decisions for the inactive queue, only for the
+ * active queue.
+ */
+ pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
+ maxscan = pq->pq_cnt;
+ vm_pagequeue_lock(pq);
+ queue_locked = TRUE;
+ for (m = TAILQ_FIRST(&pq->pq_pl);
m != NULL && maxscan-- > 0 && page_shortage > 0;
m = next) {
- KASSERT(queues_locked, ("unlocked queues"));
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ vm_pagequeue_assert_locked(pq);
+ KASSERT(queue_locked, ("unlocked inactive queue"));
KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m));
- cnt.v_pdpages++;
- next = TAILQ_NEXT(m, pageq);
+ PCPU_INC(cnt.v_pdpages);
+ next = TAILQ_NEXT(m, plinks.q);
/*
* skip marker pages
@@ -826,10 +1049,10 @@
continue;
}
object = m->object;
- if (!VM_OBJECT_TRYLOCK(object) &&
+ if (!VM_OBJECT_TRYWLOCK(object) &&
!vm_pageout_fallback_object_lock(m, &next)) {
vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
continue;
}
@@ -840,105 +1063,87 @@
* pages, because they may leave the inactive queue
* shortly after page scan is finished.
*/
- if (m->busy != 0 || (m->oflags & VPO_BUSY) != 0) {
+ if (vm_page_busied(m)) {
vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
addl_page_shortage++;
continue;
}
/*
- * We unlock vm_page_queue_mtx, invalidating the
+ * We unlock the inactive page queue, invalidating the
* 'next' pointer. Use our marker to remember our
* place.
*/
- TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl,
- m, &marker, pageq);
- vm_page_unlock_queues();
- queues_locked = FALSE;
+ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
+ vm_pagequeue_unlock(pq);
+ queue_locked = FALSE;
/*
- * If the object is not being used, we ignore previous
+ * We bump the activation count if the page has been
+ * referenced while in the inactive queue. This makes
+ * it less likely that the page will be added back to the
+ * inactive queue prematurely again. Here we check the
+ * page tables (or emulated bits, if any), given the upper
+ * level VM system not knowing anything about existing
* references.
*/
- if (object->ref_count == 0) {
+ act_delta = 0;
+ if ((m->aflags & PGA_REFERENCED) != 0) {
vm_page_aflag_clear(m, PGA_REFERENCED);
+ act_delta = 1;
+ }
+ if (object->ref_count != 0) {
+ act_delta += pmap_ts_referenced(m);
+ } else {
KASSERT(!pmap_page_is_mapped(m),
("vm_pageout_scan: page %p is mapped", m));
-
- /*
- * Otherwise, if the page has been referenced while in the
- * inactive queue, we bump the "activation count" upwards,
- * making it less likely that the page will be added back to
- * the inactive queue prematurely again. Here we check the
- * page tables (or emulated bits, if any), given the upper
- * level VM system not knowing anything about existing
- * references.
- */
- } else if ((m->aflags & PGA_REFERENCED) == 0 &&
- (actcount = pmap_ts_referenced(m)) != 0) {
- vm_page_activate(m);
- vm_page_unlock(m);
- m->act_count += actcount + ACT_ADVANCE;
- VM_OBJECT_UNLOCK(object);
- goto relock_queues;
}
/*
* If the upper level VM system knows about any page
- * references, we activate the page. We also set the
- * "activation count" higher than normal so that we will less
- * likely place pages back onto the inactive queue again.
+ * references, we reactivate the page or requeue it.
*/
- if ((m->aflags & PGA_REFERENCED) != 0) {
- vm_page_aflag_clear(m, PGA_REFERENCED);
- actcount = pmap_ts_referenced(m);
- vm_page_activate(m);
+ if (act_delta != 0) {
+ if (object->ref_count) {
+ vm_page_activate(m);
+ m->act_count += act_delta + ACT_ADVANCE;
+ } else {
+ vm_pagequeue_lock(pq);
+ queue_locked = TRUE;
+ vm_page_requeue_locked(m);
+ }
+ VM_OBJECT_WUNLOCK(object);
vm_page_unlock(m);
- m->act_count += actcount + ACT_ADVANCE + 1;
- VM_OBJECT_UNLOCK(object);
- goto relock_queues;
+ goto relock_queue;
}
if (m->hold_count != 0) {
vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
/*
* Held pages are essentially stuck in the
* queue. So, they ought to be discounted
- * from cnt.v_inactive_count. See the
+ * from the inactive count. See the
* calculation of the page_shortage for the
* loop over the active queue below.
*/
addl_page_shortage++;
- goto relock_queues;
+ goto relock_queue;
}
/*
- * If the upper level VM system does not believe that the page
- * is fully dirty, but it is mapped for write access, then we
- * consult the pmap to see if the page's dirty status should
- * be updated.
+ * If the page appears to be clean at the machine-independent
+ * layer, then remove all of its mappings from the pmap in
+ * anticipation of placing it onto the cache queue. If,
+ * however, any of the page's mappings allow write access,
+ * then the page may still be modified until the last of those
+ * mappings are removed.
*/
- if (m->dirty != VM_PAGE_BITS_ALL &&
- pmap_page_is_write_mapped(m)) {
- /*
- * Avoid a race condition: Unless write access is
- * removed from the page, another processor could
- * modify it before all access is removed by the call
- * to vm_page_cache() below. If vm_page_cache() finds
- * that the page has been modified when it removes all
- * access, it panics because it cannot cache dirty
- * pages. In principle, we could eliminate just write
- * access here rather than all access. In the expected
- * case, when there are no last instant modifications
- * to the page, removing all access will be cheaper
- * overall.
- */
- if (pmap_is_modified(m))
- vm_page_dirty(m);
- else if (m->dirty == 0)
+ if (object->ref_count != 0) {
+ vm_page_test_dirty(m);
+ if (m->dirty == 0)
pmap_remove_all(m);
}
@@ -956,7 +1161,7 @@
*/
vm_page_cache(m);
--page_shortage;
- } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
+ } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) {
/*
* Dirty pages need to be paged out, but flushing
* a page is extremely expensive verses freeing
@@ -970,9 +1175,9 @@
* the thrash point for a heavily loaded machine.
*/
m->flags |= PG_WINATCFLS;
- vm_page_lock_queues();
- queues_locked = TRUE;
- vm_page_requeue(m);
+ vm_pagequeue_lock(pq);
+ queue_locked = TRUE;
+ vm_page_requeue_locked(m);
} else if (maxlaunder > 0) {
/*
* We always want to try to flush some dirty pages if
@@ -981,7 +1186,7 @@
* pressure where there are insufficient clean pages
* on the inactive queue, we may have to go all out.
*/
- int swap_pageouts_ok, vfslocked = 0;
+ int swap_pageouts_ok;
struct vnode *vp = NULL;
struct mount *mp = NULL;
@@ -999,12 +1204,12 @@
* Those objects are in a "rundown" state.
*/
if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
- vm_page_lock_queues();
+ vm_pagequeue_lock(pq);
vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
- queues_locked = TRUE;
- vm_page_requeue(m);
- goto relock_queues;
+ VM_OBJECT_WUNLOCK(object);
+ queue_locked = TRUE;
+ vm_page_requeue_locked(m);
+ goto relock_queue;
}
/*
@@ -1044,11 +1249,12 @@
KASSERT(mp != NULL,
("vp %p with NULL v_mount", vp));
vm_object_reference_locked(object);
- VM_OBJECT_UNLOCK(object);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- if (vget(vp, LK_EXCLUSIVE | LK_TIMELOCK,
+ VM_OBJECT_WUNLOCK(object);
+ lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
+ LK_SHARED : LK_EXCLUSIVE;
+ if (vget(vp, lockmode | LK_TIMELOCK,
curthread)) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
++pageout_lock_miss;
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
@@ -1055,10 +1261,10 @@
vp = NULL;
goto unlock_and_continue;
}
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
vm_page_lock(m);
- vm_page_lock_queues();
- queues_locked = TRUE;
+ vm_pagequeue_lock(pq);
+ queue_locked = TRUE;
/*
* The page might have been moved to another
* queue during potential blocking in vget()
@@ -1067,7 +1273,7 @@
*/
if (m->queue != PQ_INACTIVE ||
m->object != object ||
- TAILQ_NEXT(m, pageq) != &marker) {
+ TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) {
vm_page_unlock(m);
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
@@ -1080,8 +1286,9 @@
* page back onto the end of the queue so that
* statistics are more correct if we don't.
*/
- if (m->busy || (m->oflags & VPO_BUSY)) {
+ if (vm_page_busied(m)) {
vm_page_unlock(m);
+ addl_page_shortage++;
goto unlock_and_continue;
}
@@ -1089,15 +1296,15 @@
* If the page has become held it might
* be undergoing I/O, so skip it
*/
- if (m->hold_count) {
+ if (m->hold_count != 0) {
vm_page_unlock(m);
- vm_page_requeue(m);
+ addl_page_shortage++;
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
goto unlock_and_continue;
}
- vm_page_unlock_queues();
- queues_locked = FALSE;
+ vm_pagequeue_unlock(pq);
+ queue_locked = FALSE;
}
/*
@@ -1116,60 +1323,93 @@
}
unlock_and_continue:
vm_page_lock_assert(m, MA_NOTOWNED);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
if (mp != NULL) {
- if (queues_locked) {
- vm_page_unlock_queues();
- queues_locked = FALSE;
+ if (queue_locked) {
+ vm_pagequeue_unlock(pq);
+ queue_locked = FALSE;
}
if (vp != NULL)
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
vm_object_deallocate(object);
vn_finished_write(mp);
}
vm_page_lock_assert(m, MA_NOTOWNED);
- goto relock_queues;
+ goto relock_queue;
}
vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
-relock_queues:
- if (!queues_locked) {
- vm_page_lock_queues();
- queues_locked = TRUE;
+ VM_OBJECT_WUNLOCK(object);
+relock_queue:
+ if (!queue_locked) {
+ vm_pagequeue_lock(pq);
+ queue_locked = TRUE;
}
- next = TAILQ_NEXT(&marker, pageq);
- TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
- &marker, pageq);
+ next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
}
+ vm_pagequeue_unlock(pq);
+#if !defined(NO_SWAPPING)
/*
+ * Wakeup the swapout daemon if we didn't cache or free the targeted
+ * number of pages.
+ */
+ if (vm_swap_enabled && page_shortage > 0)
+ vm_req_vmdaemon(VM_SWAP_NORMAL);
+#endif
+
+ /*
+ * Wakeup the sync daemon if we skipped a vnode in a writeable object
+ * and we didn't cache or free enough pages.
+ */
+ if (vnodes_skipped > 0 && page_shortage > cnt.v_free_target -
+ cnt.v_free_min)
+ (void)speedup_syncer();
+
+ /*
+ * If the inactive queue scan fails repeatedly to meet its
+ * target, kill the largest process.
+ */
+ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
+
+ /*
* Compute the number of pages we want to try to move from the
* active queue to the inactive queue.
*/
- page_shortage = vm_paging_target() +
- cnt.v_inactive_target - cnt.v_inactive_count;
- page_shortage += addl_page_shortage;
+ page_shortage = cnt.v_inactive_target - cnt.v_inactive_count +
+ vm_paging_target() + deficit + addl_page_shortage;
+ pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
+ vm_pagequeue_lock(pq);
+ maxscan = pq->pq_cnt;
+
/*
- * Scan the active queue for things we can deactivate. We nominally
- * track the per-page activity counter and use it to locate
- * deactivation candidates.
+ * If we're just idle polling attempt to visit every
+ * active page within 'update_period' seconds.
*/
- pcount = cnt.v_active_count;
- m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ scan_tick = ticks;
+ if (vm_pageout_update_period != 0) {
+ min_scan = pq->pq_cnt;
+ min_scan *= scan_tick - vmd->vmd_last_active_scan;
+ min_scan /= hz * vm_pageout_update_period;
+ } else
+ min_scan = 0;
+ if (min_scan > 0 || (page_shortage > 0 && maxscan > 0))
+ vmd->vmd_last_active_scan = scan_tick;
- while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
-
+ /*
+ * Scan the active queue for pages that can be deactivated. Update
+ * the per-page activity counter and use it to identify deactivation
+ * candidates. Held pages may be deactivated.
+ */
+ for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
+ min_scan || (page_shortage > 0 && scanned < maxscan)); m = next,
+ scanned++) {
KASSERT(m->queue == PQ_ACTIVE,
("vm_pageout_scan: page %p isn't active", m));
-
- next = TAILQ_NEXT(m, pageq);
- if ((m->flags & PG_MARKER) != 0) {
- m = next;
+ next = TAILQ_NEXT(m, plinks.q);
+ if ((m->flags & PG_MARKER) != 0)
continue;
- }
KASSERT((m->flags & PG_FICTITIOUS) == 0,
("Fictitious page %p cannot be in active queue", m));
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
@@ -1176,89 +1416,65 @@
("Unmanaged page %p cannot be in active queue", m));
if (!vm_pageout_page_lock(m, &next)) {
vm_page_unlock(m);
- m = next;
continue;
}
- object = m->object;
- if (!VM_OBJECT_TRYLOCK(object) &&
- !vm_pageout_fallback_object_lock(m, &next)) {
- VM_OBJECT_UNLOCK(object);
- vm_page_unlock(m);
- m = next;
- continue;
- }
/*
- * Don't deactivate pages that are busy.
+ * The count for page daemon pages is updated after checking
+ * the page for eligibility.
*/
- if ((m->busy != 0) ||
- (m->oflags & VPO_BUSY) ||
- (m->hold_count != 0)) {
- vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
- vm_page_requeue(m);
- m = next;
- continue;
- }
+ PCPU_INC(cnt.v_pdpages);
/*
- * The count for pagedaemon pages is done after checking the
- * page for eligibility...
- */
- cnt.v_pdpages++;
-
- /*
* Check to see "how much" the page has been used.
*/
- actcount = 0;
- if (object->ref_count != 0) {
- if (m->aflags & PGA_REFERENCED) {
- actcount += 1;
- }
- actcount += pmap_ts_referenced(m);
- if (actcount) {
- m->act_count += ACT_ADVANCE + actcount;
- if (m->act_count > ACT_MAX)
- m->act_count = ACT_MAX;
- }
+ act_delta = 0;
+ if (m->aflags & PGA_REFERENCED) {
+ vm_page_aflag_clear(m, PGA_REFERENCED);
+ act_delta += 1;
}
-
/*
- * Since we have "tested" this bit, we need to clear it now.
+ * Perform an unsynchronized object ref count check. While
+ * the page lock ensures that the page is not reallocated to
+ * another object, in particular, one with unmanaged mappings
+ * that cannot support pmap_ts_referenced(), two races are,
+ * nonetheless, possible:
+ * 1) The count was transitioning to zero, but we saw a non-
+ * zero value. pmap_ts_referenced() will return zero
+ * because the page is not mapped.
+ * 2) The count was transitioning to one, but we saw zero.
+ * This race delays the detection of a new reference. At
+ * worst, we will deactivate and reactivate the page.
*/
- vm_page_aflag_clear(m, PGA_REFERENCED);
+ if (m->object->ref_count != 0)
+ act_delta += pmap_ts_referenced(m);
/*
- * Only if an object is currently being used, do we use the
- * page activation count stats.
+ * Advance or decay the act_count based on recent usage.
*/
- if (actcount && (object->ref_count != 0)) {
- vm_page_requeue(m);
+ if (act_delta) {
+ m->act_count += ACT_ADVANCE + act_delta;
+ if (m->act_count > ACT_MAX)
+ m->act_count = ACT_MAX;
} else {
m->act_count -= min(m->act_count, ACT_DECLINE);
- if (vm_pageout_algorithm ||
- object->ref_count == 0 ||
- m->act_count == 0) {
- page_shortage--;
- if (object->ref_count == 0) {
- KASSERT(!pmap_page_is_mapped(m),
- ("vm_pageout_scan: page %p is mapped", m));
- if (m->dirty == 0)
- vm_page_cache(m);
- else
- vm_page_deactivate(m);
- } else {
- vm_page_deactivate(m);
- }
- } else {
- vm_page_requeue(m);
- }
+ act_delta = m->act_count;
}
+
+ /*
+ * Move this page to the tail of the active or inactive
+ * queue depending on usage.
+ */
+ if (act_delta == 0) {
+ /* Dequeue to avoid later lock recursion. */
+ vm_page_dequeue_locked(m);
+ vm_page_deactivate(m);
+ page_shortage--;
+ } else
+ vm_page_requeue_locked(m);
vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
- m = next;
}
- vm_page_unlock_queues();
+ vm_pagequeue_unlock(pq);
#if !defined(NO_SWAPPING)
/*
* Idle process swapout -- run once per second.
@@ -1271,35 +1487,124 @@
}
}
#endif
-
+}
+
+static int vm_pageout_oom_vote;
+
+/*
+ * The pagedaemon threads randlomly select one to perform the
+ * OOM. Trying to kill processes before all pagedaemons
+ * failed to reach free target is premature.
+ */
+static void
+vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
+ int starting_page_shortage)
+{
+ int old_vote;
+
+ if (starting_page_shortage <= 0 || starting_page_shortage !=
+ page_shortage)
+ vmd->vmd_oom_seq = 0;
+ else
+ vmd->vmd_oom_seq++;
+ if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
+ if (vmd->vmd_oom) {
+ vmd->vmd_oom = FALSE;
+ atomic_subtract_int(&vm_pageout_oom_vote, 1);
+ }
+ return;
+ }
+
/*
- * If we didn't get enough free pages, and we have skipped a vnode
- * in a writeable object, wakeup the sync daemon. And kick swapout
- * if we did not get enough free pages.
+ * Do not follow the call sequence until OOM condition is
+ * cleared.
*/
- if (vm_paging_target() > 0) {
- if (vnodes_skipped && vm_page_count_min())
- (void) speedup_syncer();
-#if !defined(NO_SWAPPING)
- if (vm_swap_enabled && vm_page_count_target())
- vm_req_vmdaemon(VM_SWAP_NORMAL);
-#endif
- }
+ vmd->vmd_oom_seq = 0;
+ if (vmd->vmd_oom)
+ return;
+
+ vmd->vmd_oom = TRUE;
+ old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
+ if (old_vote != vm_ndomains - 1)
+ return;
+
/*
- * If we are critically low on one of RAM or swap and low on
- * the other, kill the largest process. However, we avoid
- * doing this on the first pass in order to give ourselves a
- * chance to flush out dirty vnode-backed pages and to allow
- * active pages to be moved to the inactive queue and reclaimed.
+ * The current pagedaemon thread is the last in the quorum to
+ * start OOM. Initiate the selection and signaling of the
+ * victim.
*/
- if (pass != 0 &&
- ((swap_pager_avail < 64 && vm_page_count_min()) ||
- (swap_pager_full && vm_paging_target() > 0)))
- vm_pageout_oom(VM_OOM_MEM);
+ vm_pageout_oom(VM_OOM_MEM);
+
+ /*
+ * After one round of OOM terror, recall our vote. On the
+ * next pass, current pagedaemon would vote again if the low
+ * memory condition is still there, due to vmd_oom being
+ * false.
+ */
+ vmd->vmd_oom = FALSE;
+ atomic_subtract_int(&vm_pageout_oom_vote, 1);
}
+/*
+ * The OOM killer is the page daemon's action of last resort when
+ * memory allocation requests have been stalled for a prolonged period
+ * of time because it cannot reclaim memory. This function computes
+ * the approximate number of physical pages that could be reclaimed if
+ * the specified address space is destroyed.
+ *
+ * Private, anonymous memory owned by the address space is the
+ * principal resource that we expect to recover after an OOM kill.
+ * Since the physical pages mapped by the address space's COW entries
+ * are typically shared pages, they are unlikely to be released and so
+ * they are not counted.
+ *
+ * To get to the point where the page daemon runs the OOM killer, its
+ * efforts to write-back vnode-backed pages may have stalled. This
+ * could be caused by a memory allocation deadlock in the write path
+ * that might be resolved by an OOM kill. Therefore, physical pages
+ * belonging to vnode-backed objects are counted, because they might
+ * be freed without being written out first if the address space holds
+ * the last reference to an unlinked vnode.
+ *
+ * Similarly, physical pages belonging to OBJT_PHYS objects are
+ * counted because the address space might hold the last reference to
+ * the object.
+ */
+static long
+vm_pageout_oom_pagecount(struct vmspace *vmspace)
+{
+ vm_map_t map;
+ vm_map_entry_t entry;
+ vm_object_t obj;
+ long res;
+ map = &vmspace->vm_map;
+ KASSERT(!map->system_map, ("system map"));
+ sx_assert(&map->lock, SA_LOCKED);
+ res = 0;
+ for (entry = map->header.next; entry != &map->header;
+ entry = entry->next) {
+ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
+ continue;
+ obj = entry->object.vm_object;
+ if (obj == NULL)
+ continue;
+ if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
+ obj->ref_count != 1)
+ continue;
+ switch (obj->type) {
+ case OBJT_DEFAULT:
+ case OBJT_SWAP:
+ case OBJT_PHYS:
+ case OBJT_VNODE:
+ res += obj->resident_page_count;
+ break;
+ }
+ }
+ return (res);
+}
+
void
vm_pageout_oom(int shortage)
{
@@ -1307,6 +1612,7 @@
vm_offset_t size, bigsize;
struct thread *td;
struct vmspace *vm;
+ bool breakout;
/*
* We keep the process bigproc locked once we find it to keep anyone
@@ -1320,17 +1626,15 @@
bigsize = 0;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
- int breakout;
+ PROC_LOCK(p);
- if (PROC_TRYLOCK(p) == 0)
- continue;
/*
* If this is a system, protected or killed process, skip it.
*/
- if (p->p_state != PRS_NORMAL ||
- (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM)) ||
- (p->p_pid == 1) || P_KILLED(p) ||
- ((p->p_pid < 48) && (swap_pager_avail != 0))) {
+ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
+ P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
+ p->p_pid == 1 || P_KILLED(p) ||
+ (p->p_pid < 48 && swap_pager_avail != 0)) {
PROC_UNLOCK(p);
continue;
}
@@ -1338,15 +1642,16 @@
* If the process is in a non-running type state,
* don't touch it. Check all the threads individually.
*/
- breakout = 0;
+ breakout = false;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
if (!TD_ON_RUNQ(td) &&
!TD_IS_RUNNING(td) &&
!TD_IS_SLEEPING(td) &&
- !TD_IS_SUSPENDED(td)) {
+ !TD_IS_SUSPENDED(td) &&
+ !TD_IS_SWAPPED(td)) {
thread_unlock(td);
- breakout = 1;
+ breakout = true;
break;
}
thread_unlock(td);
@@ -1363,156 +1668,119 @@
PROC_UNLOCK(p);
continue;
}
+ _PHOLD(p);
if (!vm_map_trylock_read(&vm->vm_map)) {
+ _PRELE(p);
+ PROC_UNLOCK(p);
vmspace_free(vm);
- PROC_UNLOCK(p);
continue;
}
+ PROC_UNLOCK(p);
size = vmspace_swap_count(vm);
+ if (shortage == VM_OOM_MEM)
+ size += vm_pageout_oom_pagecount(vm);
vm_map_unlock_read(&vm->vm_map);
- if (shortage == VM_OOM_MEM)
- size += vmspace_resident_count(vm);
vmspace_free(vm);
+
/*
- * if the this process is bigger than the biggest one
+ * If this process is bigger than the biggest one,
* remember it.
*/
if (size > bigsize) {
if (bigproc != NULL)
- PROC_UNLOCK(bigproc);
+ PRELE(bigproc);
bigproc = p;
bigsize = size;
- } else
- PROC_UNLOCK(p);
+ } else {
+ PRELE(p);
+ }
}
sx_sunlock(&allproc_lock);
if (bigproc != NULL) {
+ PROC_LOCK(bigproc);
killproc(bigproc, "out of swap space");
sched_nice(bigproc, PRIO_MIN);
+ _PRELE(bigproc);
PROC_UNLOCK(bigproc);
wakeup(&cnt.v_free_count);
}
}
-/*
- * This routine tries to maintain the pseudo LRU active queue,
- * so that during long periods of time where there is no paging,
- * that some statistic accumulation still occurs. This code
- * helps the situation where paging just starts to occur.
- */
static void
-vm_pageout_page_stats()
+vm_pageout_worker(void *arg)
{
- vm_object_t object;
- vm_page_t m,next;
- int pcount,tpcount; /* Number of pages to check */
- static int fullintervalcount = 0;
- int page_shortage;
+ struct vm_domain *domain;
+ int domidx;
- page_shortage =
- (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
- (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
+ domidx = (uintptr_t)arg;
+ domain = &vm_dom[domidx];
- if (page_shortage <= 0)
- return;
+ /*
+ * XXXKIB It could be useful to bind pageout daemon threads to
+ * the cores belonging to the domain, from which vm_page_array
+ * is allocated.
+ */
- vm_page_lock_queues();
- pcount = cnt.v_active_count;
- fullintervalcount += vm_pageout_stats_interval;
- if (fullintervalcount < vm_pageout_full_stats_interval) {
- tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count /
- cnt.v_page_count;
- if (pcount > tpcount)
- pcount = tpcount;
- } else {
- fullintervalcount = 0;
- }
+ KASSERT(domain->vmd_segs != 0, ("domain without segments"));
+ domain->vmd_last_active_scan = ticks;
+ vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
- m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
- while ((m != NULL) && (pcount-- > 0)) {
- int actcount;
-
- KASSERT(m->queue == PQ_ACTIVE,
- ("vm_pageout_page_stats: page %p isn't active", m));
-
- next = TAILQ_NEXT(m, pageq);
- if ((m->flags & PG_MARKER) != 0) {
- m = next;
- continue;
- }
- vm_page_lock_assert(m, MA_NOTOWNED);
- if (!vm_pageout_page_lock(m, &next)) {
- vm_page_unlock(m);
- m = next;
- continue;
- }
- object = m->object;
- if (!VM_OBJECT_TRYLOCK(object) &&
- !vm_pageout_fallback_object_lock(m, &next)) {
- VM_OBJECT_UNLOCK(object);
- vm_page_unlock(m);
- m = next;
- continue;
- }
-
+ /*
+ * The pageout daemon worker is never done, so loop forever.
+ */
+ while (TRUE) {
/*
- * Don't deactivate pages that are busy.
+ * If we have enough free memory, wakeup waiters. Do
+ * not clear vm_pages_needed until we reach our target,
+ * otherwise we may be woken up over and over again and
+ * waste a lot of cpu.
*/
- if ((m->busy != 0) ||
- (m->oflags & VPO_BUSY) ||
- (m->hold_count != 0)) {
- vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
- vm_page_requeue(m);
- m = next;
- continue;
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (vm_pages_needed && !vm_page_count_min()) {
+ if (!vm_paging_needed())
+ vm_pages_needed = 0;
+ wakeup(&cnt.v_free_count);
}
-
- actcount = 0;
- if (m->aflags & PGA_REFERENCED) {
- vm_page_aflag_clear(m, PGA_REFERENCED);
- actcount += 1;
- }
-
- actcount += pmap_ts_referenced(m);
- if (actcount) {
- m->act_count += ACT_ADVANCE + actcount;
- if (m->act_count > ACT_MAX)
- m->act_count = ACT_MAX;
- vm_page_requeue(m);
+ if (vm_pages_needed) {
+ /*
+ * We're still not done. Either vm_pages_needed was
+ * set by another thread during the previous scan
+ * (typically, this happens during a level 0 scan) or
+ * vm_pages_needed was already set and the scan failed
+ * to free enough pages. If we haven't yet performed
+ * a level >= 2 scan (unlimited dirty cleaning), then
+ * upgrade the level and scan again now. Otherwise,
+ * sleep a bit and try again later. While sleeping,
+ * vm_pages_needed can be cleared.
+ */
+ if (domain->vmd_pass > 1)
+ msleep(&vm_pages_needed,
+ &vm_page_queue_free_mtx, PVM, "psleep",
+ hz / 2);
} else {
- if (m->act_count == 0) {
- /*
- * We turn off page access, so that we have
- * more accurate RSS stats. We don't do this
- * in the normal page deactivation when the
- * system is loaded VM wise, because the
- * cost of the large number of page protect
- * operations would be higher than the value
- * of doing the operation.
- */
- pmap_remove_all(m);
- vm_page_deactivate(m);
- } else {
- m->act_count -= min(m->act_count, ACT_DECLINE);
- vm_page_requeue(m);
- }
+ /*
+ * Good enough, sleep until required to refresh
+ * stats.
+ */
+ msleep(&vm_pages_needed, &vm_page_queue_free_mtx,
+ PVM, "psleep", hz);
}
- vm_page_unlock(m);
- VM_OBJECT_UNLOCK(object);
- m = next;
+ if (vm_pages_needed) {
+ cnt.v_pdwakeups++;
+ domain->vmd_pass++;
+ } else
+ domain->vmd_pass = 0;
+ mtx_unlock(&vm_page_queue_free_mtx);
+ vm_pageout_scan(domain, domain->vmd_pass);
}
- vm_page_unlock_queues();
}
/*
- * vm_pageout is the high level pageout daemon.
+ * vm_pageout_init initialises basic pageout daemon settings.
*/
static void
-vm_pageout()
+vm_pageout_init(void)
{
- int error, pass;
-
/*
* Initialize some paging parameters.
*/
@@ -1534,105 +1802,59 @@
cnt.v_free_reserved = vm_pageout_page_count +
cnt.v_pageout_free_min + (cnt.v_page_count / 768);
cnt.v_free_severe = cnt.v_free_min / 2;
+ cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
cnt.v_free_min += cnt.v_free_reserved;
cnt.v_free_severe += cnt.v_free_reserved;
+ cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
+ if (cnt.v_inactive_target > cnt.v_free_count / 3)
+ cnt.v_inactive_target = cnt.v_free_count / 3;
/*
- * v_free_target and v_cache_min control pageout hysteresis. Note
- * that these are more a measure of the VM cache queue hysteresis
- * then the VM free queue. Specifically, v_free_target is the
- * high water mark (free+cache pages).
- *
- * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
- * low water mark, while v_free_min is the stop. v_cache_min must
- * be big enough to handle memory needs while the pageout daemon
- * is signalled and run to free more pages.
+ * Set the default wakeup threshold to be 10% above the minimum
+ * page limit. This keeps the steady state out of shortfall.
*/
- if (cnt.v_free_count > 6144)
- cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
- else
- cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
+ vm_pageout_wakeup_thresh = (cnt.v_free_min / 10) * 11;
- if (cnt.v_free_count > 2048) {
- cnt.v_cache_min = cnt.v_free_target;
- cnt.v_cache_max = 2 * cnt.v_cache_min;
- cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
- } else {
- cnt.v_cache_min = 0;
- cnt.v_cache_max = 0;
- cnt.v_inactive_target = cnt.v_free_count / 4;
- }
- if (cnt.v_inactive_target > cnt.v_free_count / 3)
- cnt.v_inactive_target = cnt.v_free_count / 3;
+ /*
+ * Set interval in seconds for active scan. We want to visit each
+ * page at least once every ten minutes. This is to prevent worst
+ * case paging behaviors with stale active LRU.
+ */
+ if (vm_pageout_update_period == 0)
+ vm_pageout_update_period = 600;
/* XXX does not really belong here */
if (vm_page_max_wired == 0)
vm_page_max_wired = cnt.v_free_count / 3;
+}
- if (vm_pageout_stats_max == 0)
- vm_pageout_stats_max = cnt.v_free_target;
+/*
+ * vm_pageout is the high level pageout daemon.
+ */
+static void
+vm_pageout(void)
+{
+ int error;
+#if MAXMEMDOM > 1
+ int i;
+#endif
- /*
- * Set interval in seconds for stats scan.
- */
- if (vm_pageout_stats_interval == 0)
- vm_pageout_stats_interval = 5;
- if (vm_pageout_full_stats_interval == 0)
- vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
-
swap_pager_swap_init();
- pass = 0;
- /*
- * The pageout daemon is never done, so loop forever.
- */
- while (TRUE) {
- /*
- * If we have enough free memory, wakeup waiters. Do
- * not clear vm_pages_needed until we reach our target,
- * otherwise we may be woken up over and over again and
- * waste a lot of cpu.
- */
- mtx_lock(&vm_page_queue_free_mtx);
- if (vm_pages_needed && !vm_page_count_min()) {
- if (!vm_paging_needed())
- vm_pages_needed = 0;
- wakeup(&cnt.v_free_count);
+#if MAXMEMDOM > 1
+ for (i = 1; i < vm_ndomains; i++) {
+ error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
+ curproc, NULL, 0, 0, "dom%d", i);
+ if (error != 0) {
+ panic("starting pageout for domain %d, error %d\n",
+ i, error);
}
- if (vm_pages_needed) {
- /*
- * Still not done, take a second pass without waiting
- * (unlimited dirty cleaning), otherwise sleep a bit
- * and try again.
- */
- ++pass;
- if (pass > 1)
- msleep(&vm_pages_needed,
- &vm_page_queue_free_mtx, PVM, "psleep",
- hz / 2);
- } else {
- /*
- * Good enough, sleep & handle stats. Prime the pass
- * for the next run.
- */
- if (pass > 1)
- pass = 1;
- else
- pass = 0;
- error = msleep(&vm_pages_needed,
- &vm_page_queue_free_mtx, PVM, "psleep",
- vm_pageout_stats_interval * hz);
- if (error && !vm_pages_needed) {
- mtx_unlock(&vm_page_queue_free_mtx);
- pass = 0;
- vm_pageout_page_stats();
- continue;
- }
- }
- if (vm_pages_needed)
- cnt.v_pdwakeups++;
- mtx_unlock(&vm_page_queue_free_mtx);
- vm_pageout_scan(pass);
}
+#endif
+ error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL,
+ 0, 0, "uma");
+ if (error != 0)
+ panic("starting uma_reclaim helper, error %d\n", error);
+ vm_pageout_worker((void *)(uintptr_t)0);
}
/*
@@ -1642,7 +1864,7 @@
* the free page queue lock is held until the msleep() is performed.
*/
void
-pagedaemon_wakeup()
+pagedaemon_wakeup(void)
{
if (!vm_pages_needed && curthread->td_proc != pageproc) {
@@ -1667,7 +1889,7 @@
}
static void
-vm_daemon()
+vm_daemon(void)
{
struct rlimit rsslim;
struct proc *p;
@@ -1680,11 +1902,13 @@
while (TRUE) {
mtx_lock(&vm_daemon_mtx);
+ msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
#ifdef RACCT
- msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", hz);
+ racct_enable ? hz : 0
#else
- msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0);
+ 0
#endif
+ );
swapout_flags = vm_pageout_req_swapout;
vm_pageout_req_swapout = 0;
mtx_unlock(&vm_daemon_mtx);
@@ -1754,38 +1978,48 @@
continue;
size = vmspace_resident_count(vm);
- if (limit >= 0 && size >= limit) {
+ if (size >= limit) {
vm_pageout_map_deactivate_pages(
&vm->vm_map, limit);
+ size = vmspace_resident_count(vm);
}
#ifdef RACCT
- rsize = IDX_TO_OFF(size);
- PROC_LOCK(p);
- racct_set(p, RACCT_RSS, rsize);
- ravailable = racct_get_available(p, RACCT_RSS);
- PROC_UNLOCK(p);
- if (rsize > ravailable) {
- /*
- * Don't be overly aggressive; this might be
- * an innocent process, and the limit could've
- * been exceeded by some memory hog. Don't
- * try to deactivate more than 1/4th of process'
- * resident set size.
- */
- if (attempts <= 8) {
- if (ravailable < rsize - (rsize / 4))
- ravailable = rsize - (rsize / 4);
- }
- vm_pageout_map_deactivate_pages(
- &vm->vm_map, OFF_TO_IDX(ravailable));
- /* Update RSS usage after paging out. */
- size = vmspace_resident_count(vm);
+ if (racct_enable) {
rsize = IDX_TO_OFF(size);
PROC_LOCK(p);
- racct_set(p, RACCT_RSS, rsize);
+ if (p->p_state == PRS_NORMAL)
+ racct_set(p, RACCT_RSS, rsize);
+ ravailable = racct_get_available(p, RACCT_RSS);
PROC_UNLOCK(p);
- if (rsize > ravailable)
- tryagain = 1;
+ if (rsize > ravailable) {
+ /*
+ * Don't be overly aggressive; this
+ * might be an innocent process,
+ * and the limit could've been exceeded
+ * by some memory hog. Don't try
+ * to deactivate more than 1/4th
+ * of process' resident set size.
+ */
+ if (attempts <= 8) {
+ if (ravailable < rsize -
+ (rsize / 4)) {
+ ravailable = rsize -
+ (rsize / 4);
+ }
+ }
+ vm_pageout_map_deactivate_pages(
+ &vm->vm_map,
+ OFF_TO_IDX(ravailable));
+ /* Update RSS usage after paging out. */
+ size = vmspace_resident_count(vm);
+ rsize = IDX_TO_OFF(size);
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL)
+ racct_set(p, RACCT_RSS, rsize);
+ PROC_UNLOCK(p);
+ if (rsize > ravailable)
+ tryagain = 1;
+ }
}
#endif
vmspace_free(vm);
Modified: trunk/sys/vm/vm_pageout.h
===================================================================
--- trunk/sys/vm/vm_pageout.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_pageout.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -57,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_pageout.h 314664 2017-03-04 12:05:50Z avg $
*/
#ifndef _VM_VM_PAGEOUT_H_
@@ -87,6 +88,12 @@
#define VM_OOM_SWAPZ 2
/*
+ * vm_lowmem flags.
+ */
+#define VM_LOW_KMEM 0x01
+#define VM_LOW_PAGES 0x02
+
+/*
* Exported routines.
*/
@@ -101,10 +108,8 @@
extern void vm_waitpfault(void);
#ifdef _KERNEL
-boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *);
+void vm_pageout_grow_cache(int, vm_paddr_t, vm_paddr_t);
void vm_pageout_oom(int shortage);
-boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
-void vm_contig_grow_cache(int, vm_paddr_t, vm_paddr_t);
#endif
#endif /* _VM_VM_PAGEOUT_H_ */
Modified: trunk/sys/vm/vm_pager.c
===================================================================
--- trunk/sys/vm/vm_pager.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_pager.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -64,7 +65,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pager.c 311645 2017-01-07 12:04:30Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -74,9 +75,11 @@
#include <sys/buf.h>
#include <sys/ucred.h>
#include <sys/malloc.h>
+#include <sys/rwlock.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
@@ -105,43 +108,35 @@
dead_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t off, struct ucred *cred)
{
- return NULL;
+
+ return (NULL);
}
static void
-dead_pager_putpages(object, m, count, flags, rtvals)
- vm_object_t object;
- vm_page_t *m;
- int count;
- int flags;
- int *rtvals;
+dead_pager_putpages(vm_object_t object, vm_page_t *m, int count,
+ int flags, int *rtvals)
{
int i;
- for (i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
rtvals[i] = VM_PAGER_AGAIN;
- }
}
static int
-dead_pager_haspage(object, pindex, prev, next)
- vm_object_t object;
- vm_pindex_t pindex;
- int *prev;
- int *next;
+dead_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *prev, int *next)
{
- if (prev)
+
+ if (prev != NULL)
*prev = 0;
- if (next)
+ if (next != NULL)
*next = 0;
- return FALSE;
+ return (FALSE);
}
static void
-dead_pager_dealloc(object)
- vm_object_t object;
+dead_pager_dealloc(vm_object_t object)
{
- return;
+
}
static struct pagerops deadpagerops = {
@@ -173,14 +168,13 @@
* cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size
* (MAXPHYS == 64k) if you want to get the most efficiency.
*/
-vm_map_t pager_map;
+struct mtx_padalign pbuf_mtx;
+static TAILQ_HEAD(swqueue, buf) bswlist;
static int bswneeded;
-static vm_offset_t swapbkva; /* swap buffers kva */
-struct mtx pbuf_mtx;
-static TAILQ_HEAD(swqueue, buf) bswlist;
+vm_offset_t swapbkva; /* swap buffers kva */
void
-vm_pager_init()
+vm_pager_init(void)
{
struct pagerops **pgops;
@@ -190,11 +184,11 @@
*/
for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++)
if ((*pgops)->pgo_init != NULL)
- (*(*pgops)->pgo_init) ();
+ (*(*pgops)->pgo_init)();
}
void
-vm_pager_bufferinit()
+vm_pager_bufferinit(void)
{
struct buf *bp;
int i;
@@ -214,10 +208,6 @@
cluster_pbuf_freecnt = nswbuf / 2;
vnode_pbuf_freecnt = nswbuf / 2 + 1;
-
- swapbkva = kmem_alloc_nofault(pager_map, nswbuf * MAXPHYS);
- if (!swapbkva)
- panic("Not enough pager_map VM space for physical buffers");
}
/*
@@ -234,7 +224,7 @@
ops = pagertab[type];
if (ops)
- ret = (*ops->pgo_alloc) (handle, size, prot, off, cred);
+ ret = (*ops->pgo_alloc)(handle, size, prot, off, cred);
else
ret = NULL;
return (ret);
@@ -244,11 +234,10 @@
* The object must be locked.
*/
void
-vm_pager_deallocate(object)
- vm_object_t object;
+vm_pager_deallocate(vm_object_t object)
{
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
(*pagertab[object->type]->pgo_dealloc) (object);
}
@@ -272,13 +261,13 @@
TAILQ_FOREACH(object, pg_list, pager_object_list) {
if (object->handle == handle) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if ((object->flags & OBJ_DEAD) == 0) {
vm_object_reference_locked(object);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
break;
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
}
return (object);
@@ -294,12 +283,13 @@
static void
initpbuf(struct buf *bp)
{
+
KASSERT(bp->b_bufobj == NULL, ("initpbuf with bufobj"));
KASSERT(bp->b_vp == NULL, ("initpbuf with vp"));
bp->b_rcred = NOCRED;
bp->b_wcred = NOCRED;
bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */
- bp->b_saveaddr = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva;
+ bp->b_saveaddr = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva;
bp->b_data = bp->b_saveaddr;
bp->b_kvabase = bp->b_saveaddr;
bp->b_kvasize = MAXPHYS;
@@ -332,9 +322,8 @@
struct buf *bp;
mtx_lock(&pbuf_mtx);
-
for (;;) {
- if (pfreecnt) {
+ if (pfreecnt != NULL) {
while (*pfreecnt == 0) {
msleep(pfreecnt, &pbuf_mtx, PVM, "wswbuf0", 0);
}
@@ -352,9 +341,8 @@
if (pfreecnt)
--*pfreecnt;
mtx_unlock(&pbuf_mtx);
-
initpbuf(bp);
- return bp;
+ return (bp);
}
/*
@@ -374,14 +362,10 @@
return NULL;
}
TAILQ_REMOVE(&bswlist, bp, b_freelist);
-
--*pfreecnt;
-
mtx_unlock(&pbuf_mtx);
-
initpbuf(bp);
-
- return bp;
+ return (bp);
}
/*
@@ -468,17 +452,9 @@
KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
KASSERT(bp->b_bufobj != NULL, ("pbrelvp: NULL bufobj"));
+ KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0,
+ ("pbrelvp: pager buf on vnode list."));
- /* XXX REMOVE ME */
- BO_LOCK(bp->b_bufobj);
- if (TAILQ_NEXT(bp, b_bobufs) != NULL) {
- panic(
- "relpbuf(): b_vp was probably reassignbuf()d %p %x",
- bp,
- (int)bp->b_flags
- );
- }
- BO_UNLOCK(bp->b_bufobj);
bp->b_vp = NULL;
bp->b_bufobj = NULL;
bp->b_flags &= ~B_PAGING;
@@ -493,17 +469,9 @@
KASSERT(bp->b_vp == NULL, ("pbrelbo: vnode"));
KASSERT(bp->b_bufobj != NULL, ("pbrelbo: NULL bufobj"));
+ KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0,
+ ("pbrelbo: pager buf on vnode list."));
- /* XXX REMOVE ME */
- BO_LOCK(bp->b_bufobj);
- if (TAILQ_NEXT(bp, b_bobufs) != NULL) {
- panic(
- "relpbuf(): b_vp was probably reassignbuf()d %p %x",
- bp,
- (int)bp->b_flags
- );
- }
- BO_UNLOCK(bp->b_bufobj);
bp->b_bufobj = NULL;
bp->b_flags &= ~B_PAGING;
}
Modified: trunk/sys/vm/vm_pager.h
===================================================================
--- trunk/sys/vm/vm_pager.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_pager.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1990 University of Utah.
* Copyright (c) 1991, 1993
@@ -32,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vm_pager.h 8.4 (Berkeley) 1/12/94
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_pager.h 308365 2016-11-06 13:37:33Z kib $
*/
/*
@@ -95,9 +96,8 @@
#ifdef _KERNEL
-extern vm_map_t pager_map;
extern struct pagerops *pagertab[];
-extern struct mtx pbuf_mtx;
+extern struct mtx_padalign pbuf_mtx;
vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t,
vm_ooffset_t, struct ucred *);
@@ -104,7 +104,6 @@
void vm_pager_bufferinit(void);
void vm_pager_deallocate(vm_object_t);
static __inline int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int);
-static __inline boolean_t vm_pager_has_page(vm_object_t, vm_pindex_t, int *, int *);
void vm_pager_init(void);
vm_object_t vm_pager_object_lookup(struct pagerlst *, void *);
@@ -124,7 +123,7 @@
) {
int r;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage);
if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) {
vm_page_zero_invalid(m[reqpage], TRUE);
@@ -141,7 +140,7 @@
int *rtvals
) {
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
(*pagertab[object->type]->pgo_putpages)
(object, m, count, flags, rtvals);
}
@@ -165,7 +164,7 @@
) {
boolean_t ret;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
ret = (*pagertab[object->type]->pgo_haspage)
(object, offset, before, after);
return (ret);
@@ -188,7 +187,7 @@
vm_pager_page_unswapped(vm_page_t m)
{
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ VM_OBJECT_ASSERT_LOCKED(m->object);
if (pagertab[m->object->type]->pgo_pageunswapped)
(*pagertab[m->object->type]->pgo_pageunswapped)(m);
}
Modified: trunk/sys/vm/vm_param.h
===================================================================
--- trunk/sys/vm/vm_param.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_param.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -57,7 +58,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_param.h 254168 2013-08-09 23:47:43Z zont $
*/
/*
@@ -82,25 +83,10 @@
#define VM_V_CACHE_MIN 7 /* cnt.v_cache_min */
#define VM_V_CACHE_MAX 8 /* cnt.v_cache_max */
#define VM_V_PAGEOUT_FREE_MIN 9 /* cnt.v_pageout_free_min */
-#define VM_PAGEOUT_ALGORITHM 10 /* pageout algorithm */
+#define VM_OBSOLETE_10 10 /* pageout algorithm */
#define VM_SWAPPING_ENABLED 11 /* swapping enabled */
#define VM_MAXID 12 /* number of valid vm ids */
-#define CTL_VM_NAMES { \
- { 0, 0 }, \
- { "vmtotal", CTLTYPE_STRUCT }, \
- { "loadavg", CTLTYPE_STRUCT }, \
- { "v_free_min", CTLTYPE_UINT }, \
- { "v_free_target", CTLTYPE_UINT }, \
- { "v_free_reserved", CTLTYPE_UINT }, \
- { "v_inactive_target", CTLTYPE_UINT }, \
- { "v_cache_min", CTLTYPE_UINT }, \
- { "v_cache_max", CTLTYPE_UINT }, \
- { "v_pageout_free_min", CTLTYPE_UINT}, \
- { "pageout_algorithm", CTLTYPE_INT}, \
- { "swap_enabled", CTLTYPE_INT},\
-}
-
/*
* Structure for swap device statistics
*/
Modified: trunk/sys/vm/vm_phys.c
===================================================================
--- trunk/sys/vm/vm_phys.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_phys.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2002-2006 Rice University
* Copyright (c) 2007 Alan L. Cox <alc at cs.rice.edu>
@@ -29,8 +30,15 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
+/*
+ * Physical memory system implementation
+ *
+ * Any external functions defined by this module are only to be used by the
+ * virtual memory system.
+ */
+
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_phys.c 308349 2016-11-05 20:14:23Z markj $");
#include "opt_ddb.h"
#include "opt_vm.h"
@@ -41,11 +49,13 @@
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#if MAXMEMDOM > 1
+#include <sys/proc.h>
+#endif
#include <sys/queue.h>
#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
-#include <sys/vnode.h>
#include <ddb/ddb.h>
@@ -55,33 +65,16 @@
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
-#include <vm/vm_reserv.h>
-/*
- * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
- * domain. These extra lists are stored at the end of the regular
- * free lists starting with VM_NFREELIST.
- */
-#define VM_RAW_NFREELIST (VM_NFREELIST + VM_NDOMAIN - 1)
+_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
+ "Too many physsegs.");
-struct vm_freelist {
- struct pglist pl;
- int lcnt;
-};
-
-struct vm_phys_seg {
- vm_paddr_t start;
- vm_paddr_t end;
- vm_page_t first_page;
- int domain;
- struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
-};
-
struct mem_affinity *mem_affinity;
-static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
+int vm_ndomains = 1;
-static int vm_phys_nsegs;
+struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
+int vm_phys_nsegs;
#define VM_PHYS_FICTITIOUS_NSEGS 8
static struct vm_phys_fictitious_seg {
@@ -90,15 +83,38 @@
vm_page_t first_page;
} vm_phys_fictitious_segs[VM_PHYS_FICTITIOUS_NSEGS];
static struct mtx vm_phys_fictitious_reg_mtx;
-MALLOC_DEFINE(M_FICT_PAGES, "", "");
+MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
static struct vm_freelist
- vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
-static struct vm_freelist
-(*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
+ vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
-static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
+static int vm_nfreelists;
+/*
+ * Provides the mapping from VM_FREELIST_* to free list indices (flind).
+ */
+static int vm_freelist_to_flind[VM_NFREELIST];
+
+CTASSERT(VM_FREELIST_DEFAULT == 0);
+
+#ifdef VM_FREELIST_ISADMA
+#define VM_ISADMA_BOUNDARY 16777216
+#endif
+#ifdef VM_FREELIST_DMA32
+#define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32)
+#endif
+
+/*
+ * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
+ * the ordering of the free list boundaries.
+ */
+#if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY)
+CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY);
+#endif
+#if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
+CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
+#endif
+
static int cnt_prezero;
SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
&cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
@@ -111,21 +127,49 @@
SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
-#if VM_NDOMAIN > 1
-static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
-SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
- NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
-#endif
+SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
+ &vm_ndomains, 0, "Number of physical memory domains available.");
static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
int order);
-static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
- int domain);
-static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
+static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
+static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
static int vm_phys_paddr_to_segind(vm_paddr_t pa);
static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
int order);
+static __inline int
+vm_rr_selectdomain(void)
+{
+#if MAXMEMDOM > 1
+ struct thread *td;
+
+ td = curthread;
+
+ td->td_dom_rr_idx++;
+ td->td_dom_rr_idx %= vm_ndomains;
+ return (td->td_dom_rr_idx);
+#else
+ return (0);
+#endif
+}
+
+boolean_t
+vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
+{
+ struct vm_phys_seg *s;
+ int idx;
+
+ while ((idx = ffsl(mask)) != 0) {
+ idx--; /* ffsl counts from 1 */
+ mask &= ~(1UL << idx);
+ s = &vm_phys_segs[idx];
+ if (low < s->end && high > s->start)
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
/*
* Outputs the state of the physical memory allocator, specifically,
* the amount of physical memory in each free list.
@@ -135,30 +179,34 @@
{
struct sbuf sbuf;
struct vm_freelist *fl;
- int error, flind, oind, pind;
+ int dom, error, flind, oind, pind;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
- sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
- for (flind = 0; flind < vm_nfreelists; flind++) {
- sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
- "\n ORDER (SIZE) | NUMBER"
- "\n ", flind);
- for (pind = 0; pind < VM_NFREEPOOL; pind++)
- sbuf_printf(&sbuf, " | POOL %d", pind);
- sbuf_printf(&sbuf, "\n-- ");
- for (pind = 0; pind < VM_NFREEPOOL; pind++)
- sbuf_printf(&sbuf, "-- -- ");
- sbuf_printf(&sbuf, "--\n");
- for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
- sbuf_printf(&sbuf, " %2d (%6dK)", oind,
- 1 << (PAGE_SHIFT - 10 + oind));
- for (pind = 0; pind < VM_NFREEPOOL; pind++) {
- fl = vm_phys_free_queues[flind][pind];
- sbuf_printf(&sbuf, " | %6d", fl[oind].lcnt);
+ sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
+ for (dom = 0; dom < vm_ndomains; dom++) {
+ sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
+ "\n ORDER (SIZE) | NUMBER"
+ "\n ", flind);
+ for (pind = 0; pind < VM_NFREEPOOL; pind++)
+ sbuf_printf(&sbuf, " | POOL %d", pind);
+ sbuf_printf(&sbuf, "\n-- ");
+ for (pind = 0; pind < VM_NFREEPOOL; pind++)
+ sbuf_printf(&sbuf, "-- -- ");
+ sbuf_printf(&sbuf, "--\n");
+ for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+ sbuf_printf(&sbuf, " %2d (%6dK)", oind,
+ 1 << (PAGE_SHIFT - 10 + oind));
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ fl = vm_phys_free_queues[dom][flind][pind];
+ sbuf_printf(&sbuf, " | %6d",
+ fl[oind].lcnt);
+ }
+ sbuf_printf(&sbuf, "\n");
}
- sbuf_printf(&sbuf, "\n");
}
}
error = sbuf_finish(&sbuf);
@@ -195,78 +243,56 @@
return (error);
}
-#if VM_NDOMAIN > 1
-/*
- * Outputs the set of free list lookup lists.
- */
-static int
-sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
+static void
+vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
{
- struct sbuf sbuf;
- int domain, error, flind, ndomains;
- error = sysctl_wire_old_buffer(req, 0);
- if (error != 0)
- return (error);
- sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
- ndomains = vm_nfreelists - VM_NFREELIST + 1;
- for (domain = 0; domain < ndomains; domain++) {
- sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
- for (flind = 0; flind < vm_nfreelists; flind++)
- sbuf_printf(&sbuf, " [%d]:\t%p\n", flind,
- vm_phys_lookup_lists[domain][flind]);
- }
- error = sbuf_finish(&sbuf);
- sbuf_delete(&sbuf);
- return (error);
+ m->order = order;
+ if (tail)
+ TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
+ else
+ TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
+ fl[order].lcnt++;
}
-#endif
-
+
+static void
+vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
+{
+
+ TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
+ fl[order].lcnt--;
+ m->order = VM_NFREEORDER;
+}
+
/*
* Create a physical memory segment.
*/
static void
-_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
+_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
{
struct vm_phys_seg *seg;
-#ifdef VM_PHYSSEG_SPARSE
- long pages;
- int segind;
- pages = 0;
- for (segind = 0; segind < vm_phys_nsegs; segind++) {
- seg = &vm_phys_segs[segind];
- pages += atop(seg->end - seg->start);
- }
-#endif
KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
+ KASSERT(domain < vm_ndomains,
+ ("vm_phys_create_seg: invalid domain provided"));
seg = &vm_phys_segs[vm_phys_nsegs++];
+ while (seg > vm_phys_segs && (seg - 1)->start >= end) {
+ *seg = *(seg - 1);
+ seg--;
+ }
seg->start = start;
seg->end = end;
seg->domain = domain;
-#ifdef VM_PHYSSEG_SPARSE
- seg->first_page = &vm_page_array[pages];
-#else
- seg->first_page = PHYS_TO_VM_PAGE(start);
-#endif
-#if VM_NDOMAIN > 1
- if (flind == VM_FREELIST_DEFAULT && domain != 0) {
- flind = VM_NFREELIST + (domain - 1);
- if (flind >= vm_nfreelists)
- vm_nfreelists = flind + 1;
- }
-#endif
- seg->free_queues = &vm_phys_free_queues[flind];
}
static void
-vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
+vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
{
int i;
if (mem_affinity == NULL) {
- _vm_phys_create_seg(start, end, flind, 0);
+ _vm_phys_create_seg(start, end, 0);
return;
}
@@ -279,11 +305,11 @@
panic("No affinity info for start %jx",
(uintmax_t)start);
if (mem_affinity[i].end >= end) {
- _vm_phys_create_seg(start, end, flind,
+ _vm_phys_create_seg(start, end,
mem_affinity[i].domain);
break;
}
- _vm_phys_create_seg(start, mem_affinity[i].end, flind,
+ _vm_phys_create_seg(start, mem_affinity[i].end,
mem_affinity[i].domain);
start = mem_affinity[i].end;
}
@@ -290,90 +316,163 @@
}
/*
+ * Add a physical memory segment.
+ */
+void
+vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
+{
+ vm_paddr_t paddr;
+
+ KASSERT((start & PAGE_MASK) == 0,
+ ("vm_phys_define_seg: start is not page aligned"));
+ KASSERT((end & PAGE_MASK) == 0,
+ ("vm_phys_define_seg: end is not page aligned"));
+
+ /*
+ * Split the physical memory segment if it spans two or more free
+ * list boundaries.
+ */
+ paddr = start;
+#ifdef VM_FREELIST_ISADMA
+ if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) {
+ vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY);
+ paddr = VM_ISADMA_BOUNDARY;
+ }
+#endif
+#ifdef VM_FREELIST_LOWMEM
+ if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
+ vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
+ paddr = VM_LOWMEM_BOUNDARY;
+ }
+#endif
+#ifdef VM_FREELIST_DMA32
+ if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
+ vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
+ paddr = VM_DMA32_BOUNDARY;
+ }
+#endif
+ vm_phys_create_seg(paddr, end);
+}
+
+/*
* Initialize the physical memory allocator.
+ *
+ * Requires that vm_page_array is initialized!
*/
void
vm_phys_init(void)
{
struct vm_freelist *fl;
- int flind, i, oind, pind;
-#if VM_NDOMAIN > 1
- int ndomains, j;
+ struct vm_phys_seg *seg;
+ u_long npages;
+ int dom, flind, freelist, oind, pind, segind;
+
+ /*
+ * Compute the number of free lists, and generate the mapping from the
+ * manifest constants VM_FREELIST_* to the free list indices.
+ *
+ * Initially, the entries of vm_freelist_to_flind[] are set to either
+ * 0 or 1 to indicate which free lists should be created.
+ */
+ npages = 0;
+ for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
+ seg = &vm_phys_segs[segind];
+#ifdef VM_FREELIST_ISADMA
+ if (seg->end <= VM_ISADMA_BOUNDARY)
+ vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1;
+ else
#endif
+#ifdef VM_FREELIST_LOWMEM
+ if (seg->end <= VM_LOWMEM_BOUNDARY)
+ vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
+ else
+#endif
+#ifdef VM_FREELIST_DMA32
+ if (
+#ifdef VM_DMA32_NPAGES_THRESHOLD
+ /*
+ * Create the DMA32 free list only if the amount of
+ * physical memory above physical address 4G exceeds the
+ * given threshold.
+ */
+ npages > VM_DMA32_NPAGES_THRESHOLD &&
+#endif
+ seg->end <= VM_DMA32_BOUNDARY)
+ vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
+ else
+#endif
+ {
+ npages += atop(seg->end - seg->start);
+ vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
+ }
+ }
+ /* Change each entry into a running total of the free lists. */
+ for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
+ vm_freelist_to_flind[freelist] +=
+ vm_freelist_to_flind[freelist - 1];
+ }
+ vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
+ KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
+ /* Change each entry into a free list index. */
+ for (freelist = 0; freelist < VM_NFREELIST; freelist++)
+ vm_freelist_to_flind[freelist]--;
- for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+ /*
+ * Initialize the first_page and free_queues fields of each physical
+ * memory segment.
+ */
+#ifdef VM_PHYSSEG_SPARSE
+ npages = 0;
+#endif
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ seg = &vm_phys_segs[segind];
+#ifdef VM_PHYSSEG_SPARSE
+ seg->first_page = &vm_page_array[npages];
+ npages += atop(seg->end - seg->start);
+#else
+ seg->first_page = PHYS_TO_VM_PAGE(seg->start);
+#endif
#ifdef VM_FREELIST_ISADMA
- if (phys_avail[i] < 16777216) {
- if (phys_avail[i + 1] > 16777216) {
- vm_phys_create_seg(phys_avail[i], 16777216,
- VM_FREELIST_ISADMA);
- vm_phys_create_seg(16777216, phys_avail[i + 1],
- VM_FREELIST_DEFAULT);
- } else {
- vm_phys_create_seg(phys_avail[i],
- phys_avail[i + 1], VM_FREELIST_ISADMA);
- }
- if (VM_FREELIST_ISADMA >= vm_nfreelists)
- vm_nfreelists = VM_FREELIST_ISADMA + 1;
+ if (seg->end <= VM_ISADMA_BOUNDARY) {
+ flind = vm_freelist_to_flind[VM_FREELIST_ISADMA];
+ KASSERT(flind >= 0,
+ ("vm_phys_init: ISADMA flind < 0"));
} else
#endif
-#ifdef VM_FREELIST_HIGHMEM
- if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
- if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
- vm_phys_create_seg(phys_avail[i],
- VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
- vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
- phys_avail[i + 1], VM_FREELIST_HIGHMEM);
- } else {
- vm_phys_create_seg(phys_avail[i],
- phys_avail[i + 1], VM_FREELIST_HIGHMEM);
- }
- if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
- vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
+#ifdef VM_FREELIST_LOWMEM
+ if (seg->end <= VM_LOWMEM_BOUNDARY) {
+ flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
+ KASSERT(flind >= 0,
+ ("vm_phys_init: LOWMEM flind < 0"));
} else
#endif
- vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
- VM_FREELIST_DEFAULT);
- }
- for (flind = 0; flind < vm_nfreelists; flind++) {
- for (pind = 0; pind < VM_NFREEPOOL; pind++) {
- fl = vm_phys_free_queues[flind][pind];
- for (oind = 0; oind < VM_NFREEORDER; oind++)
- TAILQ_INIT(&fl[oind].pl);
+#ifdef VM_FREELIST_DMA32
+ if (seg->end <= VM_DMA32_BOUNDARY) {
+ flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
+ KASSERT(flind >= 0,
+ ("vm_phys_init: DMA32 flind < 0"));
+ } else
+#endif
+ {
+ flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
+ KASSERT(flind >= 0,
+ ("vm_phys_init: DEFAULT flind < 0"));
}
+ seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
}
-#if VM_NDOMAIN > 1
+
/*
- * Build a free list lookup list for each domain. All of the
- * memory domain lists are inserted at the VM_FREELIST_DEFAULT
- * index in a round-robin order starting with the current
- * domain.
+ * Initialize the free queues.
*/
- ndomains = vm_nfreelists - VM_NFREELIST + 1;
- for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
- for (i = 0; i < ndomains; i++)
- vm_phys_lookup_lists[i][flind] =
- &vm_phys_free_queues[flind];
- for (i = 0; i < ndomains; i++)
- for (j = 0; j < ndomains; j++) {
- flind = (i + j) % ndomains;
- if (flind == 0)
- flind = VM_FREELIST_DEFAULT;
- else
- flind += VM_NFREELIST - 1;
- vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
- &vm_phys_free_queues[flind];
+ for (dom = 0; dom < vm_ndomains; dom++) {
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ fl = vm_phys_free_queues[dom][flind][pind];
+ for (oind = 0; oind < VM_NFREEORDER; oind++)
+ TAILQ_INIT(&fl[oind].pl);
+ }
}
- for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
- flind++)
- for (i = 0; i < ndomains; i++)
- vm_phys_lookup_lists[i][flind + ndomains - 1] =
- &vm_phys_free_queues[flind];
-#else
- for (flind = 0; flind < vm_nfreelists; flind++)
- vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
-#endif
-
+ }
mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF);
}
@@ -391,9 +490,7 @@
KASSERT(m_buddy->order == VM_NFREEORDER,
("vm_phys_split_pages: page %p has unexpected order %d",
m_buddy, m_buddy->order));
- m_buddy->order = oind;
- TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
- fl[oind].lcnt++;
+ vm_freelist_add(fl, m_buddy, oind, 0);
}
}
@@ -404,12 +501,17 @@
vm_phys_add_page(vm_paddr_t pa)
{
vm_page_t m;
+ struct vm_domain *vmd;
cnt.v_page_count++;
m = vm_phys_paddr_to_vm_page(pa);
+ m->busy_lock = VPB_UNBUSIED;
m->phys_addr = pa;
m->queue = PQ_NONE;
m->segind = vm_phys_paddr_to_segind(pa);
+ vmd = vm_phys_domain(m);
+ vmd->vmd_page_count++;
+ vmd->vmd_segs |= 1UL << m->segind;
m->flags = PG_FREE;
KASSERT(m->order == VM_NFREEORDER,
("vm_phys_add_page: page %p has unexpected order %d",
@@ -417,7 +519,7 @@
m->pool = VM_FREEPOOL_DEFAULT;
pmap_page_init(m);
mtx_lock(&vm_page_queue_free_mtx);
- cnt.v_free_count++;
+ vm_phys_freecnt_adj(m, 1);
vm_phys_free_pages(m, 0);
mtx_unlock(&vm_page_queue_free_mtx);
}
@@ -432,7 +534,7 @@
vm_phys_alloc_pages(int pool, int order)
{
vm_page_t m;
- int domain, flind;
+ int dom, domain, flind;
KASSERT(pool < VM_NFREEPOOL,
("vm_phys_alloc_pages: pool %d is out of range", pool));
@@ -439,63 +541,46 @@
KASSERT(order < VM_NFREEORDER,
("vm_phys_alloc_pages: order %d is out of range", order));
-#if VM_NDOMAIN > 1
- domain = PCPU_GET(domain);
-#else
- domain = 0;
-#endif
- for (flind = 0; flind < vm_nfreelists; flind++) {
- m = vm_phys_alloc_domain_pages(domain, flind, pool, order);
- if (m != NULL)
- return (m);
+ for (dom = 0; dom < vm_ndomains; dom++) {
+ domain = vm_rr_selectdomain();
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ m = vm_phys_alloc_domain_pages(domain, flind, pool,
+ order);
+ if (m != NULL)
+ return (m);
+ }
}
return (NULL);
}
/*
- * Find and dequeue a free page on the given free list, with the
- * specified pool and order
+ * Allocate a contiguous, power of two-sized set of physical pages from the
+ * specified free list. The free list must be specified using one of the
+ * manifest constants VM_FREELIST_*.
+ *
+ * The free page queues must be locked.
*/
vm_page_t
-vm_phys_alloc_freelist_pages(int flind, int pool, int order)
+vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
{
-#if VM_NDOMAIN > 1
vm_page_t m;
- int i, ndomains;
-#endif
- int domain;
+ int dom, domain;
- KASSERT(flind < VM_NFREELIST,
- ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
+ KASSERT(freelist < VM_NFREELIST,
+ ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
+ freelist));
KASSERT(pool < VM_NFREEPOOL,
("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
KASSERT(order < VM_NFREEORDER,
("vm_phys_alloc_freelist_pages: order %d is out of range", order));
-
-#if VM_NDOMAIN > 1
- /*
- * This routine expects to be called with a VM_FREELIST_* constant.
- * On a system with multiple domains we need to adjust the flind
- * appropriately. If it is for VM_FREELIST_DEFAULT we need to
- * iterate over the per-domain lists.
- */
- domain = PCPU_GET(domain);
- ndomains = vm_nfreelists - VM_NFREELIST + 1;
- if (flind == VM_FREELIST_DEFAULT) {
- m = NULL;
- for (i = 0; i < ndomains; i++, flind++) {
- m = vm_phys_alloc_domain_pages(domain, flind, pool,
- order);
- if (m != NULL)
- break;
- }
- return (m);
- } else if (flind > VM_FREELIST_DEFAULT)
- flind += ndomains - 1;
-#else
- domain = 0;
-#endif
- return (vm_phys_alloc_domain_pages(domain, flind, pool, order));
+ for (dom = 0; dom < vm_ndomains; dom++) {
+ domain = vm_rr_selectdomain();
+ m = vm_phys_alloc_domain_pages(domain,
+ vm_freelist_to_flind[freelist], pool, order);
+ if (m != NULL)
+ return (m);
+ }
+ return (NULL);
}
static vm_page_t
@@ -507,13 +592,11 @@
vm_page_t m;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- fl = (*vm_phys_lookup_lists[domain][flind])[pool];
+ fl = &vm_phys_free_queues[domain][flind][pool][0];
for (oind = order; oind < VM_NFREEORDER; oind++) {
m = TAILQ_FIRST(&fl[oind].pl);
if (m != NULL) {
- TAILQ_REMOVE(&fl[oind].pl, m, pageq);
- fl[oind].lcnt--;
- m->order = VM_NFREEORDER;
+ vm_freelist_rem(fl, m, oind);
vm_phys_split_pages(m, oind, fl, order);
return (m);
}
@@ -527,12 +610,10 @@
*/
for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
- alt = (*vm_phys_lookup_lists[domain][flind])[pind];
+ alt = &vm_phys_free_queues[domain][flind][pind][0];
m = TAILQ_FIRST(&alt[oind].pl);
if (m != NULL) {
- TAILQ_REMOVE(&alt[oind].pl, m, pageq);
- alt[oind].lcnt--;
- m->order = VM_NFREEORDER;
+ vm_freelist_rem(alt, m, oind);
vm_phys_set_pool(pool, m, oind);
vm_phys_split_pages(m, oind, fl, order);
return (m);
@@ -543,26 +624,6 @@
}
/*
- * Allocate physical memory from phys_avail[].
- */
-vm_paddr_t
-vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
-{
- vm_paddr_t pa;
- int i;
-
- size = round_page(size);
- for (i = 0; phys_avail[i + 1] != 0; i += 2) {
- if (phys_avail[i + 1] - phys_avail[i] < size)
- continue;
- pa = phys_avail[i];
- phys_avail[i] += size;
- return (pa);
- }
- panic("vm_phys_bootstrap_alloc");
-}
-
-/*
* Find the vm_page corresponding to the given physical address.
*/
vm_page_t
@@ -616,7 +677,9 @@
#ifdef VM_PHYSSEG_DENSE
pi = atop(start);
- if (pi >= first_page && atop(end) < vm_page_array_size) {
+ if (pi >= first_page && pi < vm_page_array_size + first_page) {
+ if (atop(end) >= vm_page_array_size + first_page)
+ return (EINVAL);
fp = &vm_page_array[pi - first_page];
malloced = FALSE;
} else
@@ -630,8 +693,8 @@
}
for (i = 0; i < page_count; i++) {
vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr);
- pmap_page_init(&fp[i]);
- fp[i].oflags &= ~(VPO_BUSY | VPO_UNMANAGED);
+ fp[i].oflags &= ~VPO_UNMANAGED;
+ fp[i].busy_lock = VPB_UNBUSIED;
}
mtx_lock(&vm_phys_fictitious_reg_mtx);
for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
@@ -713,7 +776,7 @@
{
struct vm_freelist *fl;
struct vm_phys_seg *seg;
- vm_paddr_t pa, pa_buddy;
+ vm_paddr_t pa;
vm_page_t m_buddy;
KASSERT(m->order == VM_NFREEORDER,
@@ -725,33 +788,71 @@
KASSERT(order < VM_NFREEORDER,
("vm_phys_free_pages: order %d is out of range", order));
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- pa = VM_PAGE_TO_PHYS(m);
seg = &vm_phys_segs[m->segind];
- while (order < VM_NFREEORDER - 1) {
- pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
- if (pa_buddy < seg->start ||
- pa_buddy >= seg->end)
- break;
- m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
- if (m_buddy->order != order)
- break;
- fl = (*seg->free_queues)[m_buddy->pool];
- TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
- fl[m_buddy->order].lcnt--;
- m_buddy->order = VM_NFREEORDER;
- if (m_buddy->pool != m->pool)
- vm_phys_set_pool(m->pool, m_buddy, order);
- order++;
- pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
- m = &seg->first_page[atop(pa - seg->start)];
+ if (order < VM_NFREEORDER - 1) {
+ pa = VM_PAGE_TO_PHYS(m);
+ do {
+ pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
+ if (pa < seg->start || pa >= seg->end)
+ break;
+ m_buddy = &seg->first_page[atop(pa - seg->start)];
+ if (m_buddy->order != order)
+ break;
+ fl = (*seg->free_queues)[m_buddy->pool];
+ vm_freelist_rem(fl, m_buddy, order);
+ if (m_buddy->pool != m->pool)
+ vm_phys_set_pool(m->pool, m_buddy, order);
+ order++;
+ pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
+ m = &seg->first_page[atop(pa - seg->start)];
+ } while (order < VM_NFREEORDER - 1);
}
- m->order = order;
fl = (*seg->free_queues)[m->pool];
- TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
- fl[order].lcnt++;
+ vm_freelist_add(fl, m, order, 1);
}
/*
+ * Free a contiguous, arbitrarily sized set of physical pages.
+ *
+ * The free page queues must be locked.
+ */
+void
+vm_phys_free_contig(vm_page_t m, u_long npages)
+{
+ u_int n;
+ int order;
+
+ /*
+ * Avoid unnecessary coalescing by freeing the pages in the largest
+ * possible power-of-two-sized subsets.
+ */
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ for (;; npages -= n) {
+ /*
+ * Unsigned "min" is used here so that "order" is assigned
+ * "VM_NFREEORDER - 1" when "m"'s physical address is zero
+ * or the low-order bits of its physical address are zero
+ * because the size of a physical address exceeds the size of
+ * a long.
+ */
+ order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
+ VM_NFREEORDER - 1);
+ n = 1 << order;
+ if (npages < n)
+ break;
+ vm_phys_free_pages(m, order);
+ m += n;
+ }
+ /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
+ for (; npages > 0; npages -= n) {
+ order = flsl(npages) - 1;
+ n = 1 << order;
+ vm_phys_free_pages(m, order);
+ m += n;
+ }
+}
+
+/*
* Set the pool for a contiguous, power of two-sized set of physical pages.
*/
void
@@ -812,9 +913,7 @@
*/
fl = (*seg->free_queues)[m_set->pool];
order = m_set->order;
- TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
- fl[order].lcnt--;
- m_set->order = VM_NFREEORDER;
+ vm_freelist_rem(fl, m_set, order);
while (order > 0) {
order--;
pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
@@ -824,9 +923,7 @@
m_tmp = m_set;
m_set = &seg->first_page[atop(pa_half - seg->start)];
}
- m_tmp->order = order;
- TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
- fl[order].lcnt++;
+ vm_freelist_add(fl, m_tmp, order, 0);
}
KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
return (TRUE);
@@ -838,22 +935,25 @@
boolean_t
vm_phys_zero_pages_idle(void)
{
- static struct vm_freelist *fl = vm_phys_free_queues[0][0];
+ static struct vm_freelist *fl;
static int flind, oind, pind;
vm_page_t m, m_tmp;
+ int domain;
+ domain = vm_rr_selectdomain();
+ fl = vm_phys_free_queues[domain][0][0];
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
for (;;) {
- TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
+ TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) {
for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
vm_phys_unfree_page(m_tmp);
- cnt.v_free_count--;
+ vm_phys_freecnt_adj(m, -1);
mtx_unlock(&vm_page_queue_free_mtx);
pmap_zero_page_idle(m_tmp);
m_tmp->flags |= PG_ZERO;
mtx_lock(&vm_page_queue_free_mtx);
- cnt.v_free_count++;
+ vm_phys_freecnt_adj(m, 1);
vm_phys_free_pages(m_tmp, 0);
vm_page_zero_count++;
cnt_prezero++;
@@ -871,7 +971,7 @@
if (flind == vm_nfreelists)
flind = 0;
}
- fl = vm_phys_free_queues[flind][pind];
+ fl = vm_phys_free_queues[domain][flind][pind];
}
}
}
@@ -887,21 +987,17 @@
* "alignment" and "boundary" must be a power of two.
*/
vm_page_t
-vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
- unsigned long alignment, unsigned long boundary)
+vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+ u_long alignment, vm_paddr_t boundary)
{
struct vm_freelist *fl;
struct vm_phys_seg *seg;
- struct vnode *vp;
vm_paddr_t pa, pa_last, size;
- vm_page_t deferred_vdrop_list, m, m_ret;
- int domain, flind, i, oind, order, pind;
+ vm_page_t m, m_ret;
+ u_long npages_end;
+ int dom, domain, flind, oind, order, pind;
-#if VM_NDOMAIN > 1
- domain = PCPU_GET(domain);
-#else
- domain = 0;
-#endif
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
size = npages << PAGE_SHIFT;
KASSERT(size != 0,
("vm_phys_alloc_contig: size must not be 0"));
@@ -909,19 +1005,16 @@
("vm_phys_alloc_contig: alignment must be a power of 2"));
KASSERT((boundary & (boundary - 1)) == 0,
("vm_phys_alloc_contig: boundary must be a power of 2"));
- deferred_vdrop_list = NULL;
/* Compute the queue that is the best fit for npages. */
for (order = 0; (1 << order) < npages; order++);
- mtx_lock(&vm_page_queue_free_mtx);
-#if VM_NRESERVLEVEL > 0
-retry:
-#endif
+ dom = 0;
+restartdom:
+ domain = vm_rr_selectdomain();
for (flind = 0; flind < vm_nfreelists; flind++) {
for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
- fl = (*vm_phys_lookup_lists[domain][flind])
- [pind];
- TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
+ fl = &vm_phys_free_queues[domain][flind][pind][0];
+ TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
/*
* A free list may contain physical pages
* from one or more segments.
@@ -974,51 +1067,22 @@
}
}
}
-#if VM_NRESERVLEVEL > 0
- if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary))
- goto retry;
-#endif
- mtx_unlock(&vm_page_queue_free_mtx);
+ if (++dom < vm_ndomains)
+ goto restartdom;
return (NULL);
done:
for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
fl = (*seg->free_queues)[m->pool];
- TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
- fl[m->order].lcnt--;
- m->order = VM_NFREEORDER;
+ vm_freelist_rem(fl, m, m->order);
}
if (m_ret->pool != VM_FREEPOOL_DEFAULT)
vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
fl = (*seg->free_queues)[m_ret->pool];
vm_phys_split_pages(m_ret, oind, fl, order);
- for (i = 0; i < npages; i++) {
- m = &m_ret[i];
- vp = vm_page_alloc_init(m);
- if (vp != NULL) {
- /*
- * Enqueue the vnode for deferred vdrop().
- *
- * Unmanaged pages don't use "pageq", so it
- * can be safely abused to construct a short-
- * lived queue of vnodes.
- */
- m->pageq.tqe_prev = (void *)vp;
- m->pageq.tqe_next = deferred_vdrop_list;
- deferred_vdrop_list = m;
- }
- }
- for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
- m = &m_ret[i];
- KASSERT(m->order == VM_NFREEORDER,
- ("vm_phys_alloc_contig: page %p has unexpected order %d",
- m, m->order));
- vm_phys_free_pages(m, 0);
- }
- mtx_unlock(&vm_page_queue_free_mtx);
- while (deferred_vdrop_list != NULL) {
- vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
- deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
- }
+ /* Return excess pages to the free lists. */
+ npages_end = roundup2(npages, 1 << imin(oind, order));
+ if (npages < npages_end)
+ vm_phys_free_contig(&m_ret[npages], npages_end - npages);
return (m_ret);
}
@@ -1029,24 +1093,28 @@
DB_SHOW_COMMAND(freepages, db_show_freepages)
{
struct vm_freelist *fl;
- int flind, oind, pind;
+ int flind, oind, pind, dom;
- for (flind = 0; flind < vm_nfreelists; flind++) {
- db_printf("FREE LIST %d:\n"
- "\n ORDER (SIZE) | NUMBER"
- "\n ", flind);
- for (pind = 0; pind < VM_NFREEPOOL; pind++)
- db_printf(" | POOL %d", pind);
- db_printf("\n-- ");
- for (pind = 0; pind < VM_NFREEPOOL; pind++)
- db_printf("-- -- ");
- db_printf("--\n");
- for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
- db_printf(" %2.2d (%6.6dK)", oind,
- 1 << (PAGE_SHIFT - 10 + oind));
- for (pind = 0; pind < VM_NFREEPOOL; pind++) {
- fl = vm_phys_free_queues[flind][pind];
- db_printf(" | %6.6d", fl[oind].lcnt);
+ for (dom = 0; dom < vm_ndomains; dom++) {
+ db_printf("DOMAIN: %d\n", dom);
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ db_printf("FREE LIST %d:\n"
+ "\n ORDER (SIZE) | NUMBER"
+ "\n ", flind);
+ for (pind = 0; pind < VM_NFREEPOOL; pind++)
+ db_printf(" | POOL %d", pind);
+ db_printf("\n-- ");
+ for (pind = 0; pind < VM_NFREEPOOL; pind++)
+ db_printf("-- -- ");
+ db_printf("--\n");
+ for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+ db_printf(" %2.2d (%6.6dK)", oind,
+ 1 << (PAGE_SHIFT - 10 + oind));
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ fl = vm_phys_free_queues[dom][flind][pind];
+ db_printf(" | %6.6d", fl[oind].lcnt);
+ }
+ db_printf("\n");
}
db_printf("\n");
}
Modified: trunk/sys/vm/vm_phys.h
===================================================================
--- trunk/sys/vm/vm_phys.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_phys.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2002-2006 Rice University
* Copyright (c) 2007 Alan L. Cox <alc at cs.rice.edu>
@@ -28,7 +29,7 @@
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_phys.h 285634 2015-07-16 14:41:58Z kib $
*/
/*
@@ -47,24 +48,76 @@
int domain;
};
+struct vm_freelist {
+ struct pglist pl;
+ int lcnt;
+};
+
+struct vm_phys_seg {
+ vm_paddr_t start;
+ vm_paddr_t end;
+ vm_page_t first_page;
+ int domain;
+ struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
+};
+
extern struct mem_affinity *mem_affinity;
+extern int vm_ndomains;
+extern struct vm_phys_seg vm_phys_segs[];
+extern int vm_phys_nsegs;
+/*
+ * The following functions are only to be used by the virtual memory system.
+ */
void vm_phys_add_page(vm_paddr_t pa);
-vm_page_t vm_phys_alloc_contig(unsigned long npages,
- vm_paddr_t low, vm_paddr_t high,
- unsigned long alignment, unsigned long boundary);
-vm_page_t vm_phys_alloc_freelist_pages(int flind, int pool, int order);
+void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end);
+vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+ u_long alignment, vm_paddr_t boundary);
+vm_page_t vm_phys_alloc_freelist_pages(int freelist, int pool, int order);
vm_page_t vm_phys_alloc_pages(int pool, int order);
-vm_paddr_t vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment);
+boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high);
int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
vm_memattr_t memattr);
void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end);
vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa);
+void vm_phys_free_contig(vm_page_t m, u_long npages);
void vm_phys_free_pages(vm_page_t m, int order);
void vm_phys_init(void);
+vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
void vm_phys_set_pool(int pool, vm_page_t m, int order);
boolean_t vm_phys_unfree_page(vm_page_t m);
boolean_t vm_phys_zero_pages_idle(void);
+/*
+ * vm_phys_domain:
+ *
+ * Return the memory domain the page belongs to.
+ */
+static inline struct vm_domain *
+vm_phys_domain(vm_page_t m)
+{
+#if MAXMEMDOM > 1
+ int domn, segind;
+
+ /* XXXKIB try to assert that the page is managed */
+ segind = m->segind;
+ KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m));
+ domn = vm_phys_segs[segind].domain;
+ KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m));
+ return (&vm_dom[domn]);
+#else
+ return (&vm_dom[0]);
+#endif
+}
+
+static inline void
+vm_phys_freecnt_adj(vm_page_t m, int adj)
+{
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ cnt.v_free_count += adj;
+ vm_phys_domain(m)->vmd_free_count += adj;
+}
+
#endif /* _KERNEL */
#endif /* !_VM_PHYS_H_ */
Added: trunk/sys/vm/vm_radix.c
===================================================================
--- trunk/sys/vm/vm_radix.c (rev 0)
+++ trunk/sys/vm/vm_radix.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -0,0 +1,857 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff at freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul at gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Path-compressed radix trie implementation.
+ * The following code is not generalized into a general purpose library
+ * because there are way too many parameters embedded that should really
+ * be decided by the library consumers. At the same time, consumers
+ * of this code must achieve highest possible performance.
+ *
+ * The implementation takes into account the following rationale:
+ * - Size of the nodes should be as small as possible but still big enough
+ * to avoid a large maximum depth for the trie. This is a balance
+ * between the necessity to not wire too much physical memory for the nodes
+ * and the necessity to avoid too much cache pollution during the trie
+ * operations.
+ * - There is not a huge bias toward the number of lookup operations over
+ * the number of insert and remove operations. This basically implies
+ * that optimizations supposedly helping one operation but hurting the
+ * other might be carefully evaluated.
+ * - On average not many nodes are expected to be fully populated, hence
+ * level compression may just complicate things.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_radix.c 298653 2016-04-26 17:39:54Z pfg $");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/vmmeter.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_page.h>
+#include <vm/vm_radix.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * These widths should allow the pointers to a node's children to fit within
+ * a single cache line. The extra levels from a narrow width should not be
+ * a problem thanks to path compression.
+ */
+#ifdef __LP64__
+#define VM_RADIX_WIDTH 4
+#else
+#define VM_RADIX_WIDTH 3
+#endif
+
+#define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH)
+#define VM_RADIX_MASK (VM_RADIX_COUNT - 1)
+#define VM_RADIX_LIMIT \
+ (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
+
+/* Flag bits stored in node pointers. */
+#define VM_RADIX_ISLEAF 0x1
+#define VM_RADIX_FLAGS 0x1
+#define VM_RADIX_PAD VM_RADIX_FLAGS
+
+/* Returns one unit associated with specified level. */
+#define VM_RADIX_UNITLEVEL(lev) \
+ ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
+
+struct vm_radix_node {
+ vm_pindex_t rn_owner; /* Owner of record. */
+ uint16_t rn_count; /* Valid children. */
+ uint16_t rn_clev; /* Current level. */
+ void *rn_child[VM_RADIX_COUNT]; /* Child nodes. */
+};
+
+static uma_zone_t vm_radix_node_zone;
+
+/*
+ * Allocate a radix node.
+ */
+static __inline struct vm_radix_node *
+vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
+{
+ struct vm_radix_node *rnode;
+
+ rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
+ if (rnode == NULL)
+ return (NULL);
+ rnode->rn_owner = owner;
+ rnode->rn_count = count;
+ rnode->rn_clev = clevel;
+ return (rnode);
+}
+
+/*
+ * Free radix node.
+ */
+static __inline void
+vm_radix_node_put(struct vm_radix_node *rnode)
+{
+
+ uma_zfree(vm_radix_node_zone, rnode);
+}
+
+/*
+ * Return the position in the array for a given level.
+ */
+static __inline int
+vm_radix_slot(vm_pindex_t index, uint16_t level)
+{
+
+ return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
+}
+
+/* Trims the key after the specified level. */
+static __inline vm_pindex_t
+vm_radix_trimkey(vm_pindex_t index, uint16_t level)
+{
+ vm_pindex_t ret;
+
+ ret = index;
+ if (level > 0) {
+ ret >>= level * VM_RADIX_WIDTH;
+ ret <<= level * VM_RADIX_WIDTH;
+ }
+ return (ret);
+}
+
+/*
+ * Get the root node for a radix tree.
+ */
+static __inline struct vm_radix_node *
+vm_radix_getroot(struct vm_radix *rtree)
+{
+
+ return ((struct vm_radix_node *)rtree->rt_root);
+}
+
+/*
+ * Set the root node for a radix tree.
+ */
+static __inline void
+vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode)
+{
+
+ rtree->rt_root = (uintptr_t)rnode;
+}
+
+/*
+ * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
+ */
+static __inline boolean_t
+vm_radix_isleaf(struct vm_radix_node *rnode)
+{
+
+ return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
+}
+
+/*
+ * Returns the associated page extracted from rnode.
+ */
+static __inline vm_page_t
+vm_radix_topage(struct vm_radix_node *rnode)
+{
+
+ return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
+}
+
+/*
+ * Adds the page as a child of the provided node.
+ */
+static __inline void
+vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
+ vm_page_t page)
+{
+ int slot;
+
+ slot = vm_radix_slot(index, clev);
+ rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF);
+}
+
+/*
+ * Returns the slot where two keys differ.
+ * It cannot accept 2 equal keys.
+ */
+static __inline uint16_t
+vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
+{
+ uint16_t clev;
+
+ KASSERT(index1 != index2, ("%s: passing the same key value %jx",
+ __func__, (uintmax_t)index1));
+
+ index1 ^= index2;
+ for (clev = VM_RADIX_LIMIT;; clev--)
+ if (vm_radix_slot(index1, clev) != 0)
+ return (clev);
+}
+
+/*
+ * Returns TRUE if it can be determined that key does not belong to the
+ * specified rnode. Otherwise, returns FALSE.
+ */
+static __inline boolean_t
+vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
+{
+
+ if (rnode->rn_clev < VM_RADIX_LIMIT) {
+ idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
+ return (idx != rnode->rn_owner);
+ }
+ return (FALSE);
+}
+
+/*
+ * Internal helper for vm_radix_reclaim_allnodes().
+ * This function is recursive.
+ */
+static void
+vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
+{
+ int slot;
+
+ KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
+ ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
+ for (slot = 0; rnode->rn_count != 0; slot++) {
+ if (rnode->rn_child[slot] == NULL)
+ continue;
+ if (!vm_radix_isleaf(rnode->rn_child[slot]))
+ vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]);
+ rnode->rn_child[slot] = NULL;
+ rnode->rn_count--;
+ }
+ vm_radix_node_put(rnode);
+}
+
+#ifdef INVARIANTS
+/*
+ * Radix node zone destructor.
+ */
+static void
+vm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
+{
+ struct vm_radix_node *rnode;
+ int slot;
+
+ rnode = mem;
+ KASSERT(rnode->rn_count == 0,
+ ("vm_radix_node_put: rnode %p has %d children", rnode,
+ rnode->rn_count));
+ for (slot = 0; slot < VM_RADIX_COUNT; slot++)
+ KASSERT(rnode->rn_child[slot] == NULL,
+ ("vm_radix_node_put: rnode %p has a child", rnode));
+}
+#endif
+
+#ifndef UMA_MD_SMALL_ALLOC
+/*
+ * Reserve the KVA necessary to satisfy the node allocation.
+ * This is mandatory in architectures not supporting direct
+ * mapping as they will need otherwise to carve into the kernel maps for
+ * every node allocation, resulting into deadlocks for consumers already
+ * working with kernel maps.
+ */
+static void
+vm_radix_reserve_kva(void *arg __unused)
+{
+
+ /*
+ * Calculate the number of reserved nodes, discounting the pages that
+ * are needed to store them.
+ */
+ if (!uma_zone_reserve_kva(vm_radix_node_zone,
+ ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
+ sizeof(struct vm_radix_node))))
+ panic("%s: unable to reserve KVA", __func__);
+}
+SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND,
+ vm_radix_reserve_kva, NULL);
+#endif
+
+/*
+ * Initialize the UMA slab zone.
+ * Until vm_radix_prealloc() is called, the zone will be served by the
+ * UMA boot-time pre-allocated pool of pages.
+ */
+void
+vm_radix_init(void)
+{
+
+ vm_radix_node_zone = uma_zcreate("RADIX NODE",
+ sizeof(struct vm_radix_node), NULL,
+#ifdef INVARIANTS
+ vm_radix_node_zone_dtor,
+#else
+ NULL,
+#endif
+ NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM);
+}
+
+/*
+ * Inserts the key-value pair into the trie.
+ * Panics if the key already exists.
+ */
+int
+vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
+{
+ vm_pindex_t index, newind;
+ void **parentp;
+ struct vm_radix_node *rnode, *tmp;
+ vm_page_t m;
+ int slot;
+ uint16_t clev;
+
+ index = page->pindex;
+
+restart:
+
+ /*
+ * The owner of record for root is not really important because it
+ * will never be used.
+ */
+ rnode = vm_radix_getroot(rtree);
+ if (rnode == NULL) {
+ rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
+ return (0);
+ }
+ parentp = (void **)&rtree->rt_root;
+ for (;;) {
+ if (vm_radix_isleaf(rnode)) {
+ m = vm_radix_topage(rnode);
+ if (m->pindex == index)
+ panic("%s: key %jx is already present",
+ __func__, (uintmax_t)index);
+ clev = vm_radix_keydiff(m->pindex, index);
+
+ /*
+ * During node allocation the trie that is being
+ * walked can be modified because of recursing radix
+ * trie operations.
+ * If this is the case, the recursing functions signal
+ * such situation and the insert operation must
+ * start from scratch again.
+ * The freed radix node will then be in the UMA
+ * caches very likely to avoid the same situation
+ * to happen.
+ */
+ rtree->rt_flags |= RT_INSERT_INPROG;
+ tmp = vm_radix_node_get(vm_radix_trimkey(index,
+ clev + 1), 2, clev);
+ rtree->rt_flags &= ~RT_INSERT_INPROG;
+ if (tmp == NULL) {
+ rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ return (ENOMEM);
+ }
+ if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
+ rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ tmp->rn_count = 0;
+ vm_radix_node_put(tmp);
+ goto restart;
+ }
+ *parentp = tmp;
+ vm_radix_addpage(tmp, index, clev, page);
+ vm_radix_addpage(tmp, m->pindex, clev, m);
+ return (0);
+ } else if (vm_radix_keybarr(rnode, index))
+ break;
+ slot = vm_radix_slot(index, rnode->rn_clev);
+ if (rnode->rn_child[slot] == NULL) {
+ rnode->rn_count++;
+ vm_radix_addpage(rnode, index, rnode->rn_clev, page);
+ return (0);
+ }
+ parentp = &rnode->rn_child[slot];
+ rnode = rnode->rn_child[slot];
+ }
+
+ /*
+ * A new node is needed because the right insertion level is reached.
+ * Setup the new intermediate node and add the 2 children: the
+ * new object and the older edge.
+ */
+ newind = rnode->rn_owner;
+ clev = vm_radix_keydiff(newind, index);
+
+ /* See the comments above. */
+ rtree->rt_flags |= RT_INSERT_INPROG;
+ tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
+ rtree->rt_flags &= ~RT_INSERT_INPROG;
+ if (tmp == NULL) {
+ rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ return (ENOMEM);
+ }
+ if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
+ rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+ tmp->rn_count = 0;
+ vm_radix_node_put(tmp);
+ goto restart;
+ }
+ *parentp = tmp;
+ vm_radix_addpage(tmp, index, clev, page);
+ slot = vm_radix_slot(newind, clev);
+ tmp->rn_child[slot] = rnode;
+ return (0);
+}
+
+/*
+ * Returns TRUE if the specified radix tree contains a single leaf and FALSE
+ * otherwise.
+ */
+boolean_t
+vm_radix_is_singleton(struct vm_radix *rtree)
+{
+ struct vm_radix_node *rnode;
+
+ rnode = vm_radix_getroot(rtree);
+ if (rnode == NULL)
+ return (FALSE);
+ return (vm_radix_isleaf(rnode));
+}
+
+/*
+ * Returns the value stored at the index. If the index is not present,
+ * NULL is returned.
+ */
+vm_page_t
+vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
+{
+ struct vm_radix_node *rnode;
+ vm_page_t m;
+ int slot;
+
+ rnode = vm_radix_getroot(rtree);
+ while (rnode != NULL) {
+ if (vm_radix_isleaf(rnode)) {
+ m = vm_radix_topage(rnode);
+ if (m->pindex == index)
+ return (m);
+ else
+ break;
+ } else if (vm_radix_keybarr(rnode, index))
+ break;
+ slot = vm_radix_slot(index, rnode->rn_clev);
+ rnode = rnode->rn_child[slot];
+ }
+ return (NULL);
+}
+
+/*
+ * Look up the nearest entry at a position bigger than or equal to index.
+ */
+vm_page_t
+vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
+{
+ struct vm_radix_node *stack[VM_RADIX_LIMIT];
+ vm_pindex_t inc;
+ vm_page_t m;
+ struct vm_radix_node *child, *rnode;
+#ifdef INVARIANTS
+ int loops = 0;
+#endif
+ int slot, tos;
+
+ rnode = vm_radix_getroot(rtree);
+ if (rnode == NULL)
+ return (NULL);
+ else if (vm_radix_isleaf(rnode)) {
+ m = vm_radix_topage(rnode);
+ if (m->pindex >= index)
+ return (m);
+ else
+ return (NULL);
+ }
+ tos = 0;
+ for (;;) {
+ /*
+ * If the keys differ before the current bisection node,
+ * then the search key might rollback to the earliest
+ * available bisection node or to the smallest key
+ * in the current node (if the owner is bigger than the
+ * search key).
+ */
+ if (vm_radix_keybarr(rnode, index)) {
+ if (index > rnode->rn_owner) {
+ascend:
+ KASSERT(++loops < 1000,
+ ("vm_radix_lookup_ge: too many loops"));
+
+ /*
+ * Pop nodes from the stack until either the
+ * stack is empty or a node that could have a
+ * matching descendant is found.
+ */
+ do {
+ if (tos == 0)
+ return (NULL);
+ rnode = stack[--tos];
+ } while (vm_radix_slot(index,
+ rnode->rn_clev) == (VM_RADIX_COUNT - 1));
+
+ /*
+ * The following computation cannot overflow
+ * because index's slot at the current level
+ * is less than VM_RADIX_COUNT - 1.
+ */
+ index = vm_radix_trimkey(index,
+ rnode->rn_clev);
+ index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
+ } else
+ index = rnode->rn_owner;
+ KASSERT(!vm_radix_keybarr(rnode, index),
+ ("vm_radix_lookup_ge: keybarr failed"));
+ }
+ slot = vm_radix_slot(index, rnode->rn_clev);
+ child = rnode->rn_child[slot];
+ if (vm_radix_isleaf(child)) {
+ m = vm_radix_topage(child);
+ if (m->pindex >= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+
+ /*
+ * Look for an available edge or page within the current
+ * bisection node.
+ */
+ if (slot < (VM_RADIX_COUNT - 1)) {
+ inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
+ index = vm_radix_trimkey(index, rnode->rn_clev);
+ do {
+ index += inc;
+ slot++;
+ child = rnode->rn_child[slot];
+ if (vm_radix_isleaf(child)) {
+ m = vm_radix_topage(child);
+ if (m->pindex >= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+ } while (slot < (VM_RADIX_COUNT - 1));
+ }
+ KASSERT(child == NULL || vm_radix_isleaf(child),
+ ("vm_radix_lookup_ge: child is radix node"));
+
+ /*
+ * If a page or edge bigger than the search slot is not found
+ * in the current node, ascend to the next higher-level node.
+ */
+ goto ascend;
+descend:
+ KASSERT(rnode->rn_clev > 0,
+ ("vm_radix_lookup_ge: pushing leaf's parent"));
+ KASSERT(tos < VM_RADIX_LIMIT,
+ ("vm_radix_lookup_ge: stack overflow"));
+ stack[tos++] = rnode;
+ rnode = child;
+ }
+}
+
+/*
+ * Look up the nearest entry at a position less than or equal to index.
+ */
+vm_page_t
+vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
+{
+ struct vm_radix_node *stack[VM_RADIX_LIMIT];
+ vm_pindex_t inc;
+ vm_page_t m;
+ struct vm_radix_node *child, *rnode;
+#ifdef INVARIANTS
+ int loops = 0;
+#endif
+ int slot, tos;
+
+ rnode = vm_radix_getroot(rtree);
+ if (rnode == NULL)
+ return (NULL);
+ else if (vm_radix_isleaf(rnode)) {
+ m = vm_radix_topage(rnode);
+ if (m->pindex <= index)
+ return (m);
+ else
+ return (NULL);
+ }
+ tos = 0;
+ for (;;) {
+ /*
+ * If the keys differ before the current bisection node,
+ * then the search key might rollback to the earliest
+ * available bisection node or to the largest key
+ * in the current node (if the owner is smaller than the
+ * search key).
+ */
+ if (vm_radix_keybarr(rnode, index)) {
+ if (index > rnode->rn_owner) {
+ index = rnode->rn_owner + VM_RADIX_COUNT *
+ VM_RADIX_UNITLEVEL(rnode->rn_clev);
+ } else {
+ascend:
+ KASSERT(++loops < 1000,
+ ("vm_radix_lookup_le: too many loops"));
+
+ /*
+ * Pop nodes from the stack until either the
+ * stack is empty or a node that could have a
+ * matching descendant is found.
+ */
+ do {
+ if (tos == 0)
+ return (NULL);
+ rnode = stack[--tos];
+ } while (vm_radix_slot(index,
+ rnode->rn_clev) == 0);
+
+ /*
+ * The following computation cannot overflow
+ * because index's slot at the current level
+ * is greater than 0.
+ */
+ index = vm_radix_trimkey(index,
+ rnode->rn_clev);
+ }
+ index--;
+ KASSERT(!vm_radix_keybarr(rnode, index),
+ ("vm_radix_lookup_le: keybarr failed"));
+ }
+ slot = vm_radix_slot(index, rnode->rn_clev);
+ child = rnode->rn_child[slot];
+ if (vm_radix_isleaf(child)) {
+ m = vm_radix_topage(child);
+ if (m->pindex <= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+
+ /*
+ * Look for an available edge or page within the current
+ * bisection node.
+ */
+ if (slot > 0) {
+ inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
+ index |= inc - 1;
+ do {
+ index -= inc;
+ slot--;
+ child = rnode->rn_child[slot];
+ if (vm_radix_isleaf(child)) {
+ m = vm_radix_topage(child);
+ if (m->pindex <= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+ } while (slot > 0);
+ }
+ KASSERT(child == NULL || vm_radix_isleaf(child),
+ ("vm_radix_lookup_le: child is radix node"));
+
+ /*
+ * If a page or edge smaller than the search slot is not found
+ * in the current node, ascend to the next higher-level node.
+ */
+ goto ascend;
+descend:
+ KASSERT(rnode->rn_clev > 0,
+ ("vm_radix_lookup_le: pushing leaf's parent"));
+ KASSERT(tos < VM_RADIX_LIMIT,
+ ("vm_radix_lookup_le: stack overflow"));
+ stack[tos++] = rnode;
+ rnode = child;
+ }
+}
+
+/*
+ * Remove the specified index from the tree.
+ * Panics if the key is not present.
+ */
+void
+vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
+{
+ struct vm_radix_node *rnode, *parent;
+ vm_page_t m;
+ int i, slot;
+
+ /*
+ * Detect if a page is going to be removed from a trie which is
+ * already undergoing another trie operation.
+ * Right now this is only possible for vm_radix_remove() recursing
+ * into vm_radix_insert().
+ * If this is the case, the caller must be notified about this
+ * situation. It will also takecare to update the RT_TRIE_MODIFIED
+ * accordingly.
+ * The RT_TRIE_MODIFIED bit is set here because the remove operation
+ * will always succeed.
+ */
+ if ((rtree->rt_flags & RT_INSERT_INPROG) != 0)
+ rtree->rt_flags |= RT_TRIE_MODIFIED;
+
+ rnode = vm_radix_getroot(rtree);
+ if (vm_radix_isleaf(rnode)) {
+ m = vm_radix_topage(rnode);
+ if (m->pindex != index)
+ panic("%s: invalid key found", __func__);
+ vm_radix_setroot(rtree, NULL);
+ return;
+ }
+ parent = NULL;
+ for (;;) {
+ if (rnode == NULL)
+ panic("vm_radix_remove: impossible to locate the key");
+ slot = vm_radix_slot(index, rnode->rn_clev);
+ if (vm_radix_isleaf(rnode->rn_child[slot])) {
+ m = vm_radix_topage(rnode->rn_child[slot]);
+ if (m->pindex != index)
+ panic("%s: invalid key found", __func__);
+ rnode->rn_child[slot] = NULL;
+ rnode->rn_count--;
+ if (rnode->rn_count > 1)
+ break;
+ for (i = 0; i < VM_RADIX_COUNT; i++)
+ if (rnode->rn_child[i] != NULL)
+ break;
+ KASSERT(i != VM_RADIX_COUNT,
+ ("%s: invalid node configuration", __func__));
+ if (parent == NULL)
+ vm_radix_setroot(rtree, rnode->rn_child[i]);
+ else {
+ slot = vm_radix_slot(index, parent->rn_clev);
+ KASSERT(parent->rn_child[slot] == rnode,
+ ("%s: invalid child value", __func__));
+ parent->rn_child[slot] = rnode->rn_child[i];
+ }
+ rnode->rn_count--;
+ rnode->rn_child[i] = NULL;
+ vm_radix_node_put(rnode);
+ break;
+ }
+ parent = rnode;
+ rnode = rnode->rn_child[slot];
+ }
+}
+
+/*
+ * Remove and free all the nodes from the radix tree.
+ * This function is recursive but there is a tight control on it as the
+ * maximum depth of the tree is fixed.
+ */
+void
+vm_radix_reclaim_allnodes(struct vm_radix *rtree)
+{
+ struct vm_radix_node *root;
+
+ KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0,
+ ("vm_radix_reclaim_allnodes: unexpected trie recursion"));
+
+ root = vm_radix_getroot(rtree);
+ if (root == NULL)
+ return;
+ vm_radix_setroot(rtree, NULL);
+ if (!vm_radix_isleaf(root))
+ vm_radix_reclaim_allnodes_int(root);
+}
+
+/*
+ * Replace an existing page in the trie with another one.
+ * Panics if there is not an old page in the trie at the new page's index.
+ */
+vm_page_t
+vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
+{
+ struct vm_radix_node *rnode;
+ vm_page_t m;
+ vm_pindex_t index;
+ int slot;
+
+ index = newpage->pindex;
+ rnode = vm_radix_getroot(rtree);
+ if (rnode == NULL)
+ panic("%s: replacing page on an empty trie", __func__);
+ if (vm_radix_isleaf(rnode)) {
+ m = vm_radix_topage(rnode);
+ if (m->pindex != index)
+ panic("%s: original replacing root key not found",
+ __func__);
+ rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
+ return (m);
+ }
+ for (;;) {
+ slot = vm_radix_slot(index, rnode->rn_clev);
+ if (vm_radix_isleaf(rnode->rn_child[slot])) {
+ m = vm_radix_topage(rnode->rn_child[slot]);
+ if (m->pindex == index) {
+ rnode->rn_child[slot] =
+ (void *)((uintptr_t)newpage |
+ VM_RADIX_ISLEAF);
+ return (m);
+ } else
+ break;
+ } else if (rnode->rn_child[slot] == NULL ||
+ vm_radix_keybarr(rnode->rn_child[slot], index))
+ break;
+ rnode = rnode->rn_child[slot];
+ }
+ panic("%s: original replacing page not found", __func__);
+}
+
+#ifdef DDB
+/*
+ * Show details about the given radix node.
+ */
+DB_SHOW_COMMAND(radixnode, db_show_radixnode)
+{
+ struct vm_radix_node *rnode;
+ int i;
+
+ if (!have_addr)
+ return;
+ rnode = (struct vm_radix_node *)addr;
+ db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
+ (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
+ rnode->rn_clev);
+ for (i = 0; i < VM_RADIX_COUNT; i++)
+ if (rnode->rn_child[i] != NULL)
+ db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
+ i, (void *)rnode->rn_child[i],
+ vm_radix_isleaf(rnode->rn_child[i]) ?
+ vm_radix_topage(rnode->rn_child[i]) : NULL,
+ rnode->rn_clev);
+}
+#endif /* DDB */
Property changes on: trunk/sys/vm/vm_radix.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/vm/vm_radix.h
===================================================================
--- trunk/sys/vm/vm_radix.h (rev 0)
+++ trunk/sys/vm/vm_radix.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -0,0 +1,50 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff at freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul at gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/10/sys/vm/vm_radix.h 266591 2014-05-23 17:47:49Z alc $
+ */
+
+#ifndef _VM_RADIX_H_
+#define _VM_RADIX_H_
+
+#include <vm/_vm_radix.h>
+
+#ifdef _KERNEL
+
+void vm_radix_init(void);
+int vm_radix_insert(struct vm_radix *rtree, vm_page_t page);
+boolean_t vm_radix_is_singleton(struct vm_radix *rtree);
+vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index);
+void vm_radix_reclaim_allnodes(struct vm_radix *rtree);
+void vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage);
+
+#endif /* _KERNEL */
+#endif /* !_VM_RADIX_H_ */
Property changes on: trunk/sys/vm/vm_radix.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/vm_reserv.c
===================================================================
--- trunk/sys/vm/vm_reserv.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_reserv.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2002-2006 Rice University
* Copyright (c) 2007-2008 Alan L. Cox <alc at cs.rice.edu>
@@ -31,10 +32,13 @@
/*
* Superpage reservation management module
+ *
+ * Any external functions defined by this module are only to be used by the
+ * virtual memory system.
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_reserv.c 280045 2015-03-15 18:40:06Z kib $");
#include "opt_vm.h"
@@ -44,6 +48,7 @@
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/queue.h>
+#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
@@ -53,6 +58,7 @@
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
#include <vm/vm_reserv.h>
/*
@@ -224,6 +230,11 @@
if (rv->inpartpopq) {
TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
rv->inpartpopq = FALSE;
+ } else {
+ KASSERT(rv->pages->psind == 1,
+ ("vm_reserv_depopulate: reserv %p is already demoted",
+ rv));
+ rv->pages->psind = 0;
}
rv->popcnt--;
if (rv->popcnt == 0) {
@@ -273,6 +284,8 @@
("vm_reserv_populate: reserv %p is free", rv));
KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
("vm_reserv_populate: reserv %p is already full", rv));
+ KASSERT(rv->pages->psind == 0,
+ ("vm_reserv_populate: reserv %p is already promoted", rv));
if (rv->inpartpopq) {
TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
rv->inpartpopq = FALSE;
@@ -281,106 +294,281 @@
if (rv->popcnt < VM_LEVEL_0_NPAGES) {
rv->inpartpopq = TRUE;
TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
- }
+ } else
+ rv->pages->psind = 1;
}
/*
- * Allocates a page from an existing or newly-created reservation.
+ * Allocates a contiguous set of physical pages of the given size "npages"
+ * from existing or newly created reservations. All of the physical pages
+ * must be at or above the given physical address "low" and below the given
+ * physical address "high". The given value "alignment" determines the
+ * alignment of the first physical page in the set. If the given value
+ * "boundary" is non-zero, then the set of physical pages cannot cross any
+ * physical address boundary that is a multiple of that value. Both
+ * "alignment" and "boundary" must be a power of two.
*
* The object and free page queue must be locked.
*/
vm_page_t
-vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex)
+vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
+ vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
{
- vm_page_t m, mpred, msucc;
+ vm_paddr_t pa, size;
+ vm_page_t m, m_ret, mpred, msucc;
vm_pindex_t first, leftcap, rightcap;
vm_reserv_t rv;
+ u_long allocpages, maxpages, minpages;
+ int i, index, n;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
/*
- * Is a reservation fundamentally not possible?
+ * Is a reservation fundamentally impossible?
*/
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
if (pindex < VM_RESERV_INDEX(object, pindex) ||
- pindex >= object->size)
+ pindex + npages > object->size)
return (NULL);
/*
+ * All reservations of a particular size have the same alignment.
+ * Assuming that the first page is allocated from a reservation, the
+ * least significant bits of its physical address can be determined
+ * from its offset from the beginning of the reservation and the size
+ * of the reservation.
+ *
+ * Could the specified index within a reservation of the smallest
+ * possible size satisfy the alignment and boundary requirements?
+ */
+ pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
+ if ((pa & (alignment - 1)) != 0)
+ return (NULL);
+ size = npages << PAGE_SHIFT;
+ if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
+ return (NULL);
+
+ /*
* Look for an existing reservation.
*/
- msucc = NULL;
- mpred = object->root;
- while (mpred != NULL) {
- KASSERT(mpred->pindex != pindex,
- ("vm_reserv_alloc_page: pindex already allocated"));
+ mpred = vm_radix_lookup_le(&object->rtree, pindex);
+ if (mpred != NULL) {
+ KASSERT(mpred->pindex < pindex,
+ ("vm_reserv_alloc_contig: pindex already allocated"));
rv = vm_reserv_from_page(mpred);
- if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) {
- m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
- /* Handle vm_page_rename(m, new_object, ...). */
- if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
+ if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+ goto found;
+ msucc = TAILQ_NEXT(mpred, listq);
+ } else
+ msucc = TAILQ_FIRST(&object->memq);
+ if (msucc != NULL) {
+ KASSERT(msucc->pindex > pindex,
+ ("vm_reserv_alloc_contig: pindex already allocated"));
+ rv = vm_reserv_from_page(msucc);
+ if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+ goto found;
+ }
+
+ /*
+ * Could at least one reservation fit between the first index to the
+ * left that can be used ("leftcap") and the first index to the right
+ * that cannot be used ("rightcap")?
+ */
+ first = pindex - VM_RESERV_INDEX(object, pindex);
+ if (mpred != NULL) {
+ if ((rv = vm_reserv_from_page(mpred))->object != object)
+ leftcap = mpred->pindex + 1;
+ else
+ leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
+ if (leftcap > first)
+ return (NULL);
+ }
+ minpages = VM_RESERV_INDEX(object, pindex) + npages;
+ maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
+ allocpages = maxpages;
+ if (msucc != NULL) {
+ if ((rv = vm_reserv_from_page(msucc))->object != object)
+ rightcap = msucc->pindex;
+ else
+ rightcap = rv->pindex;
+ if (first + maxpages > rightcap) {
+ if (maxpages == VM_LEVEL_0_NPAGES)
return (NULL);
- vm_reserv_populate(rv);
- return (m);
- } else if (mpred->pindex < pindex) {
- if (msucc != NULL ||
- (msucc = TAILQ_NEXT(mpred, listq)) == NULL)
- break;
- KASSERT(msucc->pindex != pindex,
- ("vm_reserv_alloc_page: pindex already allocated"));
- rv = vm_reserv_from_page(msucc);
- if (rv->object == object &&
- vm_reserv_has_pindex(rv, pindex)) {
- m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
- /* Handle vm_page_rename(m, new_object, ...). */
- if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
- return (NULL);
- vm_reserv_populate(rv);
- return (m);
- } else if (pindex < msucc->pindex)
- break;
- } else if (msucc == NULL) {
- msucc = mpred;
- mpred = TAILQ_PREV(msucc, pglist, listq);
- continue;
+
+ /*
+ * At least one reservation will fit between "leftcap"
+ * and "rightcap". However, a reservation for the
+ * last of the requested pages will not fit. Reduce
+ * the size of the upcoming allocation accordingly.
+ */
+ allocpages = minpages;
}
- msucc = NULL;
- mpred = object->root = vm_page_splay(pindex, object->root);
}
/*
- * Determine the first index to the left that can be used.
+ * Would the last new reservation extend past the end of the object?
*/
- if (mpred == NULL)
- leftcap = 0;
- else if ((rv = vm_reserv_from_page(mpred))->object != object)
- leftcap = mpred->pindex + 1;
- else
- leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
+ if (first + maxpages > object->size) {
+ /*
+ * Don't allocate the last new reservation if the object is a
+ * vnode or backed by another object that is a vnode.
+ */
+ if (object->type == OBJT_VNODE ||
+ (object->backing_object != NULL &&
+ object->backing_object->type == OBJT_VNODE)) {
+ if (maxpages == VM_LEVEL_0_NPAGES)
+ return (NULL);
+ allocpages = minpages;
+ }
+ /* Speculate that the object may grow. */
+ }
/*
- * Determine the first index to the right that cannot be used.
+ * Allocate the physical pages. The alignment and boundary specified
+ * for this allocation may be different from the alignment and
+ * boundary specified for the requested pages. For instance, the
+ * specified index may not be the first page within the first new
+ * reservation.
*/
- if (msucc == NULL)
- rightcap = pindex + VM_LEVEL_0_NPAGES;
- else if ((rv = vm_reserv_from_page(msucc))->object != object)
- rightcap = msucc->pindex;
- else
- rightcap = rv->pindex;
+ m = vm_phys_alloc_contig(allocpages, low, high, ulmax(alignment,
+ VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
+ if (m == NULL)
+ return (NULL);
/*
- * Determine if a reservation fits between the first index to
- * the left that can be used and the first index to the right
- * that cannot be used.
+ * The allocated physical pages always begin at a reservation
+ * boundary, but they do not always end at a reservation boundary.
+ * Initialize every reservation that is completely covered by the
+ * allocated physical pages.
*/
- first = pindex - VM_RESERV_INDEX(object, pindex);
- if (first < leftcap || first + VM_LEVEL_0_NPAGES > rightcap)
+ m_ret = NULL;
+ index = VM_RESERV_INDEX(object, pindex);
+ do {
+ rv = vm_reserv_from_page(m);
+ KASSERT(rv->pages == m,
+ ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
+ rv));
+ KASSERT(rv->object == NULL,
+ ("vm_reserv_alloc_contig: reserv %p isn't free", rv));
+ LIST_INSERT_HEAD(&object->rvq, rv, objq);
+ rv->object = object;
+ rv->pindex = first;
+ KASSERT(rv->popcnt == 0,
+ ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted",
+ rv));
+ KASSERT(!rv->inpartpopq,
+ ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
+ rv));
+ n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
+ for (i = 0; i < n; i++)
+ vm_reserv_populate(rv);
+ npages -= n;
+ if (m_ret == NULL) {
+ m_ret = &rv->pages[index];
+ index = 0;
+ }
+ m += VM_LEVEL_0_NPAGES;
+ first += VM_LEVEL_0_NPAGES;
+ allocpages -= VM_LEVEL_0_NPAGES;
+ } while (allocpages >= VM_LEVEL_0_NPAGES);
+ return (m_ret);
+
+ /*
+ * Found a matching reservation.
+ */
+found:
+ index = VM_RESERV_INDEX(object, pindex);
+ /* Does the allocation fit within the reservation? */
+ if (index + npages > VM_LEVEL_0_NPAGES)
return (NULL);
+ m = &rv->pages[index];
+ pa = VM_PAGE_TO_PHYS(m);
+ if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
+ ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
+ return (NULL);
+ /* Handle vm_page_rename(m, new_object, ...). */
+ for (i = 0; i < npages; i++)
+ if ((rv->pages[index + i].flags & (PG_CACHED | PG_FREE)) == 0)
+ return (NULL);
+ for (i = 0; i < npages; i++)
+ vm_reserv_populate(rv);
+ return (m);
+}
+/*
+ * Allocates a page from an existing or newly-created reservation.
+ *
+ * The page "mpred" must immediately precede the offset "pindex" within the
+ * specified object.
+ *
+ * The object and free page queue must be locked.
+ */
+vm_page_t
+vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred)
+{
+ vm_page_t m, msucc;
+ vm_pindex_t first, leftcap, rightcap;
+ vm_reserv_t rv;
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
/*
- * Would a new reservation extend past the end of the given object?
+ * Is a reservation fundamentally impossible?
*/
- if (object->size < first + VM_LEVEL_0_NPAGES) {
+ if (pindex < VM_RESERV_INDEX(object, pindex) ||
+ pindex >= object->size)
+ return (NULL);
+
+ /*
+ * Look for an existing reservation.
+ */
+ if (mpred != NULL) {
+ KASSERT(mpred->object == object,
+ ("vm_reserv_alloc_page: object doesn't contain mpred"));
+ KASSERT(mpred->pindex < pindex,
+ ("vm_reserv_alloc_page: mpred doesn't precede pindex"));
+ rv = vm_reserv_from_page(mpred);
+ if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+ goto found;
+ msucc = TAILQ_NEXT(mpred, listq);
+ } else
+ msucc = TAILQ_FIRST(&object->memq);
+ if (msucc != NULL) {
+ KASSERT(msucc->pindex > pindex,
+ ("vm_reserv_alloc_page: msucc doesn't succeed pindex"));
+ rv = vm_reserv_from_page(msucc);
+ if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+ goto found;
+ }
+
+ /*
+ * Could a reservation fit between the first index to the left that
+ * can be used and the first index to the right that cannot be used?
+ */
+ first = pindex - VM_RESERV_INDEX(object, pindex);
+ if (mpred != NULL) {
+ if ((rv = vm_reserv_from_page(mpred))->object != object)
+ leftcap = mpred->pindex + 1;
+ else
+ leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
+ if (leftcap > first)
+ return (NULL);
+ }
+ if (msucc != NULL) {
+ if ((rv = vm_reserv_from_page(msucc))->object != object)
+ rightcap = msucc->pindex;
+ else
+ rightcap = rv->pindex;
+ if (first + VM_LEVEL_0_NPAGES > rightcap)
+ return (NULL);
+ }
+
+ /*
+ * Would a new reservation extend past the end of the object?
+ */
+ if (first + VM_LEVEL_0_NPAGES > object->size) {
/*
* Don't allocate a new reservation if the object is a vnode or
* backed by another object that is a vnode.
@@ -393,28 +581,35 @@
}
/*
- * Allocate a new reservation.
+ * Allocate and populate the new reservation.
*/
m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
- if (m != NULL) {
- rv = vm_reserv_from_page(m);
- KASSERT(rv->pages == m,
- ("vm_reserv_alloc_page: reserv %p's pages is corrupted",
- rv));
- KASSERT(rv->object == NULL,
- ("vm_reserv_alloc_page: reserv %p isn't free", rv));
- LIST_INSERT_HEAD(&object->rvq, rv, objq);
- rv->object = object;
- rv->pindex = first;
- KASSERT(rv->popcnt == 0,
- ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted",
- rv));
- KASSERT(!rv->inpartpopq,
- ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE",
- rv));
- vm_reserv_populate(rv);
- m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
- }
+ if (m == NULL)
+ return (NULL);
+ rv = vm_reserv_from_page(m);
+ KASSERT(rv->pages == m,
+ ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
+ KASSERT(rv->object == NULL,
+ ("vm_reserv_alloc_page: reserv %p isn't free", rv));
+ LIST_INSERT_HEAD(&object->rvq, rv, objq);
+ rv->object = object;
+ rv->pindex = first;
+ KASSERT(rv->popcnt == 0,
+ ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
+ KASSERT(!rv->inpartpopq,
+ ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
+ vm_reserv_populate(rv);
+ return (&rv->pages[VM_RESERV_INDEX(object, pindex)]);
+
+ /*
+ * Found a matching reservation.
+ */
+found:
+ m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
+ /* Handle vm_page_rename(m, new_object, ...). */
+ if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
+ return (NULL);
+ vm_reserv_populate(rv);
return (m);
}
@@ -629,16 +824,17 @@
* The free page queue lock must be held.
*/
boolean_t
-vm_reserv_reclaim_contig(vm_paddr_t size, vm_paddr_t low, vm_paddr_t high,
- unsigned long alignment, unsigned long boundary)
+vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+ u_long alignment, vm_paddr_t boundary)
{
- vm_paddr_t pa, pa_length;
+ vm_paddr_t pa, pa_length, size;
vm_reserv_t rv;
int i;
mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
- if (size > VM_LEVEL_0_SIZE - PAGE_SIZE)
+ if (npages > VM_LEVEL_0_NPAGES - 1)
return (FALSE);
+ size = npages << PAGE_SHIFT;
TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
if (pa + PAGE_SIZE - size < low) {
@@ -681,7 +877,7 @@
{
vm_reserv_t rv;
- VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(new_object);
rv = vm_reserv_from_page(m);
if (rv->object == old_object) {
mtx_lock(&vm_page_queue_free_mtx);
Modified: trunk/sys/vm/vm_reserv.h
===================================================================
--- trunk/sys/vm/vm_reserv.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_reserv.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2002-2006 Rice University
* Copyright (c) 2007-2008 Alan L. Cox <alc at cs.rice.edu>
@@ -28,7 +29,7 @@
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_reserv.h 250577 2013-05-12 16:50:18Z alc $
*/
/*
@@ -42,15 +43,21 @@
#if VM_NRESERVLEVEL > 0
-vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex);
+/*
+ * The following functions are only to be used by the virtual memory system.
+ */
+vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex,
+ u_long npages, vm_paddr_t low, vm_paddr_t high,
+ u_long alignment, vm_paddr_t boundary);
+vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex,
+ vm_page_t mpred);
void vm_reserv_break_all(vm_object_t object);
boolean_t vm_reserv_free_page(vm_page_t m);
void vm_reserv_init(void);
int vm_reserv_level_iffullpop(vm_page_t m);
boolean_t vm_reserv_reactivate_page(vm_page_t m);
-boolean_t vm_reserv_reclaim_contig(vm_paddr_t size, vm_paddr_t low,
- vm_paddr_t high, unsigned long alignment,
- unsigned long boundary);
+boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low,
+ vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
boolean_t vm_reserv_reclaim_inactive(void);
void vm_reserv_rename(vm_page_t m, vm_object_t new_object,
vm_object_t old_object, vm_pindex_t old_object_offset);
Modified: trunk/sys/vm/vm_unix.c
===================================================================
--- trunk/sys/vm/vm_unix.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_unix.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991, 1993
@@ -43,7 +44,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_unix.c 284665 2015-06-21 06:28:26Z trasz $");
#include <sys/param.h>
#include <sys/lock.h>
@@ -130,35 +131,39 @@
goto done;
}
#ifdef RACCT
- PROC_LOCK(td->td_proc);
- error = racct_set(td->td_proc, RACCT_DATA, new - base);
- if (error != 0) {
- PROC_UNLOCK(td->td_proc);
- error = ENOMEM;
- goto done;
- }
- error = racct_set(td->td_proc, RACCT_VMEM,
- map->size + (new - old));
- if (error != 0) {
- racct_set_force(td->td_proc, RACCT_DATA, old - base);
- PROC_UNLOCK(td->td_proc);
- error = ENOMEM;
- goto done;
- }
- if (!old_mlock && map->flags & MAP_WIREFUTURE) {
- error = racct_set(td->td_proc, RACCT_MEMLOCK,
- ptoa(pmap_wired_count(map->pmap)) + (new - old));
+ if (racct_enable) {
+ PROC_LOCK(td->td_proc);
+ error = racct_set(td->td_proc, RACCT_DATA, new - base);
if (error != 0) {
+ PROC_UNLOCK(td->td_proc);
+ error = ENOMEM;
+ goto done;
+ }
+ error = racct_set(td->td_proc, RACCT_VMEM,
+ map->size + (new - old));
+ if (error != 0) {
racct_set_force(td->td_proc, RACCT_DATA,
old - base);
- racct_set_force(td->td_proc, RACCT_VMEM,
- map->size);
PROC_UNLOCK(td->td_proc);
error = ENOMEM;
goto done;
}
+ if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+ error = racct_set(td->td_proc, RACCT_MEMLOCK,
+ ptoa(pmap_wired_count(map->pmap)) +
+ (new - old));
+ if (error != 0) {
+ racct_set_force(td->td_proc, RACCT_DATA,
+ old - base);
+ racct_set_force(td->td_proc, RACCT_VMEM,
+ map->size);
+ PROC_UNLOCK(td->td_proc);
+ error = ENOMEM;
+ goto done;
+ }
+ }
+ PROC_UNLOCK(td->td_proc);
}
- PROC_UNLOCK(td->td_proc);
#endif
prot = VM_PROT_RW;
#ifdef COMPAT_FREEBSD32
@@ -170,14 +175,19 @@
rv = vm_map_insert(map, NULL, 0, old, new, prot, VM_PROT_ALL, 0);
if (rv != KERN_SUCCESS) {
#ifdef RACCT
- PROC_LOCK(td->td_proc);
- racct_set_force(td->td_proc, RACCT_DATA, old - base);
- racct_set_force(td->td_proc, RACCT_VMEM, map->size);
- if (!old_mlock && map->flags & MAP_WIREFUTURE) {
- racct_set_force(td->td_proc, RACCT_MEMLOCK,
- ptoa(pmap_wired_count(map->pmap)));
+ if (racct_enable) {
+ PROC_LOCK(td->td_proc);
+ racct_set_force(td->td_proc,
+ RACCT_DATA, old - base);
+ racct_set_force(td->td_proc,
+ RACCT_VMEM, map->size);
+ if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+ racct_set_force(td->td_proc,
+ RACCT_MEMLOCK,
+ ptoa(pmap_wired_count(map->pmap)));
+ }
+ PROC_UNLOCK(td->td_proc);
}
- PROC_UNLOCK(td->td_proc);
#endif
error = ENOMEM;
goto done;
@@ -205,14 +215,16 @@
}
vm->vm_dsize -= btoc(old - new);
#ifdef RACCT
- PROC_LOCK(td->td_proc);
- racct_set_force(td->td_proc, RACCT_DATA, new - base);
- racct_set_force(td->td_proc, RACCT_VMEM, map->size);
- if (!old_mlock && map->flags & MAP_WIREFUTURE) {
- racct_set_force(td->td_proc, RACCT_MEMLOCK,
- ptoa(pmap_wired_count(map->pmap)));
+ if (racct_enable) {
+ PROC_LOCK(td->td_proc);
+ racct_set_force(td->td_proc, RACCT_DATA, new - base);
+ racct_set_force(td->td_proc, RACCT_VMEM, map->size);
+ if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+ racct_set_force(td->td_proc, RACCT_MEMLOCK,
+ ptoa(pmap_wired_count(map->pmap)));
+ }
+ PROC_UNLOCK(td->td_proc);
}
- PROC_UNLOCK(td->td_proc);
#endif
}
done:
Modified: trunk/sys/vm/vm_zeroidle.c
===================================================================
--- trunk/sys/vm/vm_zeroidle.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_zeroidle.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1994 John Dyson
* Copyright (c) 2001 Matt Dillon
@@ -33,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_zeroidle.c 254065 2013-08-07 16:36:38Z kib $");
#include <opt_sched.h>
@@ -50,6 +51,7 @@
#include <sys/unistd.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
#include <vm/vm_page.h>
#include <vm/vm_phys.h>
Modified: trunk/sys/vm/vnode_pager.c
===================================================================
--- trunk/sys/vm/vnode_pager.c 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vnode_pager.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1990 University of Utah.
* Copyright (c) 1991 The Regents of the University of California.
@@ -51,7 +52,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vnode_pager.c 291454 2015-11-29 14:44:40Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -63,6 +64,7 @@
#include <sys/vmmeter.h>
#include <sys/limits.h>
#include <sys/conf.h>
+#include <sys/rwlock.h>
#include <sys/sf_buf.h>
#include <machine/atomic.h>
@@ -82,7 +84,7 @@
static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
static void vnode_pager_dealloc(vm_object_t);
static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
-static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
+static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
vm_ooffset_t, struct ucred *cred);
@@ -109,14 +111,14 @@
return (0);
while ((object = vp->v_object) != NULL) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (!(object->flags & OBJ_DEAD)) {
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return (0);
}
VOP_UNLOCK(vp, 0);
vm_object_set_flag(object, OBJ_DISCONNECTWNT);
- msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vodead", 0);
+ VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vodead", 0);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
}
@@ -135,9 +137,9 @@
* Dereference the reference we just created. This assumes
* that the object is associated with the vp.
*/
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
object->ref_count--;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
vrele(vp);
KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
@@ -154,26 +156,21 @@
if (obj == NULL)
return;
ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_WLOCK(obj);
if (obj->ref_count == 0) {
/*
- * vclean() may be called twice. The first time
- * removes the primary reference to the object,
- * the second time goes one further and is a
- * special-case to terminate the object.
- *
* don't double-terminate the object
*/
if ((obj->flags & OBJ_DEAD) == 0)
vm_object_terminate(obj);
else
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
} else {
/*
* Woe to the process that tries to page now :-).
*/
vm_pager_deallocate(obj);
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
}
vp->v_object = NULL;
}
@@ -206,11 +203,11 @@
*/
retry:
while ((object = vp->v_object) != NULL) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if ((object->flags & OBJ_DEAD) == 0)
break;
vm_object_set_flag(object, OBJ_DISCONNECTWNT);
- msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vadead", 0);
+ VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vadead", 0);
}
KASSERT(vp->v_usecount != 0, ("vnode_pager_alloc: no vnode reference"));
@@ -231,6 +228,12 @@
* Object has been created while we were sleeping
*/
VI_UNLOCK(vp);
+ VM_OBJECT_WLOCK(object);
+ KASSERT(object->ref_count == 1,
+ ("leaked ref %p %d", object, object->ref_count));
+ object->type = OBJT_DEAD;
+ object->ref_count = 0;
+ VM_OBJECT_WUNLOCK(object);
vm_object_destroy(object);
goto retry;
}
@@ -238,7 +241,7 @@
VI_UNLOCK(vp);
} else {
object->ref_count++;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
vref(vp);
return (object);
@@ -258,7 +261,7 @@
if (vp == NULL)
panic("vnode_pager_dealloc: pager already dealloced");
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
vm_object_pip_wait(object, "vnpdea");
refs = object->ref_count;
@@ -272,13 +275,15 @@
if (object->un_pager.vnp.writemappings > 0) {
object->un_pager.vnp.writemappings = 0;
VOP_ADD_WRITECOUNT(vp, -1);
+ CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
+ __func__, vp, vp->v_writecount);
}
vp->v_object = NULL;
VOP_UNSET_TEXT(vp);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
while (refs-- > 0)
vunref(vp);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
}
static boolean_t
@@ -295,9 +300,8 @@
int poff;
int bsize;
int pagesperblock, blocksperpage;
- int vfslocked;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
/*
* If no vp or vp is doomed or marked transparent to VM, we do not
* have the page.
@@ -320,11 +324,9 @@
blocksperpage = (PAGE_SIZE / bsize);
reqblock = pindex * blocksperpage;
}
- VM_OBJECT_UNLOCK(object);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ VM_OBJECT_WUNLOCK(object);
err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
- VFS_UNLOCK_GIANT(vfslocked);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (err)
return TRUE;
if (bn == -1)
@@ -379,9 +381,9 @@
if ((object = vp->v_object) == NULL)
return;
/* ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (object->type == OBJT_DEAD) {
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return;
}
KASSERT(object->type == OBJT_VNODE,
@@ -390,7 +392,7 @@
/*
* Hasn't changed size
*/
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return;
}
nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
@@ -426,7 +428,7 @@
* have been zeroed. Some of these valid bits may
* have already been set.
*/
- vm_page_set_valid(m, base, size);
+ vm_page_set_valid_range(m, base, size);
/*
* Round "base" to the next block boundary so that the
@@ -444,7 +446,7 @@
*/
vm_page_clear_dirty(m, base, PAGE_SIZE - base);
} else if ((nsize & PAGE_MASK) &&
- __predict_false(object->cache != NULL)) {
+ vm_page_is_cached(object, OFF_TO_IDX(nsize))) {
vm_page_cache_free(object, OFF_TO_IDX(nsize),
nobjsize);
}
@@ -451,7 +453,7 @@
}
object->un_pager.vnp.vnp_size = nsize;
object->size = nobjsize;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
/*
@@ -574,9 +576,9 @@
bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
KASSERT((m->dirty & bits) == 0,
("vnode_pager_input_smlfs: page %p is dirty", m));
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
m->valid |= bits;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
sf_buf_free(sf);
if (error) {
@@ -600,7 +602,7 @@
struct sf_buf *sf;
struct vnode *vp;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
error = 0;
/*
@@ -613,7 +615,7 @@
if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
vp = object->handle;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
/*
* Allocate a kernel virtual address and initialize so that
@@ -643,7 +645,7 @@
}
sf_buf_free(sf);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
}
KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m));
if (!error)
@@ -673,16 +675,13 @@
int rtval;
struct vnode *vp;
int bytes = count * PAGE_SIZE;
- int vfslocked;
vp = object->handle;
- VM_OBJECT_UNLOCK(object);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ VM_OBJECT_WUNLOCK(object);
rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
KASSERT(rtval != EOPNOTSUPP,
("vnode_pager: FS getpages not implemented\n"));
- VFS_UNLOCK_GIANT(vfslocked);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
return rtval;
}
@@ -698,25 +697,22 @@
int reqpage;
{
vm_object_t object;
+ struct bufobj *bo;
+ struct buf *bp;
+ struct mount *mp;
vm_offset_t kva;
- off_t foff, tfoff, nextoff;
- int i, j, size, bsize, first;
daddr_t firstaddr, reqblock;
- struct bufobj *bo;
- int runpg;
- int runend;
- struct buf *bp;
- int count;
- int error;
+ off_t foff, nextoff, tfoff, pib;
+ int pbefore, pafter, i, size, bsize, first, last;
+ int count, error, before, after, secmask;
- object = vp->v_object;
- count = bytecount / PAGE_SIZE;
-
KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
("vnode_pager_generic_getpages does not support devices"));
if (vp->v_iflag & VI_DOOMED)
- return VM_PAGER_BAD;
+ return (VM_PAGER_BAD);
+ object = vp->v_object;
+ count = bytecount / PAGE_SIZE;
bsize = vp->v_mount->mnt_stat.f_iosize;
/* get the UNDERLYING device for the file with VOP_BMAP() */
@@ -730,9 +726,10 @@
/*
* if we can't bmap, use old VOP code
*/
- error = VOP_BMAP(vp, foff / bsize, &bo, &reqblock, NULL, NULL);
+ error = VOP_BMAP(vp, IDX_TO_OFF(m[reqpage]->pindex) / bsize, &bo,
+ &reqblock, &after, &before);
if (error == EOPNOTSUPP) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
for (i = 0; i < count; i++)
if (i != reqpage) {
@@ -743,10 +740,10 @@
PCPU_INC(cnt.v_vnodein);
PCPU_INC(cnt.v_vnodepgsin);
error = vnode_pager_input_old(object, m[reqpage]);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return (error);
} else if (error != 0) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
for (i = 0; i < count; i++)
if (i != reqpage) {
vm_page_lock(m[i]);
@@ -753,7 +750,7 @@
vm_page_free(m[i]);
vm_page_unlock(m[i]);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return (VM_PAGER_ERROR);
/*
@@ -763,7 +760,7 @@
*/
} else if ((PAGE_SIZE / bsize) > 1 &&
(vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
for (i = 0; i < count; i++)
if (i != reqpage) {
vm_page_lock(m[i]);
@@ -770,10 +767,10 @@
vm_page_free(m[i]);
vm_page_unlock(m[i]);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
PCPU_INC(cnt.v_vnodein);
PCPU_INC(cnt.v_vnodepgsin);
- return vnode_pager_input_smlfs(object, m[reqpage]);
+ return (vnode_pager_input_smlfs(object, m[reqpage]));
}
/*
@@ -781,7 +778,7 @@
* clean up and return. Otherwise we have to re-read the
* media.
*/
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
for (i = 0; i < count; i++)
if (i != reqpage) {
@@ -789,7 +786,7 @@
vm_page_free(m[i]);
vm_page_unlock(m[i]);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return VM_PAGER_OK;
} else if (reqblock == -1) {
pmap_zero_page(m[reqpage]);
@@ -802,87 +799,48 @@
vm_page_free(m[i]);
vm_page_unlock(m[i]);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return (VM_PAGER_OK);
}
m[reqpage]->valid = 0;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
- /*
- * here on direct device I/O
- */
- firstaddr = -1;
-
- /*
- * calculate the run that includes the required page
- */
- for (first = 0, i = 0; i < count; i = runend) {
- if (vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &firstaddr,
- &runpg) != 0) {
- VM_OBJECT_LOCK(object);
- for (; i < count; i++)
- if (i != reqpage) {
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
- }
- VM_OBJECT_UNLOCK(object);
- return (VM_PAGER_ERROR);
+ pib = IDX_TO_OFF(m[reqpage]->pindex) % bsize;
+ pbefore = ((daddr_t)before * bsize + pib) / PAGE_SIZE;
+ pafter = ((daddr_t)(after + 1) * bsize - pib) / PAGE_SIZE - 1;
+ first = reqpage < pbefore ? 0 : reqpage - pbefore;
+ last = reqpage + pafter >= count ? count - 1 : reqpage + pafter;
+ if (first > 0 || last + 1 < count) {
+ VM_OBJECT_WLOCK(object);
+ for (i = 0; i < first; i++) {
+ vm_page_lock(m[i]);
+ vm_page_free(m[i]);
+ vm_page_unlock(m[i]);
}
- if (firstaddr == -1) {
- VM_OBJECT_LOCK(object);
- if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
- panic("vnode_pager_getpages: unexpected missing page: firstaddr: %jd, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx",
- (intmax_t)firstaddr, (uintmax_t)(foff >> 32),
- (uintmax_t)foff,
- (uintmax_t)
- (object->un_pager.vnp.vnp_size >> 32),
- (uintmax_t)object->un_pager.vnp.vnp_size);
- }
+ for (i = last + 1; i < count; i++) {
vm_page_lock(m[i]);
vm_page_free(m[i]);
vm_page_unlock(m[i]);
- VM_OBJECT_UNLOCK(object);
- runend = i + 1;
- first = runend;
- continue;
}
- runend = i + runpg;
- if (runend <= reqpage) {
- VM_OBJECT_LOCK(object);
- for (j = i; j < runend; j++) {
- vm_page_lock(m[j]);
- vm_page_free(m[j]);
- vm_page_unlock(m[j]);
- }
- VM_OBJECT_UNLOCK(object);
- } else {
- if (runpg < (count - first)) {
- VM_OBJECT_LOCK(object);
- for (i = first + runpg; i < count; i++) {
- vm_page_lock(m[i]);
- vm_page_free(m[i]);
- vm_page_unlock(m[i]);
- }
- VM_OBJECT_UNLOCK(object);
- count = first + runpg;
- }
- break;
- }
- first = runend;
+ VM_OBJECT_WUNLOCK(object);
}
/*
- * the first and last page have been calculated now, move input pages
- * to be zero based...
+ * here on direct device I/O
*/
- if (first != 0) {
- m += first;
- count -= first;
- reqpage -= first;
- }
+ firstaddr = reqblock;
+ firstaddr += pib / DEV_BSIZE;
+ firstaddr -= IDX_TO_OFF(reqpage - first) / DEV_BSIZE;
/*
+ * The first and last page have been calculated now, move
+ * input pages to be zero based, and adjust the count.
+ */
+ m += first;
+ reqpage -= first;
+ count = last - first + 1;
+
+ /*
* calculate the file virtual address for the transfer
*/
foff = IDX_TO_OFF(m[0]->pindex);
@@ -899,21 +857,31 @@
/*
* round up physical size for real devices.
*/
- if (1) {
- int secmask = bo->bo_bsize - 1;
- KASSERT(secmask < PAGE_SIZE && secmask > 0,
- ("vnode_pager_generic_getpages: sector size %d too large",
- secmask + 1));
- size = (size + secmask) & ~secmask;
- }
+ secmask = bo->bo_bsize - 1;
+ KASSERT(secmask < PAGE_SIZE && secmask > 0,
+ ("vnode_pager_generic_getpages: sector size %d too large",
+ secmask + 1));
+ size = (size + secmask) & ~secmask;
bp = getpbuf(&vnode_pbuf_freecnt);
kva = (vm_offset_t)bp->b_data;
/*
- * and map the pages to be read into the kva
+ * and map the pages to be read into the kva, if the filesystem
+ * requires mapped buffers.
*/
- pmap_qenter(kva, m, count);
+ mp = vp->v_mount;
+ if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
+ unmapped_buf_allowed) {
+ bp->b_data = unmapped_buf;
+ bp->b_kvabase = unmapped_buf;
+ bp->b_offset = 0;
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_npages = count;
+ for (i = 0; i < count; i++)
+ bp->b_pages[i] = m[i];
+ } else
+ pmap_qenter(kva, m, count);
/* build a minimal buffer header */
bp->b_iocmd = BIO_READ;
@@ -942,11 +910,22 @@
if ((bp->b_ioflags & BIO_ERROR) != 0)
error = EIO;
- if (!error) {
- if (size != count * PAGE_SIZE)
- bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
+ if (error == 0 && size != count * PAGE_SIZE) {
+ if ((bp->b_flags & B_UNMAPPED) != 0) {
+ bp->b_flags &= ~B_UNMAPPED;
+ pmap_qenter(kva, m, count);
+ }
+ bzero((caddr_t)kva + size, PAGE_SIZE * count - size);
}
- pmap_qremove(kva, count);
+ if ((bp->b_flags & B_UNMAPPED) == 0)
+ pmap_qremove(kva, count);
+ if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) {
+ bp->b_data = (caddr_t)kva;
+ bp->b_kvabase = (caddr_t)kva;
+ bp->b_flags &= ~B_UNMAPPED;
+ for (i = 0; i < count; i++)
+ bp->b_pages[i] = NULL;
+ }
/*
* free the buffer header back to the swap buffer pool
@@ -955,7 +934,7 @@
pbrelbo(bp);
relpbuf(bp, &vnode_pbuf_freecnt);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
vm_page_t mt;
@@ -981,7 +960,7 @@
* we just try to clear the piece that we couldn't
* read.
*/
- vm_page_set_valid(mt, 0,
+ vm_page_set_valid_range(mt, 0,
object->un_pager.vnp.vnp_size - tfoff);
KASSERT((mt->dirty & vm_page_bits(0,
object->un_pager.vnp.vnp_size - tfoff)) == 0,
@@ -992,7 +971,7 @@
if (i != reqpage)
vm_page_readahead_finish(mt);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
if (error) {
printf("vnode_pager_getpages: I/O read error\n");
}
@@ -1008,12 +987,8 @@
* backing vp's VOP_PUTPAGES.
*/
static void
-vnode_pager_putpages(object, m, count, sync, rtvals)
- vm_object_t object;
- vm_page_t *m;
- int count;
- boolean_t sync;
- int *rtvals;
+vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count,
+ int flags, int *rtvals)
{
int rtval;
struct vnode *vp;
@@ -1022,7 +997,7 @@
/*
* Force synchronous operation if we are extremely low on memory
* to prevent a low-memory deadlock. VOP operations often need to
- * allocate more memory to initiate the I/O ( i.e. do a BMAP
+ * allocate more memory to initiate the I/O ( i.e. do a BMAP
* operation ). The swapper handles the case by limiting the amount
* of asynchronous I/O, but that sort of solution doesn't scale well
* for the vnode pager without a lot of work.
@@ -1031,18 +1006,18 @@
* daemon up. This should be probably be addressed XXX.
*/
- if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
- sync |= OBJPC_SYNC;
+ if (cnt.v_free_count + cnt.v_cache_count < cnt.v_pageout_free_min)
+ flags |= VM_PAGER_PUT_SYNC;
/*
* Call device-specific putpages function
*/
vp = object->handle;
- VM_OBJECT_UNLOCK(object);
- rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
+ VM_OBJECT_WUNLOCK(object);
+ rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals, 0);
KASSERT(rtval != EOPNOTSUPP,
("vnode_pager: stale FS putpages\n"));
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
}
@@ -1104,7 +1079,7 @@
* We do not under any circumstances truncate the valid bits, as
* this will screw up bogus page replacement.
*/
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
if (object->un_pager.vnp.vnp_size > poffset) {
int pgoff;
@@ -1119,8 +1094,7 @@
* pmap operation.
*/
m = ma[ncount - 1];
- KASSERT(m->busy > 0,
- ("vnode_pager_generic_putpages: page %p is not busy", m));
+ vm_page_assert_sbusied(m);
KASSERT(!pmap_page_is_write_mapped(m),
("vnode_pager_generic_putpages: page %p is not read-only", m));
vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
@@ -1136,10 +1110,10 @@
}
}
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
/*
- * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
+ * pageouts are already clustered, use IO_ASYNC to force a bawrite()
* rather then a bdwrite() to prevent paging I/O from saturating
* the buffer cache. Dummy-up the sequential heuristic to cause
* large ranges to cluster. If neither IO_SYNC or IO_ASYNC is set,
@@ -1190,7 +1164,7 @@
if (written == 0)
return;
obj = ma[0]->object;
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_WLOCK(obj);
for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) {
if (pos < trunc_page(written)) {
rtvals[i] = VM_PAGER_OK;
@@ -1201,7 +1175,7 @@
vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK);
}
}
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
}
void
@@ -1211,9 +1185,9 @@
struct vnode *vp;
vm_ooffset_t old_wm;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
if (object->type != OBJT_VNODE) {
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return;
}
old_wm = object->un_pager.vnp.writemappings;
@@ -1222,11 +1196,15 @@
if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) {
ASSERT_VOP_ELOCKED(vp, "v_writecount inc");
VOP_ADD_WRITECOUNT(vp, 1);
+ CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
+ __func__, vp, vp->v_writecount);
} else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) {
ASSERT_VOP_ELOCKED(vp, "v_writecount dec");
VOP_ADD_WRITECOUNT(vp, -1);
+ CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
+ __func__, vp, vp->v_writecount);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
}
void
@@ -1236,9 +1214,8 @@
struct vnode *vp;
struct mount *mp;
vm_offset_t inc;
- int vfslocked;
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
/*
* First, recheck the object type to account for the race when
@@ -1245,7 +1222,7 @@
* the vnode is reclaimed.
*/
if (object->type != OBJT_VNODE) {
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return;
}
@@ -1256,14 +1233,13 @@
inc = end - start;
if (object->un_pager.vnp.writemappings != inc) {
object->un_pager.vnp.writemappings -= inc;
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return;
}
vp = object->handle;
vhold(vp);
- VM_OBJECT_UNLOCK(object);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ VM_OBJECT_WUNLOCK(object);
mp = NULL;
vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
@@ -1279,5 +1255,4 @@
vdrop(vp);
if (mp != NULL)
vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
}
Modified: trunk/sys/vm/vnode_pager.h
===================================================================
--- trunk/sys/vm/vnode_pager.h 2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vnode_pager.h 2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1990 University of Utah.
* Copyright (c) 1991, 1993
@@ -32,7 +33,7 @@
* SUCH DAMAGE.
*
* @(#)vnode_pager.h 8.1 (Berkeley) 6/11/93
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vnode_pager.h 232071 2012-02-23 21:07:16Z kib $
*/
#ifndef _VNODE_PAGER_
Added: trunk/sys/x86/acpica/acpi_wakeup.c
===================================================================
--- trunk/sys/x86/acpica/acpi_wakeup.c (rev 0)
+++ trunk/sys/x86/acpica/acpi_wakeup.c 2018-05-24 22:27:41 UTC (rev 9896)
@@ -0,0 +1,409 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
+ * Copyright (c) 2001-2012 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
+ * Copyright (c) 2003 Peter Wemm
+ * Copyright (c) 2008-2012 Jung-uk Kim <jkim at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/acpi_wakeup.c 331910 2018-04-03 07:52:06Z avg $");
+
+#ifdef __i386__
+#include "opt_npx.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <x86/mca.h>
+#include <machine/pcb.h>
+#include <machine/pmap.h>
+#include <machine/specialreg.h>
+#include <machine/md_var.h>
+
+#ifdef SMP
+#include <x86/apicreg.h>
+#include <machine/smp.h>
+#include <machine/vmparam.h>
+#endif
+
+#include <contrib/dev/acpica/include/acpi.h>
+
+#include <dev/acpica/acpivar.h>
+
+#include "acpi_wakecode.h"
+#include "acpi_wakedata.h"
+
+/* Make sure the code is less than a page and leave room for the stack. */
+CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024);
+
+extern int acpi_resume_beep;
+extern int acpi_reset_video;
+
+#ifdef SMP
+extern struct susppcb **susppcbs;
+static cpuset_t suspcpus;
+#else
+static struct susppcb **susppcbs;
+#endif
+
+static void *acpi_alloc_wakeup_handler(void);
+static void acpi_stop_beep(void *);
+
+#ifdef SMP
+static int acpi_wakeup_ap(struct acpi_softc *, int);
+static void acpi_wakeup_cpus(struct acpi_softc *);
+#endif
+
+#ifdef __amd64__
+#define ACPI_PAGETABLES 3
+#else
+#define ACPI_PAGETABLES 0
+#endif
+
+#define WAKECODE_VADDR(sc) \
+ ((sc)->acpi_wakeaddr + (ACPI_PAGETABLES * PAGE_SIZE))
+#define WAKECODE_PADDR(sc) \
+ ((sc)->acpi_wakephys + (ACPI_PAGETABLES * PAGE_SIZE))
+#define WAKECODE_FIXUP(offset, type, val) do { \
+ type *addr; \
+ addr = (type *)(WAKECODE_VADDR(sc) + offset); \
+ *addr = val; \
+} while (0)
+
+static void
+acpi_stop_beep(void *arg)
+{
+
+ if (acpi_resume_beep != 0)
+ timer_spkr_release();
+}
+
+#ifdef SMP
+static int
+acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
+{
+ struct pcb *pcb;
+ int vector = (WAKECODE_PADDR(sc) >> 12) & 0xff;
+ int apic_id = cpu_apic_ids[cpu];
+ int ms;
+
+ pcb = &susppcbs[cpu]->sp_pcb;
+ WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb);
+ WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit);
+ WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base);
+
+ ipi_startup(apic_id, vector);
+
+ /* Wait up to 5 seconds for it to resume. */
+ for (ms = 0; ms < 5000; ms++) {
+ if (!CPU_ISSET(cpu, &suspended_cpus))
+ return (1); /* return SUCCESS */
+ DELAY(1000);
+ }
+ return (0); /* return FAILURE */
+}
+
+#define WARMBOOT_TARGET 0
+#define WARMBOOT_OFF (KERNBASE + 0x0467)
+#define WARMBOOT_SEG (KERNBASE + 0x0469)
+
+#define CMOS_REG (0x70)
+#define CMOS_DATA (0x71)
+#define BIOS_RESET (0x0f)
+#define BIOS_WARM (0x0a)
+
+static void
+acpi_wakeup_cpus(struct acpi_softc *sc)
+{
+ uint32_t mpbioswarmvec;
+ int cpu;
+ u_char mpbiosreason;
+
+ /* save the current value of the warm-start vector */
+ mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF);
+ outb(CMOS_REG, BIOS_RESET);
+ mpbiosreason = inb(CMOS_DATA);
+
+ /* setup a vector to our boot code */
+ *((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET;
+ *((volatile u_short *)WARMBOOT_SEG) = WAKECODE_PADDR(sc) >> 4;
+ outb(CMOS_REG, BIOS_RESET);
+ outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
+
+ /* Wake up each AP. */
+ for (cpu = 1; cpu < mp_ncpus; cpu++) {
+ if (!CPU_ISSET(cpu, &suspcpus))
+ continue;
+ if (acpi_wakeup_ap(sc, cpu) == 0) {
+ /* restore the warmstart vector */
+ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
+ panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)",
+ cpu, cpu_apic_ids[cpu]);
+ }
+ }
+
+ /* restore the warmstart vector */
+ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
+
+ outb(CMOS_REG, BIOS_RESET);
+ outb(CMOS_DATA, mpbiosreason);
+}
+#endif
+
+int
+acpi_sleep_machdep(struct acpi_softc *sc, int state)
+{
+ ACPI_STATUS status;
+ struct pcb *pcb;
+
+ if (sc->acpi_wakeaddr == 0ul)
+ return (-1); /* couldn't alloc wake memory */
+
+#ifdef SMP
+ suspcpus = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &suspcpus);
+#endif
+
+ if (acpi_resume_beep != 0)
+ timer_spkr_acquire();
+
+ AcpiSetFirmwareWakingVector(WAKECODE_PADDR(sc), 0);
+
+ intr_suspend();
+
+ pcb = &susppcbs[0]->sp_pcb;
+ if (savectx(pcb)) {
+#ifdef __amd64__
+ fpususpend(susppcbs[0]->sp_fpususpend);
+#elif defined(DEV_NPX)
+ npxsuspend(susppcbs[0]->sp_fpususpend);
+#endif
+#ifdef SMP
+ if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) {
+ device_printf(sc->acpi_dev, "Failed to suspend APs\n");
+ return (0); /* couldn't sleep */
+ }
+#endif
+
+ WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0));
+ WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0));
+
+#ifndef __amd64__
+ WAKECODE_FIXUP(wakeup_cr4, register_t, pcb->pcb_cr4);
+#endif
+ WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb);
+ WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit);
+ WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base);
+
+ /* Call ACPICA to enter the desired sleep state */
+ if (state == ACPI_STATE_S4 && sc->acpi_s4bios)
+ status = AcpiEnterSleepStateS4bios();
+ else
+ status = AcpiEnterSleepState(state);
+ if (ACPI_FAILURE(status)) {
+ device_printf(sc->acpi_dev,
+ "AcpiEnterSleepState failed - %s\n",
+ AcpiFormatException(status));
+ return (0); /* couldn't sleep */
+ }
+
+ for (;;)
+ ia32_pause();
+ } else {
+#ifdef __amd64__
+ fpuresume(susppcbs[0]->sp_fpususpend);
+#elif defined(DEV_NPX)
+ npxresume(susppcbs[0]->sp_fpususpend);
+#endif
+ }
+
+ return (1); /* wakeup successfully */
+}
+
+int
+acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
+ int intr_enabled)
+{
+
+ if (sleep_result == -1)
+ return (sleep_result);
+
+ if (!intr_enabled) {
+ /* Wakeup MD procedures in interrupt disabled context */
+ if (sleep_result == 1) {
+ pmap_init_pat();
+ initializecpu();
+ PCPU_SET(switchtime, 0);
+ PCPU_SET(switchticks, ticks);
+#ifdef SMP
+ if (!CPU_EMPTY(&suspcpus))
+ acpi_wakeup_cpus(sc);
+#endif
+ }
+
+#ifdef SMP
+ if (!CPU_EMPTY(&suspcpus))
+ resume_cpus(suspcpus);
+#endif
+ mca_resume();
+#ifdef __amd64__
+ if (vmm_resume_p != NULL)
+ vmm_resume_p();
+#endif
+ intr_resume(/*suspend_cancelled*/false);
+
+ AcpiSetFirmwareWakingVector(0, 0);
+ } else {
+ /* Wakeup MD procedures in interrupt enabled context */
+ if (sleep_result == 1 && mem_range_softc.mr_op != NULL &&
+ mem_range_softc.mr_op->reinit != NULL)
+ mem_range_softc.mr_op->reinit(&mem_range_softc);
+ }
+
+ return (sleep_result);
+}
+
+static void *
+acpi_alloc_wakeup_handler(void)
+{
+ void *wakeaddr;
+ int i;
+
+ /*
+ * Specify the region for our wakeup code. We want it in the low 1 MB
+ * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA
+ * (less than 128KB, below 0xa0000, must be excluded by SMAP and DSDT),
+ * and ROM area (0xa0000 and above). The temporary page tables must be
+ * page-aligned.
+ */
+ wakeaddr = contigmalloc((ACPI_PAGETABLES + 1) * PAGE_SIZE, M_DEVBUF,
+ M_WAITOK, 0x500, 0xa0000, PAGE_SIZE, 0ul);
+ if (wakeaddr == NULL) {
+ printf("%s: can't alloc wake memory\n", __func__);
+ return (NULL);
+ }
+ if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL,
+ EVENTHANDLER_PRI_LAST) == NULL) {
+ printf("%s: can't register event handler\n", __func__);
+ contigfree(wakeaddr, (ACPI_PAGETABLES + 1) * PAGE_SIZE,
+ M_DEVBUF);
+ return (NULL);
+ }
+ susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK);
+ for (i = 0; i < mp_ncpus; i++) {
+ susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK);
+ susppcbs[i]->sp_fpususpend = alloc_fpusave(M_WAITOK);
+ }
+
+ return (wakeaddr);
+}
+
+void
+acpi_install_wakeup_handler(struct acpi_softc *sc)
+{
+ static void *wakeaddr = NULL;
+#ifdef __amd64__
+ uint64_t *pt4, *pt3, *pt2;
+ int i;
+#endif
+
+ if (wakeaddr != NULL)
+ return;
+
+ wakeaddr = acpi_alloc_wakeup_handler();
+ if (wakeaddr == NULL)
+ return;
+
+ sc->acpi_wakeaddr = (vm_offset_t)wakeaddr;
+ sc->acpi_wakephys = vtophys(wakeaddr);
+
+ bcopy(wakecode, (void *)WAKECODE_VADDR(sc), sizeof(wakecode));
+
+ /* Patch GDT base address, ljmp targets. */
+ WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t,
+ WAKECODE_PADDR(sc) + bootgdt);
+ WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t,
+ WAKECODE_PADDR(sc) + wakeup_32);
+#ifdef __amd64__
+ WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t,
+ WAKECODE_PADDR(sc) + wakeup_64);
+ WAKECODE_FIXUP(wakeup_pagetables, uint32_t, sc->acpi_wakephys);
+#endif
+
+ /* Save pointers to some global data. */
+ WAKECODE_FIXUP(wakeup_ret, void *, resumectx);
+#ifndef __amd64__
+#if defined(PAE) || defined(PAE_TABLES)
+ WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdpt));
+#else
+ WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdir));
+#endif
+
+#else
+ /* Build temporary page tables below realmode code. */
+ pt4 = wakeaddr;
+ pt3 = pt4 + (PAGE_SIZE) / sizeof(uint64_t);
+ pt2 = pt3 + (PAGE_SIZE) / sizeof(uint64_t);
+
+ /* Create the initial 1GB replicated page tables */
+ for (i = 0; i < 512; i++) {
+ /*
+ * Each slot of the level 4 pages points
+ * to the same level 3 page
+ */
+ pt4[i] = (uint64_t)(sc->acpi_wakephys + PAGE_SIZE);
+ pt4[i] |= PG_V | PG_RW | PG_U;
+
+ /*
+ * Each slot of the level 3 pages points
+ * to the same level 2 page
+ */
+ pt3[i] = (uint64_t)(sc->acpi_wakephys + (2 * PAGE_SIZE));
+ pt3[i] |= PG_V | PG_RW | PG_U;
+
+ /* The level 2 page slots are mapped with 2MB pages for 1GB. */
+ pt2[i] = i * (2 * 1024 * 1024);
+ pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+ }
+#endif
+
+ if (bootverbose)
+ device_printf(sc->acpi_dev, "wakeup code va %#jx pa %#jx\n",
+ (uintmax_t)sc->acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys);
+}
Property changes on: trunk/sys/x86/acpica/acpi_wakeup.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
More information about the Midnightbsd-cvs
mailing list