[Midnightbsd-cvs] src: sys/vm: Bring in ufs and vm changes from FreeBSD.
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Fri Sep 12 15:40:17 EDT 2008
Log Message:
-----------
Bring in ufs and vm changes from FreeBSD.
Modified Files:
--------------
src/sys/vm:
device_pager.c (r1.1.1.1 -> r1.2)
memguard.c (r1.1.1.1 -> r1.2)
memguard.h (r1.1.1.1 -> r1.2)
phys_pager.c (r1.1.1.1 -> r1.2)
pmap.h (r1.1.1.1 -> r1.2)
swap_pager.c (r1.1.1.1 -> r1.2)
swap_pager.h (r1.1.1.1 -> r1.2)
uma.h (r1.1.1.1 -> r1.2)
uma_core.c (r1.1.1.2 -> r1.2)
uma_dbg.c (r1.1.1.1 -> r1.2)
uma_dbg.h (r1.1.1.1 -> r1.2)
uma_int.h (r1.1.1.1 -> r1.2)
vm.h (r1.1.1.1 -> r1.2)
vm_contig.c (r1.2 -> r1.3)
vm_extern.h (r1.2 -> r1.3)
vm_fault.c (r1.2 -> r1.3)
vm_glue.c (r1.2 -> r1.3)
vm_kern.c (r1.1.1.1 -> r1.2)
vm_kern.h (r1.1.1.1 -> r1.2)
vm_map.c (r1.1.1.1 -> r1.2)
vm_map.h (r1.1.1.1 -> r1.2)
vm_meter.c (r1.1.1.1 -> r1.2)
vm_mmap.c (r1.1.1.2 -> r1.2)
vm_object.c (r1.2 -> r1.3)
vm_object.h (r1.1.1.1 -> r1.2)
vm_page.c (r1.1.1.1 -> r1.2)
vm_page.h (r1.1.1.1 -> r1.2)
vm_pageout.c (r1.2 -> r1.3)
vm_pageq.c (r1.1.1.1 -> r1.2)
vm_pager.c (r1.1.1.1 -> r1.2)
vm_param.h (r1.1.1.1 -> r1.2)
vm_zeroidle.c (r1.2 -> r1.3)
vnode_pager.c (r1.2 -> r1.3)
src/sys/ufs/ffs:
ffs_alloc.c (r1.2 -> r1.3)
ffs_balloc.c (r1.1.1.1 -> r1.2)
ffs_extern.h (r1.2 -> r1.3)
ffs_inode.c (r1.1.1.1 -> r1.2)
ffs_rawread.c (r1.2 -> r1.3)
ffs_snapshot.c (r1.2 -> r1.3)
ffs_softdep.c (r1.2 -> r1.3)
ffs_vfsops.c (r1.2 -> r1.3)
ffs_vnops.c (r1.1.1.1 -> r1.2)
fs.h (r1.1.1.1 -> r1.2)
softdep.h (r1.2 -> r1.3)
src/sys/ufs/ufs:
dinode.h (r1.1.1.1 -> r1.2)
dir.h (r1.1.1.1 -> r1.2)
extattr.h (r1.1.1.1 -> r1.2)
inode.h (r1.1.1.1 -> r1.2)
quota.h (r1.1.1.1 -> r1.2)
ufs_acl.c (r1.1.1.1 -> r1.2)
ufs_bmap.c (r1.1.1.1 -> r1.2)
ufs_dirhash.c (r1.1.1.1 -> r1.2)
ufs_extattr.c (r1.2 -> r1.3)
ufs_inode.c (r1.2 -> r1.3)
ufs_lookup.c (r1.2 -> r1.3)
ufs_quota.c (r1.2 -> r1.3)
ufs_vfsops.c (r1.1.1.2 -> r1.2)
ufs_vnops.c (r1.2 -> r1.3)
ufsmount.h (r1.2 -> r1.3)
Added Files:
-----------
src/sys/vm:
redzone.c (r1.1)
redzone.h (r1.1)
vm_phys.c (r1.1)
vm_phys.h (r1.1)
src/sys/ufs/ufs:
gjournal.h (r1.1)
ufs_gjournal.c (r1.1)
Removed Files:
-------------
src/sys/ufs/ffs:
README.softupdates
-------------- next part --------------
Index: vm_pageout.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_pageout.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_pageout.c -L sys/vm/vm_pageout.c -u -r1.2 -r1.3
--- sys/vm/vm_pageout.c
+++ sys/vm/vm_pageout.c
@@ -73,7 +73,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_pageout.c,v 1.268.2.3 2006/03/09 00:02:51 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_pageout.c,v 1.292 2007/09/25 06:25:06 alc Exp $");
#include "opt_vm.h"
#include <sys/param.h>
@@ -85,6 +85,7 @@
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/ktr.h>
+#include <sys/mount.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
@@ -113,7 +114,6 @@
/* the kernel process "vm_pageout"*/
static void vm_pageout(void);
static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_pmap_collect(void);
static void vm_pageout_scan(int pass);
struct proc *pageproc;
@@ -146,6 +146,9 @@
#if !defined(NO_SWAPPING)
static int vm_pageout_req_swapout; /* XXX */
static int vm_daemon_needed;
+static struct mtx vm_daemon_mtx;
+/* Allow for use by vm_pageout before vm_daemon is initialized. */
+MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
#endif
static int vm_max_launder = 32;
static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
@@ -207,7 +210,7 @@
#if !defined(NO_SWAPPING)
static void vm_pageout_map_deactivate_pages(vm_map_t, long);
static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
-static void vm_req_vmdaemon(void);
+static void vm_req_vmdaemon(int req);
#endif
static void vm_pageout_page_stats(void);
@@ -237,7 +240,8 @@
* Initialize our marker
*/
bzero(&marker, sizeof(marker));
- marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+ marker.flags = PG_FICTITIOUS | PG_MARKER;
+ marker.oflags = VPO_BUSY;
marker.queue = m->queue;
marker.wire_count = 1;
@@ -292,10 +296,10 @@
*/
/*
- * Don't mess with the page if it's busy, held, or special
+ * Can't clean the page if it's busy or held.
*/
if ((m->hold_count != 0) ||
- ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) {
+ ((m->busy != 0) || (m->oflags & VPO_BUSY))) {
return 0;
}
@@ -338,8 +342,7 @@
ib = 0;
break;
}
- if (((p->queue - p->pc) == PQ_CACHE) ||
- (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
+ if ((p->oflags & VPO_BUSY) || p->busy) {
ib = 0;
break;
}
@@ -368,8 +371,7 @@
if ((p = vm_page_lookup(object, pindex + is)) == NULL)
break;
- if (((p->queue - p->pc) == PQ_CACHE) ||
- (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
+ if ((p->oflags & VPO_BUSY) || p->busy) {
break;
}
vm_page_test_dirty(p);
@@ -432,20 +434,19 @@
("vm_pageout_flush: partially invalid page %p index %d/%d",
mc[i], i, count));
vm_page_io_start(mc[i]);
- pmap_page_protect(mc[i], VM_PROT_READ);
+ pmap_remove_write(mc[i]);
}
vm_page_unlock_queues();
vm_object_pip_add(object, count);
- vm_pager_put_pages(object, mc, count,
- (flags | ((object == kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
- pageout_status);
+ vm_pager_put_pages(object, mc, count, flags, pageout_status);
vm_page_lock_queues();
for (i = 0; i < count; i++) {
vm_page_t mt = mc[i];
- KASSERT((mt->flags & PG_WRITEABLE) == 0,
+ KASSERT(pageout_status[i] == VM_PAGER_PEND ||
+ (mt->flags & PG_WRITEABLE) == 0,
("vm_pageout_flush: page %p is not write protected", mt));
switch (pageout_status[i]) {
case VM_PAGER_OK:
@@ -539,7 +540,8 @@
if (p->wire_count != 0 ||
p->hold_count != 0 ||
p->busy != 0 ||
- (p->flags & (PG_BUSY|PG_UNMANAGED)) ||
+ (p->oflags & VPO_BUSY) ||
+ (p->flags & PG_UNMANAGED) ||
!pmap_page_exists_quick(pmap, p)) {
p = next;
continue;
@@ -667,35 +669,6 @@
#endif /* !defined(NO_SWAPPING) */
/*
- * This routine is very drastic, but can save the system
- * in a pinch.
- */
-static void
-vm_pageout_pmap_collect(void)
-{
- int i;
- vm_page_t m;
- static int warningdone;
-
- if (pmap_pagedaemon_waken == 0)
- return;
- if (warningdone < 5) {
- printf("collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
- warningdone++;
- }
- vm_page_lock_queues();
- for (i = 0; i < vm_page_array_size; i++) {
- m = &vm_page_array[i];
- if (m->wire_count || m->hold_count || m->busy ||
- (m->flags & (PG_BUSY | PG_UNMANAGED)))
- continue;
- pmap_remove_all(m);
- }
- vm_page_unlock_queues();
- pmap_pagedaemon_waken = 0;
-}
-
-/*
* vm_pageout_scan does the dirty work for the pageout daemon.
*/
static void
@@ -709,12 +682,10 @@
struct thread *td;
vm_offset_t size, bigsize;
vm_object_t object;
- int actcount, cache_cur, cache_first_failure;
- static int cache_last_free;
+ int actcount;
int vnodes_skipped = 0;
int maxlaunder;
- mtx_lock(&Giant);
/*
* Decrease registered cache sizes.
*/
@@ -723,10 +694,6 @@
* We do this explicitly after the caches have been drained above.
*/
uma_reclaim();
- /*
- * Do whatever cleanup that the pmap code can.
- */
- vm_pageout_pmap_collect();
addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit);
@@ -740,7 +707,8 @@
* Initialize our marker
*/
bzero(&marker, sizeof(marker));
- marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+ marker.flags = PG_FICTITIOUS | PG_MARKER;
+ marker.oflags = VPO_BUSY;
marker.queue = PQ_INACTIVE;
marker.wire_count = 1;
@@ -775,7 +743,7 @@
cnt.v_pdpages++;
- if (m->queue != PQ_INACTIVE) {
+ if (VM_PAGE_GETQUEUE(m) != PQ_INACTIVE) {
goto rescan0;
}
@@ -807,7 +775,7 @@
addl_page_shortage++;
continue;
}
- if (m->busy || (m->flags & PG_BUSY)) {
+ if (m->busy || (m->oflags & VPO_BUSY)) {
VM_OBJECT_UNLOCK(object);
addl_page_shortage++;
continue;
@@ -883,7 +851,6 @@
/*
* Invalid pages can be easily freed
*/
- pmap_remove_all(m);
vm_page_free(m);
cnt.v_dfree++;
--page_shortage;
@@ -917,9 +884,9 @@
* pressure where there are insufficient clean pages
* on the inactive queue, we may have to go all out.
*/
- int swap_pageouts_ok;
+ int swap_pageouts_ok, vfslocked = 0;
struct vnode *vp = NULL;
- struct mount *mp;
+ struct mount *mp = NULL;
if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
swap_pageouts_ok = 1;
@@ -975,24 +942,24 @@
*/
if (object->type == OBJT_VNODE) {
vp = object->handle;
- mp = NULL;
if (vp->v_type == VREG &&
vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+ KASSERT(mp == NULL,
+ ("vm_pageout_scan: mp != NULL"));
++pageout_lock_miss;
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
- vp = NULL;
goto unlock_and_continue;
}
vm_page_unlock_queues();
- VI_LOCK(vp);
+ vm_object_reference_locked(object);
VM_OBJECT_UNLOCK(object);
- if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK |
- LK_TIMELOCK, curthread)) {
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ if (vget(vp, LK_EXCLUSIVE | LK_TIMELOCK,
+ curthread)) {
VM_OBJECT_LOCK(object);
vm_page_lock_queues();
++pageout_lock_miss;
- vn_finished_write(mp);
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
vp = NULL;
@@ -1004,12 +971,10 @@
* The page might have been moved to another
* queue during potential blocking in vget()
* above. The page might have been freed and
- * reused for another vnode. The object might
- * have been reused for another vnode.
+ * reused for another vnode.
*/
- if (m->queue != PQ_INACTIVE ||
+ if (VM_PAGE_GETQUEUE(m) != PQ_INACTIVE ||
m->object != object ||
- object->handle != vp ||
TAILQ_NEXT(m, pageq) != &marker) {
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
@@ -1018,11 +983,11 @@
/*
* The page may have been busied during the
- * blocking in vput(); We don't move the
+ * blocking in vget(). We don't move the
* page back onto the end of the queue so that
* statistics are more correct if we don't.
*/
- if (m->busy || (m->flags & PG_BUSY)) {
+ if (m->busy || (m->oflags & VPO_BUSY)) {
goto unlock_and_continue;
}
@@ -1054,9 +1019,12 @@
}
unlock_and_continue:
VM_OBJECT_UNLOCK(object);
- if (vp) {
+ if (mp != NULL) {
vm_page_unlock_queues();
- vput(vp);
+ if (vp != NULL)
+ vput(vp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ vm_object_deallocate(object);
vn_finished_write(mp);
vm_page_lock_queues();
}
@@ -1086,7 +1054,7 @@
while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
- KASSERT(m->queue == PQ_ACTIVE,
+ KASSERT(VM_PAGE_INQUEUE2(m, PQ_ACTIVE),
("vm_pageout_scan: page %p isn't active", m));
next = TAILQ_NEXT(m, pageq);
@@ -1106,7 +1074,7 @@
* Don't deactivate pages that are busy.
*/
if ((m->busy != 0) ||
- (m->flags & PG_BUSY) ||
+ (m->oflags & VPO_BUSY) ||
(m->hold_count != 0)) {
VM_OBJECT_UNLOCK(object);
vm_pageq_requeue(m);
@@ -1169,43 +1137,6 @@
VM_OBJECT_UNLOCK(object);
m = next;
}
-
- /*
- * We try to maintain some *really* free pages, this allows interrupt
- * code to be guaranteed space. Since both cache and free queues
- * are considered basically 'free', moving pages from cache to free
- * does not effect other calculations.
- */
- cache_cur = cache_last_free;
- cache_first_failure = -1;
- while (cnt.v_free_count < cnt.v_free_reserved && (cache_cur =
- (cache_cur + PQ_PRIME2) & PQ_L2_MASK) != cache_first_failure) {
- TAILQ_FOREACH(m, &vm_page_queues[PQ_CACHE + cache_cur].pl,
- pageq) {
- KASSERT(m->dirty == 0,
- ("Found dirty cache page %p", m));
- KASSERT(!pmap_page_is_mapped(m),
- ("Found mapped cache page %p", m));
- KASSERT((m->flags & PG_UNMANAGED) == 0,
- ("Found unmanaged cache page %p", m));
- KASSERT(m->wire_count == 0,
- ("Found wired cache page %p", m));
- if (m->hold_count == 0 && VM_OBJECT_TRYLOCK(object =
- m->object)) {
- KASSERT((m->flags & PG_BUSY) == 0 &&
- m->busy == 0, ("Found busy cache page %p",
- m));
- vm_page_free(m);
- VM_OBJECT_UNLOCK(object);
- cnt.v_dfree++;
- cache_last_free = cache_cur;
- cache_first_failure = -1;
- break;
- }
- }
- if (m == NULL && cache_first_failure == -1)
- cache_first_failure = cache_cur;
- }
vm_page_unlock_queues();
#if !defined(NO_SWAPPING)
/*
@@ -1214,8 +1145,7 @@
if (vm_swap_idle_enabled) {
static long lsec;
if (time_second != lsec) {
- vm_pageout_req_swapout |= VM_SWAP_IDLE;
- vm_req_vmdaemon();
+ vm_req_vmdaemon(VM_SWAP_IDLE);
lsec = time_second;
}
}
@@ -1230,10 +1160,8 @@
if (vnodes_skipped && vm_page_count_min())
(void) speedup_syncer();
#if !defined(NO_SWAPPING)
- if (vm_swap_enabled && vm_page_count_target()) {
- vm_req_vmdaemon();
- vm_pageout_req_swapout |= VM_SWAP_NORMAL;
- }
+ if (vm_swap_enabled && vm_page_count_target())
+ vm_req_vmdaemon(VM_SWAP_NORMAL);
#endif
}
@@ -1275,22 +1203,24 @@
* If the process is in a non-running type state,
* don't touch it. Check all the threads individually.
*/
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
breakout = 0;
FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
if (!TD_ON_RUNQ(td) &&
!TD_IS_RUNNING(td) &&
!TD_IS_SLEEPING(td)) {
+ thread_unlock(td);
breakout = 1;
break;
}
+ thread_unlock(td);
}
+ PROC_SUNLOCK(p);
if (breakout) {
- mtx_unlock_spin(&sched_lock);
PROC_UNLOCK(p);
continue;
}
- mtx_unlock_spin(&sched_lock);
/*
* get the process size
*/
@@ -1316,14 +1246,13 @@
sx_sunlock(&allproc_lock);
if (bigproc != NULL) {
killproc(bigproc, "out of swap space");
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(bigproc);
sched_nice(bigproc, PRIO_MIN);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(bigproc);
PROC_UNLOCK(bigproc);
wakeup(&cnt.v_free_count);
}
}
- mtx_unlock(&Giant);
}
/*
@@ -1363,7 +1292,7 @@
while ((m != NULL) && (pcount-- > 0)) {
int actcount;
- KASSERT(m->queue == PQ_ACTIVE,
+ KASSERT(VM_PAGE_INQUEUE2(m, PQ_ACTIVE),
("vm_pageout_page_stats: page %p isn't active", m));
next = TAILQ_NEXT(m, pageq);
@@ -1384,7 +1313,7 @@
* Don't deactivate pages that are busy.
*/
if ((m->busy != 0) ||
- (m->flags & PG_BUSY) ||
+ (m->oflags & VPO_BUSY) ||
(m->hold_count != 0)) {
VM_OBJECT_UNLOCK(object);
vm_pageq_requeue(m);
@@ -1454,7 +1383,7 @@
cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
cnt.v_interrupt_free_min;
cnt.v_free_reserved = vm_pageout_page_count +
- cnt.v_pageout_free_min + (cnt.v_page_count / 768) + PQ_L2_SIZE;
+ cnt.v_pageout_free_min + (cnt.v_page_count / 768);
cnt.v_free_severe = cnt.v_free_min / 2;
cnt.v_free_min += cnt.v_free_reserved;
cnt.v_free_severe += cnt.v_free_reserved;
@@ -1508,13 +1437,13 @@
* The pageout daemon is never done, so loop forever.
*/
while (TRUE) {
- vm_page_lock_queues();
/*
* If we have enough free memory, wakeup waiters. Do
* not clear vm_pages_needed until we reach our target,
* otherwise we may be woken up over and over again and
* waste a lot of cpu.
*/
+ mtx_lock(&vm_page_queue_free_mtx);
if (vm_pages_needed && !vm_page_count_min()) {
if (!vm_paging_needed())
vm_pages_needed = 0;
@@ -1528,8 +1457,9 @@
*/
++pass;
if (pass > 1)
- msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM,
- "psleep", hz/2);
+ msleep(&vm_pages_needed,
+ &vm_page_queue_free_mtx, PVM, "psleep",
+ hz / 2);
} else {
/*
* Good enough, sleep & handle stats. Prime the pass
@@ -1539,10 +1469,13 @@
pass = 1;
else
pass = 0;
- error = msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM,
- "psleep", vm_pageout_stats_interval * hz);
+ error = msleep(&vm_pages_needed,
+ &vm_page_queue_free_mtx, PVM, "psleep",
+ vm_pageout_stats_interval * hz);
if (error && !vm_pages_needed) {
+ mtx_unlock(&vm_page_queue_free_mtx);
pass = 0;
+ vm_page_lock_queues();
vm_pageout_page_stats();
vm_page_unlock_queues();
continue;
@@ -1550,16 +1483,16 @@
}
if (vm_pages_needed)
cnt.v_pdwakeups++;
- vm_page_unlock_queues();
+ mtx_unlock(&vm_page_queue_free_mtx);
vm_pageout_scan(pass);
}
}
/*
- * Unless the page queue lock is held by the caller, this function
+ * Unless the free page queue lock is held by the caller, this function
* should be regarded as advisory. Specifically, the caller should
* not msleep() on &cnt.v_free_count following this function unless
- * the page queue lock is held until the msleep() is performed.
+ * the free page queue lock is held until the msleep() is performed.
*/
void
pagedaemon_wakeup()
@@ -1573,14 +1506,17 @@
#if !defined(NO_SWAPPING)
static void
-vm_req_vmdaemon()
+vm_req_vmdaemon(int req)
{
static int lastrun = 0;
+ mtx_lock(&vm_daemon_mtx);
+ vm_pageout_req_swapout |= req;
if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
wakeup(&vm_daemon_needed);
lastrun = ticks;
}
+ mtx_unlock(&vm_daemon_mtx);
}
static void
@@ -1589,21 +1525,23 @@
struct rlimit rsslim;
struct proc *p;
struct thread *td;
- int breakout;
+ int breakout, swapout_flags;
- mtx_lock(&Giant);
while (TRUE) {
- tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
- if (vm_pageout_req_swapout) {
- swapout_procs(vm_pageout_req_swapout);
- vm_pageout_req_swapout = 0;
- }
+ mtx_lock(&vm_daemon_mtx);
+ msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0);
+ swapout_flags = vm_pageout_req_swapout;
+ vm_pageout_req_swapout = 0;
+ mtx_unlock(&vm_daemon_mtx);
+ if (swapout_flags)
+ swapout_procs(swapout_flags);
+
/*
* scan the processes for exceeding their rlimits or if
* process is swapped out -- deactivate pages
*/
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
vm_pindex_t limit, size;
/*
@@ -1619,17 +1557,20 @@
* if the process is in a non-running type state,
* don't touch it.
*/
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
breakout = 0;
FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
if (!TD_ON_RUNQ(td) &&
!TD_IS_RUNNING(td) &&
!TD_IS_SLEEPING(td)) {
+ thread_unlock(td);
breakout = 1;
break;
}
+ thread_unlock(td);
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
if (breakout) {
PROC_UNLOCK(p);
continue;
@@ -1646,7 +1587,7 @@
* swapped out set the limit to nothing (will force a
* swap-out.)
*/
- if ((p->p_sflag & PS_INMEM) == 0)
+ if ((p->p_flag & P_INMEM) == 0)
limit = 0; /* XXX */
PROC_UNLOCK(p);
Index: memguard.c
===================================================================
RCS file: /home/cvs/src/sys/vm/memguard.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/memguard.c -L sys/vm/memguard.c -u -r1.1.1.1 -r1.2
--- sys/vm/memguard.c
+++ sys/vm/memguard.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/memguard.c,v 1.5 2005/02/16 21:45:59 bmilekic Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/memguard.c,v 1.6 2005/12/30 11:45:07 pjd Exp $");
/*
* MemGuard is a simple replacement allocator for debugging only
@@ -44,6 +44,7 @@
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
+#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -59,6 +60,67 @@
*/
#define MAX_PAGES_PER_ITEM 64
+SYSCTL_NODE(_vm, OID_AUTO, memguard, CTLFLAG_RW, NULL, "MemGuard data");
+/*
+ * The vm_memguard_divisor variable controls how much of kmem_map should be
+ * reserved for MemGuard.
+ */
+u_int vm_memguard_divisor;
+SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RD, &vm_memguard_divisor,
+ 0, "(kmem_size/memguard_divisor) == memguard submap size");
+
+/*
+ * Short description (ks_shortdesc) of memory type to monitor.
+ */
+static char vm_memguard_desc[128] = "";
+static struct malloc_type *vm_memguard_mtype = NULL;
+TUNABLE_STR("vm.memguard.desc", vm_memguard_desc, sizeof(vm_memguard_desc));
+static int
+memguard_sysctl_desc(SYSCTL_HANDLER_ARGS)
+{
+ struct malloc_type_internal *mtip;
+ struct malloc_type_stats *mtsp;
+ struct malloc_type *mtp;
+ char desc[128];
+ long bytes;
+ int error, i;
+
+ strlcpy(desc, vm_memguard_desc, sizeof(desc));
+ error = sysctl_handle_string(oidp, desc, sizeof(desc), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ /*
+ * We can change memory type when no memory has been allocated for it
+ * or when there is no such memory type yet (ie. it will be loaded with
+ * kernel module).
+ */
+ bytes = 0;
+ mtx_lock(&malloc_mtx);
+ mtp = malloc_desc2type(desc);
+ if (mtp != NULL) {
+ mtip = mtp->ks_handle;
+ for (i = 0; i < MAXCPU; i++) {
+ mtsp = &mtip->mti_stats[i];
+ bytes += mtsp->mts_memalloced;
+ bytes -= mtsp->mts_memfreed;
+ }
+ }
+ if (bytes > 0)
+ error = EBUSY;
+ else {
+ /*
+ * If mtp is NULL, it will be initialized in memguard_cmp().
+ */
+ vm_memguard_mtype = mtp;
+ strlcpy(vm_memguard_desc, desc, sizeof(vm_memguard_desc));
+ }
+ mtx_unlock(&malloc_mtx);
+ return (error);
+}
+SYSCTL_PROC(_vm_memguard, OID_AUTO, desc, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
+ memguard_sysctl_desc, "A", "Short description of memory type to monitor");
+
/*
* Global MemGuard data.
*/
@@ -239,6 +301,34 @@
MEMGUARD_CRIT_SECTION_EXIT;
}
+int
+memguard_cmp(struct malloc_type *mtp)
+{
+
+#if 1
+ /*
+ * The safest way of comparsion is to always compare short description
+ * string of memory type, but it is also the slowest way.
+ */
+ return (strcmp(mtp->ks_shortdesc, vm_memguard_desc) == 0);
+#else
+ /*
+ * If we compare pointers, there are two possible problems:
+ * 1. Memory type was unloaded and new memory type was allocated at the
+ * same address.
+ * 2. Memory type was unloaded and loaded again, but allocated at a
+ * different address.
+ */
+ if (vm_memguard_mtype != NULL)
+ return (mtp == vm_memguard_mtype);
+ if (strcmp(mtp->ks_shortdesc, vm_memguard_desc) == 0) {
+ vm_memguard_mtype = mtp;
+ return (1);
+ }
+ return (0);
+#endif
+}
+
/*
* Guard a page containing specified object (make it read-only so that
* future writes to it fail).
Index: device_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/device_pager.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/device_pager.c -L sys/vm/device_pager.c -u -r1.1.1.1 -r1.2
--- sys/vm/device_pager.c
+++ sys/vm/device_pager.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/device_pager.c,v 1.78 2005/06/10 17:27:54 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/device_pager.c,v 1.84 2007/08/18 16:41:31 kib Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -64,8 +64,6 @@
/* list of device pager objects */
static struct pagerlst dev_pager_object_list;
-/* protect against object creation */
-static struct sx dev_pager_sx;
/* protect list manipulation */
static struct mtx dev_pager_mtx;
@@ -89,7 +87,6 @@
dev_pager_init()
{
TAILQ_INIT(&dev_pager_object_list);
- sx_init(&dev_pager_sx, "dev_pager create");
mtx_init(&dev_pager_mtx, "dev_pager list", NULL, MTX_DEF);
fakepg_zone = uma_zcreate("DP fakepg", sizeof(struct vm_page),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
@@ -103,7 +100,7 @@
dev_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff)
{
struct cdev *dev;
- vm_object_t object;
+ vm_object_t object, object1;
vm_pindex_t pindex;
unsigned int npages;
vm_paddr_t paddr;
@@ -126,7 +123,6 @@
csw = dev_refthread(dev);
if (csw == NULL)
return (NULL);
- mtx_lock(&Giant);
/*
* Check that the specified range of the device allows the desired
@@ -137,42 +133,46 @@
npages = OFF_TO_IDX(size);
for (off = foff; npages--; off += PAGE_SIZE)
if ((*csw->d_mmap)(dev, off, &paddr, (int)prot) != 0) {
- mtx_unlock(&Giant);
dev_relthread(dev);
return (NULL);
}
- /*
- * Lock to prevent object creation race condition.
- */
- sx_xlock(&dev_pager_sx);
+ mtx_lock(&dev_pager_mtx);
/*
* Look up pager, creating as necessary.
*/
+ object1 = NULL;
object = vm_pager_object_lookup(&dev_pager_object_list, handle);
if (object == NULL) {
/*
* Allocate object and associate it with the pager.
*/
- object = vm_object_allocate(OBJT_DEVICE, pindex);
- object->handle = handle;
- TAILQ_INIT(&object->un_pager.devp.devp_pglist);
- mtx_lock(&dev_pager_mtx);
- TAILQ_INSERT_TAIL(&dev_pager_object_list, object, pager_object_list);
mtx_unlock(&dev_pager_mtx);
+ object1 = vm_object_allocate(OBJT_DEVICE, pindex);
+ mtx_lock(&dev_pager_mtx);
+ object = vm_pager_object_lookup(&dev_pager_object_list, handle);
+ if (object != NULL) {
+ /*
+ * We raced with other thread while allocating object.
+ */
+ if (pindex > object->size)
+ object->size = pindex;
+ } else {
+ object = object1;
+ object1 = NULL;
+ object->handle = handle;
+ TAILQ_INIT(&object->un_pager.devp.devp_pglist);
+ TAILQ_INSERT_TAIL(&dev_pager_object_list, object,
+ pager_object_list);
+ }
} else {
- /*
- * Gain a reference to the object.
- */
- vm_object_reference(object);
if (pindex > object->size)
object->size = pindex;
}
-
- sx_xunlock(&dev_pager_sx);
- mtx_unlock(&Giant);
+ mtx_unlock(&dev_pager_mtx);
dev_relthread(dev);
+ vm_object_deallocate(object1);
return (object);
}
@@ -182,9 +182,11 @@
{
vm_page_t m;
+ VM_OBJECT_UNLOCK(object);
mtx_lock(&dev_pager_mtx);
TAILQ_REMOVE(&dev_pager_object_list, object, pager_object_list);
mtx_unlock(&dev_pager_mtx);
+ VM_OBJECT_LOCK(object);
/*
* Free up our fake pages.
*/
@@ -216,12 +218,10 @@
csw = dev_refthread(dev);
if (csw == NULL)
panic("dev_pager_getpage: no cdevsw");
- mtx_lock(&Giant);
prot = PROT_READ; /* XXX should pass in? */
ret = (*csw->d_mmap)(dev, (vm_offset_t)offset << PAGE_SHIFT, &paddr, prot);
KASSERT(ret == 0, ("dev_pager_getpage: map function returns error"));
- mtx_unlock(&Giant);
dev_relthread(dev);
if ((m[reqpage]->flags & PG_FICTITIOUS) != 0) {
@@ -295,7 +295,8 @@
m = uma_zalloc(fakepg_zone, M_WAITOK);
- m->flags = PG_BUSY | PG_FICTITIOUS;
+ m->flags = PG_FICTITIOUS;
+ m->oflags = VPO_BUSY;
m->valid = VM_PAGE_BITS_ALL;
m->dirty = 0;
m->busy = 0;
Index: vm_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_pager.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_pager.c -L sys/vm/vm_pager.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_pager.c
+++ sys/vm/vm_pager.c
@@ -64,7 +64,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_pager.c,v 1.105.2.1 2005/08/15 14:04:47 kan Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_pager.c,v 1.108 2007/08/05 21:04:32 alc Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -82,7 +82,7 @@
#include <vm/vm_pager.h>
#include <vm/vm_extern.h>
-MALLOC_DEFINE(M_VMPGDATA, "VM pgdata", "XXX: VM pager private data");
+MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "XXX: VM pager private data");
int cluster_pbuf_freecnt = -1; /* unlimited to begin with */
@@ -261,17 +261,29 @@
* vm_pager_has_page() - inline, see vm/vm_pager.h
*/
+/*
+ * Search the specified pager object list for an object with the
+ * specified handle. If an object with the specified handle is found,
+ * increase its reference count and return it. Otherwise, return NULL.
+ *
+ * The pager object list must be locked.
+ */
vm_object_t
-vm_pager_object_lookup(pg_list, handle)
- struct pagerlst *pg_list;
- void *handle;
+vm_pager_object_lookup(struct pagerlst *pg_list, void *handle)
{
vm_object_t object;
- TAILQ_FOREACH(object, pg_list, pager_object_list)
- if (object->handle == handle)
- return (object);
- return (NULL);
+ TAILQ_FOREACH(object, pg_list, pager_object_list) {
+ VM_OBJECT_LOCK(object);
+ if (object->handle == handle &&
+ (object->flags & OBJ_DEAD) == 0) {
+ vm_object_reference_locked(object);
+ VM_OBJECT_UNLOCK(object);
+ break;
+ }
+ VM_OBJECT_UNLOCK(object);
+ }
+ return (object);
}
/*
Index: vm_extern.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_extern.h,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_extern.h -L sys/vm/vm_extern.h -u -r1.2 -r1.3
--- sys/vm/vm_extern.h
+++ sys/vm/vm_extern.h
@@ -27,7 +27,7 @@
* SUCH DAMAGE.
*
* @(#)vm_extern.h 8.2 (Berkeley) 1/12/94
- * $FreeBSD: src/sys/vm/vm_extern.h,v 1.76.2.1 2006/03/16 00:25:32 alc Exp $
+ * $FreeBSD: src/sys/vm/vm_extern.h,v 1.78.4.1 2008/01/19 18:15:07 kib Exp $
*/
#ifndef _VM_EXTERN_H_
@@ -70,14 +70,16 @@
void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t);
void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
-void vm_forkproc(struct thread *, struct proc *, struct thread *, int);
+int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
void vm_waitproc(struct proc *);
int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t);
void vm_set_page_size(void);
struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t);
struct vmspace *vmspace_fork(struct vmspace *);
-void vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
-void vmspace_unshare(struct proc *);
+int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
+int vmspace_unshare(struct proc *);
+void vmspace_exit(struct thread *);
+struct vmspace *vmspace_acquire_ref(struct proc *);
void vmspace_free(struct vmspace *);
void vmspace_exitfree(struct proc *);
void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
@@ -90,8 +92,8 @@
void vm_imgact_unmap_page(struct sf_buf *sf);
void vm_thread_dispose(struct thread *td);
void vm_thread_dispose_altkstack(struct thread *td);
-void vm_thread_new(struct thread *td, int pages);
-void vm_thread_new_altkstack(struct thread *td, int pages);
+int vm_thread_new(struct thread *td, int pages);
+int vm_thread_new_altkstack(struct thread *td, int pages);
void vm_thread_swapin(struct thread *td);
void vm_thread_swapout(struct thread *td);
#endif /* _KERNEL */
Index: vm_page.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_page.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_page.h -L sys/vm/vm_page.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_page.h
+++ sys/vm/vm_page.h
@@ -57,7 +57,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: src/sys/vm/vm_page.h,v 1.136.2.1 2005/08/15 09:02:01 rwatson Exp $
+ * $FreeBSD: src/sys/vm/vm_page.h,v 1.152 2007/09/27 04:21:59 alc Exp $
*/
/*
@@ -67,10 +67,6 @@
#ifndef _VM_PAGE_
#define _VM_PAGE_
-#if !defined(KLD_MODULE) && !defined(LIBMEMSTAT)
-#include "opt_vmpage.h"
-#endif
-
#include <vm/pmap.h>
/*
@@ -114,12 +110,15 @@
vm_pindex_t pindex; /* offset into object (O,P) */
vm_paddr_t phys_addr; /* physical address of page */
struct md_page md; /* machine dependant stuff */
- u_short queue; /* page queue index */
- u_short flags, /* see below */
- pc; /* page color */
+ uint8_t queue; /* page queue index */
+ int8_t segind;
+ u_short flags; /* see below */
+ uint8_t order; /* index of the buddy queue */
+ uint8_t pool;
u_short wire_count; /* wired down maps refs (P) */
u_int cow; /* page cow mapping count */
short hold_count; /* page hold count */
+ u_short oflags; /* page flags (O) */
u_char act_count; /* page usage count */
u_char busy; /* page busy count (O) */
/* NOTE that these must support one bit per DEV_BSIZE in a page!!! */
@@ -139,6 +138,18 @@
#endif
};
+/*
+ * Page flags stored in oflags:
+ *
+ * Access to these page flags is synchronized by the lock on the object
+ * containing the page (O).
+ */
+#define VPO_BUSY 0x0001 /* page is in transit */
+#define VPO_WANTED 0x0002 /* someone is waiting for page */
+#define VPO_CLEANCHK 0x0100 /* page will be checked for cleaning */
+#define VPO_SWAPINPROG 0x0200 /* swap I/O in progress on page */
+#define VPO_NOSYNC 0x0400 /* do not collect for syncer */
+
/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
#if PAGE_SIZE == 32768
#ifdef CTASSERT
@@ -146,82 +157,33 @@
#endif
#endif
-#if !defined(KLD_MODULE)
-/*
- * Page coloring parameters
- */
+#define PQ_NONE 0
+#define PQ_INACTIVE 1
+#define PQ_ACTIVE 2
+#define PQ_HOLD 3
+#define PQ_COUNT 4
+#define PQ_MAXCOUNT 4
+
+/* Returns the real queue a page is on. */
+#define VM_PAGE_GETQUEUE(m) ((m)->queue)
-/* Backward compatibility for existing PQ_*CACHE config options. */
-#if !defined(PQ_CACHESIZE)
-#if defined(PQ_HUGECACHE)
-#define PQ_CACHESIZE 1024
-#elif defined(PQ_LARGECACHE)
-#define PQ_CACHESIZE 512
-#elif defined(PQ_MEDIUMCACHE)
-#define PQ_CACHESIZE 256
-#elif defined(PQ_NORMALCACHE)
-#define PQ_CACHESIZE 64
-#elif defined(PQ_NOOPT)
-#define PQ_CACHESIZE 0
-#else
-#define PQ_CACHESIZE 128
-#endif
-#endif /* !defined(PQ_CACHESIZE) */
+/* Returns the well known queue a page is on. */
+#define VM_PAGE_GETKNOWNQUEUE2(m) VM_PAGE_GETQUEUE(m)
-#if PQ_CACHESIZE >= 1024
-#define PQ_PRIME1 31 /* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_PRIME2 23 /* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 256 /* A number of colors opt for 1M cache */
-
-#elif PQ_CACHESIZE >= 512
-#define PQ_PRIME1 31 /* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_PRIME2 23 /* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 128 /* A number of colors opt for 512K cache */
-
-#elif PQ_CACHESIZE >= 256
-#define PQ_PRIME1 13 /* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_PRIME2 7 /* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 64 /* A number of colors opt for 256K cache */
-
-#elif PQ_CACHESIZE >= 128
-#define PQ_PRIME1 9 /* Produces a good PQ_L2_SIZE/3 + PQ_PRIME1 */
-#define PQ_PRIME2 5 /* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 32 /* A number of colors opt for 128k cache */
-
-#elif PQ_CACHESIZE >= 64
-#define PQ_PRIME1 5 /* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_PRIME2 3 /* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 16 /* A reasonable number of colors (opt for 64K cache) */
+/* Returns true if the page is in the named well known queue. */
+#define VM_PAGE_INQUEUE2(m, q) (VM_PAGE_GETKNOWNQUEUE2(m) == (q))
-#else
-#define PQ_PRIME1 1 /* Disable page coloring. */
-#define PQ_PRIME2 1
-#define PQ_L2_SIZE 1
-
-#endif
-
-#define PQ_L2_MASK (PQ_L2_SIZE - 1)
-
-/* PQ_CACHE and PQ_FREE represent PQ_L2_SIZE consecutive queues. */
-#define PQ_NONE 0
-#define PQ_FREE 1
-#define PQ_INACTIVE (1 + 1*PQ_L2_SIZE)
-#define PQ_ACTIVE (2 + 1*PQ_L2_SIZE)
-#define PQ_CACHE (3 + 1*PQ_L2_SIZE)
-#define PQ_HOLD (3 + 2*PQ_L2_SIZE)
-#define PQ_COUNT (4 + 2*PQ_L2_SIZE)
+/* Sets the queue a page is on. */
+#define VM_PAGE_SETQUEUE2(m, q) (VM_PAGE_GETQUEUE(m) = (q))
struct vpgqueues {
struct pglist pl;
int *cnt;
- int lcnt;
};
-extern struct vpgqueues vm_page_queues[PQ_COUNT];
+extern struct vpgqueues vm_page_queues[PQ_MAXCOUNT];
extern struct mtx vm_page_queue_free_mtx;
-#endif /* !defined(KLD_MODULE) */
-
/*
* These are the flags defined for vm_page.
*
@@ -232,16 +194,13 @@
* pte mappings, nor can they be removed from their objects via
* the object, and such pages are also not on any PQ queue.
*/
-#define PG_BUSY 0x0001 /* page is in transit (O) */
-#define PG_WANTED 0x0002 /* someone is waiting for page (O) */
+#define PG_CACHED 0x0001 /* page is cached */
+#define PG_FREE 0x0002 /* page is free */
#define PG_WINATCFLS 0x0004 /* flush dirty page on inactive q */
#define PG_FICTITIOUS 0x0008 /* physical page doesn't exist (O) */
#define PG_WRITEABLE 0x0010 /* page is mapped writeable */
#define PG_ZERO 0x0040 /* page is zeroed */
#define PG_REFERENCED 0x0080 /* page has been referenced */
-#define PG_CLEANCHK 0x0100 /* page will be checked for cleaning */
-#define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */
-#define PG_NOSYNC 0x0400 /* do not collect for syncer */
#define PG_UNMANAGED 0x0800 /* No PV management for page */
#define PG_MARKER 0x1000 /* special queue marker page */
#define PG_SLAB 0x2000 /* object pointer is actually a slab */
@@ -255,18 +214,24 @@
#define ACT_MAX 64
#ifdef _KERNEL
+
+#include <vm/vm_param.h>
+
/*
- * Each pageable resident page falls into one of four lists:
+ * Each pageable resident page falls into one of five lists:
*
* free
* Available for allocation now.
*
- * The following are all LRU sorted:
- *
* cache
- * Almost available for allocation. Still in an
- * object, but clean and immediately freeable at
- * non-interrupt times.
+ * Almost available for allocation. Still associated with
+ * an object, but clean and immediately freeable.
+ *
+ * hold
+ * Will become free after a pending I/O operation
+ * completes.
+ *
+ * The following lists are LRU sorted:
*
* inactive
* Low activity, candidates for reclamation.
@@ -277,9 +242,6 @@
* Pages that are "active" i.e. they have been
* recently referenced.
*
- * zero
- * Pages that are really free and have been pre-zeroed
- *
*/
extern int vm_page_zero_count;
@@ -288,10 +250,25 @@
extern int vm_page_array_size; /* number of vm_page_t's */
extern long first_page; /* first physical page number */
+#define VM_PAGE_IS_FREE(m) (((m)->flags & PG_FREE) != 0)
+
#define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr)
-#define PHYS_TO_VM_PAGE(pa) \
- (&vm_page_array[atop(pa) - first_page ])
+vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
+
+static __inline vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
+
+static __inline vm_page_t
+PHYS_TO_VM_PAGE(vm_paddr_t pa)
+{
+#ifdef VM_PHYSSEG_SPARSE
+ return (vm_phys_paddr_to_vm_page(pa));
+#elif defined(VM_PHYSSEG_DENSE)
+ return (&vm_page_array[atop(pa) - first_page]);
+#else
+#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
+#endif
+}
extern struct mtx vm_page_queue_mtx;
#define vm_page_lock_queues() mtx_lock(&vm_page_queue_mtx)
@@ -318,6 +295,8 @@
#define VM_ALLOC_RETRY 0x0080 /* vm_page_grab() only */
#define VM_ALLOC_NOOBJ 0x0100 /* No associated object */
#define VM_ALLOC_NOBUSY 0x0200 /* Do not busy the page */
+#define VM_ALLOC_IFCACHED 0x0400 /* Fail if the page is not cached */
+#define VM_ALLOC_IFNOTCACHED 0x0800 /* Fail if the page is cached */
void vm_page_flag_set(vm_page_t m, unsigned short bits);
void vm_page_flag_clear(vm_page_t m, unsigned short bits);
@@ -329,25 +308,21 @@
void vm_page_unhold(vm_page_t mem);
void vm_page_free(vm_page_t m);
void vm_page_free_zero(vm_page_t m);
-int vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg);
void vm_page_dirty(vm_page_t m);
void vm_page_wakeup(vm_page_t m);
void vm_pageq_init(void);
-vm_page_t vm_pageq_add_new_page(vm_paddr_t pa);
void vm_pageq_enqueue(int queue, vm_page_t m);
-void vm_pageq_remove_nowakeup(vm_page_t m);
void vm_pageq_remove(vm_page_t m);
-vm_page_t vm_pageq_find(int basequeue, int index, boolean_t prefer_zero);
void vm_pageq_requeue(vm_page_t m);
void vm_page_activate (vm_page_t);
vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
-vm_page_t vm_page_alloc_contig (vm_pindex_t, vm_paddr_t, vm_paddr_t,
- vm_offset_t, vm_offset_t);
-void vm_page_release_contig (vm_page_t, vm_pindex_t);
vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
void vm_page_cache (register vm_page_t);
+void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
+void vm_page_cache_remove(vm_page_t);
+void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
int vm_page_try_to_cache (vm_page_t);
int vm_page_try_to_free (vm_page_t);
void vm_page_dontneed (register vm_page_t);
@@ -356,10 +331,9 @@
vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
void vm_page_remove (vm_page_t);
void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
-vm_page_t vm_page_select_cache(int);
+void vm_page_sleep(vm_page_t m, const char *msg);
vm_page_t vm_page_splay(vm_pindex_t, vm_page_t);
vm_offset_t vm_page_startup(vm_offset_t vaddr);
-void vm_page_unmanage (vm_page_t);
void vm_page_unwire (vm_page_t, int);
void vm_page_wire (vm_page_t);
void vm_page_set_validclean (vm_page_t, int, int);
@@ -376,6 +350,27 @@
void vm_page_cowclear (vm_page_t);
/*
+ * vm_page_sleep_if_busy:
+ *
+ * Sleep and release the page queues lock if VPO_BUSY is set or,
+ * if also_m_busy is TRUE, busy is non-zero. Returns TRUE if the
+ * thread slept and the page queues lock was released.
+ * Otherwise, retains the page queues lock and returns FALSE.
+ *
+ * The object containing the given page must be locked.
+ */
+static __inline int
+vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg)
+{
+
+ if ((m->oflags & VPO_BUSY) || (also_m_busy && m->busy)) {
+ vm_page_sleep(m, msg);
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
+/*
* vm_page_undirty:
*
* Set page to not be dirty. Note: does not clear pmap modify bits
Index: vm_glue.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_glue.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_glue.c -L sys/vm/vm_glue.c -u -r1.2 -r1.3
--- sys/vm/vm_glue.c
+++ sys/vm/vm_glue.c
@@ -57,7 +57,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_glue.c,v 1.213.2.1 2006/03/16 00:25:32 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_glue.c,v 1.225.4.1 2008/01/19 18:15:07 kib Exp $");
#include "opt_vm.h"
#include "opt_kstack_pages.h"
@@ -112,7 +112,8 @@
SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL)
#ifndef NO_SWAPPING
-static void swapout(struct proc *);
+static int swapout(struct proc *);
+static void swapclear(struct proc *);
#endif
@@ -272,8 +273,8 @@
}
vm_page_lock_queues();
vm_page_hold(m);
- vm_page_wakeup(m);
vm_page_unlock_queues();
+ vm_page_wakeup(m);
out:
VM_OBJECT_UNLOCK(object);
return (m);
@@ -320,7 +321,7 @@
* This routine directly affects the fork perf for a process and
* create performance for a thread.
*/
-void
+int
vm_thread_new(struct thread *td, int pages)
{
vm_object_t ksobj;
@@ -337,18 +338,22 @@
* Allocate an object for the kstack.
*/
ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
- td->td_kstack_obj = ksobj;
/*
* Get a kernel virtual address for this thread's kstack.
*/
ks = kmem_alloc_nofault(kernel_map,
(pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
- if (ks == 0)
- panic("vm_thread_new: kstack allocation failed");
+ if (ks == 0) {
+ printf("vm_thread_new: kstack allocation failed\n");
+ vm_object_deallocate(ksobj);
+ return (0);
+ }
+
if (KSTACK_GUARD_PAGES != 0) {
pmap_qremove(ks, KSTACK_GUARD_PAGES);
ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
}
+ td->td_kstack_obj = ksobj;
td->td_kstack = ks;
/*
* Knowing the number of pages allocated is useful when you
@@ -371,6 +376,7 @@
}
VM_OBJECT_UNLOCK(ksobj);
pmap_qenter(ks, ma, pages);
+ return (1);
}
/*
@@ -402,6 +408,7 @@
vm_object_deallocate(ksobj);
kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
(pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
+ td->td_kstack = 0;
}
/*
@@ -456,8 +463,8 @@
ma[i] = m;
vm_page_lock_queues();
vm_page_wire(m);
- vm_page_wakeup(m);
vm_page_unlock_queues();
+ vm_page_wakeup(m);
}
VM_OBJECT_UNLOCK(ksobj);
pmap_qenter(td->td_kstack, ma, pages);
@@ -467,7 +474,7 @@
/*
* Set up a variable-sized alternate kstack.
*/
-void
+int
vm_thread_new_altkstack(struct thread *td, int pages)
{
@@ -475,7 +482,7 @@
td->td_altkstack_obj = td->td_kstack_obj;
td->td_altkstack_pages = td->td_kstack_pages;
- vm_thread_new(td, pages);
+ return (vm_thread_new(td, pages));
}
/*
@@ -503,14 +510,16 @@
* ready to run. The new process is set up so that it returns directly
* to user mode to avoid stack copying and relocation problems.
*/
-void
-vm_forkproc(td, p2, td2, flags)
+int
+vm_forkproc(td, p2, td2, vm2, flags)
struct thread *td;
struct proc *p2;
struct thread *td2;
+ struct vmspace *vm2;
int flags;
{
struct proc *p1 = td->td_proc;
+ int error;
if ((flags & RFPROC) == 0) {
/*
@@ -520,11 +529,13 @@
*/
if ((flags & RFMEM) == 0) {
if (p1->p_vmspace->vm_refcnt > 1) {
- vmspace_unshare(p1);
+ error = vmspace_unshare(p1);
+ if (error)
+ return (error);
}
}
cpu_fork(td, p2, td2, flags);
- return;
+ return (0);
}
if (flags & RFMEM) {
@@ -537,7 +548,7 @@
}
if ((flags & RFMEM) == 0) {
- p2->p_vmspace = vmspace_fork(p1->p_vmspace);
+ p2->p_vmspace = vm2;
if (p1->p_vmspace->vm_shm)
shmfork(p1, p2);
}
@@ -547,6 +558,7 @@
* and make the child ready to run.
*/
cpu_fork(td, p2, td2, flags);
+ return (0);
}
/*
@@ -601,7 +613,7 @@
#ifdef NO_SWAPPING
PROC_LOCK_ASSERT(p, MA_OWNED);
- if ((p->p_sflag & PS_INMEM) == 0)
+ if ((p->p_flag & P_INMEM) == 0)
panic("faultin: proc swapped out with NO_SWAPPING!");
#else /* !NO_SWAPPING */
struct thread *td;
@@ -611,34 +623,34 @@
* If another process is swapping in this process,
* just wait until it finishes.
*/
- if (p->p_sflag & PS_SWAPPINGIN)
- msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0);
- else if ((p->p_sflag & PS_INMEM) == 0) {
+ if (p->p_flag & P_SWAPPINGIN) {
+ while (p->p_flag & P_SWAPPINGIN)
+ msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
+ return;
+ }
+ if ((p->p_flag & P_INMEM) == 0) {
/*
* Don't let another thread swap process p out while we are
* busy swapping it in.
*/
++p->p_lock;
- mtx_lock_spin(&sched_lock);
- p->p_sflag |= PS_SWAPPINGIN;
- mtx_unlock_spin(&sched_lock);
+ p->p_flag |= P_SWAPPINGIN;
PROC_UNLOCK(p);
+ /*
+ * We hold no lock here because the list of threads
+ * can not change while all threads in the process are
+ * swapped out.
+ */
FOREACH_THREAD_IN_PROC(p, td)
vm_thread_swapin(td);
-
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
- p->p_sflag &= ~PS_SWAPPINGIN;
- p->p_sflag |= PS_INMEM;
- FOREACH_THREAD_IN_PROC(p, td) {
- TD_CLR_SWAPPED(td);
- if (TD_CAN_RUN(td))
- setrunnable(td);
- }
- mtx_unlock_spin(&sched_lock);
+ PROC_SLOCK(p);
+ swapclear(p);
+ p->p_swtick = ticks;
+ PROC_SUNLOCK(p);
- wakeup(&p->p_sflag);
+ wakeup(&p->p_flag);
/* Allow other threads to swap p out now. */
--p->p_lock;
@@ -662,9 +674,11 @@
{
struct proc *p;
struct thread *td;
- int pri;
struct proc *pp;
+ int slptime;
+ int swtime;
int ppri;
+ int pri;
mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
mtx_unlock(&Giant);
@@ -672,9 +686,9 @@
loop:
if (vm_page_count_min()) {
VM_WAIT;
- mtx_lock_spin(&sched_lock);
+ thread_lock(&thread0);
proc0_rescan = 0;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(&thread0);
goto loop;
}
@@ -682,26 +696,27 @@
ppri = INT_MIN;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
- struct ksegrp *kg;
- if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
+ PROC_LOCK(p);
+ if (p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
+ PROC_UNLOCK(p);
continue;
}
- mtx_lock_spin(&sched_lock);
+ swtime = (ticks - p->p_swtick) / hz;
+ PROC_SLOCK(p);
FOREACH_THREAD_IN_PROC(p, td) {
/*
* An otherwise runnable thread of a process
* swapped out has only the TDI_SWAPPED bit set.
*
*/
+ thread_lock(td);
if (td->td_inhibitors == TDI_SWAPPED) {
- kg = td->td_ksegrp;
- pri = p->p_swtime + kg->kg_slptime;
- if ((p->p_sflag & PS_SWAPINREQ) == 0) {
+ slptime = (ticks - td->td_slptick) / hz;
+ pri = swtime + slptime;
+ if ((td->td_flags & TDF_SWAPINREQ) == 0)
pri -= p->p_nice * 8;
- }
-
/*
- * if this ksegrp is higher priority
+ * if this thread is higher priority
* and there is enough space, then select
* this process instead of the previous
* selection.
@@ -711,8 +726,10 @@
ppri = pri;
}
}
+ thread_unlock(td);
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
@@ -720,13 +737,13 @@
* Nothing to do, back to sleep.
*/
if ((p = pp) == NULL) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(&thread0);
if (!proc0_rescan) {
TD_SET_IWAIT(&thread0);
mi_switch(SW_VOL, NULL);
}
proc0_rescan = 0;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(&thread0);
goto loop;
}
PROC_LOCK(p);
@@ -736,28 +753,23 @@
* brought this process in while we traverse all threads.
* Or, this process may even be being swapped out again.
*/
- if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
+ if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
PROC_UNLOCK(p);
- mtx_lock_spin(&sched_lock);
+ thread_lock(&thread0);
proc0_rescan = 0;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(&thread0);
goto loop;
}
- mtx_lock_spin(&sched_lock);
- p->p_sflag &= ~PS_SWAPINREQ;
- mtx_unlock_spin(&sched_lock);
-
/*
* We would like to bring someone in. (only if there is space).
* [What checks the space? ]
*/
faultin(p);
PROC_UNLOCK(p);
- mtx_lock_spin(&sched_lock);
- p->p_swtime = 0;
+ thread_lock(&thread0);
proc0_rescan = 0;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(&thread0);
goto loop;
}
@@ -765,16 +777,18 @@
{
struct thread *td = &thread0;
-
+ /* XXX This will probably cause a LOR in some cases */
+ thread_lock(td);
if (TD_AWAITING_INTR(td)) {
- CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, 0);
+ CTR2(KTR_INTR, "%s: sched_add %d", __func__, 0);
TD_CLR_IWAIT(td);
- setrunqueue(td, SRQ_INTR);
+ sched_add(td, SRQ_INTR);
} else {
proc0_rescan = 1;
CTR2(KTR_INTR, "%s: state %d",
__func__, td->td_state);
}
+ thread_unlock(td);
}
@@ -798,7 +812,7 @@
/*
* Swapout is driven by the pageout daemon. Very simple, we find eligible
- * procs and unwire their u-areas. We try to always "swap" at least one
+ * procs and swap out their stacks. We try to always "swap" at least one
* process in case we need the room for a swapin.
* If any procs have been sleeping/stopped for at least maxslp seconds,
* they are swapped. Else, we swap the longest-sleeping or stopped process,
@@ -810,7 +824,6 @@
{
struct proc *p;
struct thread *td;
- struct ksegrp *kg;
int didswap = 0;
retry:
@@ -818,19 +831,15 @@
FOREACH_PROC_IN_SYSTEM(p) {
struct vmspace *vm;
int minslptime = 100000;
+ int slptime;
/*
* Watch out for a process in
* creation. It may have no
* address space or lock yet.
*/
- mtx_lock_spin(&sched_lock);
- if (p->p_state == PRS_NEW) {
- mtx_unlock_spin(&sched_lock);
+ if (p->p_state == PRS_NEW)
continue;
- }
- mtx_unlock_spin(&sched_lock);
-
/*
* An aio daemon switches its
* address space while running.
@@ -839,7 +848,6 @@
*/
if ((p->p_flag & P_SYSTEM) != 0)
continue;
-
/*
* Do not swapout a process that
* is waiting for VM data
@@ -852,12 +860,9 @@
* process may attempt to alter
* the map.
*/
- PROC_LOCK(p);
- vm = p->p_vmspace;
- KASSERT(vm != NULL,
- ("swapout_procs: a process has no address space"));
- atomic_add_int(&vm->vm_refcnt, 1);
- PROC_UNLOCK(p);
+ vm = vmspace_acquire_ref(p);
+ if (vm == NULL)
+ continue;
if (!vm_map_trylock(&vm->vm_map))
goto nextproc1;
@@ -872,7 +877,7 @@
* skipped because of the if statement above checking
* for P_SYSTEM
*/
- if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM)
+ if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
goto nextproc2;
switch (p->p_state) {
@@ -882,21 +887,26 @@
break;
case PRS_NORMAL:
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
/*
* do not swapout a realtime process
* Check all the thread groups..
*/
- FOREACH_KSEGRP_IN_PROC(p, kg) {
- if (PRI_IS_REALTIME(kg->kg_pri_class))
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (PRI_IS_REALTIME(td->td_pri_class)) {
+ thread_unlock(td);
goto nextproc;
-
+ }
+ slptime = (ticks - td->td_slptick) / hz;
/*
* Guarantee swap_idle_threshold1
* time in memory.
*/
- if (kg->kg_slptime < swap_idle_threshold1)
+ if (slptime < swap_idle_threshold1) {
+ thread_unlock(td);
goto nextproc;
+ }
/*
* Do not swapout a process if it is
@@ -907,10 +917,10 @@
* This could be refined to support
* swapping out a thread.
*/
- FOREACH_THREAD_IN_GROUP(kg, td) {
- if ((td->td_priority) < PSOCK ||
- !thread_safetoswapout(td))
- goto nextproc;
+ if ((td->td_priority) < PSOCK ||
+ !thread_safetoswapout(td)) {
+ thread_unlock(td);
+ goto nextproc;
}
/*
* If the system is under memory stress,
@@ -920,11 +930,14 @@
*/
if (((action & VM_SWAP_NORMAL) == 0) &&
(((action & VM_SWAP_IDLE) == 0) ||
- (kg->kg_slptime < swap_idle_threshold2)))
+ (slptime < swap_idle_threshold2))) {
+ thread_unlock(td);
goto nextproc;
+ }
- if (minslptime > kg->kg_slptime)
- minslptime = kg->kg_slptime;
+ if (minslptime > slptime)
+ minslptime = slptime;
+ thread_unlock(td);
}
/*
@@ -935,9 +948,9 @@
if ((action & VM_SWAP_NORMAL) ||
((action & VM_SWAP_IDLE) &&
(minslptime > swap_idle_threshold2))) {
- swapout(p);
- didswap++;
- mtx_unlock_spin(&sched_lock);
+ if (swapout(p) == 0)
+ didswap++;
+ PROC_SUNLOCK(p);
PROC_UNLOCK(p);
vm_map_unlock(&vm->vm_map);
vmspace_free(vm);
@@ -945,7 +958,7 @@
goto retry;
}
nextproc:
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
}
nextproc2:
PROC_UNLOCK(p);
@@ -964,13 +977,35 @@
}
static void
+swapclear(p)
+ struct proc *p;
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ td->td_flags |= TDF_INMEM;
+ td->td_flags &= ~TDF_SWAPINREQ;
+ TD_CLR_SWAPPED(td);
+ if (TD_CAN_RUN(td))
+ setrunnable(td);
+ thread_unlock(td);
+ }
+ p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
+ p->p_flag |= P_INMEM;
+}
+
+static int
swapout(p)
struct proc *p;
{
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED | MA_NOTRECURSED);
#if defined(SWAP_DEBUG)
printf("swapping out %d\n", p->p_pid);
#endif
@@ -980,40 +1015,46 @@
* by now. Assuming that there is only one pageout daemon thread,
* this process should still be in memory.
*/
- KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM,
+ KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
("swapout: lost a swapout race?"));
-#if defined(INVARIANTS)
/*
- * Make sure that all threads are safe to be swapped out.
- *
- * Alternatively, we could swap out only safe threads.
+ * remember the process resident count
+ */
+ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
+ /*
+ * Check and mark all threads before we proceed.
*/
+ p->p_flag &= ~P_INMEM;
+ p->p_flag |= P_SWAPPINGOUT;
FOREACH_THREAD_IN_PROC(p, td) {
- KASSERT(thread_safetoswapout(td),
- ("swapout: there is a thread not safe for swapout"));
+ thread_lock(td);
+ if (!thread_safetoswapout(td)) {
+ thread_unlock(td);
+ swapclear(p);
+ return (EBUSY);
+ }
+ td->td_flags &= ~TDF_INMEM;
+ TD_SET_SWAPPED(td);
+ thread_unlock(td);
}
-#endif /* INVARIANTS */
+ td = FIRST_THREAD_IN_PROC(p);
+ ++td->td_ru.ru_nswap;
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
- ++p->p_stats->p_ru.ru_nswap;
/*
- * remember the process resident count
+ * This list is stable because all threads are now prevented from
+ * running. The list is only modified in the context of a running
+ * thread in this process.
*/
- p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
-
- p->p_sflag &= ~PS_INMEM;
- p->p_sflag |= PS_SWAPPINGOUT;
- PROC_UNLOCK(p);
- FOREACH_THREAD_IN_PROC(p, td)
- TD_SET_SWAPPED(td);
- mtx_unlock_spin(&sched_lock);
-
FOREACH_THREAD_IN_PROC(p, td)
vm_thread_swapout(td);
PROC_LOCK(p);
- mtx_lock_spin(&sched_lock);
- p->p_sflag &= ~PS_SWAPPINGOUT;
- p->p_swtime = 0;
+ p->p_flag &= ~P_SWAPPINGOUT;
+ PROC_SLOCK(p);
+ p->p_swtick = ticks;
+ return (0);
}
#endif /* !NO_SWAPPING */
Index: uma_dbg.c
===================================================================
RCS file: /home/cvs/src/sys/vm/uma_dbg.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/uma_dbg.c -L sys/vm/uma_dbg.c -u -r1.1.1.1 -r1.2
--- sys/vm/uma_dbg.c
+++ sys/vm/uma_dbg.c
@@ -31,7 +31,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/uma_dbg.c,v 1.20.2.1 2005/08/20 13:31:05 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/uma_dbg.c,v 1.21 2005/07/16 09:51:52 rwatson Exp $");
#include <sys/param.h>
#include <sys/systm.h>
Index: swap_pager.h
===================================================================
RCS file: /home/cvs/src/sys/vm/swap_pager.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/swap_pager.h -L sys/vm/swap_pager.h -u -r1.1.1.1 -r1.2
--- sys/vm/swap_pager.h
+++ sys/vm/swap_pager.h
@@ -32,12 +32,44 @@
* SUCH DAMAGE.
*
* from: @(#)swap_pager.h 7.1 (Berkeley) 12/5/90
- * $FreeBSD: src/sys/vm/swap_pager.h,v 1.50 2005/01/07 02:29:27 imp Exp $
+ * $FreeBSD: src/sys/vm/swap_pager.h,v 1.52 2007/02/07 17:43:11 jhb Exp $
*/
#ifndef _VM_SWAP_PAGER_H_
#define _VM_SWAP_PAGER_H_ 1
+typedef int32_t swblk_t; /*
+ * swap offset. This is the type used to
+ * address the "virtual swap device" and
+ * therefore the maximum swap space is
+ * 2^32 pages.
+ */
+
+struct buf;
+struct swdevt;
+typedef void sw_strategy_t(struct buf *, struct swdevt *);
+typedef void sw_close_t(struct thread *, struct swdevt *);
+
+/*
+ * Swap device table
+ */
+struct swdevt {
+ int sw_flags;
+ int sw_nblks;
+ int sw_used;
+ dev_t sw_dev;
+ struct vnode *sw_vp;
+ void *sw_id;
+ swblk_t sw_first;
+ swblk_t sw_end;
+ struct blist *sw_blist;
+ TAILQ_ENTRY(swdevt) sw_list;
+ sw_strategy_t *sw_strategy;
+ sw_close_t *sw_close;
+};
+
+#define SW_CLOSING 0x04
+
#ifdef _KERNEL
extern int swap_pager_full;
@@ -50,6 +82,7 @@
int swap_pager_isswapped(vm_object_t, struct swdevt *);
int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t);
void swap_pager_status(int *total, int *used);
+void swapoff_all(void);
#endif /* _KERNEL */
#endif /* _VM_SWAP_PAGER_H_ */
Index: vm_page.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_page.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_page.c -L sys/vm/vm_page.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_page.c
+++ sys/vm/vm_page.c
@@ -97,7 +97,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_page.c,v 1.304.2.2 2005/11/13 08:44:25 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_page.c,v 1.357.2.3 2007/11/28 22:23:35 alc Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -117,10 +117,13 @@
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#include <vm/uma_int.h>
+#include <machine/md_var.h>
+
/*
* Associated with page of user-allocatable memory is a
* page structure.
@@ -156,6 +159,36 @@
}
/*
+ * vm_page_blacklist_lookup:
+ *
+ * See if a physical address in this page has been listed
+ * in the blacklist tunable. Entries in the tunable are
+ * separated by spaces or commas. If an invalid integer is
+ * encountered then the rest of the string is skipped.
+ */
+static int
+vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
+{
+ vm_paddr_t bad;
+ char *cp, *pos;
+
+ for (pos = list; *pos != '\0'; pos = cp) {
+ bad = strtoq(pos, &cp, 0);
+ if (*cp != '\0') {
+ if (*cp == ' ' || *cp == ',') {
+ cp++;
+ if (cp == pos)
+ continue;
+ } else
+ break;
+ }
+ if (pa == trunc_page(bad))
+ return (1);
+ }
+ return (0);
+}
+
+/*
* vm_page_startup:
*
* Initializes the resident memory module.
@@ -175,10 +208,12 @@
vm_paddr_t pa;
int nblocks;
vm_paddr_t last_pa;
+ char *list;
/* the biggest memory array is the second group of pages */
vm_paddr_t end;
vm_paddr_t biggestsize;
+ vm_paddr_t low_water, high_water;
int biggestone;
vm_paddr_t total;
@@ -194,6 +229,9 @@
phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
}
+ low_water = phys_avail[0];
+ high_water = phys_avail[1];
+
for (i = 0; phys_avail[i + 1]; i += 2) {
vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
@@ -201,6 +239,10 @@
biggestone = i;
biggestsize = size;
}
+ if (phys_avail[i] < low_water)
+ low_water = phys_avail[i];
+ if (phys_avail[i + 1] > high_water)
+ high_water = phys_avail[i + 1];
++nblocks;
total += size;
}
@@ -213,7 +255,7 @@
mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
MTX_RECURSE);
mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
- MTX_SPIN);
+ MTX_DEF);
/*
* Initialize the queue headers for the free queue, the active queue
@@ -232,13 +274,40 @@
bzero((void *)mapped, end - new_end);
uma_startup((void *)mapped, boot_pages);
+#if defined(__amd64__) || defined(__i386__)
+ /*
+ * Allocate a bitmap to indicate that a random physical page
+ * needs to be included in a minidump.
+ *
+ * The amd64 port needs this to indicate which direct map pages
+ * need to be dumped, via calls to dump_add_page()/dump_drop_page().
+ *
+ * However, i386 still needs this workspace internally within the
+ * minidump code. In theory, they are not needed on i386, but are
+ * included should the sf_buf code decide to use them.
+ */
+ page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE;
+ vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
+ new_end -= vm_page_dump_size;
+ vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
+ new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
+ bzero((void *)vm_page_dump, vm_page_dump_size);
+#endif
/*
* Compute the number of pages of memory that will be available for
* use (taking into account the overhead of a page structure per
* page).
*/
- first_page = phys_avail[0] / PAGE_SIZE;
- page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
+ first_page = low_water / PAGE_SIZE;
+#ifdef VM_PHYSSEG_SPARSE
+ page_range = 0;
+ for (i = 0; phys_avail[i + 1] != 0; i += 2)
+ page_range += atop(phys_avail[i + 1] - phys_avail[i]);
+#elif defined(VM_PHYSSEG_DENSE)
+ page_range = high_water / PAGE_SIZE - first_page;
+#else
+#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
+#endif
npages = (total - (page_range * sizeof(struct vm_page)) -
(end - new_end)) / PAGE_SIZE;
end = new_end;
@@ -256,30 +325,61 @@
mapped = pmap_map(&vaddr, new_end, end,
VM_PROT_READ | VM_PROT_WRITE);
vm_page_array = (vm_page_t) mapped;
+#ifdef __amd64__
+ /*
+ * pmap_map on amd64 comes out of the direct-map, not kvm like i386,
+ * so the pages must be tracked for a crashdump to include this data.
+ * This includes the vm_page_array and the early UMA bootstrap pages.
+ */
+ for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
+ dump_add_page(pa);
+#endif
phys_avail[biggestone + 1] = new_end;
/*
* Clear all of the page structures
*/
bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
+ for (i = 0; i < page_range; i++)
+ vm_page_array[i].order = VM_NFREEORDER;
vm_page_array_size = page_range;
/*
- * Construct the free queue(s) in descending order (by physical
- * address) so that the first 16MB of physical memory is allocated
- * last rather than first. On large-memory machines, this avoids
- * the exhaustion of low physical memory before isa_dma_init has run.
+ * This assertion tests the hypothesis that npages and total are
+ * redundant. XXX
+ */
+ page_range = 0;
+ for (i = 0; phys_avail[i + 1] != 0; i += 2)
+ page_range += atop(phys_avail[i + 1] - phys_avail[i]);
+ KASSERT(page_range == npages,
+ ("vm_page_startup: inconsistent page counts"));
+
+ /*
+ * Initialize the physical memory allocator.
+ */
+ vm_phys_init();
+
+ /*
+ * Add every available physical page that is not blacklisted to
+ * the free lists.
*/
cnt.v_page_count = 0;
cnt.v_free_count = 0;
- for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
+ list = getenv("vm.blacklist");
+ for (i = 0; phys_avail[i + 1] != 0; i += 2) {
pa = phys_avail[i];
last_pa = phys_avail[i + 1];
- while (pa < last_pa && npages-- > 0) {
- vm_pageq_add_new_page(pa);
+ while (pa < last_pa) {
+ if (list != NULL &&
+ vm_page_blacklist_lookup(list, pa))
+ printf("Skipping page with pa 0x%jx\n",
+ (uintmax_t)pa);
+ else
+ vm_phys_add_page(pa);
pa += PAGE_SIZE;
}
}
+ freeenv(list);
return (vaddr);
}
@@ -304,9 +404,9 @@
{
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- KASSERT((m->flags & PG_BUSY) == 0,
+ KASSERT((m->oflags & VPO_BUSY) == 0,
("vm_page_busy: page already busy!!!"));
- vm_page_flag_set(m, PG_BUSY);
+ m->oflags |= VPO_BUSY;
}
/*
@@ -319,8 +419,8 @@
{
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if (m->flags & PG_WANTED) {
- vm_page_flag_clear(m, PG_WANTED);
+ if (m->oflags & VPO_WANTED) {
+ m->oflags &= ~VPO_WANTED;
wakeup(m);
}
}
@@ -328,7 +428,7 @@
/*
* vm_page_wakeup:
*
- * clear the PG_BUSY flag and wakeup anyone waiting for the
+ * clear the VPO_BUSY flag and wakeup anyone waiting for the
* page.
*
*/
@@ -337,8 +437,8 @@
{
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!"));
- vm_page_flag_clear(m, PG_BUSY);
+ KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
+ m->oflags &= ~VPO_BUSY;
vm_page_flash(m);
}
@@ -355,7 +455,6 @@
{
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
m->busy--;
if (m->busy == 0)
vm_page_flash(m);
@@ -382,26 +481,21 @@
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
--mem->hold_count;
KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
- if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
+ if (mem->hold_count == 0 && VM_PAGE_INQUEUE2(mem, PQ_HOLD))
vm_page_free_toq(mem);
}
/*
* vm_page_free:
*
- * Free a page
- *
- * The clearing of PG_ZERO is a temporary safety until the code can be
- * reviewed to determine that PG_ZERO is being properly cleared on
- * write faults or maps. PG_ZERO was previously cleared in
- * vm_page_alloc().
+ * Free a page.
*/
void
vm_page_free(vm_page_t m)
{
- vm_page_flag_clear(m, PG_ZERO);
+
+ m->flags &= ~PG_ZERO;
vm_page_free_toq(m);
- vm_page_zero_idle_wakeup();
}
/*
@@ -412,41 +506,37 @@
void
vm_page_free_zero(vm_page_t m)
{
- vm_page_flag_set(m, PG_ZERO);
+
+ m->flags |= PG_ZERO;
vm_page_free_toq(m);
}
/*
- * vm_page_sleep_if_busy:
+ * vm_page_sleep:
+ *
+ * Sleep and release the page queues lock.
*
- * Sleep and release the page queues lock if PG_BUSY is set or,
- * if also_m_busy is TRUE, busy is non-zero. Returns TRUE if the
- * thread slept and the page queues lock was released.
- * Otherwise, retains the page queues lock and returns FALSE.
+ * The object containing the given page must be locked.
*/
-int
-vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg)
+void
+vm_page_sleep(vm_page_t m, const char *msg)
{
- vm_object_t object;
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) {
- vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
- /*
- * It's possible that while we sleep, the page will get
- * unbusied and freed. If we are holding the object
- * lock, we will assume we hold a reference to the object
- * such that even if m->object changes, we can re-lock
- * it.
- */
- object = m->object;
- VM_OBJECT_UNLOCK(object);
- msleep(m, &vm_page_queue_mtx, PDROP | PVM, msg, 0);
- VM_OBJECT_LOCK(object);
- return (TRUE);
- }
- return (FALSE);
+ if (!mtx_owned(&vm_page_queue_mtx))
+ vm_page_lock_queues();
+ vm_page_flag_set(m, PG_REFERENCED);
+ vm_page_unlock_queues();
+
+ /*
+ * It's possible that while we sleep, the page will get
+ * unbusied and freed. If we are holding the object
+ * lock, we will assume we hold a reference to the object
+ * such that even if m->object changes, we can re-lock
+ * it.
+ */
+ m->oflags |= VPO_WANTED;
+ msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
}
/*
@@ -457,9 +547,9 @@
void
vm_page_dirty(vm_page_t m)
{
- KASSERT(m->queue - m->pc != PQ_CACHE,
+ KASSERT((m->flags & PG_CACHED) == 0,
("vm_page_dirty: page in cache!"));
- KASSERT(m->queue - m->pc != PQ_FREE,
+ KASSERT(!VM_PAGE_IS_FREE(m),
("vm_page_dirty: page is free!"));
m->dirty = VM_PAGE_BITS_ALL;
}
@@ -588,7 +678,7 @@
/*
* Since we are inserting a new and possibly dirty page,
- * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
+ * update the object's OBJ_MIGHTBEDIRTY flag.
*/
if (m->flags & PG_WRITEABLE)
vm_object_set_writeable_dirty(object);
@@ -612,14 +702,14 @@
vm_object_t object;
vm_page_t root;
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
if ((object = m->object) == NULL)
return;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if (m->flags & PG_BUSY) {
- vm_page_flag_clear(m, PG_BUSY);
+ if (m->oflags & VPO_BUSY) {
+ m->oflags &= ~VPO_BUSY;
vm_page_flash(m);
}
+ mtx_assert(&vm_page_queue_mtx, MA_OWNED);
/*
* Now remove from the object's list of backed pages.
@@ -700,48 +790,201 @@
vm_page_remove(m);
vm_page_insert(m, new_object, new_pindex);
- if (m->queue - m->pc == PQ_CACHE)
- vm_page_deactivate(m);
vm_page_dirty(m);
}
/*
- * vm_page_select_cache:
+ * Convert all of the given object's cached pages that have a
+ * pindex within the given range into free pages. If the value
+ * zero is given for "end", then the range's upper bound is
+ * infinity. If the given object is backed by a vnode and it
+ * transitions from having one or more cached pages to none, the
+ * vnode's hold count is reduced.
+ */
+void
+vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+{
+ vm_page_t m, m_next;
+ boolean_t empty;
+
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (__predict_false(object->cache == NULL)) {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ return;
+ }
+ m = object->cache = vm_page_splay(start, object->cache);
+ if (m->pindex < start) {
+ if (m->right == NULL)
+ m = NULL;
+ else {
+ m_next = vm_page_splay(start, m->right);
+ m_next->left = m;
+ m->right = NULL;
+ m = object->cache = m_next;
+ }
+ }
+
+ /*
+ * At this point, "m" is either (1) a reference to the page
+ * with the least pindex that is greater than or equal to
+ * "start" or (2) NULL.
+ */
+ for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
+ /*
+ * Find "m"'s successor and remove "m" from the
+ * object's cache.
+ */
+ if (m->right == NULL) {
+ object->cache = m->left;
+ m_next = NULL;
+ } else {
+ m_next = vm_page_splay(start, m->right);
+ m_next->left = m->left;
+ object->cache = m_next;
+ }
+ /* Convert "m" to a free page. */
+ m->object = NULL;
+ m->valid = 0;
+ /* Clear PG_CACHED and set PG_FREE. */
+ m->flags ^= PG_CACHED | PG_FREE;
+ KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
+ ("vm_page_cache_free: page %p has inconsistent flags", m));
+ cnt.v_cache_count--;
+ cnt.v_free_count++;
+ }
+ empty = object->cache == NULL;
+ mtx_unlock(&vm_page_queue_free_mtx);
+ if (object->type == OBJT_VNODE && empty)
+ vdrop(object->handle);
+}
+
+/*
+ * Returns the cached page that is associated with the given
+ * object and offset. If, however, none exists, returns NULL.
*
- * Move a page of the given color from the cache queue to the free
- * queue. As pages might be found, but are not applicable, they are
- * deactivated.
+ * The free page queue must be locked.
+ */
+static inline vm_page_t
+vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
+{
+ vm_page_t m;
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ if ((m = object->cache) != NULL && m->pindex != pindex) {
+ m = vm_page_splay(pindex, m);
+ if ((object->cache = m)->pindex != pindex)
+ m = NULL;
+ }
+ return (m);
+}
+
+/*
+ * Remove the given cached page from its containing object's
+ * collection of cached pages.
*
- * This routine may not block.
+ * The free page queue must be locked.
*/
-vm_page_t
-vm_page_select_cache(int color)
+void
+vm_page_cache_remove(vm_page_t m)
{
vm_object_t object;
- vm_page_t m;
- boolean_t was_trylocked;
+ vm_page_t root;
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
- while ((m = vm_pageq_find(PQ_CACHE, color, FALSE)) != NULL) {
- KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
- KASSERT(!pmap_page_is_mapped(m),
- ("Found mapped cache page %p", m));
- KASSERT((m->flags & PG_UNMANAGED) == 0,
- ("Found unmanaged cache page %p", m));
- KASSERT(m->wire_count == 0, ("Found wired cache page %p", m));
- if (m->hold_count == 0 && (object = m->object,
- (was_trylocked = VM_OBJECT_TRYLOCK(object)) ||
- VM_OBJECT_LOCKED(object))) {
- KASSERT((m->flags & PG_BUSY) == 0 && m->busy == 0,
- ("Found busy cache page %p", m));
- vm_page_free(m);
- if (was_trylocked)
- VM_OBJECT_UNLOCK(object);
- break;
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ KASSERT((m->flags & PG_CACHED) != 0,
+ ("vm_page_cache_remove: page %p is not cached", m));
+ object = m->object;
+ if (m != object->cache) {
+ root = vm_page_splay(m->pindex, object->cache);
+ KASSERT(root == m,
+ ("vm_page_cache_remove: page %p is not cached in object %p",
+ m, object));
+ }
+ if (m->left == NULL)
+ root = m->right;
+ else if (m->right == NULL)
+ root = m->left;
+ else {
+ root = vm_page_splay(m->pindex, m->left);
+ root->right = m->right;
+ }
+ object->cache = root;
+ m->object = NULL;
+ cnt.v_cache_count--;
+}
+
+/*
+ * Transfer all of the cached pages with offset greater than or
+ * equal to 'offidxstart' from the original object's cache to the
+ * new object's cache. However, any cached pages with offset
+ * greater than or equal to the new object's size are kept in the
+ * original object. Initially, the new object's cache must be
+ * empty. Offset 'offidxstart' in the original object must
+ * correspond to offset zero in the new object.
+ *
+ * The new object must be locked.
+ */
+void
+vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
+ vm_object_t new_object)
+{
+ vm_page_t m, m_next;
+
+ /*
+ * Insertion into an object's collection of cached pages
+ * requires the object to be locked. In contrast, removal does
+ * not.
+ */
+ VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
+ KASSERT(new_object->cache == NULL,
+ ("vm_page_cache_transfer: object %p has cached pages",
+ new_object));
+ mtx_lock(&vm_page_queue_free_mtx);
+ if ((m = orig_object->cache) != NULL) {
+ /*
+ * Transfer all of the pages with offset greater than or
+ * equal to 'offidxstart' from the original object's
+ * cache to the new object's cache.
+ */
+ m = vm_page_splay(offidxstart, m);
+ if (m->pindex < offidxstart) {
+ orig_object->cache = m;
+ new_object->cache = m->right;
+ m->right = NULL;
+ } else {
+ orig_object->cache = m->left;
+ new_object->cache = m;
+ m->left = NULL;
}
- vm_page_deactivate(m);
+ while ((m = new_object->cache) != NULL) {
+ if ((m->pindex - offidxstart) >= new_object->size) {
+ /*
+ * Return all of the cached pages with
+ * offset greater than or equal to the
+ * new object's size to the original
+ * object's cache.
+ */
+ new_object->cache = m->left;
+ m->left = orig_object->cache;
+ orig_object->cache = m;
+ break;
+ }
+ m_next = vm_page_splay(m->pindex, m->right);
+ /* Update the page's object and offset. */
+ m->object = new_object;
+ m->pindex -= offidxstart;
+ if (m_next == NULL)
+ break;
+ m->right = NULL;
+ m_next->left = m;
+ new_object->cache = m_next;
+ }
+ KASSERT(new_object->cache == NULL ||
+ new_object->type == OBJT_SWAP,
+ ("vm_page_cache_transfer: object %p's type is incompatible"
+ " with cached pages", new_object));
}
- return (m);
+ mtx_unlock(&vm_page_queue_free_mtx);
}
/*
@@ -757,16 +1000,14 @@
* VM_ALLOC_ZERO zero page
*
* This routine may not block.
- *
- * Additional special handling is required when called from an
- * interrupt (VM_ALLOC_INTERRUPT). We are not allowed to mess with
- * the page cache in this case.
*/
vm_page_t
vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
{
- vm_page_t m = NULL;
- int color, flags, page_req;
+ struct vnode *vp = NULL;
+ vm_object_t m_object;
+ vm_page_t m;
+ int flags, page_req;
page_req = req & VM_ALLOC_CLASS_MASK;
KASSERT(curthread->td_intr_nesting_level == 0 ||
@@ -777,9 +1018,7 @@
KASSERT(object != NULL,
("vm_page_alloc: NULL object."));
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- color = (pindex + object->pg_color) & PQ_L2_MASK;
- } else
- color = pindex & PQ_L2_MASK;
+ }
/*
* The pager is allowed to eat deeper into the free page list.
@@ -788,43 +1027,35 @@
page_req = VM_ALLOC_SYSTEM;
};
-loop:
- mtx_lock_spin(&vm_page_queue_free_mtx);
- if (cnt.v_free_count > cnt.v_free_reserved ||
+ mtx_lock(&vm_page_queue_free_mtx);
+ if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
(page_req == VM_ALLOC_SYSTEM &&
- cnt.v_cache_count == 0 &&
- cnt.v_free_count > cnt.v_interrupt_free_min) ||
- (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)) {
+ cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
+ (page_req == VM_ALLOC_INTERRUPT &&
+ cnt.v_free_count + cnt.v_cache_count > 0)) {
/*
* Allocate from the free queue if the number of free pages
* exceeds the minimum for the request class.
*/
- m = vm_pageq_find(PQ_FREE, color, (req & VM_ALLOC_ZERO) != 0);
- } else if (page_req != VM_ALLOC_INTERRUPT) {
- mtx_unlock_spin(&vm_page_queue_free_mtx);
- /*
- * Allocatable from cache (non-interrupt only). On success,
- * we must free the page and try again, thus ensuring that
- * cnt.v_*_free_min counters are replenished.
- */
- vm_page_lock_queues();
- if ((m = vm_page_select_cache(color)) == NULL) {
-#if defined(DIAGNOSTIC)
- if (cnt.v_cache_count > 0)
- printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count);
-#endif
- vm_page_unlock_queues();
- atomic_add_int(&vm_pageout_deficit, 1);
- pagedaemon_wakeup();
+ if (object != NULL &&
+ (m = vm_page_cache_lookup(object, pindex)) != NULL) {
+ if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ return (NULL);
+ }
+ vm_phys_unfree_page(m);
+ vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
+ } else if ((req & VM_ALLOC_IFCACHED) != 0) {
+ mtx_unlock(&vm_page_queue_free_mtx);
return (NULL);
- }
- vm_page_unlock_queues();
- goto loop;
+ } else
+ m = vm_phys_alloc_pages(object != NULL ?
+ VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
} else {
/*
- * Not allocatable from cache from interrupt, give up.
+ * Not allocatable, give up.
*/
- mtx_unlock_spin(&vm_page_queue_free_mtx);
+ mtx_unlock(&vm_page_queue_free_mtx);
atomic_add_int(&vm_pageout_deficit, 1);
pagedaemon_wakeup();
return (NULL);
@@ -838,24 +1069,41 @@
m != NULL,
("vm_page_alloc(): missing page on free queue")
);
-
- /*
- * Remove from free queue
- */
- vm_pageq_remove_nowakeup(m);
+ if ((m->flags & PG_CACHED) != 0) {
+ KASSERT(m->valid != 0,
+ ("vm_page_alloc: cached page %p is invalid", m));
+ if (m->object == object && m->pindex == pindex)
+ cnt.v_reactivated++;
+ else
+ m->valid = 0;
+ m_object = m->object;
+ vm_page_cache_remove(m);
+ if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
+ vp = m_object->handle;
+ } else {
+ KASSERT(VM_PAGE_IS_FREE(m),
+ ("vm_page_alloc: page %p is not free", m));
+ KASSERT(m->valid == 0,
+ ("vm_page_alloc: free page %p is valid", m));
+ cnt.v_free_count--;
+ }
/*
* Initialize structure. Only the PG_ZERO flag is inherited.
*/
- flags = PG_BUSY;
+ flags = 0;
if (m->flags & PG_ZERO) {
vm_page_zero_count--;
if (req & VM_ALLOC_ZERO)
- flags = PG_ZERO | PG_BUSY;
+ flags = PG_ZERO;
}
- if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ))
- flags &= ~PG_BUSY;
+ if (object == NULL || object->type == OBJT_PHYS)
+ flags |= PG_UNMANAGED;
m->flags = flags;
+ if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ))
+ m->oflags = 0;
+ else
+ m->oflags = VPO_BUSY;
if (req & VM_ALLOC_WIRED) {
atomic_add_int(&cnt.v_wire_count, 1);
m->wire_count = 1;
@@ -864,9 +1112,8 @@
m->hold_count = 0;
m->act_count = 0;
m->busy = 0;
- m->valid = 0;
KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
- mtx_unlock_spin(&vm_page_queue_free_mtx);
+ mtx_unlock(&vm_page_queue_free_mtx);
if ((req & VM_ALLOC_NOOBJ) == 0)
vm_page_insert(m, object, pindex);
@@ -874,6 +1121,15 @@
m->pindex = pindex;
/*
+ * The following call to vdrop() must come after the above call
+ * to vm_page_insert() in case both affect the same object and
+ * vnode. Otherwise, the affected vnode's hold count could
+ * temporarily become zero.
+ */
+ if (vp != NULL)
+ vdrop(vp);
+
+ /*
* Don't wakeup too often - wakeup the pageout daemon when
* we would be nearly out of memory.
*/
@@ -893,17 +1149,17 @@
vm_wait(void)
{
- vm_page_lock_queues();
+ mtx_lock(&vm_page_queue_free_mtx);
if (curproc == pageproc) {
vm_pageout_pages_needed = 1;
- msleep(&vm_pageout_pages_needed, &vm_page_queue_mtx,
+ msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
PDROP | PSWP, "VMWait", 0);
} else {
if (!vm_pages_needed) {
vm_pages_needed = 1;
wakeup(&vm_pages_needed);
}
- msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PVM,
+ msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
"vmwait", 0);
}
}
@@ -922,12 +1178,12 @@
vm_waitpfault(void)
{
- vm_page_lock_queues();
+ mtx_lock(&vm_page_queue_free_mtx);
if (!vm_pages_needed) {
vm_pages_needed = 1;
wakeup(&vm_pages_needed);
}
- msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PUSER,
+ msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
"pfault", 0);
}
@@ -946,9 +1202,7 @@
{
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
- if (m->queue != PQ_ACTIVE) {
- if ((m->queue - m->pc) == PQ_CACHE)
- cnt.v_reactivated++;
+ if (VM_PAGE_GETKNOWNQUEUE2(m) != PQ_ACTIVE) {
vm_pageq_remove(m);
if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
if (m->act_count < ACT_INIT)
@@ -971,11 +1225,11 @@
* The page queues must be locked.
* This routine may not block.
*/
-static __inline void
+static inline void
vm_page_free_wakeup(void)
{
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
/*
* if pageout daemon needs pages, then tell it that there are
* some free.
@@ -999,7 +1253,7 @@
/*
* vm_page_free_toq:
*
- * Returns the given page to the PQ_FREE list,
+ * Returns the given page to the free list,
* disassociating it with any VM object.
*
* Object and page must be locked prior to entry.
@@ -1009,17 +1263,19 @@
void
vm_page_free_toq(vm_page_t m)
{
- struct vpgqueues *pq;
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
- cnt.v_tfree++;
+ if (VM_PAGE_GETQUEUE(m) != PQ_NONE)
+ mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ KASSERT(!pmap_page_is_mapped(m),
+ ("vm_page_free_toq: freeing mapped page %p", m));
+ PCPU_INC(cnt.v_tfree);
- if (m->busy || ((m->queue - m->pc) == PQ_FREE)) {
+ if (m->busy || VM_PAGE_IS_FREE(m)) {
printf(
- "vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n",
- (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0,
+ "vm_page_free: pindex(%lu), busy(%d), VPO_BUSY(%d), hold(%d)\n",
+ (u_long)m->pindex, m->busy, (m->oflags & VPO_BUSY) ? 1 : 0,
m->hold_count);
- if ((m->queue - m->pc) == PQ_FREE)
+ if (VM_PAGE_IS_FREE(m))
panic("vm_page_free: freeing free page");
else
panic("vm_page_free: freeing busy page");
@@ -1031,7 +1287,7 @@
* callback routine until after we've put the page on the
* appropriate free queue.
*/
- vm_pageq_remove_nowakeup(m);
+ vm_pageq_remove(m);
vm_page_remove(m);
/*
@@ -1052,66 +1308,23 @@
}
panic("vm_page_free: freeing wired page");
}
-
- /*
- * Clear the UNMANAGED flag when freeing an unmanaged page.
- */
- if (m->flags & PG_UNMANAGED) {
- m->flags &= ~PG_UNMANAGED;
- }
-
if (m->hold_count != 0) {
m->flags &= ~PG_ZERO;
- m->queue = PQ_HOLD;
- } else
- m->queue = PQ_FREE + m->pc;
- pq = &vm_page_queues[m->queue];
- mtx_lock_spin(&vm_page_queue_free_mtx);
- pq->lcnt++;
- ++(*pq->cnt);
-
- /*
- * Put zero'd pages on the end ( where we look for zero'd pages
- * first ) and non-zerod pages at the head.
- */
- if (m->flags & PG_ZERO) {
- TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
- ++vm_page_zero_count;
+ vm_pageq_enqueue(PQ_HOLD, m);
} else {
- TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
- }
- mtx_unlock_spin(&vm_page_queue_free_mtx);
- vm_page_free_wakeup();
-}
-
-/*
- * vm_page_unmanage:
- *
- * Prevent PV management from being done on the page. The page is
- * removed from the paging queues as if it were wired, and as a
- * consequence of no longer being managed the pageout daemon will not
- * touch it (since there is no way to locate the pte mappings for the
- * page). madvise() calls that mess with the pmap will also no longer
- * operate on the page.
- *
- * Beyond that the page is still reasonably 'normal'. Freeing the page
- * will clear the flag.
- *
- * This routine is used by OBJT_PHYS objects - objects using unswappable
- * physical memory as backing store rather then swap-backed memory and
- * will eventually be extended to support 4MB unmanaged physical
- * mappings.
- */
-void
-vm_page_unmanage(vm_page_t m)
-{
-
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
- if ((m->flags & PG_UNMANAGED) == 0) {
- if (m->wire_count == 0)
- vm_pageq_remove(m);
+ m->flags |= PG_FREE;
+ mtx_lock(&vm_page_queue_free_mtx);
+ cnt.v_free_count++;
+ if ((m->flags & PG_ZERO) != 0) {
+ vm_phys_free_pages(m, 0);
+ ++vm_page_zero_count;
+ } else {
+ vm_phys_free_pages(m, 0);
+ vm_page_zero_idle_wakeup();
+ }
+ vm_page_free_wakeup();
+ mtx_unlock(&vm_page_queue_free_mtx);
}
- vm_page_flag_set(m, PG_UNMANAGED);
}
/*
@@ -1209,7 +1422,7 @@
*
* This routine may not block.
*/
-static __inline void
+static inline void
_vm_page_deactivate(vm_page_t m, int athead)
{
@@ -1218,19 +1431,16 @@
/*
* Ignore if already inactive.
*/
- if (m->queue == PQ_INACTIVE)
+ if (VM_PAGE_INQUEUE2(m, PQ_INACTIVE))
return;
if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
- if ((m->queue - m->pc) == PQ_CACHE)
- cnt.v_reactivated++;
vm_page_flag_clear(m, PG_WINATCFLS);
vm_pageq_remove(m);
if (athead)
TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
else
TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
- m->queue = PQ_INACTIVE;
- vm_page_queues[PQ_INACTIVE].lcnt++;
+ VM_PAGE_SETQUEUE2(m, PQ_INACTIVE);
cnt.v_inactive_count++;
}
}
@@ -1253,7 +1463,7 @@
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
if (m->dirty || m->hold_count || m->busy || m->wire_count ||
- (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+ (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
return (0);
}
pmap_remove_all(m);
@@ -1277,7 +1487,7 @@
if (m->object != NULL)
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
if (m->dirty || m->hold_count || m->busy || m->wire_count ||
- (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+ (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
return (0);
}
pmap_remove_all(m);
@@ -1297,29 +1507,99 @@
void
vm_page_cache(vm_page_t m)
{
+ vm_object_t object;
+ vm_page_t root;
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy ||
+ object = m->object;
+ VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ if ((m->flags & PG_UNMANAGED) || (m->oflags & VPO_BUSY) || m->busy ||
m->hold_count || m->wire_count) {
- printf("vm_page_cache: attempting to cache busy page\n");
- return;
+ panic("vm_page_cache: attempting to cache busy page");
}
- if ((m->queue - m->pc) == PQ_CACHE)
+ pmap_remove_all(m);
+ if (m->dirty != 0)
+ panic("vm_page_cache: page %p is dirty", m);
+ if (m->valid == 0 || object->type == OBJT_DEFAULT ||
+ (object->type == OBJT_SWAP &&
+ !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
+ /*
+ * Hypothesis: A cache-elgible page belonging to a
+ * default object or swap object but without a backing
+ * store must be zero filled.
+ */
+ vm_page_free(m);
return;
+ }
+ KASSERT((m->flags & PG_CACHED) == 0,
+ ("vm_page_cache: page %p is already cached", m));
+ cnt.v_tcached++;
/*
- * Remove all pmaps and indicate that the page is not
- * writeable or mapped.
+ * Remove the page from the paging queues.
*/
- pmap_remove_all(m);
- if (m->dirty != 0) {
- panic("vm_page_cache: caching a dirty page, pindex: %ld",
- (long)m->pindex);
+ vm_pageq_remove(m);
+
+ /*
+ * Remove the page from the object's collection of resident
+ * pages.
+ */
+ if (m != object->root)
+ vm_page_splay(m->pindex, object->root);
+ if (m->left == NULL)
+ root = m->right;
+ else {
+ root = vm_page_splay(m->pindex, m->left);
+ root->right = m->right;
+ }
+ object->root = root;
+ TAILQ_REMOVE(&object->memq, m, listq);
+ object->resident_page_count--;
+ object->generation++;
+
+ /*
+ * Insert the page into the object's collection of cached pages
+ * and the physical memory allocator's cache/free page queues.
+ */
+ vm_page_flag_set(m, PG_CACHED);
+ vm_page_flag_clear(m, PG_ZERO);
+ mtx_lock(&vm_page_queue_free_mtx);
+ vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
+ cnt.v_cache_count++;
+ root = object->cache;
+ if (root == NULL) {
+ m->left = NULL;
+ m->right = NULL;
+ } else {
+ root = vm_page_splay(m->pindex, root);
+ if (m->pindex < root->pindex) {
+ m->left = root->left;
+ m->right = root;
+ root->left = NULL;
+ } else if (__predict_false(m->pindex == root->pindex))
+ panic("vm_page_cache: offset already cached");
+ else {
+ m->right = root->right;
+ m->left = root;
+ root->right = NULL;
+ }
}
- vm_pageq_remove_nowakeup(m);
- vm_pageq_enqueue(PQ_CACHE + m->pc, m);
+ object->cache = m;
+ vm_phys_free_pages(m, 0);
vm_page_free_wakeup();
+ mtx_unlock(&vm_page_queue_free_mtx);
+
+ /*
+ * Increment the vnode's hold count if this is the object's only
+ * cached page. Decrement the vnode's hold count if this was
+ * the object's only resident page.
+ */
+ if (object->type == OBJT_VNODE) {
+ if (root == NULL && object->resident_page_count != 0)
+ vhold(object->handle);
+ else if (root != NULL && object->resident_page_count == 0)
+ vdrop(object->handle);
+ }
}
/*
@@ -1357,9 +1637,7 @@
* occassionally leave the page alone
*/
if ((dnw & 0x01F0) == 0 ||
- m->queue == PQ_INACTIVE ||
- m->queue - m->pc == PQ_CACHE
- ) {
+ VM_PAGE_INQUEUE2(m, PQ_INACTIVE)) {
if (m->act_count >= ACT_INIT)
--m->act_count;
return;
@@ -1400,21 +1678,18 @@
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
retrylookup:
if ((m = vm_page_lookup(object, pindex)) != NULL) {
- vm_page_lock_queues();
- if (m->busy || (m->flags & PG_BUSY)) {
- vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
- VM_OBJECT_UNLOCK(object);
- msleep(m, &vm_page_queue_mtx, PDROP | PVM, "pgrbwt", 0);
- VM_OBJECT_LOCK(object);
+ if (vm_page_sleep_if_busy(m, TRUE, "pgrbwt")) {
if ((allocflags & VM_ALLOC_RETRY) == 0)
return (NULL);
goto retrylookup;
} else {
- if (allocflags & VM_ALLOC_WIRED)
+ if ((allocflags & VM_ALLOC_WIRED) != 0) {
+ vm_page_lock_queues();
vm_page_wire(m);
+ vm_page_unlock_queues();
+ }
if ((allocflags & VM_ALLOC_NOBUSY) == 0)
vm_page_busy(m);
- vm_page_unlock_queues();
return (m);
}
}
@@ -1426,7 +1701,8 @@
if ((allocflags & VM_ALLOC_RETRY) == 0)
return (NULL);
goto retrylookup;
- }
+ } else if (m->valid != 0)
+ return (m);
if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
pmap_zero_page(m);
return (m);
@@ -1438,7 +1714,7 @@
*
* Inputs are required to range within a page.
*/
-__inline int
+int
vm_page_bits(int base, int size)
{
int first_bit;
@@ -1505,7 +1781,7 @@
/*
* Set valid, clear dirty bits. If validating the entire
* page we can safely clear the pmap modify bit. We also
- * use this opportunity to clear the PG_NOSYNC flag. If a process
+ * use this opportunity to clear the VPO_NOSYNC flag. If a process
* takes a write fault on a MAP_NOSYNC memory area the flag will
* be set again.
*
@@ -1528,7 +1804,7 @@
m->dirty &= ~pagebits;
if (base == 0 && size == PAGE_SIZE) {
pmap_clear_modify(m);
- vm_page_flag_clear(m, PG_NOSYNC);
+ m->oflags &= ~VPO_NOSYNC;
}
}
@@ -1556,6 +1832,8 @@
VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
bits = vm_page_bits(base, size);
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
+ pmap_remove_all(m);
m->valid &= ~bits;
m->dirty &= ~bits;
m->object->generation++;
@@ -1640,6 +1918,14 @@
int so_zerocp_fullpage = 0;
+/*
+ * Replace the given page with a copy. The copied page assumes
+ * the portion of the given page's "wire_count" that is not the
+ * responsibility of this copy-on-write mechanism.
+ *
+ * The object containing the given page must have a non-zero
+ * paging-in-progress count and be locked.
+ */
void
vm_page_cowfault(vm_page_t m)
{
@@ -1648,20 +1934,32 @@
vm_pindex_t pindex;
object = m->object;
+ VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ KASSERT(object->paging_in_progress != 0,
+ ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
+ object));
pindex = m->pindex;
retry_alloc:
pmap_remove_all(m);
vm_page_remove(m);
- mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
+ mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
if (mnew == NULL) {
vm_page_insert(m, object, pindex);
vm_page_unlock_queues();
VM_OBJECT_UNLOCK(object);
VM_WAIT;
VM_OBJECT_LOCK(object);
- vm_page_lock_queues();
- goto retry_alloc;
+ if (m == vm_page_lookup(object, pindex)) {
+ vm_page_lock_queues();
+ goto retry_alloc;
+ } else {
+ /*
+ * Page disappeared during the wait.
+ */
+ vm_page_lock_queues();
+ return;
+ }
}
if (m->cow == 0) {
@@ -1677,7 +1975,6 @@
pmap_copy_page(m, mnew);
mnew->valid = VM_PAGE_BITS_ALL;
vm_page_dirty(mnew);
- vm_page_flag_clear(mnew, PG_BUSY);
mnew->wire_count = m->wire_count - m->cow;
m->wire_count = m->cow;
}
@@ -1705,7 +2002,7 @@
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
m->cow++;
- pmap_page_protect(m, VM_PROT_READ);
+ pmap_remove_write(m);
}
#include "opt_ddb.h"
@@ -1730,21 +2027,17 @@
DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
{
- int i;
+
db_printf("PQ_FREE:");
- for (i = 0; i < PQ_L2_SIZE; i++) {
- db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
- }
+ db_printf(" %d", cnt.v_free_count);
db_printf("\n");
db_printf("PQ_CACHE:");
- for (i = 0; i < PQ_L2_SIZE; i++) {
- db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
- }
+ db_printf(" %d", cnt.v_cache_count);
db_printf("\n");
db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
- vm_page_queues[PQ_ACTIVE].lcnt,
- vm_page_queues[PQ_INACTIVE].lcnt);
+ *vm_page_queues[PQ_ACTIVE].cnt,
+ *vm_page_queues[PQ_INACTIVE].cnt);
}
#endif /* DDB */
Index: swap_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/swap_pager.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/swap_pager.c -L sys/vm/swap_pager.c -u -r1.1.1.1 -r1.2
--- sys/vm/swap_pager.c
+++ sys/vm/swap_pager.c
@@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/swap_pager.c,v 1.273.2.1 2005/08/20 06:07:55 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/swap_pager.c,v 1.295 2007/08/05 21:04:32 alc Exp $");
#include "opt_mac.h"
#include "opt_swap.h"
@@ -77,6 +77,7 @@
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/kernel.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/bio.h>
#include <sys/buf.h>
@@ -85,7 +86,6 @@
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode.h>
-#include <sys/mac.h>
#include <sys/malloc.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
@@ -94,6 +94,8 @@
#include <sys/sx.h>
#include <sys/vmmeter.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
@@ -136,37 +138,6 @@
#define SWAP_META_PAGES (SWB_NPAGES * 2)
#define SWAP_META_MASK (SWAP_META_PAGES - 1)
-typedef int32_t swblk_t; /*
- * swap offset. This is the type used to
- * address the "virtual swap device" and
- * therefore the maximum swap space is
- * 2^32 pages.
- */
-
-struct swdevt;
-typedef void sw_strategy_t(struct buf *bp, struct swdevt *sw);
-typedef void sw_close_t(struct thread *td, struct swdevt *sw);
-
-/*
- * Swap device table
- */
-struct swdevt {
- int sw_flags;
- int sw_nblks;
- int sw_used;
- dev_t sw_dev;
- struct vnode *sw_vp;
- void *sw_id;
- swblk_t sw_first;
- swblk_t sw_end;
- struct blist *sw_blist;
- TAILQ_ENTRY(swdevt) sw_list;
- sw_strategy_t *sw_strategy;
- sw_close_t *sw_close;
-};
-
-#define SW_CLOSING 0x04
-
struct swblock {
struct swblock *swb_hnext;
vm_object_t swb_object;
@@ -266,6 +237,7 @@
static void swp_pager_async_iodone(struct buf *bp);
static int swapongeom(struct thread *, struct vnode *);
static int swaponvp(struct thread *, struct vnode *, u_long);
+static int swapoff_one(struct swdevt *sp, struct thread *td);
/*
* Swap bitmap functions
@@ -487,9 +459,7 @@
sx_xlock(&sw_alloc_sx);
object = vm_pager_object_lookup(NOBJLIST(handle), handle);
- if (object != NULL) {
- vm_object_reference(object);
- } else {
+ if (object == NULL) {
object = vm_object_allocate(OBJT_DEFAULT, pindex);
object->handle = handle;
@@ -1055,20 +1025,18 @@
bp->b_pager.pg_reqpage = reqpage - i;
VM_OBJECT_LOCK(object);
- vm_page_lock_queues();
{
int k;
for (k = i; k < j; ++k) {
bp->b_pages[k - i] = m[k];
- vm_page_flag_set(m[k], PG_SWAPINPROG);
+ m[k]->oflags |= VPO_SWAPINPROG;
}
}
- vm_page_unlock_queues();
bp->b_npages = j - i;
- cnt.v_swapin++;
- cnt.v_swappgsin += bp->b_npages;
+ PCPU_INC(cnt.v_swapin);
+ PCPU_ADD(cnt.v_swappgsin, bp->b_npages);
/*
* We still hold the lock on mreq, and our automatic completion routine
@@ -1092,23 +1060,24 @@
swp_pager_strategy(bp);
/*
- * wait for the page we want to complete. PG_SWAPINPROG is always
+ * wait for the page we want to complete. VPO_SWAPINPROG is always
* cleared on completion. If an I/O error occurs, SWAPBLK_NONE
* is set in the meta-data.
*/
- vm_page_lock_queues();
- while ((mreq->flags & PG_SWAPINPROG) != 0) {
- vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
- cnt.v_intrans++;
- if (msleep(mreq, &vm_page_queue_mtx, PSWP, "swread", hz*20)) {
+ VM_OBJECT_LOCK(object);
+ while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
+ mreq->oflags |= VPO_WANTED;
+ vm_page_lock_queues();
+ vm_page_flag_set(mreq, PG_REFERENCED);
+ vm_page_unlock_queues();
+ PCPU_INC(cnt.v_intrans);
+ if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) {
printf(
"swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
}
}
- vm_page_unlock_queues();
- VM_OBJECT_LOCK(object);
/*
* mreq is left busied after completion, but all the other pages
* are freed. If we had an unrecoverable read error the page will
@@ -1157,7 +1126,6 @@
int i;
int n = 0;
- GIANT_REQUIRED;
if (count && m[0]->object != object) {
panic("swap_pager_getpages: object mismatch %p/%p",
object,
@@ -1281,9 +1249,7 @@
vm_page_dirty(mreq);
rtvals[i+j] = VM_PAGER_OK;
- vm_page_lock_queues();
- vm_page_flag_set(mreq, PG_SWAPINPROG);
- vm_page_unlock_queues();
+ mreq->oflags |= VPO_SWAPINPROG;
bp->b_pages[j] = mreq;
}
VM_OBJECT_UNLOCK(object);
@@ -1294,8 +1260,8 @@
bp->b_dirtyoff = 0;
bp->b_dirtyend = bp->b_bcount;
- cnt.v_swapout++;
- cnt.v_swappgsout += bp->b_npages;
+ PCPU_INC(cnt.v_swapout);
+ PCPU_ADD(cnt.v_swappgsout, bp->b_npages);
/*
* asynchronous
@@ -1398,7 +1364,7 @@
for (i = 0; i < bp->b_npages; ++i) {
vm_page_t m = bp->b_pages[i];
- vm_page_flag_clear(m, PG_SWAPINPROG);
+ m->oflags &= ~VPO_SWAPINPROG;
if (bp->b_ioflags & BIO_ERROR) {
/*
@@ -1417,17 +1383,12 @@
* not match anything ).
*
* We have to wake specifically requested pages
- * up too because we cleared PG_SWAPINPROG and
+ * up too because we cleared VPO_SWAPINPROG and
* someone may be waiting for that.
*
* NOTE: for reads, m->dirty will probably
* be overridden by the original caller of
* getpages so don't play cute tricks here.
- *
- * XXX IT IS NOT LEGAL TO FREE THE PAGE HERE
- * AS THIS MESSES WITH object->memq, and it is
- * not legal to mess with object->memq from an
- * interrupt.
*/
m->valid = 0;
if (i != bp->b_pager.pg_reqpage)
@@ -1476,7 +1437,7 @@
/*
* We have to wake specifically requested pages
- * up too because we cleared PG_SWAPINPROG and
+ * up too because we cleared VPO_SWAPINPROG and
* could be waiting for it in getpages. However,
* be sure to not unbusy getpages specifically
* requested page - getpages expects it to be
@@ -1512,6 +1473,15 @@
VM_OBJECT_UNLOCK(object);
}
+ /*
+ * swapdev_strategy() manually sets b_vp and b_bufobj before calling
+ * bstrategy(). Set them back to NULL now we're done with it, or we'll
+ * trigger a KASSERT in relpbuf().
+ */
+ if (bp->b_vp) {
+ bp->b_vp = NULL;
+ bp->b_bufobj = NULL;
+ }
/*
* release the physical I/O buffer
*/
@@ -1579,7 +1549,7 @@
* XXX - The code to page the whole block in doesn't work, so we
* revert to the one-by-one behavior for now. Sigh.
*/
-static __inline void
+static inline void
swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
{
vm_page_t m;
@@ -1591,8 +1561,8 @@
vm_page_lock_queues();
vm_page_activate(m);
vm_page_dirty(m);
- vm_page_wakeup(m);
vm_page_unlock_queues();
+ vm_page_wakeup(m);
vm_pager_page_unswapped(m);
return;
}
@@ -1603,8 +1573,8 @@
vm_page_lock_queues();
vm_page_dirty(m);
vm_page_dontneed(m);
- vm_page_wakeup(m);
vm_page_unlock_queues();
+ vm_page_wakeup(m);
vm_pager_page_unswapped(m);
}
@@ -1653,7 +1623,6 @@
}
mtx_unlock(&swhash_mtx);
if (sp->sw_used) {
- int dummy;
/*
* Objects may be locked or paging to the device being
* removed, so we will miss their pages and need to
@@ -1665,7 +1634,7 @@
panic("swapoff: failed to locate %d swap blocks",
sp->sw_used);
}
- tsleep(&dummy, PVM, "swpoff", hz / 20);
+ pause("swpoff", hz / 20);
goto full_rescan;
}
}
@@ -1742,6 +1711,8 @@
if (swap == NULL) {
mtx_unlock(&swhash_mtx);
VM_OBJECT_UNLOCK(object);
+ if (uma_zone_exhausted(swap_zone))
+ printf("swap zone exhausted, increase kern.maxswzone\n");
VM_WAIT;
VM_OBJECT_LOCK(object);
goto retry;
@@ -1963,11 +1934,11 @@
struct nameidata nd;
int error;
- mtx_lock(&Giant);
- error = suser(td);
+ error = priv_check(td, PRIV_SWAPON);
if (error)
- goto done2;
+ return (error);
+ mtx_lock(&Giant);
while (swdev_syscall_active)
tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
swdev_syscall_active = 1;
@@ -1981,7 +1952,8 @@
goto done;
}
- NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW, UIO_USERSPACE, uap->name, td);
+ NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->name, td);
error = namei(&nd);
if (error)
goto done;
@@ -2006,7 +1978,6 @@
done:
swdev_syscall_active = 0;
wakeup_one(&swdev_syscall_active);
-done2:
mtx_unlock(&Giant);
return (error);
}
@@ -2100,20 +2071,19 @@
struct vnode *vp;
struct nameidata nd;
struct swdevt *sp;
- u_long nblks, dvbase;
int error;
- mtx_lock(&Giant);
-
- error = suser(td);
+ error = priv_check(td, PRIV_SWAPOFF);
if (error)
- goto done2;
+ return (error);
+ mtx_lock(&Giant);
while (swdev_syscall_active)
tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
swdev_syscall_active = 1;
- NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
+ td);
error = namei(&nd);
if (error)
goto done;
@@ -2123,21 +2093,37 @@
mtx_lock(&sw_dev_mtx);
TAILQ_FOREACH(sp, &swtailq, sw_list) {
if (sp->sw_vp == vp)
- goto found;
+ break;
}
mtx_unlock(&sw_dev_mtx);
- error = EINVAL;
- goto done;
-found:
- mtx_unlock(&sw_dev_mtx);
+ if (sp == NULL) {
+ error = EINVAL;
+ goto done;
+ }
+ error = swapoff_one(sp, td);
+done:
+ swdev_syscall_active = 0;
+ wakeup_one(&swdev_syscall_active);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+swapoff_one(struct swdevt *sp, struct thread *td)
+{
+ u_long nblks, dvbase;
#ifdef MAC
- (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- error = mac_check_system_swapoff(td->td_ucred, vp);
- (void) VOP_UNLOCK(vp, 0, td);
+ int error;
+#endif
+
+ mtx_assert(&Giant, MA_OWNED);
+#ifdef MAC
+ (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = mac_check_system_swapoff(td->td_ucred, sp->sw_vp);
+ (void) VOP_UNLOCK(sp->sw_vp, 0, td);
if (error != 0)
- goto done;
+ return (error);
#endif
-
nblks = sp->sw_nblks;
/*
@@ -2148,8 +2134,7 @@
*/
if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
nblks + nswap_lowat) {
- error = ENOMEM;
- goto done;
+ return (ENOMEM);
}
/*
@@ -2182,13 +2167,42 @@
mtx_unlock(&sw_dev_mtx);
blist_destroy(sp->sw_blist);
free(sp, M_VMPGDATA);
+ return (0);
+}
-done:
+void
+swapoff_all(void)
+{
+ struct swdevt *sp, *spt;
+ const char *devname;
+ int error;
+
+ mtx_lock(&Giant);
+ while (swdev_syscall_active)
+ tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
+ swdev_syscall_active = 1;
+
+ mtx_lock(&sw_dev_mtx);
+ TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
+ mtx_unlock(&sw_dev_mtx);
+ if (vn_isdisk(sp->sw_vp, NULL))
+ devname = sp->sw_vp->v_rdev->si_name;
+ else
+ devname = "[file]";
+ error = swapoff_one(sp, &thread0);
+ if (error != 0) {
+ printf("Cannot remove swap device %s (error=%d), "
+ "skipping.\n", devname, error);
+ } else if (bootverbose) {
+ printf("Swap device %s removed.\n", devname);
+ }
+ mtx_lock(&sw_dev_mtx);
+ }
+ mtx_unlock(&sw_dev_mtx);
+
swdev_syscall_active = 0;
wakeup_one(&swdev_syscall_active);
-done2:
mtx_unlock(&Giant);
- return (error);
}
void
@@ -2475,10 +2489,12 @@
vp2 = sp->sw_id;
vhold(vp2);
if (bp->b_iocmd == BIO_WRITE) {
- if (bp->b_bufobj) /* XXX: should always be true /phk */
+ if (bp->b_bufobj)
bufobj_wdrop(bp->b_bufobj);
bufobj_wref(&vp2->v_bufobj);
}
+ if (bp->b_bufobj != &vp2->v_bufobj)
+ bp->b_bufobj = &vp2->v_bufobj;
bp->b_vp = vp2;
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
@@ -2516,7 +2532,7 @@
error = mac_check_system_swapon(td->td_ucred, vp);
if (error == 0)
#endif
- error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, -1);
+ error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
(void) VOP_UNLOCK(vp, 0, td);
if (error)
return (error);
Index: vm_map.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_map.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_map.h -L sys/vm/vm_map.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_map.h
+++ sys/vm/vm_map.h
@@ -57,7 +57,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: src/sys/vm/vm_map.h,v 1.117 2005/01/07 02:29:27 imp Exp $
+ * $FreeBSD: src/sys/vm/vm_map.h,v 1.120 2007/08/20 12:05:45 kib Exp $
*/
/*
@@ -242,7 +242,6 @@
caddr_t vm_taddr; /* (c) user virtual address of text */
caddr_t vm_daddr; /* (c) user virtual address of data */
caddr_t vm_maxsaddr; /* user VA at max stack growth */
- int vm_exitingcnt; /* several processes zombied in exit1 */
int vm_refcnt; /* number of references */
};
@@ -296,7 +295,6 @@
/* XXX: number of kernel maps and entries to statically allocate */
#define MAX_KMAP 10
#define MAX_KMAPENT 128
-#define MAX_MAPENT 128
/*
* Copy-on-write flags for vm_map operations
@@ -335,6 +333,7 @@
vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t);
int vm_map_delete (vm_map_t, vm_offset_t, vm_offset_t);
int vm_map_find (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, boolean_t, vm_prot_t, vm_prot_t, int);
+int vm_map_fixed (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int);
int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *);
int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t);
void vm_map_init (struct vm_map *, vm_offset_t, vm_offset_t);
--- /dev/null
+++ sys/vm/redzone.c
@@ -0,0 +1,181 @@
+/*-
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/vm/redzone.c,v 1.1 2006/01/31 11:09:20 pjd Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/stack.h>
+#include <sys/sysctl.h>
+
+#include <vm/redzone.h>
+
+
+SYSCTL_NODE(_vm, OID_AUTO, redzone, CTLFLAG_RW, NULL, "RedZone data");
+static u_long redzone_extra_mem = 0;
+SYSCTL_ULONG(_vm_redzone, OID_AUTO, extra_mem, CTLFLAG_RD, &redzone_extra_mem,
+ 0, "Extra memory allocated by redzone");
+static int redzone_panic = 0;
+TUNABLE_INT("vm.redzone.panic", &redzone_panic);
+SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RW, &redzone_panic, 0,
+ "Panic when buffer corruption is detected");
+
+#define REDZONE_CHSIZE (16)
+#define REDZONE_CFSIZE (16)
+#define REDZONE_HSIZE (sizeof(struct stack) + sizeof(u_long) + REDZONE_CHSIZE)
+#define REDZONE_FSIZE (REDZONE_CFSIZE)
+
+static u_long
+redzone_roundup(u_long n)
+{
+
+ if (n <= 128)
+ return (128);
+ else if (n <= 256)
+ return (256);
+ else if (n <= 512)
+ return (512);
+ else if (n <= 1024)
+ return (1024);
+ else if (n <= 2048)
+ return (2048);
+ return (PAGE_SIZE);
+}
+
+u_long
+redzone_get_size(caddr_t naddr)
+{
+ u_long nsize;
+
+ bcopy(naddr - REDZONE_CHSIZE - sizeof(u_long), &nsize, sizeof(nsize));
+ return (nsize);
+}
+
+u_long
+redzone_size_ntor(u_long nsize)
+{
+
+ return (nsize + redzone_roundup(nsize) + REDZONE_FSIZE);
+}
+
+void *
+redzone_addr_ntor(caddr_t naddr)
+{
+
+ return (naddr - redzone_roundup(redzone_get_size(naddr)));
+}
+
+/*
+ * Set redzones and remember allocation backtrace.
+ */
+void *
+redzone_setup(caddr_t raddr, u_long nsize)
+{
+ struct stack st;
+ caddr_t haddr, faddr;
+
+ atomic_add_long(&redzone_extra_mem, redzone_size_ntor(nsize) - nsize);
+
+ haddr = raddr + redzone_roundup(nsize) - REDZONE_HSIZE;
+ faddr = haddr + REDZONE_HSIZE + nsize;
+
+ /* Redzone header. */
+ stack_save(&st);
+ bcopy(&st, haddr, sizeof(st));
+ haddr += sizeof(st);
+ bcopy(&nsize, haddr, sizeof(nsize));
+ haddr += sizeof(nsize);
+ memset(haddr, 0x42, REDZONE_CHSIZE);
+ haddr += REDZONE_CHSIZE;
+
+ /* Redzone footer. */
+ memset(faddr, 0x42, REDZONE_CFSIZE);
+
+ return (haddr);
+}
+
+/*
+ * Verify redzones.
+ * This function is called on free() and realloc().
+ */
+void
+redzone_check(caddr_t naddr)
+{
+ struct stack ast, fst;
+ caddr_t haddr, faddr;
+ u_int ncorruptions;
+ u_long nsize;
+ int i;
+
+ haddr = naddr - REDZONE_HSIZE;
+ bcopy(haddr, &ast, sizeof(ast));
+ haddr += sizeof(ast);
+ bcopy(haddr, &nsize, sizeof(nsize));
+ haddr += sizeof(nsize);
+
+ atomic_subtract_long(&redzone_extra_mem,
+ redzone_size_ntor(nsize) - nsize);
+
+ /* Look for buffer underflow. */
+ ncorruptions = 0;
+ for (i = 0; i < REDZONE_CHSIZE; i++, haddr++) {
+ if (*(u_char *)haddr != 0x42)
+ ncorruptions++;
+ }
+ if (ncorruptions > 0) {
+ printf("REDZONE: Buffer underflow detected. %u byte%s "
+ "corrupted before %p (%lu bytes allocated).\n",
+ ncorruptions, ncorruptions == 1 ? "" : "s", naddr, nsize);
+ printf("Allocation backtrace:\n");
+ stack_print(&ast);
+ printf("Free backtrace:\n");
+ stack_save(&fst);
+ stack_print(&fst);
+ if (redzone_panic)
+ panic("Stopping here.");
+ }
+ faddr = naddr + nsize;
+ /* Look for buffer overflow. */
+ ncorruptions = 0;
+ for (i = 0; i < REDZONE_CFSIZE; i++, faddr++) {
+ if (*(u_char *)faddr != 0x42)
+ ncorruptions++;
+ }
+ if (ncorruptions > 0) {
+ printf("REDZONE: Buffer overflow detected. %u byte%s corrupted "
+ "after %p (%lu bytes allocated).\n", ncorruptions,
+ ncorruptions == 1 ? "" : "s", naddr + nsize, nsize);
+ printf("Allocation backtrace:\n");
+ stack_print(&ast);
+ printf("Free backtrace:\n");
+ stack_save(&fst);
+ stack_print(&fst);
+ if (redzone_panic)
+ panic("Stopping here.");
+ }
+}
--- /dev/null
+++ sys/vm/vm_phys.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2002-2006 Rice University
+ * Copyright (c) 2007 Alan L. Cox <alc at cs.rice.edu>
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Alan L. Cox,
+ * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/vm/vm_phys.h,v 1.3 2007/09/25 06:25:06 alc Exp $
+ */
+
+/*
+ * Physical memory system definitions
+ */
+
+#ifndef _VM_PHYS_H_
+#define _VM_PHYS_H_
+
+void vm_phys_add_page(vm_paddr_t pa);
+vm_page_t vm_phys_alloc_contig(unsigned long npages,
+ vm_paddr_t low, vm_paddr_t high,
+ unsigned long alignment, unsigned long boundary);
+vm_page_t vm_phys_alloc_pages(int pool, int order);
+vm_paddr_t vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment);
+void vm_phys_free_pages(vm_page_t m, int order);
+void vm_phys_init(void);
+void vm_phys_set_pool(int pool, vm_page_t m, int order);
+void vm_phys_unfree_page(vm_page_t m);
+boolean_t vm_phys_zero_pages_idle(void);
+
+#endif /* !_VM_PHYS_H_ */
--- /dev/null
+++ sys/vm/redzone.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/vm/redzone.h,v 1.1 2006/01/31 11:09:20 pjd Exp $
+ */
+
+#ifndef _VM_REDZONE_H_
+#define _VM_REDZONE_H_
+
+u_long redzone_get_size(caddr_t naddr);
+u_long redzone_size_ntor(u_long nsize);
+void *redzone_addr_ntor(caddr_t naddr);
+void *redzone_setup(caddr_t raddr, u_long nsize);
+void redzone_check(caddr_t naddr);
+
+#endif /* _VM_REDZONE_H_ */
Index: vnode_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vnode_pager.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vnode_pager.c -L sys/vm/vnode_pager.c -u -r1.2 -r1.3
--- sys/vm/vnode_pager.c
+++ sys/vm/vnode_pager.c
@@ -51,7 +51,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vnode_pager.c,v 1.221.2.6 2006/03/13 03:08:26 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vnode_pager.c,v 1.236.2.1 2007/10/26 00:12:23 alc Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -75,8 +75,8 @@
#include <vm/vnode_pager.h>
#include <vm/vm_extern.h>
-static daddr_t vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
- int *run);
+static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
+ daddr_t *rtaddress, int *run);
static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
static void vnode_pager_dealloc(vm_object_t);
@@ -95,30 +95,9 @@
int vnode_pbuf_freecnt;
-/*
- * Compatibility function for RELENG_6, in which vnode_create_vobject()
- * takes file size as size_t due to an oversight. The type may not just
- * change to off_t because the ABI to 3rd party modules must be preserved
- * for RELENG_6 lifetime.
- */
+/* Create the VM system backing object for this vnode */
int
-vnode_create_vobject(struct vnode *vp, size_t isize __unused, struct thread *td)
-{
-
- /*
- * Size of 0 will indicate to vnode_create_vobject_off()
- * VOP_GETATTR() is to be called to get the actual size.
- */
- return (vnode_create_vobject_off(vp, 0, td));
-}
-
-/*
- * Create the VM system backing object for this vnode -- for RELENG_6 only.
- * In HEAD, vnode_create_vobject() has been fixed to take file size as off_t
- * and so it can be used as is.
- */
-int
-vnode_create_vobject_off(struct vnode *vp, off_t isize, struct thread *td)
+vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td)
{
vm_object_t object;
vm_ooffset_t size = isize;
@@ -172,7 +151,7 @@
obj = vp->v_object;
if (obj == NULL)
return;
- ASSERT_VOP_LOCKED(vp, "vnode_destroy_vobject");
+ ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
VM_OBJECT_LOCK(obj);
if (obj->ref_count == 0) {
/*
@@ -219,7 +198,7 @@
vp = (struct vnode *) handle;
- ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc");
+ ASSERT_VOP_ELOCKED(vp, "vnode_pager_alloc");
/*
* If the object is being terminated, wait for it to
@@ -277,7 +256,7 @@
vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
wakeup(object);
}
- ASSERT_VOP_LOCKED(vp, "vnode_pager_dealloc");
+ ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc");
vp->v_object = NULL;
vp->v_vflag &= ~VV_TEXT;
}
@@ -447,6 +426,10 @@
if (m->dirty != 0)
m->dirty = VM_PAGE_BITS_ALL;
vm_page_unlock_queues();
+ } else if ((nsize & PAGE_MASK) &&
+ __predict_false(object->cache != NULL)) {
+ vm_page_cache_free(object, OFF_TO_IDX(nsize),
+ nobjsize);
}
}
object->un_pager.vnp.vnp_size = nsize;
@@ -458,15 +441,11 @@
* calculate the linear (byte) disk address of specified virtual
* file address
*/
-static daddr_t
-vnode_pager_addr(vp, address, run)
- struct vnode *vp;
- vm_ooffset_t address;
- int *run;
+static int
+vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress,
+ int *run)
{
- daddr_t rtaddress;
int bsize;
- daddr_t block;
int err;
daddr_t vblock;
daddr_t voffset;
@@ -481,12 +460,10 @@
vblock = address / bsize;
voffset = address % bsize;
- err = VOP_BMAP(vp, vblock, NULL, &block, run, NULL);
-
- if (err || (block == -1))
- rtaddress = -1;
- else {
- rtaddress = block + voffset / DEV_BSIZE;
+ err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
+ if (err == 0) {
+ if (*rtaddress != -1)
+ *rtaddress += voffset / DEV_BSIZE;
if (run) {
*run += 1;
*run *= bsize/PAGE_SIZE;
@@ -494,7 +471,7 @@
}
}
- return rtaddress;
+ return (err);
}
/*
@@ -534,7 +511,9 @@
if (address >= object->un_pager.vnp.vnp_size) {
fileaddr = -1;
} else {
- fileaddr = vnode_pager_addr(vp, address, NULL);
+ error = vnode_pager_addr(vp, address, &fileaddr, NULL);
+ if (error)
+ break;
}
if (fileaddr != -1) {
bp = getpbuf(&vnode_pbuf_freecnt);
@@ -716,13 +695,13 @@
vm_offset_t kva;
off_t foff, tfoff, nextoff;
int i, j, size, bsize, first;
- daddr_t firstaddr;
+ daddr_t firstaddr, reqblock;
struct bufobj *bo;
int runpg;
int runend;
struct buf *bp;
int count;
- int error = 0;
+ int error;
object = vp->v_object;
count = bytecount / PAGE_SIZE;
@@ -745,18 +724,28 @@
/*
* if we can't bmap, use old VOP code
*/
- if (VOP_BMAP(vp, 0, &bo, 0, NULL, NULL)) {
+ error = VOP_BMAP(vp, foff / bsize, &bo, &reqblock, NULL, NULL);
+ if (error == EOPNOTSUPP) {
VM_OBJECT_LOCK(object);
vm_page_lock_queues();
for (i = 0; i < count; i++)
if (i != reqpage)
vm_page_free(m[i]);
vm_page_unlock_queues();
- cnt.v_vnodein++;
- cnt.v_vnodepgsin++;
+ PCPU_INC(cnt.v_vnodein);
+ PCPU_INC(cnt.v_vnodepgsin);
error = vnode_pager_input_old(object, m[reqpage]);
VM_OBJECT_UNLOCK(object);
return (error);
+ } else if (error != 0) {
+ VM_OBJECT_LOCK(object);
+ vm_page_lock_queues();
+ for (i = 0; i < count; i++)
+ if (i != reqpage)
+ vm_page_free(m[i]);
+ vm_page_unlock_queues();
+ VM_OBJECT_UNLOCK(object);
+ return (VM_PAGER_ERROR);
/*
* if the blocksize is smaller than a page size, then use
@@ -772,8 +761,8 @@
vm_page_free(m[i]);
vm_page_unlock_queues();
VM_OBJECT_UNLOCK(object);
- cnt.v_vnodein++;
- cnt.v_vnodepgsin++;
+ PCPU_INC(cnt.v_vnodein);
+ PCPU_INC(cnt.v_vnodepgsin);
return vnode_pager_input_smlfs(object, m[reqpage]);
}
@@ -791,6 +780,17 @@
vm_page_unlock_queues();
VM_OBJECT_UNLOCK(object);
return VM_PAGER_OK;
+ } else if (reqblock == -1) {
+ pmap_zero_page(m[reqpage]);
+ vm_page_undirty(m[reqpage]);
+ m[reqpage]->valid = VM_PAGE_BITS_ALL;
+ vm_page_lock_queues();
+ for (i = 0; i < count; i++)
+ if (i != reqpage)
+ vm_page_free(m[i]);
+ vm_page_unlock_queues();
+ VM_OBJECT_UNLOCK(object);
+ return (VM_PAGER_OK);
}
m[reqpage]->valid = 0;
VM_OBJECT_UNLOCK(object);
@@ -804,8 +804,17 @@
* calculate the run that includes the required page
*/
for (first = 0, i = 0; i < count; i = runend) {
- firstaddr = vnode_pager_addr(vp,
- IDX_TO_OFF(m[i]->pindex), &runpg);
+ if (vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &firstaddr,
+ &runpg) != 0) {
+ VM_OBJECT_LOCK(object);
+ vm_page_lock_queues();
+ for (; i < count; i++)
+ if (i != reqpage)
+ vm_page_free(m[i]);
+ vm_page_unlock_queues();
+ VM_OBJECT_UNLOCK(object);
+ return (VM_PAGER_ERROR);
+ }
if (firstaddr == -1) {
VM_OBJECT_LOCK(object);
if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
@@ -852,9 +861,7 @@
* to be zero based...
*/
if (first != 0) {
- for (i = first; i < count; i++) {
- m[i - first] = m[i];
- }
+ m += first;
count -= first;
reqpage -= first;
}
@@ -906,8 +913,8 @@
bp->b_runningbufspace = bp->b_bufsize;
atomic_add_int(&runningbufspace, bp->b_runningbufspace);
- cnt.v_vnodein++;
- cnt.v_vnodepgsin += count;
+ PCPU_INC(cnt.v_vnodein);
+ PCPU_ADD(cnt.v_vnodepgsin, count);
/* do the input */
bp->b_iooffset = dbtob(bp->b_blkno);
@@ -977,7 +984,7 @@
* now tell them that it is ok to use
*/
if (!error) {
- if (mt->flags & PG_WANTED)
+ if (mt->oflags & VPO_WANTED)
vm_page_activate(mt);
else
vm_page_deactivate(mt);
@@ -1154,8 +1161,8 @@
auio.uio_resid = maxsize;
auio.uio_td = (struct thread *) 0;
error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
- cnt.v_vnodeout++;
- cnt.v_vnodepgsout += ncount;
+ PCPU_INC(cnt.v_vnodeout);
+ PCPU_ADD(cnt.v_vnodepgsout, ncount);
if (error) {
if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
Index: vm_kern.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_kern.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_kern.h -L sys/vm/vm_kern.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_kern.h
+++ sys/vm/vm_kern.h
@@ -57,7 +57,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: src/sys/vm/vm_kern.h,v 1.28 2005/01/07 02:29:27 imp Exp $
+ * $FreeBSD: src/sys/vm/vm_kern.h,v 1.29 2006/11/20 16:23:34 ru Exp $
*/
#ifndef _VM_VM_KERN_H_
@@ -67,7 +67,6 @@
extern vm_map_t buffer_map;
extern vm_map_t kernel_map;
extern vm_map_t kmem_map;
-extern vm_map_t clean_map;
extern vm_map_t exec_map;
extern vm_map_t pipe_map;
extern u_int vm_kmem_size;
Index: uma_dbg.h
===================================================================
RCS file: /home/cvs/src/sys/vm/uma_dbg.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/uma_dbg.h -L sys/vm/uma_dbg.h -u -r1.1.1.1 -r1.2
--- sys/vm/uma_dbg.h
+++ sys/vm/uma_dbg.h
@@ -24,7 +24,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: src/sys/vm/uma_dbg.h,v 1.8.2.1 2005/08/20 13:31:05 rwatson Exp $
+ * $FreeBSD: src/sys/vm/uma_dbg.h,v 1.9 2005/07/16 09:51:52 rwatson Exp $
*
*/
Index: vm.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm.h -L sys/vm/vm.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm.h
+++ sys/vm/vm.h
@@ -55,7 +55,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: src/sys/vm/vm.h,v 1.26 2005/04/01 20:00:11 jhb Exp $
+ * $FreeBSD: src/sys/vm/vm.h,v 1.27 2006/07/21 23:22:49 alc Exp $
*/
#ifndef VM_H
@@ -115,19 +115,6 @@
#endif /* _KERNEL */
/*
- * Virtual memory MPSAFE temporary workarounds.
- */
-extern int debug_mpsafevm; /* defined in vm/vm_meter.c */
-#define VM_LOCK_GIANT() do { \
- if (!debug_mpsafevm) \
- mtx_lock(&Giant); \
-} while (0)
-#define VM_UNLOCK_GIANT() do { \
- if (!debug_mpsafevm) \
- mtx_unlock(&Giant); \
-} while (0)
-
-/*
* Information passed from the machine-independant VM initialization code
* for use by machine-dependant code (mainly for MMU support)
*/
Index: memguard.h
===================================================================
RCS file: /home/cvs/src/sys/vm/memguard.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/memguard.h -L sys/vm/memguard.h -u -r1.1.1.1 -r1.2
--- sys/vm/memguard.h
+++ sys/vm/memguard.h
@@ -23,9 +23,12 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: src/sys/vm/memguard.h,v 1.2 2005/02/16 21:45:59 bmilekic Exp $
+ * $FreeBSD: src/sys/vm/memguard.h,v 1.3 2005/12/30 11:45:07 pjd Exp $
*/
+extern u_int vm_memguard_divisor;
+
void memguard_init(vm_map_t parent_map, unsigned long size);
void *memguard_alloc(unsigned long size, int flags);
void memguard_free(void *addr);
+int memguard_cmp(struct malloc_type *mtp);
Index: pmap.h
===================================================================
RCS file: /home/cvs/src/sys/vm/pmap.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/pmap.h -L sys/vm/pmap.h -u -r1.1.1.1 -r1.2
--- sys/vm/pmap.h
+++ sys/vm/pmap.h
@@ -57,7 +57,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: src/sys/vm/pmap.h,v 1.71.2.1 2005/11/13 21:45:48 alc Exp $
+ * $FreeBSD: src/sys/vm/pmap.h,v 1.79.4.1 2008/01/19 18:15:07 kib Exp $
*/
/*
@@ -90,8 +90,6 @@
*/
extern vm_offset_t kernel_vm_end;
-extern int pmap_pagedaemon_waken;
-
void pmap_change_wiring(pmap_t, vm_offset_t, boolean_t);
void pmap_clear_modify(vm_page_t m);
void pmap_clear_reference(vm_page_t m);
@@ -99,8 +97,10 @@
void pmap_copy_page(vm_page_t, vm_page_t);
void pmap_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t,
boolean_t);
-vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
- vm_prot_t prot, vm_page_t mpte);
+void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
+ vm_prot_t prot);
+void pmap_enter_object(pmap_t pmap, vm_offset_t start,
+ vm_offset_t end, vm_page_t m_start, vm_prot_t prot);
vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va);
vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va,
vm_prot_t prot);
@@ -114,8 +114,7 @@
vm_object_t object, vm_pindex_t pindex, vm_size_t size);
boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m);
void pmap_page_init(vm_page_t m);
-void pmap_page_protect(vm_page_t m, vm_prot_t prot);
-void pmap_pinit(pmap_t);
+int pmap_pinit(pmap_t);
void pmap_pinit0(pmap_t);
void pmap_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
void pmap_qenter(vm_offset_t, vm_page_t *, int);
@@ -123,14 +122,14 @@
void pmap_release(pmap_t);
void pmap_remove(pmap_t, vm_offset_t, vm_offset_t);
void pmap_remove_all(vm_page_t m);
-void pmap_remove_pages(pmap_t, vm_offset_t, vm_offset_t);
+void pmap_remove_pages(pmap_t);
+void pmap_remove_write(vm_page_t m);
void pmap_zero_page(vm_page_t);
void pmap_zero_page_area(vm_page_t, int off, int size);
void pmap_zero_page_idle(vm_page_t);
int pmap_mincore(pmap_t pmap, vm_offset_t addr);
void pmap_activate(struct thread *td);
vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size);
-void pmap_init2(void);
#define pmap_resident_count(pm) ((pm)->pm_stats.resident_count)
#define pmap_wired_count(pm) ((pm)->pm_stats.wired_count)
Index: uma_core.c
===================================================================
RCS file: /home/cvs/src/sys/vm/uma_core.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/vm/uma_core.c -L sys/vm/uma_core.c -u -r1.1.1.2 -r1.2
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -1,7 +1,7 @@
/*-
* Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff at FreeBSD.org>
* Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
- * Copyright (c) 2004-2005 Robert N. M. Watson
+ * Copyright (c) 2004-2006 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -48,7 +48,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/uma_core.c,v 1.119.2.15 2006/02/14 03:37:58 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/uma_core.c,v 1.147.2.1 2007/10/18 18:45:17 jhb Exp $");
/* I should really use ktr.. */
/*
@@ -111,6 +111,9 @@
*/
static uma_zone_t hashzone;
+/* The boot-time adjusted value for cache line alignment. */
+static int uma_align_cache = 16 - 1;
+
static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
/*
@@ -238,27 +241,21 @@
static int uma_zalloc_bucket(uma_zone_t zone, int flags);
static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags);
static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab);
-static void zone_drain(uma_zone_t);
static uma_zone_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
uma_fini fini, int align, u_int32_t flags);
void uma_print_zone(uma_zone_t);
void uma_print_stats(void);
-static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS);
static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
#ifdef WITNESS
static int nosleepwithlocks = 1;
-SYSCTL_INT(_debug, OID_AUTO, nosleepwithlocks, CTLFLAG_RW, &nosleepwithlocks,
- 0, "Convert M_WAITOK to M_NOWAIT to avoid lock-held-across-sleep paths");
#else
static int nosleepwithlocks = 0;
+#endif
SYSCTL_INT(_debug, OID_AUTO, nosleepwithlocks, CTLFLAG_RW, &nosleepwithlocks,
0, "Convert M_WAITOK to M_NOWAIT to avoid lock-held-across-sleep paths");
-#endif
-SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD,
- NULL, 0, sysctl_vm_zone, "A", "Zone Info");
SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
@@ -685,7 +682,7 @@
* Returns:
* Nothing.
*/
-static void
+void
zone_drain(uma_zone_t zone)
{
struct slabhead freeslabs = { 0 };
@@ -1713,13 +1710,22 @@
args.size = size;
args.uminit = uminit;
args.fini = fini;
- args.align = align;
+ args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
args.flags = flags;
args.zone = zone;
return (uma_zalloc_internal(kegs, &args, M_WAITOK));
}
/* See uma.h */
+void
+uma_set_align(int align)
+{
+
+ if (align != UMA_ALIGN_CACHE)
+ uma_align_cache = align;
+}
+
+/* See uma.h */
uma_zone_t
uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
uma_init uminit, uma_fini fini, int align, u_int32_t flags)
@@ -1777,7 +1783,6 @@
uma_cache_t cache;
uma_bucket_t bucket;
int cpu;
- int badness;
/* This is the fast path allocation */
#ifdef UMA_DEBUG_ALLOC_1
@@ -1786,29 +1791,9 @@
CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
zone->uz_name, flags);
- if (!(flags & M_NOWAIT)) {
- KASSERT(curthread->td_intr_nesting_level == 0,
- ("malloc(M_WAITOK) in interrupt context"));
- if (nosleepwithlocks) {
-#ifdef WITNESS
- badness = WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
- NULL,
- "malloc(M_WAITOK) of \"%s\", forcing M_NOWAIT",
- zone->uz_name);
-#else
- badness = 1;
-#endif
- } else {
- badness = 0;
-#ifdef WITNESS
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
- "malloc(M_WAITOK) of \"%s\"", zone->uz_name);
-#endif
- }
- if (badness) {
- flags &= ~M_WAITOK;
- flags |= M_NOWAIT;
- }
+ if (flags & M_WAITOK) {
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
}
/*
@@ -1981,7 +1966,7 @@
* caller can't handle that.
*/
if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0)
- if ((zone != slabzone) && (zone != slabrefzone))
+ if (zone != slabzone && zone != slabrefzone && zone != zones)
return (NULL);
slab = NULL;
@@ -2417,8 +2402,7 @@
* If nothing else caught this, we'll just do an internal free.
*/
zfree_internal:
- uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFAIL |
- ZFREE_STATFREE);
+ uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
return;
}
@@ -2502,8 +2486,13 @@
if (keg->uk_pages < keg->uk_maxpages)
keg->uk_flags &= ~UMA_ZFLAG_FULL;
- /* We can handle one more allocation */
- wakeup_one(keg);
+ /*
+ * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
+ * wake up all procs blocked on pages. This should be uncommon, so
+ * keeping this simple for now (rather than adding count of blocked
+ * threads etc).
+ */
+ wakeup(keg);
}
ZONE_UNLOCK(zone);
@@ -2689,6 +2678,24 @@
bucket_zone_drain();
}
+/* See uma.h */
+int
+uma_zone_exhausted(uma_zone_t zone)
+{
+ int full;
+
+ ZONE_LOCK(zone);
+ full = (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL);
+ ZONE_UNLOCK(zone);
+ return (full);
+}
+
+int
+uma_zone_exhausted_nolock(uma_zone_t zone)
+{
+ return (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL);
+}
+
void *
uma_large_malloc(int size, int wait)
{
@@ -2776,6 +2783,7 @@
}
}
+#ifdef DDB
/*
* Generate statistics across both the zone and its per-cpu cache's. Return
* desired statistics if the pointer is non-NULL for that statistic.
@@ -2817,83 +2825,7 @@
if (freesp != NULL)
*freesp = frees;
}
-
-/*
- * Sysctl handler for vm.zone
- *
- * stolen from vm_zone.c
- */
-static int
-sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
-{
- int error, len, cnt;
- const int linesize = 128; /* conservative */
- int totalfree;
- char *tmpbuf, *offset;
- uma_zone_t z;
- uma_keg_t zk;
- char *p;
- int cachefree;
- uma_bucket_t bucket;
- u_int64_t allocs, frees;
-
- cnt = 0;
- mtx_lock(&uma_mtx);
- LIST_FOREACH(zk, &uma_kegs, uk_link) {
- LIST_FOREACH(z, &zk->uk_zones, uz_link)
- cnt++;
- }
- mtx_unlock(&uma_mtx);
- MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize,
- M_TEMP, M_WAITOK);
- len = snprintf(tmpbuf, linesize,
- "\nITEM SIZE LIMIT USED FREE REQUESTS\n\n");
- if (cnt == 0)
- tmpbuf[len - 1] = '\0';
- error = SYSCTL_OUT(req, tmpbuf, cnt == 0 ? len-1 : len);
- if (error || cnt == 0)
- goto out;
- offset = tmpbuf;
- mtx_lock(&uma_mtx);
- LIST_FOREACH(zk, &uma_kegs, uk_link) {
- LIST_FOREACH(z, &zk->uk_zones, uz_link) {
- if (cnt == 0) /* list may have changed size */
- break;
- ZONE_LOCK(z);
- cachefree = 0;
- if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
- uma_zone_sumstat(z, &cachefree, &allocs, &frees);
- } else {
- allocs = z->uz_allocs;
- frees = z->uz_frees;
- }
-
- LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) {
- cachefree += bucket->ub_cnt;
- }
- totalfree = zk->uk_free + cachefree;
- len = snprintf(offset, linesize,
- "%-12.12s %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n",
- z->uz_name, zk->uk_size,
- zk->uk_maxpages * zk->uk_ipers,
- (zk->uk_ipers * (zk->uk_pages / zk->uk_ppera)) - totalfree,
- totalfree,
- (unsigned long long)allocs);
- ZONE_UNLOCK(z);
- for (p = offset + 12; p > offset && *p == ' '; --p)
- /* nothing */ ;
- p[1] = ':';
- cnt--;
- offset += len;
- }
- }
- mtx_unlock(&uma_mtx);
- *offset++ = '\0';
- error = SYSCTL_OUT(req, tmpbuf, offset - tmpbuf);
-out:
- FREE(tmpbuf, M_TEMP);
- return (error);
-}
+#endif /* DDB */
static int
sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
@@ -3055,8 +2987,8 @@
uma_zone_t z;
int cachefree;
- db_printf("%18s %12s %12s %12s %8s\n", "Zone", "Allocs", "Frees",
- "Used", "Cache");
+ db_printf("%18s %8s %8s %8s %12s\n", "Zone", "Size", "Used", "Free",
+ "Requests");
LIST_FOREACH(kz, &uma_kegs, uk_link) {
LIST_FOREACH(z, &kz->uk_zones, uz_link) {
if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
@@ -3071,8 +3003,10 @@
cachefree += kz->uk_free;
LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
cachefree += bucket->ub_cnt;
- db_printf("%18s %12ju %12ju %12ju %8d\n", z->uz_name,
- allocs, frees, allocs - frees, cachefree);
+ db_printf("%18s %8ju %8jd %8d %12ju\n", z->uz_name,
+ (uintmax_t)kz->uk_size,
+ (intmax_t)(allocs - frees), cachefree,
+ (uintmax_t)allocs);
}
}
}
Index: vm_pageq.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_pageq.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_pageq.c -L sys/vm/vm_pageq.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_pageq.c
+++ sys/vm/vm_pageq.c
@@ -26,13 +26,15 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_pageq.c,v 1.18 2005/06/10 03:33:36 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_pageq.c,v 1.35 2007/09/25 06:25:06 alc Exp $");
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/linker_set.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
@@ -44,21 +46,16 @@
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
#include <vm/vm_extern.h>
-struct vpgqueues vm_page_queues[PQ_COUNT];
+struct vpgqueues vm_page_queues[PQ_MAXCOUNT];
void
-vm_pageq_init(void)
+vm_pageq_init(void)
{
int i;
- for (i = 0; i < PQ_L2_SIZE; i++) {
- vm_page_queues[PQ_FREE+i].cnt = &cnt.v_free_count;
- }
- for (i = 0; i < PQ_L2_SIZE; i++) {
- vm_page_queues[PQ_CACHE+i].cnt = &cnt.v_cache_count;
- }
vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
@@ -71,7 +68,7 @@
void
vm_pageq_requeue(vm_page_t m)
{
- int queue = m->queue;
+ int queue = VM_PAGE_GETQUEUE(m);
struct vpgqueues *vpq;
if (queue != PQ_NONE) {
@@ -90,84 +87,9 @@
struct vpgqueues *vpq;
vpq = &vm_page_queues[queue];
- m->queue = queue;
+ VM_PAGE_SETQUEUE2(m, queue);
TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
++*vpq->cnt;
- ++vpq->lcnt;
-}
-
-/*
- * vm_add_new_page:
- *
- * Add a new page to the freelist for use by the system.
- */
-vm_page_t
-vm_pageq_add_new_page(vm_paddr_t pa)
-{
- vm_paddr_t bad;
- vm_page_t m;
- char *cp, *list, *pos;
-
- GIANT_REQUIRED;
-
- /*
- * See if a physical address in this page has been listed
- * in the blacklist tunable. Entries in the tunable are
- * separated by spaces or commas. If an invalid integer is
- * encountered then the rest of the string is skipped.
- */
- if (testenv("vm.blacklist")) {
- list = getenv("vm.blacklist");
- for (pos = list; *pos != '\0'; pos = cp) {
- bad = strtoq(pos, &cp, 0);
- if (*cp != '\0') {
- if (*cp == ' ' || *cp == ',') {
- cp++;
- if (cp == pos)
- continue;
- } else
- break;
- }
- if (pa == trunc_page(bad)) {
- printf("Skipping page with pa 0x%jx\n",
- (uintmax_t)pa);
- freeenv(list);
- return (NULL);
- }
- }
- freeenv(list);
- }
-
- ++cnt.v_page_count;
- m = PHYS_TO_VM_PAGE(pa);
- m->phys_addr = pa;
- m->flags = 0;
- m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
- pmap_page_init(m);
- vm_pageq_enqueue(m->pc + PQ_FREE, m);
- return (m);
-}
-
-/*
- * vm_pageq_remove_nowakeup:
- *
- * vm_page_unqueue() without any wakeup
- *
- * The queue containing the given page must be locked.
- * This routine may not block.
- */
-void
-vm_pageq_remove_nowakeup(vm_page_t m)
-{
- int queue = m->queue;
- struct vpgqueues *pq;
- if (queue != PQ_NONE) {
- pq = &vm_page_queues[queue];
- m->queue = PQ_NONE;
- TAILQ_REMOVE(&pq->pl, m, pageq);
- (*pq->cnt)--;
- pq->lcnt--;
- }
}
/*
@@ -181,87 +103,13 @@
void
vm_pageq_remove(vm_page_t m)
{
- int queue = m->queue;
+ int queue = VM_PAGE_GETQUEUE(m);
struct vpgqueues *pq;
if (queue != PQ_NONE) {
- m->queue = PQ_NONE;
+ VM_PAGE_SETQUEUE2(m, PQ_NONE);
pq = &vm_page_queues[queue];
TAILQ_REMOVE(&pq->pl, m, pageq);
(*pq->cnt)--;
- pq->lcnt--;
- if ((queue - m->pc) == PQ_CACHE) {
- if (vm_paging_needed())
- pagedaemon_wakeup();
- }
- }
-}
-
-#if PQ_L2_SIZE > 1
-
-/*
- * vm_pageq_find:
- *
- * Find a page on the specified queue with color optimization.
- *
- * The page coloring optimization attempts to locate a page
- * that does not overload other nearby pages in the object in
- * the cpu's L2 cache. We need this optimization because cpu
- * caches tend to be physical caches, while object spaces tend
- * to be virtual.
- *
- * The specified queue must be locked.
- * This routine may not block.
- *
- * This routine may only be called from the vm_pageq_find()
- * function in this file.
- */
-static __inline vm_page_t
-_vm_pageq_find(int basequeue, int index)
-{
- int i;
- vm_page_t m = NULL;
- struct vpgqueues *pq;
-
- pq = &vm_page_queues[basequeue];
-
- /*
- * Note that for the first loop, index+i and index-i wind up at the
- * same place. Even though this is not totally optimal, we've already
- * blown it by missing the cache case so we do not care.
- */
- for (i = PQ_L2_SIZE / 2; i > 0; --i) {
- if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_L2_MASK].pl)) != NULL)
- break;
-
- if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_L2_MASK].pl)) != NULL)
- break;
}
- return (m);
}
-#endif /* PQ_L2_SIZE > 1 */
-
-vm_page_t
-vm_pageq_find(int basequeue, int index, boolean_t prefer_zero)
-{
- vm_page_t m;
-
-#if PQ_L2_SIZE > 1
- if (prefer_zero) {
- m = TAILQ_LAST(&vm_page_queues[basequeue+index].pl, pglist);
- } else {
- m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
- }
- if (m == NULL) {
- m = _vm_pageq_find(basequeue, index);
- }
-#else
- if (prefer_zero) {
- m = TAILQ_LAST(&vm_page_queues[basequeue].pl, pglist);
- } else {
- m = TAILQ_FIRST(&vm_page_queues[basequeue].pl);
- }
-#endif
- return (m);
-}
-
Index: uma.h
===================================================================
RCS file: /home/cvs/src/sys/vm/uma.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/uma.h -L sys/vm/uma.h -u -r1.1.1.1 -r1.2
--- sys/vm/uma.h
+++ sys/vm/uma.h
@@ -24,7 +24,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: src/sys/vm/uma.h,v 1.22.2.6 2005/11/13 08:44:24 alc Exp $
+ * $FreeBSD: src/sys/vm/uma.h,v 1.31 2007/02/11 20:13:52 rwatson Exp $
*
*/
@@ -48,6 +48,8 @@
/* Opaque type used as a handle to the zone */
typedef struct uma_zone * uma_zone_t;
+void zone_drain(uma_zone_t);
+
/*
* Item constructor
*
@@ -234,7 +236,7 @@
#define UMA_ALIGN_INT (sizeof(int) - 1) /* "" int */
#define UMA_ALIGN_SHORT (sizeof(short) - 1) /* "" short */
#define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */
-#define UMA_ALIGN_CACHE (16 - 1) /* Cache line size align */
+#define UMA_ALIGN_CACHE (0 - 1) /* Cache line size align */
/*
* Destroys an empty uma zone. If the zone is not empty uma complains loudly.
@@ -386,6 +388,18 @@
void uma_reclaim(void);
/*
+ * Sets the alignment mask to be used for all zones requesting cache
+ * alignment. Should be called by MD boot code prior to starting VM/UMA.
+ *
+ * Arguments:
+ * align The alignment mask
+ *
+ * Returns:
+ * Nothing
+ */
+void uma_set_align(int align);
+
+/*
* Switches the backing object of a zone
*
* Arguments:
@@ -509,6 +523,18 @@
u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item);
/*
+ * Used to determine if a fixed-size zone is exhausted.
+ *
+ * Arguments:
+ * zone The zone to check
+ *
+ * Returns:
+ * Non-zero if zone is exhausted.
+ */
+int uma_zone_exhausted(uma_zone_t zone);
+int uma_zone_exhausted_nolock(uma_zone_t zone);
+
+/*
* Exported statistics structures to be used by user space monitoring tools.
* Statistics stream consusts of a uma_stream_header, followed by a series of
* alternative uma_type_header and uma_type_stat structures. Statistics
Index: vm_meter.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_meter.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_meter.c -L sys/vm/vm_meter.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_meter.c
+++ sys/vm/vm_meter.c
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_meter.c,v 1.85 2005/05/08 23:56:16 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_meter.c,v 1.96 2007/07/27 20:01:21 alc Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -52,18 +52,6 @@
#include <vm/vm_object.h>
#include <sys/sysctl.h>
-/*
- * Virtual memory MPSAFE temporary workarounds.
- */
-#if !defined(__arm__) && !defined(__powerpc__)
-int debug_mpsafevm = 1;
-#else
-int debug_mpsafevm;
-#endif
-TUNABLE_INT("debug.mpsafevm", &debug_mpsafevm);
-SYSCTL_INT(_debug, OID_AUTO, mpsafevm, CTLFLAG_RD, &debug_mpsafevm, 0,
- "Enable/disable MPSAFE virtual memory support");
-
struct vmmeter cnt;
int maxslp = MAXSLP;
@@ -109,15 +97,15 @@
{
/* XXXKSE almost completely broken */
struct proc *p;
- struct vmtotal total, *totalp;
+ struct vmtotal total;
vm_map_entry_t entry;
vm_object_t object;
vm_map_t map;
int paging;
struct thread *td;
+ struct vmspace *vm;
- totalp = &total;
- bzero(totalp, sizeof *totalp);
+ bzero(&total, sizeof(total));
/*
* Mark all objects as inactive.
*/
@@ -143,49 +131,58 @@
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_flag & P_SYSTEM)
continue;
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
switch (p->p_state) {
case PRS_NEW:
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
continue;
break;
default:
FOREACH_THREAD_IN_PROC(p, td) {
/* Need new statistics XXX */
+ thread_lock(td);
switch (td->td_state) {
case TDS_INHIBITED:
+ /*
+ * XXX stats no longer synchronized.
+ */
if (TD_ON_LOCK(td) ||
(td->td_inhibitors ==
TDI_SWAPPED)) {
- totalp->t_sw++;
+ total.t_sw++;
} else if (TD_IS_SLEEPING(td) ||
TD_AWAITING_INTR(td) ||
TD_IS_SUSPENDED(td)) {
if (td->td_priority <= PZERO)
- totalp->t_dw++;
+ total.t_dw++;
else
- totalp->t_sl++;
+ total.t_sl++;
}
break;
case TDS_CAN_RUN:
- totalp->t_sw++;
+ total.t_sw++;
break;
case TDS_RUNQ:
case TDS_RUNNING:
- totalp->t_rq++;
+ total.t_rq++;
+ thread_unlock(td);
continue;
default:
break;
}
+ thread_unlock(td);
}
}
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
/*
* Note active objects.
*/
paging = 0;
- map = &p->p_vmspace->vm_map;
+ vm = vmspace_acquire_ref(p);
+ if (vm == NULL)
+ continue;
+ map = &vm->vm_map;
vm_map_lock_read(map);
for (entry = map->header.next;
entry != &map->header; entry = entry->next) {
@@ -198,8 +195,9 @@
VM_OBJECT_UNLOCK(object);
}
vm_map_unlock_read(map);
+ vmspace_free(vm);
if (paging)
- totalp->t_pw++;
+ total.t_pw++;
}
sx_sunlock(&allproc_lock);
/*
@@ -219,25 +217,32 @@
*/
continue;
}
- totalp->t_vm += object->size;
- totalp->t_rm += object->resident_page_count;
+ if (object->ref_count == 0) {
+ /*
+ * Also skip unreferenced objects, including
+ * vnodes representing mounted file systems.
+ */
+ continue;
+ }
+ total.t_vm += object->size;
+ total.t_rm += object->resident_page_count;
if (object->flags & OBJ_ACTIVE) {
- totalp->t_avm += object->size;
- totalp->t_arm += object->resident_page_count;
+ total.t_avm += object->size;
+ total.t_arm += object->resident_page_count;
}
if (object->shadow_count > 1) {
/* shared object */
- totalp->t_vmshr += object->size;
- totalp->t_rmshr += object->resident_page_count;
+ total.t_vmshr += object->size;
+ total.t_rmshr += object->resident_page_count;
if (object->flags & OBJ_ACTIVE) {
- totalp->t_avmshr += object->size;
- totalp->t_armshr += object->resident_page_count;
+ total.t_avmshr += object->size;
+ total.t_armshr += object->resident_page_count;
}
}
}
mtx_unlock(&vm_object_list_mtx);
- totalp->t_free = cnt.v_free_count + cnt.v_cache_count;
- return (sysctl_handle_opaque(oidp, totalp, sizeof total, req));
+ total.t_free = cnt.v_free_count + cnt.v_cache_count;
+ return (sysctl_handle_opaque(oidp, &total, sizeof(total), req));
}
/*
@@ -324,6 +329,8 @@
&cnt.v_pdwakeups, 0, vcnt, "IU", "Pagedaemon wakeups");
SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, CTLTYPE_UINT|CTLFLAG_RD,
&cnt.v_pdpages, 0, vcnt, "IU", "Pagedaemon page scans");
+SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_tcached, CTLTYPE_UINT|CTLFLAG_RD,
+ &cnt.v_tcached, 0, vcnt, "IU", "Total pages cached");
SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_dfree, CTLTYPE_UINT|CTLFLAG_RD,
&cnt.v_dfree, 0, vcnt, "IU", "");
SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pfree, CTLTYPE_UINT|CTLFLAG_RD,
@@ -379,13 +386,3 @@
SYSCTL_INT(_vm_stats_misc, OID_AUTO,
zero_page_count, CTLFLAG_RD, &vm_page_zero_count, 0, "");
-#if 0
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
- page_mask, CTLFLAG_RD, &page_mask, 0, "");
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
- page_shift, CTLFLAG_RD, &page_shift, 0, "");
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
- first_page, CTLFLAG_RD, &first_page, 0, "");
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
- last_page, CTLFLAG_RD, &last_page, 0, "");
-#endif
Index: vm_fault.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_fault.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_fault.c -L sys/vm/vm_fault.c -u -r1.2 -r1.3
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -72,7 +72,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_fault.c,v 1.205.2.4 2006/03/08 23:53:39 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_fault.c,v 1.237 2007/10/08 20:09:53 kib Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -130,17 +130,17 @@
struct vnode *vp;
};
-static __inline void
+static inline void
release_page(struct faultstate *fs)
{
- vm_page_lock_queues();
vm_page_wakeup(fs->m);
+ vm_page_lock_queues();
vm_page_deactivate(fs->m);
vm_page_unlock_queues();
fs->m = NULL;
}
-static __inline void
+static inline void
unlock_map(struct faultstate *fs)
{
if (fs->lookup_still_valid) {
@@ -152,7 +152,6 @@
static void
unlock_and_deallocate(struct faultstate *fs)
{
- boolean_t firstobjneedgiant;
vm_object_pip_wakeup(fs->object);
VM_OBJECT_UNLOCK(fs->object);
@@ -165,7 +164,6 @@
VM_OBJECT_UNLOCK(fs->first_object);
fs->first_m = NULL;
}
- firstobjneedgiant = (fs->first_object->flags & OBJ_NEEDGIANT) != 0;
vm_object_deallocate(fs->first_object);
unlock_map(fs);
if (fs->vp != NULL) {
@@ -176,8 +174,6 @@
fs->vp = NULL;
VFS_UNLOCK_GIANT(vfslocked);
}
- if (firstobjneedgiant)
- VM_UNLOCK_GIANT();
}
/*
@@ -223,7 +219,7 @@
hardfault = 0;
growstack = TRUE;
- atomic_add_int(&cnt.v_vm_faults, 1);
+ PCPU_INC(cnt.v_vm_faults);
RetryFault:;
@@ -302,7 +298,7 @@
KASSERT((fs.first_object->flags & OBJ_NEEDGIANT) == 0 ||
!fs.map->system_map,
("vm_fault: Object requiring giant mapped by system map"));
- if (fs.first_object->flags & OBJ_NEEDGIANT && debug_mpsafevm)
+ if (fs.first_object->flags & OBJ_NEEDGIANT)
mtx_unlock(&Giant);
vm_object_pip_add(fs.first_object, 1);
@@ -332,8 +328,6 @@
*/
fs.m = vm_page_lookup(fs.object, fs.pindex);
if (fs.m != NULL) {
- int queue;
-
/*
* check for page-based copy on write.
* We check fs.object == fs.first_object so
@@ -355,7 +349,7 @@
/*
* Wait/Retry if the page is busy. We have to do this
- * if the page is busy via either PG_BUSY or
+ * if the page is busy via either VPO_BUSY or
* vm_page_t->busy because the vm_pager may be using
* vm_page_t->busy for pageouts ( and even pageins if
* it is the vnode pager ), and we could end up trying
@@ -369,7 +363,7 @@
* around with a vm_page_t->busy page except, perhaps,
* to pmap it.
*/
- if ((fs.m->flags & PG_BUSY) || fs.m->busy) {
+ if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) {
vm_page_unlock_queues();
VM_OBJECT_UNLOCK(fs.object);
if (fs.object != fs.first_object) {
@@ -393,30 +387,17 @@
VM_OBJECT_LOCK(fs.object);
if (fs.m == vm_page_lookup(fs.object,
fs.pindex)) {
- vm_page_lock_queues();
- if (!vm_page_sleep_if_busy(fs.m, TRUE,
- "vmpfw"))
- vm_page_unlock_queues();
+ vm_page_sleep_if_busy(fs.m, TRUE,
+ "vmpfw");
}
vm_object_pip_wakeup(fs.object);
VM_OBJECT_UNLOCK(fs.object);
- atomic_add_int(&cnt.v_intrans, 1);
- if (fs.first_object->flags & OBJ_NEEDGIANT)
- VM_UNLOCK_GIANT();
+ PCPU_INC(cnt.v_intrans);
vm_object_deallocate(fs.first_object);
goto RetryFault;
}
- queue = fs.m->queue;
-
- vm_pageq_remove_nowakeup(fs.m);
-
- if ((queue - fs.m->pc) == PQ_CACHE && vm_page_count_severe()) {
- vm_page_activate(fs.m);
- vm_page_unlock_queues();
- unlock_and_deallocate(&fs);
- VM_WAITPFAULT;
- goto RetryFault;
- }
+ vm_pageq_remove(fs.m);
+ vm_page_unlock_queues();
/*
* Mark page busy for other processes, and the
@@ -425,7 +406,6 @@
* found the page ).
*/
vm_page_busy(fs.m);
- vm_page_unlock_queues();
if (((fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) &&
fs.m->object != kernel_object && fs.m->object != kmem_object) {
goto readrest;
@@ -456,7 +436,8 @@
unlock_and_deallocate(&fs);
VM_WAITPFAULT;
goto RetryFault;
- }
+ } else if ((fs.m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
+ break;
}
readrest:
@@ -471,7 +452,7 @@
*/
if (TRYPAGER) {
int rv;
- int reqpage;
+ int reqpage = 0;
int ahead, behind;
u_char behavior = vm_map_entry_behavior(fs.entry);
@@ -517,7 +498,8 @@
if (mt == NULL || (mt->valid != VM_PAGE_BITS_ALL))
break;
if (mt->busy ||
- (mt->flags & (PG_BUSY | PG_FICTITIOUS | PG_UNMANAGED)) ||
+ (mt->oflags & VPO_BUSY) ||
+ (mt->flags & (PG_FICTITIOUS | PG_UNMANAGED)) ||
mt->hold_count ||
mt->wire_count)
continue;
@@ -546,7 +528,7 @@
* return value is the index into the marray for the
* vm_page_t passed to the routine.
*
- * fs.m plus the additional pages are PG_BUSY'd.
+ * fs.m plus the additional pages are VPO_BUSY'd.
*
* XXX vm_fault_additional_pages() can block
* without releasing the map lock.
@@ -566,7 +548,7 @@
/*
* Call the pager to retrieve the data, if any, after
* releasing the lock on the map. We hold a ref on
- * fs.object and the pages are PG_BUSY'd.
+ * fs.object and the pages are VPO_BUSY'd.
*/
unlock_map(&fs);
@@ -674,9 +656,9 @@
if ((fs.m->flags & PG_ZERO) == 0) {
pmap_zero_page(fs.m);
} else {
- atomic_add_int(&cnt.v_ozfod, 1);
+ PCPU_INC(cnt.v_ozfod);
}
- atomic_add_int(&cnt.v_zfod, 1);
+ PCPU_INC(cnt.v_zfod);
fs.m->valid = VM_PAGE_BITS_ALL;
break; /* break to PAGE HAS BEEN FOUND */
} else {
@@ -691,7 +673,7 @@
}
}
- KASSERT((fs.m->flags & PG_BUSY) != 0,
+ KASSERT((fs.m->oflags & VPO_BUSY) != 0,
("vm_fault: not busy after main loop"));
/*
@@ -747,7 +729,6 @@
/*
* get rid of the unnecessary page
*/
- pmap_remove_all(fs.first_m);
vm_page_free(fs.first_m);
/*
* grab the page and put it into the
@@ -755,11 +736,11 @@
* automatically made dirty.
*/
vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
- vm_page_busy(fs.m);
vm_page_unlock_queues();
+ vm_page_busy(fs.m);
fs.first_m = fs.m;
fs.m = NULL;
- atomic_add_int(&cnt.v_cow_optim, 1);
+ PCPU_INC(cnt.v_cow_optim);
} else {
/*
* Oh, well, lets copy it.
@@ -787,7 +768,7 @@
fs.m = fs.first_m;
if (!is_first_object_locked)
VM_OBJECT_LOCK(fs.object);
- atomic_add_int(&cnt.v_cow_faults, 1);
+ PCPU_INC(cnt.v_cow_faults);
} else {
prot &= ~VM_PROT_WRITE;
}
@@ -813,7 +794,7 @@
&fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
/*
- * If we don't need the page any longer, put it on the active
+ * If we don't need the page any longer, put it on the inactive
* list (the easiest thing to do here). If no one needs it,
* pageout will grab it eventually.
*/
@@ -848,16 +829,14 @@
}
}
if (prot & VM_PROT_WRITE) {
- vm_page_lock_queues();
- vm_page_flag_set(fs.m, PG_WRITEABLE);
- vm_object_set_writeable_dirty(fs.m->object);
+ vm_object_set_writeable_dirty(fs.object);
/*
* If the fault is a write, we know that this page is being
* written NOW so dirty it explicitly to save on
* pmap_is_modified() calls later.
*
- * If this is a NOSYNC mmap we do not want to set PG_NOSYNC
+ * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
* if the page is already dirty to prevent data written with
* the expectation of being synced from not being synced.
* Likewise if this entry does not request NOSYNC then make
@@ -869,11 +848,10 @@
*/
if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
if (fs.m->dirty == 0)
- vm_page_flag_set(fs.m, PG_NOSYNC);
+ fs.m->oflags |= VPO_NOSYNC;
} else {
- vm_page_flag_clear(fs.m, PG_NOSYNC);
+ fs.m->oflags &= ~VPO_NOSYNC;
}
- vm_page_unlock_queues();
if (fault_flags & VM_FAULT_DIRTY) {
vm_page_dirty(fs.m);
vm_pager_page_unswapped(fs.m);
@@ -883,7 +861,7 @@
/*
* Page had better still be busy
*/
- KASSERT(fs.m->flags & PG_BUSY,
+ KASSERT(fs.m->oflags & VPO_BUSY,
("vm_fault: page %p not busy!", fs.m));
/*
* Sanity check: page must be completely valid or it is not fit to
@@ -921,22 +899,17 @@
} else {
vm_page_activate(fs.m);
}
- vm_page_wakeup(fs.m);
vm_page_unlock_queues();
+ vm_page_wakeup(fs.m);
/*
* Unlock everything, and return
*/
unlock_and_deallocate(&fs);
- PROC_LOCK(curproc);
- if ((curproc->p_sflag & PS_INMEM) && curproc->p_stats) {
- if (hardfault) {
- curproc->p_stats->p_ru.ru_majflt++;
- } else {
- curproc->p_stats->p_ru.ru_minflt++;
- }
- }
- PROC_UNLOCK(curproc);
+ if (hardfault)
+ curthread->td_ru.ru_majflt++;
+ else
+ curthread->td_ru.ru_minflt++;
return (KERN_SUCCESS);
}
@@ -953,7 +926,7 @@
int i;
vm_offset_t addr, starta;
vm_pindex_t pindex;
- vm_page_t m, mpte;
+ vm_page_t m;
vm_object_t object;
if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
@@ -968,7 +941,6 @@
starta = 0;
}
- mpte = NULL;
for (i = 0; i < PAGEORDER_SIZE; i++) {
vm_object_t backing_object, lobject;
@@ -1004,13 +976,10 @@
}
if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
(m->busy == 0) &&
- (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
+ (m->flags & PG_FICTITIOUS) == 0) {
vm_page_lock_queues();
- if ((m->queue - m->pc) == PQ_CACHE)
- vm_page_deactivate(m);
- mpte = pmap_enter_quick(pmap, addr, m,
- entry->protection, mpte);
+ pmap_enter_quick(pmap, addr, m, entry->protection);
vm_page_unlock_queues();
}
VM_OBJECT_UNLOCK(lobject);
@@ -1198,17 +1167,15 @@
* Enter it in the pmap...
*/
pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE);
- VM_OBJECT_LOCK(dst_object);
- vm_page_lock_queues();
- if ((prot & VM_PROT_WRITE) != 0)
- vm_page_flag_set(dst_m, PG_WRITEABLE);
/*
* Mark it no longer busy, and put it on the active list.
*/
+ VM_OBJECT_LOCK(dst_object);
+ vm_page_lock_queues();
vm_page_activate(dst_m);
- vm_page_wakeup(dst_m);
vm_page_unlock_queues();
+ vm_page_wakeup(dst_m);
}
VM_OBJECT_UNLOCK(dst_object);
}
@@ -1248,15 +1215,7 @@
object = m->object;
pindex = m->pindex;
-
- /*
- * we don't fault-ahead for device pager
- */
- if (object->type == OBJT_DEVICE) {
- *reqpage = 0;
- marray[0] = m;
- return 1;
- }
+ cbehind = cahead = 0;
/*
* if the requested page is not available, then give up now
@@ -1280,17 +1239,6 @@
}
/*
- * try to do any readahead that we might have free pages for.
- */
- if ((rahead + rbehind) >
- ((cnt.v_free_count + cnt.v_cache_count) - cnt.v_free_reserved)) {
- pagedaemon_wakeup();
- marray[0] = m;
- *reqpage = 0;
- return 1;
- }
-
- /*
* scan backward for the read behind pages -- in memory
*/
if (pindex > 0) {
@@ -1301,30 +1249,29 @@
startpindex = pindex - rbehind;
}
- for (tpindex = pindex - 1; tpindex >= startpindex; tpindex -= 1) {
- if (vm_page_lookup(object, tpindex)) {
- startpindex = tpindex + 1;
- break;
- }
- if (tpindex == 0)
- break;
- }
-
- for (i = 0, tpindex = startpindex; tpindex < pindex; i++, tpindex++) {
+ if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL &&
+ rtm->pindex >= startpindex)
+ startpindex = rtm->pindex + 1;
+
+ /* tpindex is unsigned; beware of numeric underflow. */
+ for (i = 0, tpindex = pindex - 1; tpindex >= startpindex &&
+ tpindex < pindex; i++, tpindex--) {
- rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+ rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
+ VM_ALLOC_IFNOTCACHED);
if (rtm == NULL) {
- vm_page_lock_queues();
+ /*
+ * Shift the allocated pages to the
+ * beginning of the array.
+ */
for (j = 0; j < i; j++) {
- vm_page_free(marray[j]);
+ marray[j] = marray[j + tpindex + 1 -
+ startpindex];
}
- vm_page_unlock_queues();
- marray[0] = m;
- *reqpage = 0;
- return 1;
+ break;
}
- marray[i] = rtm;
+ marray[tpindex - startpindex] = rtm;
}
} else {
startpindex = 0;
@@ -1342,16 +1289,15 @@
* scan forward for the read ahead pages
*/
endpindex = tpindex + rahead;
+ if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex)
+ endpindex = rtm->pindex;
if (endpindex > object->size)
endpindex = object->size;
for (; tpindex < endpindex; i++, tpindex++) {
- if (vm_page_lookup(object, tpindex)) {
- break;
- }
-
- rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+ rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
+ VM_ALLOC_IFNOTCACHED);
if (rtm == NULL) {
break;
}
@@ -1359,6 +1305,6 @@
marray[i] = rtm;
}
- /* return number of bytes of pages */
+ /* return number of pages */
return i;
}
Index: vm_object.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_object.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_object.c -L sys/vm/vm_object.c -u -r1.2 -r1.3
--- sys/vm/vm_object.c
+++ sys/vm/vm_object.c
@@ -63,7 +63,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_object.c,v 1.349.2.4 2006/03/13 03:08:21 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_object.c,v 1.385.2.1 2007/10/19 05:48:45 alc Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -110,6 +110,7 @@
static void vm_object_qcollapse(vm_object_t object);
static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
+static void vm_object_vndeallocate(vm_object_t object);
/*
* Virtual memory objects maintain the actual data
@@ -143,20 +144,17 @@
struct vm_object kernel_object_store;
struct vm_object kmem_object_store;
+SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats");
+
static long object_collapses;
-static long object_bypasses;
+SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
+ &object_collapses, 0, "VM object collapses");
-/*
- * next_index determines the page color that is assigned to the next
- * allocated object. Accesses to next_index are not synchronized
- * because the effects of two or more object allocations using
- * next_index simultaneously are inconsequential. At any given time,
- * numerous objects have the same page color.
- */
-static int next_index;
+static long object_bypasses;
+SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
+ &object_bypasses, 0, "VM object bypasses");
static uma_zone_t obj_zone;
-#define VM_OBJECTS_INIT 256
static int vm_object_zinit(void *mem, int size, int flags);
@@ -172,6 +170,9 @@
KASSERT(TAILQ_EMPTY(&object->memq),
("object %p has resident pages",
object));
+ KASSERT(object->cache == NULL,
+ ("object %p has cached pages",
+ object));
KASSERT(object->paging_in_progress == 0,
("object %p paging_in_progress = %d",
object, object->paging_in_progress));
@@ -203,7 +204,6 @@
void
_vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
{
- int incr;
TAILQ_INIT(&object->memq);
LIST_INIT(&object->shadow_head);
@@ -216,15 +216,11 @@
object->flags = 0;
if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
object->flags = OBJ_ONEMAPPING;
- if (size > (PQ_L2_SIZE / 3 + PQ_PRIME1))
- incr = PQ_L2_SIZE / 3 + PQ_PRIME1;
- else
- incr = size;
- object->pg_color = next_index;
- next_index = (object->pg_color + incr) & PQ_L2_MASK;
+ object->pg_color = 0;
object->handle = NULL;
object->backing_object = NULL;
object->backing_object_offset = (vm_ooffset_t) 0;
+ object->cache = NULL;
mtx_lock(&vm_object_list_mtx);
TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
@@ -243,11 +239,11 @@
mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object");
- _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
+ _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
kernel_object);
VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object");
- _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
+ _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
kmem_object);
/*
@@ -262,7 +258,6 @@
NULL,
#endif
vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
- uma_prealloc(obj_zone, VM_OBJECTS_INIT);
}
void
@@ -393,7 +388,7 @@
/*
* Handle deallocating an object of type OBJT_VNODE.
*/
-void
+static void
vm_object_vndeallocate(vm_object_t object)
{
struct vnode *vp = (struct vnode *) object->handle;
@@ -440,23 +435,37 @@
while (object != NULL) {
int vfslocked;
- /*
- * In general, the object should be locked when working with
- * its type. In this case, in order to maintain proper lock
- * ordering, an exception is possible because a vnode-backed
- * object never changes its type.
- */
+
vfslocked = 0;
- if (object->type == OBJT_VNODE) {
- struct vnode *vp = (struct vnode *) object->handle;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- }
+ restart:
VM_OBJECT_LOCK(object);
if (object->type == OBJT_VNODE) {
+ struct vnode *vp = (struct vnode *) object->handle;
+
+ /*
+ * Conditionally acquire Giant for a vnode-backed
+ * object. We have to be careful since the type of
+ * a vnode object can change while the object is
+ * unlocked.
+ */
+ if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
+ vfslocked = 1;
+ if (!mtx_trylock(&Giant)) {
+ VM_OBJECT_UNLOCK(object);
+ mtx_lock(&Giant);
+ goto restart;
+ }
+ }
vm_object_vndeallocate(object);
VFS_UNLOCK_GIANT(vfslocked);
return;
- }
+ } else
+ /*
+ * This is to handle the case that the object
+ * changed type while we dropped its lock to
+ * obtain Giant.
+ */
+ VFS_UNLOCK_GIANT(vfslocked);
KASSERT(object->ref_count != 0,
("vm_object_deallocate: object deallocated too many times: %d", object->type));
@@ -497,7 +506,7 @@
* priority than the current thread.
* Let the lower priority thread run.
*/
- tsleep(&proc0, PVM, "vmo_de", 1);
+ pause("vmo_de", 1);
continue;
}
/*
@@ -517,8 +526,11 @@
VM_OBJECT_UNLOCK(object);
vm_object_pip_wait(robject,
"objde1");
- VM_OBJECT_LOCK(object);
- goto retry;
+ temp = robject->backing_object;
+ if (object == temp) {
+ VM_OBJECT_LOCK(object);
+ goto retry;
+ }
} else if (object->paging_in_progress) {
VM_OBJECT_UNLOCK(robject);
object->flags |= OBJ_PIPWNT;
@@ -526,10 +538,14 @@
VM_OBJECT_MTX(object),
PDROP | PVM, "objde2", 0);
VM_OBJECT_LOCK(robject);
- VM_OBJECT_LOCK(object);
- goto retry;
- }
- VM_OBJECT_UNLOCK(object);
+ temp = robject->backing_object;
+ if (object == temp) {
+ VM_OBJECT_LOCK(object);
+ goto retry;
+ }
+ } else
+ VM_OBJECT_UNLOCK(object);
+
if (robject->ref_count == 1) {
robject->ref_count--;
object = robject;
@@ -624,7 +640,7 @@
*/
vm_page_lock_queues();
while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
- KASSERT(!p->busy && (p->flags & PG_BUSY) == 0,
+ KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
("vm_object_terminate: freeing busy page %p "
"p->busy = %d, p->flags %x\n", p, p->busy, p->flags));
if (p->wire_count == 0) {
@@ -636,6 +652,9 @@
}
vm_page_unlock_queues();
+ if (__predict_false(object->cache != NULL))
+ vm_page_cache_free(object, 0, 0);
+
/*
* Let the pager know object is dead.
*/
@@ -660,7 +679,7 @@
*
* Clean all dirty pages in the specified range of object. Leaves page
* on whatever queue it is currently on. If NOSYNC is set then do not
- * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
+ * write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
* leaving the object dirty.
*
* When stuffing pages asynchronously, allow clustering. XXX we need a
@@ -720,8 +739,7 @@
while (tscan < tend) {
curgeneration = object->generation;
p = vm_page_lookup(object, tscan);
- if (p == NULL || p->valid == 0 ||
- (p->queue - p->pc) == PQ_CACHE) {
+ if (p == NULL || p->valid == 0) {
if (--scanlimit == 0)
break;
++tscan;
@@ -738,7 +756,7 @@
* If we have been asked to skip nosync pages and
* this is a nosync page, we can't continue.
*/
- if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
+ if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
if (--scanlimit == 0)
break;
++tscan;
@@ -777,17 +795,17 @@
*/
clearobjflags = 1;
TAILQ_FOREACH(p, &object->memq, listq) {
- vm_page_flag_set(p, PG_CLEANCHK);
- if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
+ p->oflags |= VPO_CLEANCHK;
+ if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC))
clearobjflags = 0;
else
- pmap_page_protect(p, VM_PROT_READ);
+ pmap_remove_write(p);
}
if (clearobjflags && (tstart == 0) && (tend == object->size)) {
struct vnode *vp;
- vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+ vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
if (object->type == OBJT_VNODE &&
(vp = (struct vnode *)object->handle) != NULL) {
VI_LOCK(vp);
@@ -807,17 +825,16 @@
again:
pi = p->pindex;
- if (((p->flags & PG_CLEANCHK) == 0) ||
+ if ((p->oflags & VPO_CLEANCHK) == 0 ||
(pi < tstart) || (pi >= tend) ||
- (p->valid == 0) ||
- ((p->queue - p->pc) == PQ_CACHE)) {
- vm_page_flag_clear(p, PG_CLEANCHK);
+ p->valid == 0) {
+ p->oflags &= ~VPO_CLEANCHK;
continue;
}
vm_page_test_dirty(p);
if ((p->dirty & p->valid) == 0) {
- vm_page_flag_clear(p, PG_CLEANCHK);
+ p->oflags &= ~VPO_CLEANCHK;
continue;
}
@@ -826,8 +843,8 @@
* nosync page, skip it. Note that the object flags were
* not cleared in this case so we do not have to set them.
*/
- if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
- vm_page_flag_clear(p, PG_CLEANCHK);
+ if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
+ p->oflags &= ~VPO_CLEANCHK;
continue;
}
@@ -883,18 +900,14 @@
vm_page_t tp;
if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
- if ((tp->flags & PG_BUSY) ||
+ if ((tp->oflags & VPO_BUSY) ||
((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
- (tp->flags & PG_CLEANCHK) == 0) ||
+ (tp->oflags & VPO_CLEANCHK) == 0) ||
(tp->busy != 0))
break;
- if((tp->queue - tp->pc) == PQ_CACHE) {
- vm_page_flag_clear(tp, PG_CLEANCHK);
- break;
- }
vm_page_test_dirty(tp);
if ((tp->dirty & tp->valid) == 0) {
- vm_page_flag_clear(tp, PG_CLEANCHK);
+ tp->oflags &= ~VPO_CLEANCHK;
break;
}
maf[ i - 1 ] = tp;
@@ -911,18 +924,14 @@
vm_page_t tp;
if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
- if ((tp->flags & PG_BUSY) ||
+ if ((tp->oflags & VPO_BUSY) ||
((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
- (tp->flags & PG_CLEANCHK) == 0) ||
+ (tp->oflags & VPO_CLEANCHK) == 0) ||
(tp->busy != 0))
break;
- if ((tp->queue - tp->pc) == PQ_CACHE) {
- vm_page_flag_clear(tp, PG_CLEANCHK);
- break;
- }
vm_page_test_dirty(tp);
if ((tp->dirty & tp->valid) == 0) {
- vm_page_flag_clear(tp, PG_CLEANCHK);
+ tp->oflags &= ~VPO_CLEANCHK;
break;
}
mab[ i - 1 ] = tp;
@@ -936,22 +945,22 @@
for(i = 0; i < maxb; i++) {
int index = (maxb - i) - 1;
ma[index] = mab[i];
- vm_page_flag_clear(ma[index], PG_CLEANCHK);
+ ma[index]->oflags &= ~VPO_CLEANCHK;
}
- vm_page_flag_clear(p, PG_CLEANCHK);
+ p->oflags &= ~VPO_CLEANCHK;
ma[maxb] = p;
for(i = 0; i < maxf; i++) {
int index = (maxb + i) + 1;
ma[index] = maf[i];
- vm_page_flag_clear(ma[index], PG_CLEANCHK);
+ ma[index]->oflags &= ~VPO_CLEANCHK;
}
runlen = maxb + maxf + 1;
vm_pageout_flush(ma, runlen, pagerflags);
for (i = 0; i < runlen; i++) {
if (ma[i]->valid & ma[i]->dirty) {
- pmap_page_protect(ma[i], VM_PROT_READ);
- vm_page_flag_set(ma[i], PG_CLEANCHK);
+ pmap_remove_write(ma[i]);
+ ma[i]->oflags |= VPO_CLEANCHK;
/*
* maxf will end up being the actual number of pages
@@ -1092,6 +1101,13 @@
}
}
m = vm_page_lookup(tobject, tpindex);
+ if (m == NULL && advise == MADV_WILLNEED) {
+ /*
+ * If the page is cached, reactivate it.
+ */
+ m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
+ VM_ALLOC_NOBUSY);
+ }
if (m == NULL) {
/*
* There may be swap even if there is no backing page
@@ -1125,12 +1141,13 @@
vm_page_unlock_queues();
goto unlock_tobject;
}
- if ((m->flags & PG_BUSY) || m->busy) {
- vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
+ if ((m->oflags & VPO_BUSY) || m->busy) {
+ vm_page_flag_set(m, PG_REFERENCED);
+ vm_page_unlock_queues();
if (object != tobject)
VM_OBJECT_UNLOCK(object);
- VM_OBJECT_UNLOCK(tobject);
- msleep(m, &vm_page_queue_mtx, PDROP | PVM, "madvpo", 0);
+ m->oflags |= VPO_WANTED;
+ msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo", 0);
VM_OBJECT_LOCK(object);
goto relookup;
}
@@ -1231,17 +1248,8 @@
LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
source->shadow_count++;
source->generation++;
- if (length < source->size)
- length = source->size;
- if (length > PQ_L2_SIZE / 3 + PQ_PRIME1 ||
- source->generation > 1)
- length = PQ_L2_SIZE / 3 + PQ_PRIME1;
- result->pg_color = (source->pg_color +
- length * source->generation) & PQ_L2_MASK;
result->flags |= source->flags & OBJ_NEEDGIANT;
VM_OBJECT_UNLOCK(source);
- next_index = (result->pg_color + PQ_L2_SIZE / 3 + PQ_PRIME1) &
- PQ_L2_MASK;
}
@@ -1262,10 +1270,10 @@
void
vm_object_split(vm_map_entry_t entry)
{
- vm_page_t m;
+ vm_page_t m, m_next;
vm_object_t orig_object, new_object, source;
- vm_pindex_t offidxstart, offidxend;
- vm_size_t idx, size;
+ vm_pindex_t idx, offidxstart;
+ vm_size_t size;
orig_object = entry->object.vm_object;
if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
@@ -1275,8 +1283,7 @@
VM_OBJECT_UNLOCK(orig_object);
offidxstart = OFF_TO_IDX(entry->offset);
- offidxend = offidxstart + OFF_TO_IDX(entry->end - entry->start);
- size = offidxend - offidxstart;
+ size = atop(entry->end - entry->start);
/*
* If swap_pager_copy() is later called, it will convert new_object
@@ -1284,11 +1291,23 @@
*/
new_object = vm_object_allocate(OBJT_DEFAULT, size);
+ /*
+ * At this point, the new object is still private, so the order in
+ * which the original and new objects are locked does not matter.
+ */
VM_OBJECT_LOCK(new_object);
VM_OBJECT_LOCK(orig_object);
source = orig_object->backing_object;
if (source != NULL) {
VM_OBJECT_LOCK(source);
+ if ((source->flags & OBJ_DEAD) != 0) {
+ VM_OBJECT_UNLOCK(source);
+ VM_OBJECT_UNLOCK(orig_object);
+ VM_OBJECT_UNLOCK(new_object);
+ vm_object_deallocate(new_object);
+ VM_OBJECT_LOCK(orig_object);
+ return;
+ }
LIST_INSERT_HEAD(&source->shadow_head,
new_object, shadow_list);
source->shadow_count++;
@@ -1301,12 +1320,18 @@
new_object->backing_object = source;
}
new_object->flags |= orig_object->flags & OBJ_NEEDGIANT;
+retry:
+ if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) {
+ if (m->pindex < offidxstart) {
+ m = vm_page_splay(offidxstart, orig_object->root);
+ if ((orig_object->root = m)->pindex < offidxstart)
+ m = TAILQ_NEXT(m, listq);
+ }
+ }
vm_page_lock_queues();
- for (idx = 0; idx < size; idx++) {
- retry:
- m = vm_page_lookup(orig_object, offidxstart + idx);
- if (m == NULL)
- continue;
+ for (; m != NULL && (idx = m->pindex - offidxstart) < size;
+ m = m_next) {
+ m_next = TAILQ_NEXT(m, listq);
/*
* We must wait for pending I/O to complete before we can
@@ -1315,14 +1340,13 @@
* We do not have to VM_PROT_NONE the page as mappings should
* not be changed by this operation.
*/
- if ((m->flags & PG_BUSY) || m->busy) {
- vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
- VM_OBJECT_UNLOCK(orig_object);
+ if ((m->oflags & VPO_BUSY) || m->busy) {
+ vm_page_flag_set(m, PG_REFERENCED);
+ vm_page_unlock_queues();
VM_OBJECT_UNLOCK(new_object);
- msleep(m, &vm_page_queue_mtx, PDROP | PVM, "spltwt", 0);
+ m->oflags |= VPO_WANTED;
+ msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
VM_OBJECT_LOCK(new_object);
- VM_OBJECT_LOCK(orig_object);
- vm_page_lock_queues();
goto retry;
}
vm_page_rename(m, new_object, idx);
@@ -1336,12 +1360,17 @@
* and new_object's locks are released and reacquired.
*/
swap_pager_copy(orig_object, new_object, offidxstart, 0);
+
+ /*
+ * Transfer any cached pages from orig_object to new_object.
+ */
+ if (__predict_false(orig_object->cache != NULL))
+ vm_page_cache_transfer(orig_object, offidxstart,
+ new_object);
}
VM_OBJECT_UNLOCK(orig_object);
- vm_page_lock_queues();
TAILQ_FOREACH(m, &new_object->memq, listq)
vm_page_wakeup(m);
- vm_page_unlock_queues();
VM_OBJECT_UNLOCK(new_object);
entry->object.vm_object = new_object;
entry->offset = 0LL;
@@ -1372,8 +1401,8 @@
*/
if (op & OBSC_TEST_ALL_SHADOWED) {
/*
- * We do not want to have to test for the existence of
- * swap pages in the backing object. XXX but with the
+ * We do not want to have to test for the existence of cache
+ * or swap pages in the backing object. XXX but with the
* new swapper this would be pretty easy to do.
*
* XXX what about anonymous MAP_SHARED memory that hasn't
@@ -1442,20 +1471,20 @@
vm_page_t pp;
if (op & OBSC_COLLAPSE_NOWAIT) {
- if ((p->flags & PG_BUSY) ||
+ if ((p->oflags & VPO_BUSY) ||
!p->valid ||
p->busy) {
p = next;
continue;
}
} else if (op & OBSC_COLLAPSE_WAIT) {
- if ((p->flags & PG_BUSY) || p->busy) {
+ if ((p->oflags & VPO_BUSY) || p->busy) {
vm_page_lock_queues();
- vm_page_flag_set(p,
- PG_WANTED | PG_REFERENCED);
- VM_OBJECT_UNLOCK(backing_object);
+ vm_page_flag_set(p, PG_REFERENCED);
+ vm_page_unlock_queues();
VM_OBJECT_UNLOCK(object);
- msleep(p, &vm_page_queue_mtx,
+ p->oflags |= VPO_WANTED;
+ msleep(p, VM_OBJECT_MTX(backing_object),
PDROP | PVM, "vmocol", 0);
VM_OBJECT_LOCK(object);
VM_OBJECT_LOCK(backing_object);
@@ -1646,6 +1675,12 @@
backing_object,
object,
OFF_TO_IDX(object->backing_object_offset), TRUE);
+
+ /*
+ * Free any cached pages from backing_object.
+ */
+ if (__predict_false(backing_object->cache != NULL))
+ vm_page_cache_free(backing_object, 0, 0);
}
/*
* Object now shadows whatever backing_object did.
@@ -1765,14 +1800,15 @@
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
if (object->resident_page_count == 0)
- return;
+ goto skipmemq;
/*
* Since physically-backed objects do not use managed pages, we can't
* remove pages from the object (we must instead remove the page
* references, and then destroy the object).
*/
- KASSERT(object->type != OBJT_PHYS,
+ KASSERT(object->type != OBJT_PHYS || object == kernel_object ||
+ object == kmem_object,
("attempt to remove pages from a physical object"));
vm_object_pip_add(object, 1);
@@ -1804,7 +1840,7 @@
if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
goto again;
if (clean_only && p->valid) {
- pmap_page_protect(p, VM_PROT_READ | VM_PROT_EXECUTE);
+ pmap_remove_write(p);
if (p->valid & p->dirty)
continue;
}
@@ -1813,6 +1849,9 @@
}
vm_page_unlock_queues();
vm_object_pip_wakeup(object);
+skipmemq:
+ if (__predict_false(object->cache != NULL))
+ vm_page_cache_free(object, start, end);
}
/*
@@ -1903,10 +1942,9 @@
struct vnode *vp;
VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
- if ((object->flags & (OBJ_MIGHTBEDIRTY|OBJ_WRITEABLE)) ==
- (OBJ_MIGHTBEDIRTY|OBJ_WRITEABLE))
+ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
return;
- vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+ vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
if (object->type == OBJT_VNODE &&
(vp = (struct vnode *)object->handle) != NULL) {
VI_LOCK(vp);
@@ -1968,7 +2006,7 @@
struct proc *p;
/* sx_slock(&allproc_lock); */
- LIST_FOREACH(p, &allproc, p_list) {
+ FOREACH_PROC_IN_SYSTEM(p) {
if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
continue;
if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
@@ -2090,7 +2128,7 @@
TAILQ_FOREACH(object, &vm_object_list, object_list) {
vm_pindex_t idx, fidx;
vm_pindex_t osize;
- vm_paddr_t pa = -1, padiff;
+ vm_paddr_t pa = -1;
int rcount;
vm_page_t m;
@@ -2132,17 +2170,8 @@
continue;
}
if (rcount) {
- padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
- padiff >>= PAGE_SHIFT;
- padiff &= PQ_L2_MASK;
- if (padiff == 0) {
- pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
- ++rcount;
- continue;
- }
- db_printf(" index(%ld)run(%d)pa(0x%lx)",
+ db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
(long)fidx, rcount, (long)pa);
- db_printf("pd(%ld)\n", (long)padiff);
if (nl > 18) {
c = cngetc();
if (c != ' ')
Index: vm_mmap.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_mmap.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/vm/vm_mmap.c -L sys/vm/vm_mmap.c -u -r1.1.1.2 -r1.2
--- sys/vm/vm_mmap.c
+++ sys/vm/vm_mmap.c
@@ -41,9 +41,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_mmap.c,v 1.200.2.2 2005/12/26 13:47:20 dds Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_mmap.c,v 1.213 2007/08/20 12:05:45 kib Exp $");
#include "opt_compat.h"
+#include "opt_hwpmc_hooks.h"
#include "opt_mac.h"
#include <sys/param.h>
@@ -53,13 +54,13 @@
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/filedesc.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/file.h>
-#include <sys/mac.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/conf.h>
@@ -67,6 +68,8 @@
#include <sys/vmmeter.h>
#include <sys/sysctl.h>
+#include <security/mac/mac_framework.h>
+
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
@@ -79,6 +82,10 @@
#include <vm/vm_page.h>
#include <vm/vm_kern.h>
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
#ifndef _SYS_SYSPROTO_H_
struct sbrk_args {
int incr;
@@ -201,6 +208,9 @@
struct thread *td;
struct mmap_args *uap;
{
+#ifdef HWPMC_HOOKS
+ struct pmckern_map_in pkm;
+#endif
struct file *fp;
struct vnode *vp;
vm_offset_t addr;
@@ -297,7 +307,7 @@
if ((error = fget(td, uap->fd, &fp)) != 0)
goto done;
if (fp->f_type != DTYPE_VNODE) {
- error = EINVAL;
+ error = ENODEV;
goto done;
}
/*
@@ -364,6 +374,15 @@
error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
flags, handle_type, handle, pos);
+#ifdef HWPMC_HOOKS
+ /* inform hwpmc(4) if an executable is being mapped */
+ if (error == 0 && handle_type == OBJT_VNODE &&
+ (prot & PROT_EXEC)) {
+ pkm.pm_file = handle;
+ pkm.pm_address = (uintptr_t) addr;
+ PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
+ }
+#endif
if (error == 0)
td->td_retval[0] = (register_t) (addr + pageoff);
done:
@@ -373,6 +392,20 @@
return (error);
}
+int
+freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
+{
+ struct mmap_args oargs;
+
+ oargs.addr = uap->addr;
+ oargs.len = uap->len;
+ oargs.prot = uap->prot;
+ oargs.flags = uap->flags;
+ oargs.fd = uap->fd;
+ oargs.pos = uap->pos;
+ return (mmap(td, &oargs));
+}
+
#ifdef COMPAT_43
#ifndef _SYS_SYSPROTO_H_
struct ommap_args {
@@ -495,6 +528,10 @@
struct thread *td;
struct munmap_args *uap;
{
+#ifdef HWPMC_HOOKS
+ struct pmckern_map_out pkm;
+ vm_map_entry_t entry;
+#endif
vm_offset_t addr;
vm_size_t size, pageoff;
vm_map_t map;
@@ -525,6 +562,26 @@
vm_map_unlock(map);
return (EINVAL);
}
+#ifdef HWPMC_HOOKS
+ /*
+ * Inform hwpmc if the address range being unmapped contains
+ * an executable region.
+ */
+ if (vm_map_lookup_entry(map, addr, &entry)) {
+ for (;
+ entry != &map->header && entry->start < addr + size;
+ entry = entry->next) {
+ if (vm_map_check_protection(map, entry->start,
+ entry->end, VM_PROT_EXECUTE) == TRUE) {
+ pkm.pm_address = (uintptr_t) addr;
+ pkm.pm_size = (size_t) size;
+ PMC_CALL_HOOK(td, PMC_FN_MUNMAP,
+ (void *) &pkm);
+ break;
+ }
+ }
+ }
+#endif
/* returns nothing but KERN_SUCCESS anyway */
vm_map_delete(map, addr, addr + size);
vm_map_unlock(map);
@@ -642,7 +699,7 @@
* "immortal."
*/
if (uap->behav == MADV_PROTECT) {
- error = suser(td);
+ error = priv_check(td, PRIV_VM_MADV_PROTECT);
if (error == 0) {
p = td->td_proc;
PROC_LOCK(p);
@@ -716,7 +773,7 @@
end = addr + (vm_size_t)round_page(uap->len);
map = &td->td_proc->p_vmspace->vm_map;
if (end > vm_map_max(map) || end < addr)
- return (EINVAL);
+ return (ENOMEM);
/*
* Address of byte vector
@@ -729,8 +786,10 @@
RestartScan:
timestamp = map->timestamp;
- if (!vm_map_lookup_entry(map, addr, &entry))
- entry = entry->next;
+ if (!vm_map_lookup_entry(map, addr, &entry)) {
+ vm_map_unlock_read(map);
+ return (ENOMEM);
+ }
/*
* Do this on a map entry basis so that if the pages are not
@@ -743,6 +802,16 @@
current = current->next) {
/*
+ * check for contiguity
+ */
+ if (current->end < end &&
+ (entry->next == &map->header ||
+ current->next->start > current->end)) {
+ vm_map_unlock_read(map);
+ return (ENOMEM);
+ }
+
+ /*
* ignore submaps (for now) or null objects
*/
if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
@@ -897,7 +966,7 @@
vm_size_t npages, size;
int error;
- error = suser(td);
+ error = priv_check(td, PRIV_VM_MLOCK);
if (error)
return (error);
addr = (vm_offset_t)uap->addr;
@@ -962,7 +1031,7 @@
}
PROC_UNLOCK(td->td_proc);
#else
- error = suser(td);
+ error = priv_check(td, PRIV_VM_MLOCK);
if (error)
return (error);
#endif
@@ -1007,7 +1076,7 @@
int error;
map = &td->td_proc->p_vmspace->vm_map;
- error = suser(td);
+ error = priv_check(td, PRIV_VM_MUNLOCK);
if (error)
return (error);
@@ -1041,7 +1110,7 @@
vm_size_t size;
int error;
- error = suser(td);
+ error = priv_check(td, PRIV_VM_MUNLOCK);
if (error)
return (error);
addr = (vm_offset_t)uap->addr;
@@ -1236,7 +1305,7 @@
vm_ooffset_t foff)
{
boolean_t fitit;
- vm_object_t object;
+ vm_object_t object = NULL;
int rv = KERN_SUCCESS;
int docow, error;
struct thread *td = curthread;
@@ -1272,7 +1341,6 @@
if (*addr != trunc_page(*addr))
return (EINVAL);
fitit = FALSE;
- (void) vm_map_remove(map, *addr, *addr + size);
}
/*
* Lookup/allocate object.
@@ -1294,6 +1362,7 @@
/* FALLTHROUGH */
default:
error = EINVAL;
+ break;
}
if (error)
return (error);
@@ -1330,8 +1399,11 @@
if (flags & MAP_STACK)
rv = vm_map_stack(map, *addr, size, prot, maxprot,
docow | MAP_STACK_GROWS_DOWN);
+ else if (fitit)
+ rv = vm_map_find(map, object, foff, addr, size, TRUE,
+ prot, maxprot, docow);
else
- rv = vm_map_find(map, object, foff, addr, size, fitit,
+ rv = vm_map_fixed(map, object, foff, addr, size,
prot, maxprot, docow);
if (rv != KERN_SUCCESS) {
Index: vm_param.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_param.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_param.h -L sys/vm/vm_param.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_param.h
+++ sys/vm/vm_param.h
@@ -57,7 +57,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: src/sys/vm/vm_param.h,v 1.21 2005/01/07 02:29:27 imp Exp $
+ * $FreeBSD: src/sys/vm/vm_param.h,v 1.22 2007/04/19 04:52:47 alc Exp $
*/
/*
@@ -79,8 +79,8 @@
#define VM_V_FREE_TARGET 4 /* cnt.v_free_target */
#define VM_V_FREE_RESERVED 5 /* cnt.v_free_reserved */
#define VM_V_INACTIVE_TARGET 6 /* cnt.v_inactive_target */
-#define VM_V_CACHE_MIN 7 /* cnt.v_cache_max */
-#define VM_V_CACHE_MAX 8 /* cnt.v_cache_min */
+#define VM_V_CACHE_MIN 7 /* cnt.v_cache_min */
+#define VM_V_CACHE_MAX 8 /* cnt.v_cache_max */
#define VM_V_PAGEOUT_FREE_MIN 9 /* cnt.v_pageout_free_min */
#define VM_PAGEOUT_ALGORITHM 10 /* pageout algorithm */
#define VM_SWAPPING_ENABLED 11 /* swapping enabled */
Index: vm_contig.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_contig.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_contig.c -L sys/vm/vm_contig.c -u -r1.2 -r1.3
--- sys/vm/vm_contig.c
+++ sys/vm/vm_contig.c
@@ -60,12 +60,13 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_contig.c,v 1.43.2.3.2.1 2006/04/25 15:29:50 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_contig.c,v 1.63 2007/09/25 06:25:06 alc Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/malloc.h>
+#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/kernel.h>
@@ -83,6 +84,7 @@
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
#include <vm/vm_extern.h>
static int
@@ -92,6 +94,7 @@
vm_page_t m_tmp;
struct vnode *vp;
struct mount *mp;
+ int vfslocked;
object = m->object;
if (!VM_OBJECT_TRYLOCK(object))
@@ -115,11 +118,13 @@
vm_object_reference_locked(object);
VM_OBJECT_UNLOCK(object);
(void) vn_start_write(vp, &mp, V_WAIT);
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
VM_OBJECT_LOCK(object);
vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
VM_OBJECT_UNLOCK(object);
VOP_UNLOCK(vp, 0, curthread);
+ VFS_UNLOCK_GIANT(vfslocked);
vm_object_deallocate(object);
vn_finished_write(mp);
vm_page_lock_queues();
@@ -150,7 +155,7 @@
if ((m->flags & PG_MARKER) != 0)
continue;
- KASSERT(m->queue == queue,
+ KASSERT(VM_PAGE_INQUEUE2(m, queue),
("vm_contig_launder: page %p's queue is not %d", m, queue));
error = vm_contig_launder_page(m);
if (error == 0)
@@ -161,191 +166,6 @@
return (FALSE);
}
-/*
- * This interface is for merging with malloc() someday.
- * Even if we never implement compaction so that contiguous allocation
- * works after initialization time, malloc()'s data structures are good
- * for statistics and for allocations of less than a page.
- */
-static void *
-contigmalloc1(
- unsigned long size, /* should be size_t here and for malloc() */
- struct malloc_type *type,
- int flags,
- vm_paddr_t low,
- vm_paddr_t high,
- unsigned long alignment,
- unsigned long boundary,
- vm_map_t map)
-{
- int i, start;
- vm_paddr_t phys;
- vm_object_t object;
- vm_offset_t addr, tmp_addr;
- int pass, pqtype;
- int inactl, actl, inactmax, actmax;
- vm_page_t pga = vm_page_array;
-
- size = round_page(size);
- if (size == 0)
- panic("contigmalloc1: size must not be 0");
- if ((alignment & (alignment - 1)) != 0)
- panic("contigmalloc1: alignment must be a power of 2");
- if ((boundary & (boundary - 1)) != 0)
- panic("contigmalloc1: boundary must be a power of 2");
-
- start = 0;
- for (pass = 2; pass >= 0; pass--) {
- vm_page_lock_queues();
-again0:
- mtx_lock_spin(&vm_page_queue_free_mtx);
-again:
- /*
- * Find first page in array that is free, within range,
- * aligned, and such that the boundary won't be crossed.
- */
- for (i = start; i < cnt.v_page_count; i++) {
- phys = VM_PAGE_TO_PHYS(&pga[i]);
- pqtype = pga[i].queue - pga[i].pc;
- if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) &&
- (phys >= low) && (phys < high) &&
- ((phys & (alignment - 1)) == 0) &&
- (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0))
- break;
- }
-
- /*
- * If the above failed or we will exceed the upper bound, fail.
- */
- if ((i == cnt.v_page_count) ||
- ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
- mtx_unlock_spin(&vm_page_queue_free_mtx);
- /*
- * Instead of racing to empty the inactive/active
- * queues, give up, even with more left to free,
- * if we try more than the initial amount of pages.
- *
- * There's no point attempting this on the last pass.
- */
- if (pass > 0) {
- inactl = actl = 0;
- inactmax = vm_page_queues[PQ_INACTIVE].lcnt;
- actmax = vm_page_queues[PQ_ACTIVE].lcnt;
-again1:
- if (inactl < inactmax &&
- vm_contig_launder(PQ_INACTIVE)) {
- inactl++;
- goto again1;
- }
- if (actl < actmax &&
- vm_contig_launder(PQ_ACTIVE)) {
- actl++;
- goto again1;
- }
- }
- vm_page_unlock_queues();
- continue;
- }
- start = i;
-
- /*
- * Check successive pages for contiguous and free.
- */
- for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
- pqtype = pga[i].queue - pga[i].pc;
- if ((VM_PAGE_TO_PHYS(&pga[i]) !=
- (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
- ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) {
- start++;
- goto again;
- }
- }
- mtx_unlock_spin(&vm_page_queue_free_mtx);
- for (i = start; i < (start + size / PAGE_SIZE); i++) {
- vm_page_t m = &pga[i];
-
- if ((m->queue - m->pc) == PQ_CACHE) {
- if (m->hold_count != 0) {
- start++;
- goto again0;
- }
- object = m->object;
- if (!VM_OBJECT_TRYLOCK(object)) {
- start++;
- goto again0;
- }
- if ((m->flags & PG_BUSY) || m->busy != 0) {
- VM_OBJECT_UNLOCK(object);
- start++;
- goto again0;
- }
- vm_page_free(m);
- VM_OBJECT_UNLOCK(object);
- }
- }
- mtx_lock_spin(&vm_page_queue_free_mtx);
- for (i = start; i < (start + size / PAGE_SIZE); i++) {
- pqtype = pga[i].queue - pga[i].pc;
- if (pqtype != PQ_FREE) {
- start++;
- goto again;
- }
- }
- for (i = start; i < (start + size / PAGE_SIZE); i++) {
- vm_page_t m = &pga[i];
- vm_pageq_remove_nowakeup(m);
- m->valid = VM_PAGE_BITS_ALL;
- if (m->flags & PG_ZERO)
- vm_page_zero_count--;
- /* Don't clear the PG_ZERO flag, we'll need it later. */
- m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
- KASSERT(m->dirty == 0,
- ("contigmalloc1: page %p was dirty", m));
- m->wire_count = 0;
- m->busy = 0;
- }
- mtx_unlock_spin(&vm_page_queue_free_mtx);
- vm_page_unlock_queues();
- /*
- * We've found a contiguous chunk that meets are requirements.
- * Allocate kernel VM, unfree and assign the physical pages to
- * it and return kernel VM pointer.
- */
- vm_map_lock(map);
- if (vm_map_findspace(map, vm_map_min(map), size, &addr) !=
- KERN_SUCCESS) {
- /*
- * XXX We almost never run out of kernel virtual
- * space, so we don't make the allocated memory
- * above available.
- */
- vm_map_unlock(map);
- return (NULL);
- }
- vm_object_reference(kernel_object);
- vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS,
- addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
- vm_map_unlock(map);
-
- tmp_addr = addr;
- VM_OBJECT_LOCK(kernel_object);
- for (i = start; i < (start + size / PAGE_SIZE); i++) {
- vm_page_t m = &pga[i];
- vm_page_insert(m, kernel_object,
- OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
- if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
- pmap_zero_page(m);
- tmp_addr += PAGE_SIZE;
- }
- VM_OBJECT_UNLOCK(kernel_object);
- vm_map_wire(map, addr, addr + size,
- VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
-
- return ((void *)addr);
- }
- return (NULL);
-}
-
static void
vm_page_release_contigl(vm_page_t m, vm_pindex_t count)
{
@@ -355,7 +175,7 @@
}
}
-void
+static void
vm_page_release_contig(vm_page_t m, vm_pindex_t count)
{
vm_page_lock_queues();
@@ -363,162 +183,6 @@
vm_page_unlock_queues();
}
-static int
-vm_contig_unqueue_free(vm_page_t m)
-{
- int error = 0;
-
- mtx_lock_spin(&vm_page_queue_free_mtx);
- if ((m->queue - m->pc) == PQ_FREE)
- vm_pageq_remove_nowakeup(m);
- else
- error = EAGAIN;
- mtx_unlock_spin(&vm_page_queue_free_mtx);
- if (error)
- return (error);
- m->valid = VM_PAGE_BITS_ALL;
- if (m->flags & PG_ZERO)
- vm_page_zero_count--;
- /* Don't clear the PG_ZERO flag; we'll need it later. */
- m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
- KASSERT(m->dirty == 0,
- ("contigmalloc2: page %p was dirty", m));
- m->wire_count = 0;
- m->busy = 0;
- return (error);
-}
-
-vm_page_t
-vm_page_alloc_contig(vm_pindex_t npages, vm_paddr_t low, vm_paddr_t high,
- vm_offset_t alignment, vm_offset_t boundary)
-{
- vm_object_t object;
- vm_offset_t size;
- vm_paddr_t phys;
- vm_page_t pga = vm_page_array;
- int i, pass, pqtype, start;
-
- size = npages << PAGE_SHIFT;
- if (size == 0)
- panic("vm_page_alloc_contig: size must not be 0");
- if ((alignment & (alignment - 1)) != 0)
- panic("vm_page_alloc_contig: alignment must be a power of 2");
- if ((boundary & (boundary - 1)) != 0)
- panic("vm_page_alloc_contig: boundary must be a power of 2");
-
- for (pass = 0; pass < 2; pass++) {
- if (atop(high) < vm_page_array_size)
- start = atop(high) - npages + 1;
- else
- start = vm_page_array_size - npages + 1;
- vm_page_lock_queues();
-retry:
- start--;
- /*
- * Find last page in array that is free, within range,
- * aligned, and such that the boundary won't be crossed.
- */
- for (i = start; i >= 0; i--) {
- phys = VM_PAGE_TO_PHYS(&pga[i]);
- pqtype = pga[i].queue - pga[i].pc;
- if (pass == 0) {
- if (pqtype != PQ_FREE && pqtype != PQ_CACHE)
- continue;
- } else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
- pga[i].queue != PQ_ACTIVE &&
- pga[i].queue != PQ_INACTIVE)
- continue;
- if (phys >= low && phys + size <= high &&
- ((phys & (alignment - 1)) == 0) &&
- ((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)
- break;
- }
- /* There are no candidates at all. */
- if (i == -1) {
- vm_page_unlock_queues();
- continue;
- }
- start = i;
- /*
- * Check successive pages for contiguous and free.
- */
- for (i = start + npages - 1; i > start; i--) {
- pqtype = pga[i].queue - pga[i].pc;
- if (VM_PAGE_TO_PHYS(&pga[i]) !=
- VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE) {
- start = i - npages + 1;
- goto retry;
- }
- if (pass == 0) {
- if (pqtype != PQ_FREE && pqtype != PQ_CACHE) {
- start = i - npages + 1;
- goto retry;
- }
- } else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
- pga[i].queue != PQ_ACTIVE &&
- pga[i].queue != PQ_INACTIVE) {
- start = i - npages + 1;
- goto retry;
- }
- }
- for (i = start + npages - 1; i >= start; i--) {
- vm_page_t m = &pga[i];
-
-retry_page:
- pqtype = m->queue - m->pc;
- if (pass != 0 && pqtype != PQ_FREE &&
- pqtype != PQ_CACHE) {
- switch (m->queue) {
- case PQ_ACTIVE:
- case PQ_INACTIVE:
- if (vm_contig_launder_page(m) != 0)
- goto cleanup_freed;
- pqtype = m->queue - m->pc;
- if (pqtype == PQ_FREE ||
- pqtype == PQ_CACHE)
- break;
- default:
-cleanup_freed:
- vm_page_release_contigl(&pga[i + 1],
- start + npages - 1 - i);
- start = i - npages + 1;
- goto retry;
- }
- }
- if (pqtype == PQ_CACHE) {
- if (m->hold_count != 0) {
- start = i - npages + 1;
- goto retry;
- }
- object = m->object;
- if (!VM_OBJECT_TRYLOCK(object)) {
- start = i - npages + 1;
- goto retry;
- }
- if ((m->flags & PG_BUSY) || m->busy != 0) {
- VM_OBJECT_UNLOCK(object);
- start = i - npages + 1;
- goto retry;
- }
- vm_page_free(m);
- VM_OBJECT_UNLOCK(object);
- }
- /*
- * There is no good API for freeing a page
- * directly to PQ_NONE on our behalf, so spin.
- */
- if (vm_contig_unqueue_free(m) != 0)
- goto retry_page;
- }
- vm_page_unlock_queues();
- /*
- * We've found a contiguous chunk that meets are requirements.
- */
- return (&pga[start]);
- }
- return (NULL);
-}
-
static void *
contigmalloc2(vm_page_t m, vm_pindex_t npages, int flags)
{
@@ -546,7 +210,7 @@
for (i = 0; i < npages; i++) {
vm_page_insert(&m[i], object,
OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
- if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
+ if ((flags & M_ZERO) && !(m[i].flags & PG_ZERO))
pmap_zero_page(&m[i]);
tmp_addr += PAGE_SIZE;
}
@@ -556,11 +220,6 @@
return ((void *)addr);
}
-static int vm_old_contigmalloc = 0;
-SYSCTL_INT(_vm, OID_AUTO, old_contigmalloc,
- CTLFLAG_RW, &vm_old_contigmalloc, 0, "Use the old contigmalloc algorithm");
-TUNABLE_INT("vm.old_contigmalloc", &vm_old_contigmalloc);
-
void *
contigmalloc(
unsigned long size, /* should be size_t here and for malloc() */
@@ -573,26 +232,41 @@
{
void * ret;
vm_page_t pages;
- vm_pindex_t npgs;
+ unsigned long npgs;
+ int actl, actmax, inactl, inactmax, tries;
npgs = round_page(size) >> PAGE_SHIFT;
- mtx_lock(&Giant);
- if (vm_old_contigmalloc) {
- ret = contigmalloc1(size, type, flags, low, high, alignment,
- boundary, kernel_map);
- } else {
- pages = vm_page_alloc_contig(npgs, low, high,
- alignment, boundary);
- if (pages == NULL) {
- ret = NULL;
- } else {
- ret = contigmalloc2(pages, npgs, flags);
- if (ret == NULL)
- vm_page_release_contig(pages, npgs);
+ tries = 0;
+retry:
+ pages = vm_phys_alloc_contig(npgs, low, high, alignment, boundary);
+ if (pages == NULL) {
+ if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
+ vm_page_lock_queues();
+ inactl = 0;
+ inactmax = tries < 1 ? 0 : cnt.v_inactive_count;
+ actl = 0;
+ actmax = tries < 2 ? 0 : cnt.v_active_count;
+again:
+ if (inactl < inactmax &&
+ vm_contig_launder(PQ_INACTIVE)) {
+ inactl++;
+ goto again;
+ }
+ if (actl < actmax &&
+ vm_contig_launder(PQ_ACTIVE)) {
+ actl++;
+ goto again;
+ }
+ vm_page_unlock_queues();
+ tries++;
+ goto retry;
}
-
+ ret = NULL;
+ } else {
+ ret = contigmalloc2(pages, npgs, flags);
+ if (ret == NULL)
+ vm_page_release_contig(pages, npgs);
}
- mtx_unlock(&Giant);
malloc_type_allocated(type, ret == NULL ? 0 : npgs << PAGE_SHIFT);
return (ret);
}
Index: uma_int.h
===================================================================
RCS file: /home/cvs/src/sys/vm/uma_int.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/uma_int.h -L sys/vm/uma_int.h -u -r1.1.1.1 -r1.2
--- sys/vm/uma_int.h
+++ sys/vm/uma_int.h
@@ -24,7 +24,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD: src/sys/vm/uma_int.h,v 1.31.2.6 2005/08/15 09:01:11 rwatson Exp $
+ * $FreeBSD: src/sys/vm/uma_int.h,v 1.38 2007/05/09 22:53:34 rwatson Exp $
*
*/
@@ -75,13 +75,13 @@
* pair, as well as with its own set of small per-CPU caches, layered above
* the Zone's general Bucket cache.
*
- * The PCPU caches are protected by their own locks, while the Zones backed
- * by the same Keg all share a common Keg lock (to coalesce contention on
- * the backing slabs). The backing Keg typically only serves one Zone but
- * in the case of multiple Zones, one of the Zones is considered the
- * Master Zone and all Zone-related stats from the Keg are done in the
- * Master Zone. For an example of a Multi-Zone setup, refer to the
- * Mbuf allocation code.
+ * The PCPU caches are protected by critical sections, and may be accessed
+ * safely only from their associated CPU, while the Zones backed by the same
+ * Keg all share a common Keg lock (to coalesce contention on the backing
+ * slabs). The backing Keg typically only serves one Zone but in the case of
+ * multiple Zones, one of the Zones is considered the Master Zone and all
+ * Zone-related stats from the Keg are done in the Master Zone. For an
+ * example of a Multi-Zone setup, refer to the Mbuf allocation code.
*/
/*
Index: vm_object.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_object.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_object.h -L sys/vm/vm_object.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_object.h
+++ sys/vm/vm_object.h
@@ -57,7 +57,7 @@
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
- * $FreeBSD: src/sys/vm/vm_object.h,v 1.111 2005/05/03 11:11:26 jeff Exp $
+ * $FreeBSD: src/sys/vm/vm_object.h,v 1.114 2007/09/25 06:25:06 alc Exp $
*/
/*
@@ -100,6 +100,7 @@
struct vm_object *backing_object; /* object that I'm a shadow of */
vm_ooffset_t backing_object_offset;/* Offset in backing object */
TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
+ vm_page_t cache; /* root of the cache page splay tree */
void *handle;
union {
/*
@@ -140,7 +141,6 @@
#define OBJ_DEAD 0x0008 /* dead objects (during rundown) */
#define OBJ_NOSPLIT 0x0010 /* dont split this object */
#define OBJ_PIPWNT 0x0040 /* paging in progress wanted */
-#define OBJ_WRITEABLE 0x0080 /* object has been made writable */
#define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty */
#define OBJ_CLEANING 0x0200
#define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */
@@ -201,7 +201,6 @@
void vm_object_collapse (vm_object_t);
void vm_object_deallocate (vm_object_t);
void vm_object_terminate (vm_object_t);
-void vm_object_vndeallocate (vm_object_t);
void vm_object_set_writeable_dirty (vm_object_t);
void vm_object_init (void);
void vm_object_page_clean (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t);
Index: vm_zeroidle.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_zeroidle.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_zeroidle.c -L sys/vm/vm_zeroidle.c -u -r1.2 -r1.3
--- sys/vm/vm_zeroidle.c
+++ sys/vm/vm_zeroidle.c
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/vm/vm_zeroidle.c,v 1.34.2.2 2006/06/16 22:11:55 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_zeroidle.c,v 1.49 2007/07/14 19:00:44 alc Exp $");
#include <opt_sched.h>
@@ -51,23 +51,14 @@
#include <vm/vm.h>
#include <vm/vm_page.h>
+#include <vm/vm_phys.h>
-SYSCTL_DECL(_vm_stats_misc);
-
-static int cnt_prezero;
-SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
- &cnt_prezero, 0, "");
-
-static int idlezero_enable_default = 1;
+static int idlezero_enable_default = 0;
TUNABLE_INT("vm.idlezero_enable", &idlezero_enable_default);
/* Defer setting the enable flag until the kthread is running. */
static int idlezero_enable = 0;
SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0, "");
-static int idlezero_maxrun = 16;
-SYSCTL_INT(_vm, OID_AUTO, idlezero_maxrun, CTLFLAG_RW, &idlezero_maxrun, 0, "");
-TUNABLE_INT("vm.idlezero_maxrun", &idlezero_maxrun);
-
/*
* Implement the pre-zeroed page mechanism.
*/
@@ -99,30 +90,16 @@
return (1);
}
-static int
+static void
vm_page_zero_idle(void)
{
- static int free_rover;
- vm_page_t m;
- mtx_lock_spin(&vm_page_queue_free_mtx);
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
zero_state = 0;
- m = vm_pageq_find(PQ_FREE, free_rover, FALSE);
- if (m != NULL && (m->flags & PG_ZERO) == 0) {
- vm_pageq_remove_nowakeup(m);
- mtx_unlock_spin(&vm_page_queue_free_mtx);
- pmap_zero_page_idle(m);
- mtx_lock_spin(&vm_page_queue_free_mtx);
- m->flags |= PG_ZERO;
- vm_pageq_enqueue(PQ_FREE + m->pc, m);
- ++vm_page_zero_count;
- ++cnt_prezero;
+ if (vm_phys_zero_pages_idle()) {
if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
zero_state = 1;
}
- free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK;
- mtx_unlock_spin(&vm_page_queue_free_mtx);
- return (1);
}
/* Called by vm_page_free to hint that a new page is available. */
@@ -130,7 +107,7 @@
vm_page_zero_idle_wakeup(void)
{
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
if (wakeup_needed && vm_page_zero_check()) {
wakeup_needed = FALSE;
wakeup(&zero_state);
@@ -143,21 +120,21 @@
idlezero_enable = idlezero_enable_default;
+ mtx_lock(&vm_page_queue_free_mtx);
for (;;) {
if (vm_page_zero_check()) {
vm_page_zero_idle();
#ifndef PREEMPTION
if (sched_runnable()) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(curthread);
mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(curthread);
}
#endif
} else {
- vm_page_lock_queues();
wakeup_needed = TRUE;
- msleep(&zero_state, &vm_page_queue_mtx,
- PDROP, "pgzero", hz * 300);
+ msleep(&zero_state, &vm_page_queue_free_mtx, 0,
+ "pgzero", hz * 300);
}
}
}
@@ -180,11 +157,11 @@
PROC_LOCK(pagezero_proc);
pagezero_proc->p_flag |= P_NOLOAD;
PROC_UNLOCK(pagezero_proc);
- mtx_lock_spin(&sched_lock);
td = FIRST_THREAD_IN_PROC(pagezero_proc);
- sched_class(td->td_ksegrp, PRI_IDLE);
+ thread_lock(td);
+ sched_class(td, PRI_IDLE);
sched_prio(td, PRI_MAX_IDLE);
- setrunqueue(td, SRQ_BORING);
- mtx_unlock_spin(&sched_lock);
+ sched_add(td, SRQ_BORING);
+ thread_unlock(td);
}
SYSINIT(pagezero, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, pagezero_start, NULL)
Index: vm_kern.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_kern.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_kern.c -L sys/vm/vm_kern.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_kern.c
+++ sys/vm/vm_kern.c
@@ -63,11 +63,12 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_kern.c,v 1.122 2005/01/07 02:29:27 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_kern.c,v 1.128.4.1 2008/01/17 14:57:50 pjd Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h> /* for ticks and hz */
+#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
@@ -81,6 +82,7 @@
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_extern.h>
+#include <vm/uma.h>
vm_map_t kernel_map=0;
vm_map_t kmem_map=0;
@@ -175,9 +177,8 @@
mem = vm_page_grab(kernel_object, OFF_TO_IDX(offset + i),
VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
mem->valid = VM_PAGE_BITS_ALL;
- vm_page_lock_queues();
- vm_page_unmanage(mem);
- vm_page_unlock_queues();
+ KASSERT((mem->flags & PG_UNMANAGED) != 0,
+ ("kmem_alloc: page %p is managed", mem));
}
VM_OBJECT_UNLOCK(kernel_object);
@@ -295,10 +296,25 @@
vm_map_lock(map);
if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
vm_map_unlock(map);
- if ((flags & M_NOWAIT) == 0)
- panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
- (long)size, (long)map->size);
- return (0);
+ if ((flags & M_NOWAIT) == 0) {
+ for (i = 0; i < 8; i++) {
+ EVENTHANDLER_INVOKE(vm_lowmem, 0);
+ uma_reclaim();
+ vm_map_lock(map);
+ if (vm_map_findspace(map, vm_map_min(map),
+ size, &addr) == 0) {
+ break;
+ }
+ vm_map_unlock(map);
+ tsleep(&i, 0, "nokva", (hz / 4) * (i + 1));
+ }
+ if (i == 8) {
+ panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
+ (long)size, (long)map->size);
+ }
+ } else {
+ return (0);
+ }
}
offset = addr - VM_MIN_KERNEL_ADDRESS;
vm_object_reference(kmem_object);
@@ -364,9 +380,8 @@
if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
pmap_zero_page(m);
m->valid = VM_PAGE_BITS_ALL;
- vm_page_lock_queues();
- vm_page_unmanage(m);
- vm_page_unlock_queues();
+ KASSERT((m->flags & PG_UNMANAGED) != 0,
+ ("kmem_malloc: page %p is managed", m));
}
VM_OBJECT_UNLOCK(kmem_object);
@@ -390,9 +405,7 @@
vm_map_simplify_entry(map, entry);
/*
- * Loop thru pages, entering them in the pmap. (We cannot add them to
- * the wired count without wrapping the vm_page_queue_lock in
- * splimp...)
+ * Loop thru pages, entering them in the pmap.
*/
VM_OBJECT_LOCK(kmem_object);
for (i = 0; i < size; i += PAGE_SIZE) {
@@ -401,10 +414,7 @@
* Because this is kernel_pmap, this call will not block.
*/
pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, 1);
- vm_page_lock_queues();
- vm_page_flag_set(m, PG_WRITEABLE | PG_REFERENCED);
vm_page_wakeup(m);
- vm_page_unlock_queues();
}
VM_OBJECT_UNLOCK(kmem_object);
vm_map_unlock(map);
@@ -492,7 +502,8 @@
/* N.B.: cannot use kgdb to debug, starting with this assignment ... */
kernel_map = m;
(void) vm_map_insert(m, NULL, (vm_ooffset_t) 0,
- VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL, 0);
+ VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL,
+ MAP_NOFAULT);
/* ... and ending with the completion of the above `insert' */
vm_map_unlock(m);
}
Index: phys_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/phys_pager.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/phys_pager.c -L sys/vm/phys_pager.c -u -r1.1.1.1 -r1.2
--- sys/vm/phys_pager.c
+++ sys/vm/phys_pager.c
@@ -24,7 +24,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/phys_pager.c,v 1.23 2005/01/07 02:29:26 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/phys_pager.c,v 1.28.2.1 2007/11/10 11:21:17 remko Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -42,9 +42,7 @@
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
-/* prevent concurrant creation races */
-static int phys_pager_alloc_lock;
-/* list of device pager objects */
+/* list of phys pager objects */
static struct pagerlst phys_pager_object_list;
/* protect access to phys_pager_object_list */
static struct mtx phys_pager_mtx;
@@ -64,7 +62,7 @@
phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff)
{
- vm_object_t object;
+ vm_object_t object, object1;
vm_pindex_t pindex;
/*
@@ -76,42 +74,41 @@
pindex = OFF_TO_IDX(foff + PAGE_MASK + size);
if (handle != NULL) {
- mtx_lock(&Giant);
- /*
- * Lock to prevent object creation race condition.
- */
- while (phys_pager_alloc_lock) {
- phys_pager_alloc_lock = -1;
- tsleep(&phys_pager_alloc_lock, PVM, "swpalc", 0);
- }
- phys_pager_alloc_lock = 1;
-
+ mtx_lock(&phys_pager_mtx);
/*
* Look up pager, creating as necessary.
*/
+ object1 = NULL;
object = vm_pager_object_lookup(&phys_pager_object_list, handle);
if (object == NULL) {
/*
* Allocate object and associate it with the pager.
*/
- object = vm_object_allocate(OBJT_PHYS, pindex);
- object->handle = handle;
- mtx_lock(&phys_pager_mtx);
- TAILQ_INSERT_TAIL(&phys_pager_object_list, object,
- pager_object_list);
mtx_unlock(&phys_pager_mtx);
+ object1 = vm_object_allocate(OBJT_PHYS, pindex);
+ mtx_lock(&phys_pager_mtx);
+ object = vm_pager_object_lookup(&phys_pager_object_list,
+ handle);
+ if (object != NULL) {
+ /*
+ * We raced with other thread while
+ * allocating object.
+ */
+ if (pindex > object->size)
+ object->size = pindex;
+ } else {
+ object = object1;
+ object1 = NULL;
+ object->handle = handle;
+ TAILQ_INSERT_TAIL(&phys_pager_object_list, object,
+ pager_object_list);
+ }
} else {
- /*
- * Gain a reference to the object.
- */
- vm_object_reference(object);
if (pindex > object->size)
object->size = pindex;
}
- if (phys_pager_alloc_lock == -1)
- wakeup(&phys_pager_alloc_lock);
- phys_pager_alloc_lock = 0;
- mtx_unlock(&Giant);
+ mtx_unlock(&phys_pager_mtx);
+ vm_object_deallocate(object1);
} else {
object = vm_object_allocate(OBJT_PHYS, pindex);
}
@@ -127,9 +124,11 @@
{
if (object->handle != NULL) {
+ VM_OBJECT_UNLOCK(object);
mtx_lock(&phys_pager_mtx);
TAILQ_REMOVE(&phys_pager_object_list, object, pager_object_list);
mtx_unlock(&phys_pager_mtx);
+ VM_OBJECT_LOCK(object);
}
}
@@ -150,19 +149,13 @@
}
KASSERT(m[i]->valid == VM_PAGE_BITS_ALL,
("phys_pager_getpages: partially valid page %p", m[i]));
- }
- vm_page_lock_queues();
- for (i = 0; i < count; i++) {
- /* Switch off pv_entries */
- vm_page_unmanage(m[i]);
m[i]->dirty = 0;
/* The requested page must remain busy, the others not. */
if (reqpage != i) {
- vm_page_flag_clear(m[i], PG_BUSY);
+ m[i]->oflags &= ~VPO_BUSY;
m[i]->busy = 0;
}
}
- vm_page_unlock_queues();
return (VM_PAGER_OK);
}
--- /dev/null
+++ sys/vm/vm_phys.c
@@ -0,0 +1,750 @@
+/*-
+ * Copyright (c) 2002-2006 Rice University
+ * Copyright (c) 2007 Alan L. Cox <alc at cs.rice.edu>
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Alan L. Cox,
+ * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/vm/vm_phys.c,v 1.4 2007/09/25 06:25:06 alc Exp $");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_phys.h>
+
+struct vm_freelist {
+ struct pglist pl;
+ int lcnt;
+};
+
+struct vm_phys_seg {
+ vm_paddr_t start;
+ vm_paddr_t end;
+ vm_page_t first_page;
+ struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
+};
+
+static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
+
+static int vm_phys_nsegs;
+
+static struct vm_freelist
+ vm_phys_free_queues[VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
+
+static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
+
+static int cnt_prezero;
+SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
+ &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
+
+static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
+
+static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
+
+static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
+static int vm_phys_paddr_to_segind(vm_paddr_t pa);
+static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
+ int order);
+
+/*
+ * Outputs the state of the physical memory allocator, specifically,
+ * the amount of physical memory in each free list.
+ */
+static int
+sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sbuf;
+ struct vm_freelist *fl;
+ char *cbuf;
+ const int cbufsize = vm_nfreelists*(VM_NFREEORDER + 1)*81;
+ int error, flind, oind, pind;
+
+ cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
+ sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
+ "\n ORDER (SIZE) | NUMBER"
+ "\n ", flind);
+ for (pind = 0; pind < VM_NFREEPOOL; pind++)
+ sbuf_printf(&sbuf, " | POOL %d", pind);
+ sbuf_printf(&sbuf, "\n-- ");
+ for (pind = 0; pind < VM_NFREEPOOL; pind++)
+ sbuf_printf(&sbuf, "-- -- ");
+ sbuf_printf(&sbuf, "--\n");
+ for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+ sbuf_printf(&sbuf, " %2.2d (%6.6dK)", oind,
+ 1 << (PAGE_SHIFT - 10 + oind));
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ fl = vm_phys_free_queues[flind][pind];
+ sbuf_printf(&sbuf, " | %6.6d", fl[oind].lcnt);
+ }
+ sbuf_printf(&sbuf, "\n");
+ }
+ }
+ sbuf_finish(&sbuf);
+ error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+ sbuf_delete(&sbuf);
+ free(cbuf, M_TEMP);
+ return (error);
+}
+
+/*
+ * Outputs the set of physical memory segments.
+ */
+static int
+sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sbuf;
+ struct vm_phys_seg *seg;
+ char *cbuf;
+ const int cbufsize = VM_PHYSSEG_MAX*(VM_NFREEORDER + 1)*81;
+ int error, segind;
+
+ cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
+ sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
+ seg = &vm_phys_segs[segind];
+ sbuf_printf(&sbuf, "start: %#jx\n",
+ (uintmax_t)seg->start);
+ sbuf_printf(&sbuf, "end: %#jx\n",
+ (uintmax_t)seg->end);
+ sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
+ }
+ sbuf_finish(&sbuf);
+ error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+ sbuf_delete(&sbuf);
+ free(cbuf, M_TEMP);
+ return (error);
+}
+
+/*
+ * Create a physical memory segment.
+ */
+static void
+vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
+{
+ struct vm_phys_seg *seg;
+#ifdef VM_PHYSSEG_SPARSE
+ long pages;
+ int segind;
+
+ pages = 0;
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ seg = &vm_phys_segs[segind];
+ pages += atop(seg->end - seg->start);
+ }
+#endif
+ KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
+ ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
+ seg = &vm_phys_segs[vm_phys_nsegs++];
+ seg->start = start;
+ seg->end = end;
+#ifdef VM_PHYSSEG_SPARSE
+ seg->first_page = &vm_page_array[pages];
+#else
+ seg->first_page = PHYS_TO_VM_PAGE(start);
+#endif
+ seg->free_queues = &vm_phys_free_queues[flind];
+}
+
+/*
+ * Initialize the physical memory allocator.
+ */
+void
+vm_phys_init(void)
+{
+ struct vm_freelist *fl;
+ int flind, i, oind, pind;
+
+ for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+#ifdef VM_FREELIST_ISADMA
+ if (phys_avail[i] < 16777216) {
+ if (phys_avail[i + 1] > 16777216) {
+ vm_phys_create_seg(phys_avail[i], 16777216,
+ VM_FREELIST_ISADMA);
+ vm_phys_create_seg(16777216, phys_avail[i + 1],
+ VM_FREELIST_DEFAULT);
+ } else {
+ vm_phys_create_seg(phys_avail[i],
+ phys_avail[i + 1], VM_FREELIST_ISADMA);
+ }
+ if (VM_FREELIST_ISADMA >= vm_nfreelists)
+ vm_nfreelists = VM_FREELIST_ISADMA + 1;
+ } else
+#endif
+#ifdef VM_FREELIST_HIGHMEM
+ if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
+ if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
+ vm_phys_create_seg(phys_avail[i],
+ VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
+ vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
+ phys_avail[i + 1], VM_FREELIST_HIGHMEM);
+ } else {
+ vm_phys_create_seg(phys_avail[i],
+ phys_avail[i + 1], VM_FREELIST_HIGHMEM);
+ }
+ if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
+ vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
+ } else
+#endif
+ vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
+ VM_FREELIST_DEFAULT);
+ }
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ fl = vm_phys_free_queues[flind][pind];
+ for (oind = 0; oind < VM_NFREEORDER; oind++)
+ TAILQ_INIT(&fl[oind].pl);
+ }
+ }
+}
+
+/*
+ * Split a contiguous, power of two-sized set of physical pages.
+ */
+static __inline void
+vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
+{
+ vm_page_t m_buddy;
+
+ while (oind > order) {
+ oind--;
+ m_buddy = &m[1 << oind];
+ KASSERT(m_buddy->order == VM_NFREEORDER,
+ ("vm_phys_split_pages: page %p has unexpected order %d",
+ m_buddy, m_buddy->order));
+ m_buddy->order = oind;
+ TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
+ fl[oind].lcnt++;
+ }
+}
+
+/*
+ * Initialize a physical page and add it to the free lists.
+ */
+void
+vm_phys_add_page(vm_paddr_t pa)
+{
+ vm_page_t m;
+
+ cnt.v_page_count++;
+ m = vm_phys_paddr_to_vm_page(pa);
+ m->phys_addr = pa;
+ m->segind = vm_phys_paddr_to_segind(pa);
+ m->flags = PG_FREE;
+ KASSERT(m->order == VM_NFREEORDER,
+ ("vm_phys_add_page: page %p has unexpected order %d",
+ m, m->order));
+ m->pool = VM_FREEPOOL_DEFAULT;
+ pmap_page_init(m);
+ mtx_lock(&vm_page_queue_free_mtx);
+ cnt.v_free_count++;
+ vm_phys_free_pages(m, 0);
+ mtx_unlock(&vm_page_queue_free_mtx);
+}
+
+/*
+ * Allocate a contiguous, power of two-sized set of physical pages
+ * from the free lists.
+ *
+ * The free page queues must be locked.
+ */
+vm_page_t
+vm_phys_alloc_pages(int pool, int order)
+{
+ struct vm_freelist *fl;
+ struct vm_freelist *alt;
+ int flind, oind, pind;
+ vm_page_t m;
+
+ KASSERT(pool < VM_NFREEPOOL,
+ ("vm_phys_alloc_pages: pool %d is out of range", pool));
+ KASSERT(order < VM_NFREEORDER,
+ ("vm_phys_alloc_pages: order %d is out of range", order));
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ fl = vm_phys_free_queues[flind][pool];
+ for (oind = order; oind < VM_NFREEORDER; oind++) {
+ m = TAILQ_FIRST(&fl[oind].pl);
+ if (m != NULL) {
+ TAILQ_REMOVE(&fl[oind].pl, m, pageq);
+ fl[oind].lcnt--;
+ m->order = VM_NFREEORDER;
+ vm_phys_split_pages(m, oind, fl, order);
+ return (m);
+ }
+ }
+
+ /*
+ * The given pool was empty. Find the largest
+ * contiguous, power-of-two-sized set of pages in any
+ * pool. Transfer these pages to the given pool, and
+ * use them to satisfy the allocation.
+ */
+ for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ alt = vm_phys_free_queues[flind][pind];
+ m = TAILQ_FIRST(&alt[oind].pl);
+ if (m != NULL) {
+ TAILQ_REMOVE(&alt[oind].pl, m, pageq);
+ alt[oind].lcnt--;
+ m->order = VM_NFREEORDER;
+ vm_phys_set_pool(pool, m, oind);
+ vm_phys_split_pages(m, oind, fl, order);
+ return (m);
+ }
+ }
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Allocate physical memory from phys_avail[].
+ */
+vm_paddr_t
+vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
+{
+ vm_paddr_t pa;
+ int i;
+
+ size = round_page(size);
+ for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+ if (phys_avail[i + 1] - phys_avail[i] < size)
+ continue;
+ pa = phys_avail[i];
+ phys_avail[i] += size;
+ return (pa);
+ }
+ panic("vm_phys_bootstrap_alloc");
+}
+
+/*
+ * Find the vm_page corresponding to the given physical address.
+ */
+vm_page_t
+vm_phys_paddr_to_vm_page(vm_paddr_t pa)
+{
+ struct vm_phys_seg *seg;
+ int segind;
+
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ seg = &vm_phys_segs[segind];
+ if (pa >= seg->start && pa < seg->end)
+ return (&seg->first_page[atop(pa - seg->start)]);
+ }
+ panic("vm_phys_paddr_to_vm_page: paddr %#jx is not in any segment",
+ (uintmax_t)pa);
+}
+
+/*
+ * Find the segment containing the given physical address.
+ */
+static int
+vm_phys_paddr_to_segind(vm_paddr_t pa)
+{
+ struct vm_phys_seg *seg;
+ int segind;
+
+ for (segind = 0; segind < vm_phys_nsegs; segind++) {
+ seg = &vm_phys_segs[segind];
+ if (pa >= seg->start && pa < seg->end)
+ return (segind);
+ }
+ panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
+ (uintmax_t)pa);
+}
+
+/*
+ * Free a contiguous, power of two-sized set of physical pages.
+ *
+ * The free page queues must be locked.
+ */
+void
+vm_phys_free_pages(vm_page_t m, int order)
+{
+ struct vm_freelist *fl;
+ struct vm_phys_seg *seg;
+ vm_paddr_t pa, pa_buddy;
+ vm_page_t m_buddy;
+
+ KASSERT(m->order == VM_NFREEORDER,
+ ("vm_phys_free_pages: page %p has unexpected order %d",
+ m, m->order));
+ KASSERT(m->pool < VM_NFREEPOOL,
+ ("vm_phys_free_pages: page %p has unexpected pool %d",
+ m, m->pool));
+ KASSERT(order < VM_NFREEORDER,
+ ("vm_phys_free_pages: order %d is out of range", order));
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ pa = VM_PAGE_TO_PHYS(m);
+ seg = &vm_phys_segs[m->segind];
+ while (order < VM_NFREEORDER - 1) {
+ pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
+ if (pa_buddy < seg->start ||
+ pa_buddy >= seg->end)
+ break;
+ m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
+ if (m_buddy->order != order)
+ break;
+ fl = (*seg->free_queues)[m_buddy->pool];
+ TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
+ fl[m_buddy->order].lcnt--;
+ m_buddy->order = VM_NFREEORDER;
+ if (m_buddy->pool != m->pool)
+ vm_phys_set_pool(m->pool, m_buddy, order);
+ order++;
+ pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
+ m = &seg->first_page[atop(pa - seg->start)];
+ }
+ m->order = order;
+ fl = (*seg->free_queues)[m->pool];
+ TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
+ fl[order].lcnt++;
+}
+
+/*
+ * Set the pool for a contiguous, power of two-sized set of physical pages.
+ */
+void
+vm_phys_set_pool(int pool, vm_page_t m, int order)
+{
+ vm_page_t m_tmp;
+
+ for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
+ m_tmp->pool = pool;
+}
+
+/*
+ * Remove the given physical page "m" from the free lists.
+ *
+ * The free page queues must be locked.
+ */
+void
+vm_phys_unfree_page(vm_page_t m)
+{
+ struct vm_freelist *fl;
+ struct vm_phys_seg *seg;
+ vm_paddr_t pa, pa_half;
+ vm_page_t m_set, m_tmp;
+ int order;
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+ /*
+ * First, find the contiguous, power of two-sized set of free
+ * physical pages containing the given physical page "m" and
+ * assign it to "m_set".
+ */
+ seg = &vm_phys_segs[m->segind];
+ for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
+ order < VM_NFREEORDER; ) {
+ order++;
+ pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
+ KASSERT(pa >= seg->start && pa < seg->end,
+ ("vm_phys_unfree_page: paddr %#jx is not within segment %p",
+ (uintmax_t)pa, seg));
+ m_set = &seg->first_page[atop(pa - seg->start)];
+ }
+ KASSERT(m_set->order >= order, ("vm_phys_unfree_page: page %p's order"
+ " (%d) is less than expected (%d)", m_set, m_set->order, order));
+ KASSERT(m_set->order < VM_NFREEORDER,
+ ("vm_phys_unfree_page: page %p has unexpected order %d",
+ m_set, m_set->order));
+ KASSERT(order < VM_NFREEORDER,
+ ("vm_phys_unfree_page: order %d is out of range", order));
+
+ /*
+ * Next, remove "m_set" from the free lists. Finally, extract
+ * "m" from "m_set" using an iterative algorithm: While "m_set"
+ * is larger than a page, shrink "m_set" by returning the half
+ * of "m_set" that does not contain "m" to the free lists.
+ */
+ fl = (*seg->free_queues)[m_set->pool];
+ order = m_set->order;
+ TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
+ fl[order].lcnt--;
+ m_set->order = VM_NFREEORDER;
+ while (order > 0) {
+ order--;
+ pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
+ if (m->phys_addr < pa_half)
+ m_tmp = &seg->first_page[atop(pa_half - seg->start)];
+ else {
+ m_tmp = m_set;
+ m_set = &seg->first_page[atop(pa_half - seg->start)];
+ }
+ m_tmp->order = order;
+ TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
+ fl[order].lcnt++;
+ }
+ KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
+}
+
+/*
+ * Try to zero one physical page. Used by an idle priority thread.
+ */
+boolean_t
+vm_phys_zero_pages_idle(void)
+{
+ static struct vm_freelist *fl = vm_phys_free_queues[0][0];
+ static int flind, oind, pind;
+ vm_page_t m, m_tmp;
+
+ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+ for (;;) {
+ TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
+ for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
+ if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
+ vm_phys_unfree_page(m_tmp);
+ cnt.v_free_count--;
+ mtx_unlock(&vm_page_queue_free_mtx);
+ pmap_zero_page_idle(m_tmp);
+ m_tmp->flags |= PG_ZERO;
+ mtx_lock(&vm_page_queue_free_mtx);
+ cnt.v_free_count++;
+ vm_phys_free_pages(m_tmp, 0);
+ vm_page_zero_count++;
+ cnt_prezero++;
+ return (TRUE);
+ }
+ }
+ }
+ oind++;
+ if (oind == VM_NFREEORDER) {
+ oind = 0;
+ pind++;
+ if (pind == VM_NFREEPOOL) {
+ pind = 0;
+ flind++;
+ if (flind == vm_nfreelists)
+ flind = 0;
+ }
+ fl = vm_phys_free_queues[flind][pind];
+ }
+ }
+}
+
+/*
+ * Allocate a contiguous set of physical pages of the given size
+ * "npages" from the free lists. All of the physical pages must be at
+ * or above the given physical address "low" and below the given
+ * physical address "high". The given value "alignment" determines the
+ * alignment of the first physical page in the set. If the given value
+ * "boundary" is non-zero, then the set of physical pages cannot cross
+ * any physical address boundary that is a multiple of that value. Both
+ * "alignment" and "boundary" must be a power of two.
+ */
+vm_page_t
+vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
+ unsigned long alignment, unsigned long boundary)
+{
+ struct vm_freelist *fl;
+ struct vm_phys_seg *seg;
+ vm_object_t m_object;
+ vm_paddr_t pa, pa_last, size;
+ vm_page_t m, m_ret;
+ int flind, i, oind, order, pind;
+
+ size = npages << PAGE_SHIFT;
+ KASSERT(size != 0,
+ ("vm_phys_alloc_contig: size must not be 0"));
+ KASSERT((alignment & (alignment - 1)) == 0,
+ ("vm_phys_alloc_contig: alignment must be a power of 2"));
+ KASSERT((boundary & (boundary - 1)) == 0,
+ ("vm_phys_alloc_contig: boundary must be a power of 2"));
+ /* Compute the queue that is the best fit for npages. */
+ for (order = 0; (1 << order) < npages; order++);
+ mtx_lock(&vm_page_queue_free_mtx);
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ fl = vm_phys_free_queues[flind][pind];
+ TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
+ /*
+ * A free list may contain physical pages
+ * from one or more segments.
+ */
+ seg = &vm_phys_segs[m_ret->segind];
+ if (seg->start > high ||
+ low >= seg->end)
+ continue;
+
+ /*
+ * Is the size of this allocation request
+ * larger than the largest block size?
+ */
+ if (order >= VM_NFREEORDER) {
+ /*
+ * Determine if a sufficient number
+ * of subsequent blocks to satisfy
+ * the allocation request are free.
+ */
+ pa = VM_PAGE_TO_PHYS(m_ret);
+ pa_last = pa + size;
+ for (;;) {
+ pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
+ if (pa >= pa_last)
+ break;
+ if (pa < seg->start ||
+ pa >= seg->end)
+ break;
+ m = &seg->first_page[atop(pa - seg->start)];
+ if (m->order != VM_NFREEORDER - 1)
+ break;
+ }
+ /* If not, continue to the next block. */
+ if (pa < pa_last)
+ continue;
+ }
+
+ /*
+ * Determine if the blocks are within the given range,
+ * satisfy the given alignment, and do not cross the
+ * given boundary.
+ */
+ pa = VM_PAGE_TO_PHYS(m_ret);
+ if (pa >= low &&
+ pa + size <= high &&
+ (pa & (alignment - 1)) == 0 &&
+ ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
+ goto done;
+ }
+ }
+ }
+ }
+ mtx_unlock(&vm_page_queue_free_mtx);
+ return (NULL);
+done:
+ for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
+ fl = (*seg->free_queues)[m->pool];
+ TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
+ fl[m->order].lcnt--;
+ m->order = VM_NFREEORDER;
+ }
+ if (m_ret->pool != VM_FREEPOOL_DEFAULT)
+ vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
+ fl = (*seg->free_queues)[m_ret->pool];
+ vm_phys_split_pages(m_ret, oind, fl, order);
+ for (i = 0; i < npages; i++) {
+ m = &m_ret[i];
+ KASSERT(m->queue == PQ_NONE,
+ ("vm_phys_alloc_contig: page %p has unexpected queue %d",
+ m, m->queue));
+ m_object = m->object;
+ if ((m->flags & PG_CACHED) != 0)
+ vm_page_cache_remove(m);
+ else {
+ KASSERT(VM_PAGE_IS_FREE(m),
+ ("vm_phys_alloc_contig: page %p is not free", m));
+ cnt.v_free_count--;
+ }
+ m->valid = VM_PAGE_BITS_ALL;
+ if (m->flags & PG_ZERO)
+ vm_page_zero_count--;
+ /* Don't clear the PG_ZERO flag; we'll need it later. */
+ m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
+ m->oflags = 0;
+ KASSERT(m->dirty == 0,
+ ("vm_phys_alloc_contig: page %p was dirty", m));
+ m->wire_count = 0;
+ m->busy = 0;
+ if (m_object != NULL &&
+ m_object->type == OBJT_VNODE &&
+ m_object->cache == NULL) {
+ mtx_unlock(&vm_page_queue_free_mtx);
+ vdrop(m_object->handle);
+ mtx_lock(&vm_page_queue_free_mtx);
+ }
+ }
+ for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
+ m = &m_ret[i];
+ KASSERT(m->order == VM_NFREEORDER,
+ ("vm_phys_alloc_contig: page %p has unexpected order %d",
+ m, m->order));
+ vm_phys_free_pages(m, 0);
+ }
+ mtx_unlock(&vm_page_queue_free_mtx);
+ return (m_ret);
+}
+
+#ifdef DDB
+/*
+ * Show the number of physical pages in each of the free lists.
+ */
+DB_SHOW_COMMAND(freepages, db_show_freepages)
+{
+ struct vm_freelist *fl;
+ int flind, oind, pind;
+
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ db_printf("FREE LIST %d:\n"
+ "\n ORDER (SIZE) | NUMBER"
+ "\n ", flind);
+ for (pind = 0; pind < VM_NFREEPOOL; pind++)
+ db_printf(" | POOL %d", pind);
+ db_printf("\n-- ");
+ for (pind = 0; pind < VM_NFREEPOOL; pind++)
+ db_printf("-- -- ");
+ db_printf("--\n");
+ for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+ db_printf(" %2.2d (%6.6dK)", oind,
+ 1 << (PAGE_SHIFT - 10 + oind));
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ fl = vm_phys_free_queues[flind][pind];
+ db_printf(" | %6.6d", fl[oind].lcnt);
+ }
+ db_printf("\n");
+ }
+ db_printf("\n");
+ }
+}
+#endif
Index: vm_map.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_map.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_map.c -L sys/vm/vm_map.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_map.c
+++ sys/vm/vm_map.c
@@ -63,7 +63,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_map.c,v 1.366.2.2 2005/11/13 21:45:49 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_map.c,v 1.388.2.1.2.2 2008/01/19 18:15:07 kib Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -148,6 +148,29 @@
static void vmspace_zdtor(void *mem, int size, void *arg);
#endif
+/*
+ * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
+ * stable.
+ */
+#define PROC_VMSPACE_LOCK(p) do { } while (0)
+#define PROC_VMSPACE_UNLOCK(p) do { } while (0)
+
+/*
+ * VM_MAP_RANGE_CHECK: [ internal use only ]
+ *
+ * Asserts that the starting and ending region
+ * addresses fall within the valid range of the map.
+ */
+#define VM_MAP_RANGE_CHECK(map, start, end) \
+ { \
+ if (start < vm_map_min(map)) \
+ start = vm_map_min(map); \
+ if (end > vm_map_max(map)) \
+ end = vm_map_max(map); \
+ if (start > end) \
+ start = end; \
+ }
+
void
vm_map_startup(void)
{
@@ -166,7 +189,6 @@
uma_prealloc(kmapentzone, MAX_KMAPENT);
mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
- uma_prealloc(mapentzone, MAX_MAPENT);
}
static void
@@ -175,7 +197,6 @@
struct vmspace *vm;
vm = (struct vmspace *)mem;
- pmap_release(vmspace_pmap(vm));
vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
}
@@ -186,8 +207,8 @@
vm = (struct vmspace *)mem;
+ vm->vm_map.pmap = NULL;
(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
- pmap_pinit(vmspace_pmap(vm));
return (0);
}
@@ -250,6 +271,10 @@
struct vmspace *vm;
vm = uma_zalloc(vmspace_zone, M_WAITOK);
+ if (vm->vm_map.pmap == NULL && !pmap_pinit(vmspace_pmap(vm))) {
+ uma_zfree(vmspace_zone, vm);
+ return (NULL);
+ }
CTR1(KTR_VM, "vmspace_alloc: %p", vm);
_vm_map_init(&vm->vm_map, min, max);
vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */
@@ -262,7 +287,6 @@
vm->vm_taddr = 0;
vm->vm_daddr = 0;
vm->vm_maxsaddr = 0;
- vm->vm_exitingcnt = 0;
return (vm);
}
@@ -279,10 +303,9 @@
NULL,
#endif
vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- pmap_init2();
}
-static __inline void
+static inline void
vmspace_dofree(struct vmspace *vm)
{
CTR1(KTR_VM, "vmspace_free: %p", vm);
@@ -298,11 +321,15 @@
* Delete all of the mappings and pages they hold, then call
* the pmap module to reclaim anything left.
*/
- vm_map_lock(&vm->vm_map);
- (void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
+ (void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
vm->vm_map.max_offset);
- vm_map_unlock(&vm->vm_map);
+ /*
+ * XXX Comment out the pmap_release call for now. The
+ * vmspace_zone is marked as UMA_ZONE_NOFREE, and bugs cause
+ * pmap.resident_count to be != 0 on exit sometimes.
+ */
+/* pmap_release(vmspace_pmap(vm)); */
uma_zfree(vmspace_zone, vm);
}
@@ -317,7 +344,7 @@
do
refcnt = vm->vm_refcnt;
while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
- if (refcnt == 1 && vm->vm_exitingcnt == 0)
+ if (refcnt == 1)
vmspace_dofree(vm);
}
@@ -325,28 +352,93 @@
vmspace_exitfree(struct proc *p)
{
struct vmspace *vm;
- int exitingcnt;
+ PROC_VMSPACE_LOCK(p);
vm = p->p_vmspace;
p->p_vmspace = NULL;
+ PROC_VMSPACE_UNLOCK(p);
+ KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
+ vmspace_free(vm);
+}
+
+void
+vmspace_exit(struct thread *td)
+{
+ int refcnt;
+ struct vmspace *vm;
+ struct proc *p;
/*
- * cleanup by parent process wait()ing on exiting child. vm_refcnt
- * may not be 0 (e.g. fork() and child exits without exec()ing).
- * exitingcnt may increment above 0 and drop back down to zero
- * several times while vm_refcnt is held non-zero. vm_refcnt
- * may also increment above 0 and drop back down to zero several
- * times while vm_exitingcnt is held non-zero.
+ * Release user portion of address space.
+ * This releases references to vnodes,
+ * which could cause I/O if the file has been unlinked.
+ * Need to do this early enough that we can still sleep.
*
- * The last wait on the exiting child's vmspace will clean up
- * the remainder of the vmspace.
+ * The last exiting process to reach this point releases as
+ * much of the environment as it can. vmspace_dofree() is the
+ * slower fallback in case another process had a temporary
+ * reference to the vmspace.
*/
- do
- exitingcnt = vm->vm_exitingcnt;
- while (!atomic_cmpset_int(&vm->vm_exitingcnt, exitingcnt,
- exitingcnt - 1));
- if (vm->vm_refcnt == 0 && exitingcnt == 1)
+
+ p = td->td_proc;
+ vm = p->p_vmspace;
+ atomic_add_int(&vmspace0.vm_refcnt, 1);
+ do {
+ refcnt = vm->vm_refcnt;
+ if (refcnt > 1 && p->p_vmspace != &vmspace0) {
+ /* Switch now since other proc might free vmspace */
+ PROC_VMSPACE_LOCK(p);
+ p->p_vmspace = &vmspace0;
+ PROC_VMSPACE_UNLOCK(p);
+ pmap_activate(td);
+ }
+ } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
+ if (refcnt == 1) {
+ if (p->p_vmspace != vm) {
+ /* vmspace not yet freed, switch back */
+ PROC_VMSPACE_LOCK(p);
+ p->p_vmspace = vm;
+ PROC_VMSPACE_UNLOCK(p);
+ pmap_activate(td);
+ }
+ pmap_remove_pages(vmspace_pmap(vm));
+ /* Switch now since this proc will free vmspace */
+ PROC_VMSPACE_LOCK(p);
+ p->p_vmspace = &vmspace0;
+ PROC_VMSPACE_UNLOCK(p);
+ pmap_activate(td);
vmspace_dofree(vm);
+ }
+}
+
+/* Acquire reference to vmspace owned by another process. */
+
+struct vmspace *
+vmspace_acquire_ref(struct proc *p)
+{
+ struct vmspace *vm;
+ int refcnt;
+
+ PROC_VMSPACE_LOCK(p);
+ vm = p->p_vmspace;
+ if (vm == NULL) {
+ PROC_VMSPACE_UNLOCK(p);
+ return (NULL);
+ }
+ do {
+ refcnt = vm->vm_refcnt;
+ if (refcnt <= 0) { /* Avoid 0->1 transition */
+ PROC_VMSPACE_UNLOCK(p);
+ return (NULL);
+ }
+ } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
+ if (vm != p->p_vmspace) {
+ PROC_VMSPACE_UNLOCK(p);
+ vmspace_free(vm);
+ return (NULL);
+ }
+ PROC_VMSPACE_UNLOCK(p);
+ return (vm);
}
void
@@ -356,7 +448,7 @@
if (map->system_map)
_mtx_lock_flags(&map->system_mtx, 0, file, line);
else
- _sx_xlock(&map->lock, file, line);
+ (void)_sx_xlock(&map->lock, 0, file, line);
map->timestamp++;
}
@@ -377,7 +469,7 @@
if (map->system_map)
_mtx_lock_flags(&map->system_mtx, 0, file, line);
else
- _sx_xlock(&map->lock, file, line);
+ (void)_sx_xlock(&map->lock, 0, file, line);
}
void
@@ -564,7 +656,7 @@
* Set the expected access behavior, either normal, random, or
* sequential.
*/
-static __inline void
+static inline void
vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
{
entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
@@ -576,7 +668,7 @@
*
* Set the max_free field in a vm_map_entry.
*/
-static __inline void
+static inline void
vm_map_entry_set_max_free(vm_map_entry_t entry)
{
@@ -1078,6 +1170,25 @@
return (0);
}
+int
+vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
+ vm_offset_t *addr /* IN/OUT */, vm_size_t length, vm_prot_t prot,
+ vm_prot_t max, int cow)
+{
+ vm_offset_t start, end;
+ int result;
+
+ start = *addr;
+ vm_map_lock(map);
+ end = start + length;
+ VM_MAP_RANGE_CHECK(map, start, end);
+ (void) vm_map_delete(map, start, end);
+ result = vm_map_insert(map, object, offset, start, end, prot,
+ max, cow);
+ vm_map_unlock(map);
+ return (result);
+}
+
/*
* vm_map_find finds an unallocated region in the target address
* map with the given length. The search is defined to be
@@ -1288,22 +1399,6 @@
}
/*
- * VM_MAP_RANGE_CHECK: [ internal use only ]
- *
- * Asserts that the starting and ending region
- * addresses fall within the valid range of the map.
- */
-#define VM_MAP_RANGE_CHECK(map, start, end) \
- { \
- if (start < vm_map_min(map)) \
- start = vm_map_min(map); \
- if (end > vm_map_max(map)) \
- end = vm_map_max(map); \
- if (start > end) \
- start = end; \
- }
-
-/*
* vm_map_submap: [ kernel use only ]
*
* Mark the given range as handled by a subordinate map.
@@ -1362,17 +1457,18 @@
/*
* vm_map_pmap_enter:
*
- * Preload read-only mappings for the given object into the specified
- * map. This eliminates the soft faults on process startup and
- * immediately after an mmap(2).
+ * Preload read-only mappings for the given object's resident pages into
+ * the given map. This eliminates the soft faults on process startup and
+ * immediately after an mmap(2). Unless the given flags include
+ * MAP_PREFAULT_MADVISE, cached pages are not reactivated and mapped.
*/
void
vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
{
- vm_offset_t tmpidx;
- int psize;
- vm_page_t p, mpte;
+ vm_offset_t start;
+ vm_page_t p, p_start;
+ vm_pindex_t psize, tmpidx;
boolean_t are_queues_locked;
if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
@@ -1398,7 +1494,8 @@
}
are_queues_locked = FALSE;
- mpte = NULL;
+ start = 0;
+ p_start = NULL;
if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
if (p->pindex < pindex) {
@@ -1421,20 +1518,32 @@
*/
if ((flags & MAP_PREFAULT_MADVISE) &&
cnt.v_free_count < cnt.v_free_reserved) {
+ psize = tmpidx;
break;
}
if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL &&
- (p->busy == 0) &&
- (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
+ (p->busy == 0)) {
+ if (p_start == NULL) {
+ start = addr + ptoa(tmpidx);
+ p_start = p;
+ }
+ } else if (p_start != NULL) {
if (!are_queues_locked) {
are_queues_locked = TRUE;
vm_page_lock_queues();
}
- if ((p->queue - p->pc) == PQ_CACHE)
- vm_page_deactivate(p);
- mpte = pmap_enter_quick(map->pmap,
- addr + ptoa(tmpidx), p, prot, mpte);
+ pmap_enter_object(map->pmap, start, addr +
+ ptoa(tmpidx), p_start, prot);
+ p_start = NULL;
+ }
+ }
+ if (p_start != NULL) {
+ if (!are_queues_locked) {
+ are_queues_locked = TRUE;
+ vm_page_lock_queues();
}
+ pmap_enter_object(map->pmap, start, addr + ptoa(psize),
+ p_start, prot);
}
if (are_queues_locked)
vm_page_unlock_queues();
@@ -2104,7 +2213,8 @@
/*
* Make a first pass to check for user-wired memory and holes.
*/
- for (current = entry; current->start < end; current = current->next) {
+ for (current = entry; current != &map->header && current->start < end;
+ current = current->next) {
if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
vm_map_unlock_read(map);
return (KERN_INVALID_ARGUMENT);
@@ -2117,16 +2227,15 @@
}
}
- if (invalidate) {
- VM_LOCK_GIANT();
+ if (invalidate)
pmap_remove(map->pmap, start, end);
- VM_UNLOCK_GIANT();
- }
+
/*
* Make a second pass, cleaning/uncaching pages from the indicated
* objects as we go.
*/
- for (current = entry; current->start < end; current = current->next) {
+ for (current = entry; current != &map->header && current->start < end;
+ current = current->next) {
offset = current->offset + (start - current->start);
size = (end <= current->end ? end : current->end) - start;
if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
@@ -2193,8 +2302,7 @@
VM_OBJECT_LOCK(object);
if (object->ref_count != 1 &&
((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
- object == kernel_object || object == kmem_object) &&
- (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
+ object == kernel_object || object == kmem_object)) {
vm_object_collapse(object);
vm_object_page_remove(object, offidxstart, offidxend, FALSE);
if (object->type == OBJT_SWAP)
@@ -2285,11 +2393,7 @@
vm_map_entry_unwire(map, entry);
}
- if (!map->system_map)
- VM_LOCK_GIANT();
pmap_remove(map->pmap, entry->start, entry->end);
- if (!map->system_map)
- VM_UNLOCK_GIANT();
/*
* Delete the entry (which may delete the object) only after
@@ -2489,16 +2593,14 @@
vm_map_lock(old_map);
vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
+ if (vm2 == NULL)
+ goto unlock_and_return;
vm2->vm_taddr = vm1->vm_taddr;
vm2->vm_daddr = vm1->vm_daddr;
vm2->vm_maxsaddr = vm1->vm_maxsaddr;
new_map = &vm2->vm_map; /* XXX */
new_map->timestamp = 1;
- /* Do not inherit the MAP_WIREFUTURE property. */
- if ((new_map->flags & MAP_WIREFUTURE) == MAP_WIREFUTURE)
- new_map->flags &= ~MAP_WIREFUTURE;
-
old_entry = old_map->header.next;
while (old_entry != &old_map->header) {
@@ -2584,7 +2686,7 @@
}
old_entry = old_entry->next;
}
-
+unlock_and_return:
vm_map_unlock(old_map);
return (vm2);
@@ -2610,7 +2712,9 @@
cow &= ~orient;
KASSERT(orient != 0, ("No stack grow direction"));
- if (addrbos < vm_map_min(map) || addrbos > map->max_offset)
+ if (addrbos < vm_map_min(map) ||
+ addrbos > vm_map_max(map) ||
+ addrbos + max_ssize < addrbos)
return (KERN_NO_SPACE);
init_ssize = (max_ssize < sgrowsiz) ? max_ssize : sgrowsiz;
@@ -2912,13 +3016,15 @@
* Unshare the specified VM space for exec. If other processes are
* mapped to it, then create a new one. The new vmspace is null.
*/
-void
+int
vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
{
struct vmspace *oldvmspace = p->p_vmspace;
struct vmspace *newvmspace;
newvmspace = vmspace_alloc(minuser, maxuser);
+ if (newvmspace == NULL)
+ return (ENOMEM);
newvmspace->vm_swrss = oldvmspace->vm_swrss;
/*
* This code is written like this for prototype purposes. The
@@ -2927,29 +3033,37 @@
* run it down. Even though there is little or no chance of blocking
* here, it is a good idea to keep this form for future mods.
*/
+ PROC_VMSPACE_LOCK(p);
p->p_vmspace = newvmspace;
+ PROC_VMSPACE_UNLOCK(p);
if (p == curthread->td_proc) /* XXXKSE ? */
pmap_activate(curthread);
vmspace_free(oldvmspace);
+ return (0);
}
/*
* Unshare the specified VM space for forcing COW. This
* is called by rfork, for the (RFMEM|RFPROC) == 0 case.
*/
-void
+int
vmspace_unshare(struct proc *p)
{
struct vmspace *oldvmspace = p->p_vmspace;
struct vmspace *newvmspace;
if (oldvmspace->vm_refcnt == 1)
- return;
+ return (0);
newvmspace = vmspace_fork(oldvmspace);
+ if (newvmspace == NULL)
+ return (ENOMEM);
+ PROC_VMSPACE_LOCK(p);
p->p_vmspace = newvmspace;
+ PROC_VMSPACE_UNLOCK(p);
if (p == curthread->td_proc) /* XXXKSE ? */
pmap_activate(curthread);
vmspace_free(oldvmspace);
+ return (0);
}
/*
Index: softdep.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/softdep.h,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/softdep.h -L sys/ufs/ffs/softdep.h -u -r1.2 -r1.3
--- sys/ufs/ffs/softdep.h
+++ sys/ufs/ffs/softdep.h
@@ -36,7 +36,7 @@
* SUCH DAMAGE.
*
* @(#)softdep.h 9.7 (McKusick) 6/21/00
- * $FreeBSD: src/sys/ufs/ffs/softdep.h,v 1.17.2.2 2006/03/13 03:08:00 jeff Exp $
+ * $FreeBSD: src/sys/ufs/ffs/softdep.h,v 1.19 2006/03/02 05:50:23 jeff Exp $
*/
#include <sys/queue.h>
Index: ffs_rawread.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_rawread.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_rawread.c -L sys/ufs/ffs/ffs_rawread.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_rawread.c
+++ sys/ufs/ffs/ffs_rawread.c
@@ -25,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_rawread.c,v 1.25.2.2 2006/03/09 00:18:45 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_rawread.c,v 1.29 2007/02/04 23:42:02 tegge Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -129,8 +129,16 @@
upgraded = 0;
- /* Attempt to msync mmap() regions to clean dirty mmap */
VI_LOCK(vp);
+ /* Check if vnode was reclaimed while unlocked. */
+ if ((vp->v_iflag & VI_DOOMED) != 0) {
+ VI_UNLOCK(vp);
+ if (upgraded != 0)
+ VOP_LOCK(vp, LK_DOWNGRADE, td);
+ vn_finished_write(mp);
+ return (EIO);
+ }
+ /* Attempt to msync mmap() regions to clean dirty mmap */
if ((vp->v_iflag & VI_OBJDIRTY) != 0) {
VI_UNLOCK(vp);
if (vp->v_object != NULL) {
@@ -150,6 +158,7 @@
VI_UNLOCK(vp);
if (upgraded != 0)
VOP_LOCK(vp, LK_DOWNGRADE, td);
+ vn_finished_write(mp);
return (error);
}
/* Flush dirty buffers */
@@ -159,6 +168,7 @@
if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) {
if (upgraded != 0)
VOP_LOCK(vp, LK_DOWNGRADE, td);
+ vn_finished_write(mp);
return (error);
}
VI_LOCK(vp);
@@ -300,7 +310,7 @@
/* XXX: Leave some bufs for swap */
bp = getpbuf(&ffsrawbufcnt);
sa = bp->b_data;
- bp->b_vp = vp;
+ pbgetvp(vp, bp);
error = ffs_rawread_readahead(vp, udata, offset,
resid, td, bp, sa);
if (error != 0)
@@ -314,7 +324,7 @@
nbp = NULL;
if (nbp != NULL) {
nsa = nbp->b_data;
- nbp->b_vp = vp;
+ pbgetvp(vp, nbp);
nerror = ffs_rawread_readahead(vp,
udata +
@@ -327,6 +337,7 @@
nbp,
nsa);
if (nerror) {
+ pbrelvp(nbp);
relpbuf(nbp, &ffsrawbufcnt);
nbp = NULL;
}
@@ -375,6 +386,7 @@
nsa = tsa;
if (resid <= bp->b_bufsize) { /* No more readaheads */
+ pbrelvp(nbp);
relpbuf(nbp, &ffsrawbufcnt);
nbp = NULL;
} else { /* Setup next readahead */
@@ -389,6 +401,7 @@
nbp,
nsa);
if (nerror != 0) {
+ pbrelvp(nbp);
relpbuf(nbp, &ffsrawbufcnt);
nbp = NULL;
}
@@ -403,13 +416,16 @@
}
}
- if (bp != NULL)
+ if (bp != NULL) {
+ pbrelvp(bp);
relpbuf(bp, &ffsrawbufcnt);
+ }
if (nbp != NULL) { /* Run down readahead buffer */
spl = splbio();
bwait(nbp, PRIBIO, "rawrd");
splx(spl);
vunmapbuf(nbp);
+ pbrelvp(nbp);
relpbuf(nbp, &ffsrawbufcnt);
}
Index: ffs_inode.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_inode.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ffs/ffs_inode.c -L sys/ufs/ffs/ffs_inode.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ffs/ffs_inode.c
+++ sys/ufs/ffs/ffs_inode.c
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_inode.c,v 1.106 2005/04/05 08:49:41 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_inode.c,v 1.108 2007/06/01 01:12:45 jeff Exp $");
#include "opt_quota.h"
@@ -66,9 +66,11 @@
* IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode
* to disk if the IN_MODIFIED flag is set (it may be set initially, or by
* the timestamp update). The IN_LAZYMOD flag is set to force a write
- * later if not now. If we write now, then clear both IN_MODIFIED and
- * IN_LAZYMOD to reflect the presumably successful write, and if waitfor is
- * set, then wait for the write to complete.
+ * later if not now. The IN_LAZYACCESS is set instead of IN_MODIFIED if the fs
+ * is currently being suspended (or is suspended) and vnode has been accessed.
+ * If we write now, then clear IN_MODIFIED, IN_LAZYACCESS and IN_LAZYMOD to
+ * reflect the presumably successful write, and if waitfor is set, then wait
+ * for the write to complete.
*/
int
ffs_update(vp, waitfor)
@@ -80,12 +82,12 @@
struct inode *ip;
int error;
- ASSERT_VOP_LOCKED(vp, "ffs_update");
+ ASSERT_VOP_ELOCKED(vp, "ffs_update");
ufs_itimes(vp);
ip = VTOI(vp);
if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0)
return (0);
- ip->i_flag &= ~(IN_LAZYMOD | IN_MODIFIED);
+ ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
fs = ip->i_fs;
if (fs->fs_ronly)
return (0);
@@ -557,7 +559,7 @@
vp = ITOV(ip);
bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0);
if ((bp->b_flags & B_CACHE) == 0) {
- curproc->p_stats->p_ru.ru_inblock++; /* pay for read */
+ curthread->td_ru.ru_inblock++; /* pay for read */
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
Index: ffs_balloc.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_balloc.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ffs/ffs_balloc.c -L sys/ufs/ffs/ffs_balloc.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ffs/ffs_balloc.c
+++ sys/ufs/ffs/ffs_balloc.c
@@ -60,7 +60,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_balloc.c,v 1.50 2005/02/08 17:23:39 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_balloc.c,v 1.50.14.1 2008/01/19 18:12:25 kib Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -102,6 +102,7 @@
ufs2_daddr_t newb;
ufs1_daddr_t *bap, pref;
ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
+ ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
int unwindidx = -1;
ip = VTOI(vp);
@@ -231,6 +232,7 @@
nb = dp->di_ib[indirs[0].in_off];
allocib = NULL;
allocblk = allociblk;
+ lbns_remfree = lbns;
if (nb == 0) {
UFS_LOCK(ump);
pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
@@ -239,6 +241,7 @@
return (error);
nb = newb;
*allocblk++ = nb;
+ *lbns_remfree++ = indirs[1].in_lbn;
bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
bp->b_blkno = fsbtodb(fs, nb);
vfs_bio_clrbuf(bp);
@@ -289,6 +292,7 @@
}
nb = newb;
*allocblk++ = nb;
+ *lbns_remfree++ = indirs[i].in_lbn;
nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
nbp->b_blkno = fsbtodb(fs, nb);
vfs_bio_clrbuf(nbp);
@@ -342,6 +346,7 @@
}
nb = newb;
*allocblk++ = nb;
+ *lbns_remfree++ = lbn;
nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
nbp->b_blkno = fsbtodb(fs, nb);
if (flags & BA_CLRBUF)
@@ -403,9 +408,18 @@
* have an error to return to the user.
*/
(void) ffs_syncvnode(vp, MNT_WAIT);
- for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
- ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
- ip->i_number);
+ for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
+ blkp < allocblk; blkp++, lbns_remfree++) {
+ /*
+ * We shall not leave the freed blocks on the vnode
+ * buffer object lists.
+ */
+ bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+ if (bp != NULL) {
+ bp->b_flags |= (B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ }
deallocated += fs->fs_bsize;
}
if (allocib != NULL) {
@@ -441,6 +455,14 @@
ip->i_flag |= IN_CHANGE | IN_UPDATE;
}
(void) ffs_syncvnode(vp, MNT_WAIT);
+ /*
+ * After the buffers are invalidated and on-disk pointers are
+ * cleared, free the blocks.
+ */
+ for (blkp = allociblk; blkp < allocblk; blkp++) {
+ ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
+ ip->i_number);
+ }
return (error);
}
@@ -464,6 +486,7 @@
struct indir indirs[NIADDR + 2];
ufs2_daddr_t nb, newb, *bap, pref;
ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
+ ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
int deallocated, osize, nsize, num, i, error;
int unwindidx = -1;
@@ -703,6 +726,7 @@
nb = dp->di_ib[indirs[0].in_off];
allocib = NULL;
allocblk = allociblk;
+ lbns_remfree = lbns;
if (nb == 0) {
UFS_LOCK(ump);
pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
@@ -711,6 +735,7 @@
return (error);
nb = newb;
*allocblk++ = nb;
+ *lbns_remfree++ = indirs[1].in_lbn;
bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
bp->b_blkno = fsbtodb(fs, nb);
vfs_bio_clrbuf(bp);
@@ -761,6 +786,7 @@
}
nb = newb;
*allocblk++ = nb;
+ *lbns_remfree++ = indirs[i].in_lbn;
nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
nbp->b_blkno = fsbtodb(fs, nb);
vfs_bio_clrbuf(nbp);
@@ -814,6 +840,7 @@
}
nb = newb;
*allocblk++ = nb;
+ *lbns_remfree++ = lbn;
nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
nbp->b_blkno = fsbtodb(fs, nb);
if (flags & BA_CLRBUF)
@@ -881,9 +908,18 @@
* have an error to return to the user.
*/
(void) ffs_syncvnode(vp, MNT_WAIT);
- for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
- ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
- ip->i_number);
+ for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
+ blkp < allocblk; blkp++, lbns_remfree++) {
+ /*
+ * We shall not leave the freed blocks on the vnode
+ * buffer object lists.
+ */
+ bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+ if (bp != NULL) {
+ bp->b_flags |= (B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ }
deallocated += fs->fs_bsize;
}
if (allocib != NULL) {
@@ -919,5 +955,13 @@
ip->i_flag |= IN_CHANGE | IN_UPDATE;
}
(void) ffs_syncvnode(vp, MNT_WAIT);
+ /*
+ * After the buffers are invalidated and on-disk pointers are
+ * cleared, free the blocks.
+ */
+ for (blkp = allociblk; blkp < allocblk; blkp++) {
+ ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
+ ip->i_number);
+ }
return (error);
}
Index: ffs_alloc.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_alloc.c -L sys/ufs/ffs/ffs_alloc.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_alloc.c
+++ sys/ufs/ffs/ffs_alloc.c
@@ -60,7 +60,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_alloc.c,v 1.132.2.4 2006/03/13 03:07:32 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_alloc.c,v 1.147 2007/09/10 14:12:29 bz Exp $");
#include "opt_quota.h"
@@ -71,6 +71,7 @@
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/filedesc.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
@@ -121,7 +122,7 @@
* 3) allocate a block in the same cylinder group.
* 4) quadradically rehash into other cylinder groups, until an
* available block is located.
- * If no block preference is given the following heirarchy is used
+ * If no block preference is given the following hierarchy is used
* to allocate a block:
* 1) allocate a block in the cylinder group that contains the
* inode for the file.
@@ -142,6 +143,7 @@
int cg, reclaimed;
static struct timeval lastfail;
static int curfail;
+ int64_t delta;
#ifdef QUOTA
int error;
#endif
@@ -171,7 +173,7 @@
#endif
if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
goto nospace;
- if (suser_cred(cred, SUSER_ALLOWJAIL) &&
+ if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
goto nospace;
if (bpref >= fs->fs_size)
@@ -182,11 +184,18 @@
cg = dtog(fs, bpref);
bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
if (bno > 0) {
- DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
+ delta = btodb(size);
+ if (ip->i_flag & IN_SPACECOUNTED) {
+ UFS_LOCK(ump);
+ fs->fs_pendingblocks += delta;
+ UFS_UNLOCK(ump);
+ }
+ DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
*bnp = bno;
return (0);
}
+nospace:
#ifdef QUOTA
UFS_UNLOCK(ump);
/*
@@ -195,7 +204,6 @@
(void) chkdq(ip, -btodb(size), cred, FORCE);
UFS_LOCK(ump);
#endif
-nospace:
if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
reclaimed = 1;
softdep_request_cleanup(fs, ITOV(ip));
@@ -236,6 +244,7 @@
ufs2_daddr_t bno;
static struct timeval lastfail;
static int curfail;
+ int64_t delta;
*bpp = 0;
vp = ITOV(ip);
@@ -259,7 +268,7 @@
#endif /* DIAGNOSTIC */
reclaimed = 0;
retry:
- if (suser_cred(cred, SUSER_ALLOWJAIL) &&
+ if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) {
goto nospace;
}
@@ -301,7 +310,13 @@
if (bno) {
if (bp->b_blkno != fsbtodb(fs, bno))
panic("ffs_realloccg: bad blockno");
- DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(nsize - osize));
+ delta = btodb(nsize - osize);
+ if (ip->i_flag & IN_SPACECOUNTED) {
+ UFS_LOCK(ump);
+ fs->fs_pendingblocks += delta;
+ UFS_UNLOCK(ump);
+ }
+ DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
allocbuf(bp, nsize);
bp->b_flags |= B_DONE;
@@ -370,7 +385,13 @@
ffs_blkfree(ump, fs, ip->i_devvp,
bno + numfrags(fs, nsize),
(long)(request - nsize), ip->i_number);
- DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(nsize - osize));
+ delta = btodb(nsize - osize);
+ if (ip->i_flag & IN_SPACECOUNTED) {
+ UFS_LOCK(ump);
+ fs->fs_pendingblocks += delta;
+ UFS_UNLOCK(ump);
+ }
+ DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
allocbuf(bp, nsize);
bp->b_flags |= B_DONE;
@@ -879,7 +900,7 @@
* 2) allocate an inode in the same cylinder group.
* 3) quadradically rehash into other cylinder groups, until an
* available inode is located.
- * If no inode preference is given the following heirarchy is used
+ * If no inode preference is given the following hierarchy is used
* to allocate an inode:
* 1) allocate an inode in cylinder group 0.
* 2) quadradically rehash into other cylinder groups, until an
@@ -1052,7 +1073,10 @@
curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
if (dirsize < curdirsize)
dirsize = curdirsize;
- maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
+ if (dirsize <= 0)
+ maxcontigdirs = 0; /* dirsize overflowed */
+ else
+ maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
if (fs->fs_avgfpdir > 0)
maxcontigdirs = min(maxcontigdirs,
fs->fs_ipg / fs->fs_avgfpdir);
@@ -2131,13 +2155,13 @@
blksfree = cg_blksfree(cgp);
len = howmany(fs->fs_fpg, NBBY) - start;
loc = scanc((u_int)len, (u_char *)&blksfree[start],
- (u_char *)fragtbl[fs->fs_frag],
+ fragtbl[fs->fs_frag],
(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
if (loc == 0) {
len = start + 1;
start = 0;
loc = scanc((u_int)len, (u_char *)&blksfree[0],
- (u_char *)fragtbl[fs->fs_frag],
+ fragtbl[fs->fs_frag],
(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
if (loc == 0) {
printf("start = %d, len = %d, fs = %s\n",
@@ -2430,6 +2454,11 @@
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
break;
ip = VTOI(vp);
+ if (ip->i_flag & IN_SPACECOUNTED) {
+ UFS_LOCK(ump);
+ fs->fs_pendingblocks += cmd.size;
+ UFS_UNLOCK(ump);
+ }
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
ip->i_flag |= IN_CHANGE;
vput(vp);
--- sys/ufs/ffs/README.softupdates
+++ /dev/null
@@ -1,58 +0,0 @@
-$FreeBSD: src/sys/ufs/ffs/README.softupdates,v 1.9 2000/07/08 02:31:21 mckusick Exp $
-
-Using Soft Updates
-
-To enable the soft updates feature in your kernel, add option
-SOFTUPDATES to your kernel configuration.
-
-Once you are running a kernel with soft update support, you need to enable
-it for whichever filesystems you wish to run with the soft update policy.
-This is done with the -n option to tunefs(8) on the UNMOUNTED filesystems,
-e.g. from single-user mode you'd do something like:
-
- tunefs -n enable /usr
-
-To permanently enable soft updates on the /usr filesystem (or at least
-until a corresponding ``tunefs -n disable'' is done).
-
-
-Soft Updates Copyright Restrictions
-
-As of June 2000 the restrictive copyright has been removed and
-replaced with a `Berkeley-style' copyright. The files implementing
-soft updates now reside in the sys/ufs/ffs directory and are
-compiled into the generic kernel by default.
-
-
-Soft Updates Status
-
-The soft updates code has been running in production on many
-systems for the past two years generally quite successfully.
-The two current sets of shortcomings are:
-
-1) On filesystems that are chronically full, the two minute lag
- from the time a file is deleted until its free space shows up
- will result in premature filesystem full failures. This
- failure mode is most evident in small filesystems such as
- the root. For this reason, use of soft updates is not
- recommended on the root filesystem.
-
-2) If your system routines runs parallel processes each of which
- remove many files, the kernel memory rate limiting code may
- not be able to slow removal operations to a level sustainable
- by the disk subsystem. The result is that the kernel runs out
- of memory and hangs.
-
-Both of these problems are being addressed, but have not yet
-been resolved. There are no other known problems at this time.
-
-
-How Soft Updates Work
-
-For more general information on soft updates, please see:
- http://www.mckusick.com/softdep/
- http://www.ece.cmu.edu/~ganger/papers/CSE-TR-254-95/
-
---
-Marshall Kirk McKusick <mckusick at mckusick.com>
-July 2000
Index: ffs_softdep.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_softdep.c -L sys/ufs/ffs/ffs_softdep.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_softdep.c
+++ sys/ufs/ffs/ffs_softdep.c
@@ -39,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.181.2.8 2006/04/04 18:14:30 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.211 2007/06/22 13:22:36 kib Exp $");
/*
* For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
@@ -81,6 +81,7 @@
#include <vm/vm.h>
#include "opt_ffs.h"
+#include "opt_quota.h"
#ifndef SOFTUPDATES
@@ -164,7 +165,7 @@
struct buf *bp;
{
- panic("softdep_setup_allocdirect called");
+ panic("softdep_setup_allocext called");
}
void
@@ -479,7 +480,7 @@
#define TYPENAME(type) \
((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
/*
- * End system adaptaion definitions.
+ * End system adaptation definitions.
*/
/*
@@ -728,6 +729,7 @@
for (;;) {
kthread_suspend_check(softdepproc);
+ vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
ACQUIRE_LOCK(&lk);
/*
* If requested, try removing inode or removal dependencies.
@@ -743,6 +745,7 @@
wakeup_one(&proc_waiting);
}
FREE_LOCK(&lk);
+ VFS_UNLOCK_GIANT(vfslocked);
remaining = 0;
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
@@ -803,7 +806,7 @@
if (wk->wk_state & ONWORKLIST)
panic("add_to_worklist: already on list");
wk->wk_state |= ONWORKLIST;
- if (LIST_FIRST(&ump->softdep_workitem_pending) == NULL)
+ if (LIST_EMPTY(&ump->softdep_workitem_pending))
LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
else
LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
@@ -990,7 +993,7 @@
{
struct worklist *wk, *wktail;
- if (LIST_FIRST(&newbp->b_dep) != NULL)
+ if (!LIST_EMPTY(&newbp->b_dep))
panic("softdep_move_dependencies: need merge code");
wktail = 0;
ACQUIRE_LOCK(&lk);
@@ -1058,7 +1061,7 @@
error = 0;
if (i == 10) {
error = EBUSY;
- printf("softdep_waitidle: Failed to flush worklist for %p",
+ printf("softdep_waitidle: Failed to flush worklist for %p\n",
mp);
}
@@ -1423,8 +1426,14 @@
struct buf *bp;
int error, cyl;
- mp->mnt_flag &= ~MNT_ASYNC;
- mp->mnt_flag |= MNT_SOFTDEP;
+ MNT_ILOCK(mp);
+ mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
+ if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
+ mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
+ MNTK_SOFTDEP;
+ mp->mnt_noasync++;
+ }
+ MNT_IUNLOCK(mp);
ump = VFSTOUFS(mp);
LIST_INIT(&ump->softdep_workitem_pending);
ump->softdep_worklist_tail = NULL;
@@ -1516,7 +1525,8 @@
ACQUIRE_LOCK(&lk);
if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
&inodedep)))
- panic("softdep_setup_inomapdep: found inode");
+ panic("softdep_setup_inomapdep: dependency for new inode "
+ "already exists");
inodedep->id_buf = bp;
inodedep->id_state &= ~DEPCOMPLETE;
bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
@@ -1769,7 +1779,7 @@
if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
newdirblk = WK_NEWDIRBLK(wk);
WORKLIST_REMOVE(&newdirblk->db_list);
- if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
+ if (!LIST_EMPTY(&oldadp->ad_newdirblk))
panic("allocdirect_merge: extra newdirblk");
WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
}
@@ -2215,7 +2225,7 @@
}
/*
* If the file was removed, then the space being freed was
- * accounted for then (see softdep_filereleased()). If the
+ * accounted for then (see softdep_releasefile()). If the
* file is merely being truncated, then we account for it now.
*/
if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
@@ -2497,7 +2507,7 @@
if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
newdirblk = WK_NEWDIRBLK(wk);
WORKLIST_REMOVE(&newdirblk->db_list);
- if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
+ if (!LIST_EMPTY(&adp->ad_newdirblk))
panic("free_allocdirect: extra newdirblk");
if (delay)
WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
@@ -2540,7 +2550,7 @@
* If no dependencies remain, the pagedep will be freed.
*/
for (i = 0; i < DAHASHSZ; i++)
- if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
+ if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
break;
if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
LIST_REMOVE(pagedep, pd_hash);
@@ -2593,6 +2603,7 @@
}
WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
FREE_LOCK(&lk);
+ ip->i_flag |= IN_MODIFIED;
}
/*
@@ -2617,13 +2628,13 @@
mtx_assert(&lk, MA_OWNED);
if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
- LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
- LIST_FIRST(&inodedep->id_bufwait) != NULL ||
- LIST_FIRST(&inodedep->id_inowait) != NULL ||
- TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
- TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
- TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
- TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
+ !LIST_EMPTY(&inodedep->id_pendinghd) ||
+ !LIST_EMPTY(&inodedep->id_bufwait) ||
+ !LIST_EMPTY(&inodedep->id_inowait) ||
+ !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
+ !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
+ !TAILQ_EMPTY(&inodedep->id_extupdt) ||
+ !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
inodedep->id_nlinkdelta != 0)
return (0);
@@ -2660,13 +2671,13 @@
mtx_assert(&lk, MA_OWNED);
if ((inodedep->id_state & ONWORKLIST) != 0 ||
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
- LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
- LIST_FIRST(&inodedep->id_bufwait) != NULL ||
- LIST_FIRST(&inodedep->id_inowait) != NULL ||
- TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
- TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
- TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
- TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
+ !LIST_EMPTY(&inodedep->id_pendinghd) ||
+ !LIST_EMPTY(&inodedep->id_bufwait) ||
+ !LIST_EMPTY(&inodedep->id_inowait) ||
+ !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
+ !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
+ !TAILQ_EMPTY(&inodedep->id_extupdt) ||
+ !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
return (0);
LIST_REMOVE(inodedep, id_hash);
@@ -2733,7 +2744,7 @@
if ((bn = freeblks->fb_iblks[level]) == 0)
continue;
if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
- level, baselbns[level], &blocksreleased)) == 0)
+ level, baselbns[level], &blocksreleased)) != 0)
allerror = error;
ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
fs->fs_bsize, freeblks->fb_previousinum);
@@ -2842,7 +2853,7 @@
panic("indir_trunc: lost indirdep");
WORKLIST_REMOVE(wk);
WORKITEM_FREE(indirdep, D_INDIRDEP);
- if (LIST_FIRST(&bp->b_dep) != NULL)
+ if (!LIST_EMPTY(&bp->b_dep))
panic("indir_trunc: dangling dep");
ump->um_numindirdeps -= 1;
FREE_LOCK(&lk);
@@ -3500,9 +3511,9 @@
int extblocks;
if (ip->i_effnlink > 0)
- panic("softdep_filerelease: file still referenced");
+ panic("softdep_releasefile: file still referenced");
/*
- * We may be called several times as the real reference count
+ * We may be called several times as the on-disk link count
* drops to zero. We only want to account for the space once.
*/
if (ip->i_flag & IN_SPACECOUNTED)
@@ -3616,9 +3627,12 @@
dirrem->dm_oldinum = dirrem->dm_dirinum;
if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
+ if (xp != NULL)
+ add_to_worklist(&dirrem->dm_list);
FREE_LOCK(&lk);
vput(vp);
- handle_workitem_remove(dirrem, NULL);
+ if (xp == NULL)
+ handle_workitem_remove(dirrem, NULL);
return;
}
WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
@@ -3757,7 +3771,7 @@
* will be writing the real pointers, so the
* dependency can be freed.
*/
- if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
+ if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
struct buf *bp;
bp = indirdep->ir_savebp;
@@ -3894,7 +3908,7 @@
*/
inodedep->id_savedsize = dp->di_size;
inodedep->id_savedextsize = 0;
- if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
+ if (TAILQ_EMPTY(&inodedep->id_inoupdt))
return;
/*
* Set the dependencies to busy.
@@ -4037,8 +4051,8 @@
*/
inodedep->id_savedsize = dp->di_size;
inodedep->id_savedextsize = dp->di_extsize;
- if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
- TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
+ if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
+ TAILQ_EMPTY(&inodedep->id_extupdt))
return;
/*
* Set the ext data dependencies to busy.
@@ -4895,10 +4909,10 @@
* allocdirects that are completed by the merger.
*/
merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
- if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
+ if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
- if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL)
+ if (!TAILQ_EMPTY(&inodedep->id_extupdt))
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
/*
* Now that the inode has been pushed into the buffer, the
@@ -4995,7 +5009,7 @@
struct buf *bp;
struct fs *fs;
struct thread *td = curthread;
- int error, flushparent;
+ int error, flushparent, pagedep_new_block;
ino_t parentino;
ufs_lbn_t lbn;
@@ -5007,12 +5021,12 @@
FREE_LOCK(&lk);
return (0);
}
- if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
- LIST_FIRST(&inodedep->id_bufwait) != NULL ||
- TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
- TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
- TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
- TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
+ if (!LIST_EMPTY(&inodedep->id_inowait) ||
+ !LIST_EMPTY(&inodedep->id_bufwait) ||
+ !TAILQ_EMPTY(&inodedep->id_extupdt) ||
+ !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
+ !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
+ !TAILQ_EMPTY(&inodedep->id_newinoupdt))
panic("softdep_fsync: pending ops");
for (error = 0, flushparent = 0; ; ) {
if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
@@ -5073,15 +5087,36 @@
* then we do the slower ffs_syncvnode of the directory.
*/
if (flushparent) {
+ int locked;
+
if ((error = ffs_update(pvp, 1)) != 0) {
vput(pvp);
return (error);
}
- if ((pagedep->pd_state & NEWBLOCK) &&
- (error = ffs_syncvnode(pvp, MNT_WAIT))) {
- vput(pvp);
- return (error);
+ ACQUIRE_LOCK(&lk);
+ locked = 1;
+ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
+ if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
+ if (wk->wk_type != D_DIRADD)
+ panic("softdep_fsync: Unexpected type %s",
+ TYPENAME(wk->wk_type));
+ dap = WK_DIRADD(wk);
+ if (dap->da_state & DIRCHG)
+ pagedep = dap->da_previous->dm_pagedep;
+ else
+ pagedep = dap->da_pagedep;
+ pagedep_new_block = pagedep->pd_state & NEWBLOCK;
+ FREE_LOCK(&lk);
+ locked = 0;
+ if (pagedep_new_block &&
+ (error = ffs_syncvnode(pvp, MNT_WAIT))) {
+ vput(pvp);
+ return (error);
+ }
+ }
}
+ if (locked)
+ FREE_LOCK(&lk);
}
/*
* Flush directory page containing the inode's name.
@@ -5268,7 +5303,7 @@
goto restart;
FREE_LOCK(&lk);
if ((error = bwrite(nbp)) != 0) {
- break;
+ goto loop_end;
}
ACQUIRE_LOCK(&lk);
goto restart;
@@ -5299,7 +5334,7 @@
flush_pagedep_deps(vp, wk->wk_mp,
&pagedep->pd_diraddhd[i]))) {
FREE_LOCK(&lk);
- break;
+ goto loop_end;
}
}
continue;
@@ -5351,6 +5386,7 @@
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
+ loop_end:
/* We reach here only in error and unlocked */
if (error == 0)
panic("softdep_sync_metadata: zero error");
@@ -5501,6 +5537,7 @@
int error = 0;
struct buf *bp;
ino_t inum;
+ struct worklist *wk;
ump = VFSTOUFS(mp);
while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
@@ -5545,8 +5582,53 @@
}
VI_LOCK(vp);
drain_output(vp);
+ /*
+ * If first block is still dirty with a D_MKDIR
+ * dependency then it needs to be written now.
+ */
+ for (;;) {
+ error = 0;
+ bp = gbincore(&vp->v_bufobj, 0);
+ if (bp == NULL)
+ break; /* First block not present */
+ error = BUF_LOCK(bp,
+ LK_EXCLUSIVE |
+ LK_SLEEPFAIL |
+ LK_INTERLOCK,
+ VI_MTX(vp));
+ VI_LOCK(vp);
+ if (error == ENOLCK)
+ continue; /* Slept, retry */
+ if (error != 0)
+ break; /* Failed */
+ if ((bp->b_flags & B_DELWRI) == 0) {
+ BUF_UNLOCK(bp);
+ break; /* Buffer not dirty */
+ }
+ for (wk = LIST_FIRST(&bp->b_dep);
+ wk != NULL;
+ wk = LIST_NEXT(wk, wk_list))
+ if (wk->wk_type == D_MKDIR)
+ break;
+ if (wk == NULL)
+ BUF_UNLOCK(bp); /* Dependency gone */
+ else {
+ /*
+ * D_MKDIR dependency remains,
+ * must write buffer to stable
+ * storage.
+ */
+ VI_UNLOCK(vp);
+ bremfree(bp);
+ error = bwrite(bp);
+ VI_LOCK(vp);
+ }
+ break;
+ }
VI_UNLOCK(vp);
vput(vp);
+ if (error != 0)
+ break; /* Flushing of first block failed */
ACQUIRE_LOCK(&lk);
/*
* If that cleared dependencies, go on to next.
@@ -5819,7 +5901,7 @@
if (next >= pagedep_hash)
next = 0;
LIST_FOREACH(pagedep, pagedephd, pd_hash) {
- if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
+ if (LIST_EMPTY(&pagedep->pd_dirremhd))
continue;
mp = pagedep->pd_list.wk_mp;
ino = pagedep->pd_ino;
Index: ffs_vfsops.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_vfsops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_vfsops.c -L sys/ufs/ffs/ffs_vfsops.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_vfsops.c
+++ sys/ufs/ffs/ffs_vfsops.c
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_vfsops.c,v 1.290.2.9 2006/03/22 17:54:50 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_vfsops.c,v 1.329 2007/04/04 07:29:53 delphij Exp $");
#include "opt_mac.h"
#include "opt_quota.h"
@@ -40,9 +40,9 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/kernel.h>
-#include <sys/mac.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/bio.h>
@@ -52,7 +52,10 @@
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <security/mac/mac_framework.h>
+
#include <ufs/ufs/extattr.h>
+#include <ufs/ufs/gjournal.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
@@ -70,7 +73,6 @@
static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
-static int ffs_sbupdate(struct ufsmount *, int, int);
static int ffs_reload(struct mount *, struct thread *);
static int ffs_mountfs(struct vnode *, struct mount *, struct thread *);
static void ffs_oldfscompat_read(struct fs *, struct ufsmount *,
@@ -85,7 +87,6 @@
static vfs_mount_t ffs_mount;
static vfs_statfs_t ffs_statfs;
static vfs_fhtovp_t ffs_fhtovp;
-static vfs_vptofh_t ffs_vptofh;
static vfs_sync_t ffs_sync;
static struct vfsops ufs_vfsops = {
@@ -101,10 +102,10 @@
.vfs_uninit = ffs_uninit,
.vfs_unmount = ffs_unmount,
.vfs_vget = ffs_vget,
- .vfs_vptofh = ffs_vptofh,
};
VFS_SET(ufs_vfsops, ufs, 0);
+MODULE_VERSION(ufs, 1);
static b_strategy_t ffs_geom_strategy;
static b_write_t ffs_bufwrite;
@@ -114,12 +115,17 @@
.bop_write = ffs_bufwrite,
.bop_strategy = ffs_geom_strategy,
.bop_sync = bufsync,
+#ifdef NO_FFS_SNAPSHOT
+ .bop_bdflush = bufbdflush,
+#else
+ .bop_bdflush = ffs_bdflush,
+#endif
};
static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr",
"clusterw", "exec", "export", "force", "from", "multilabel",
"snapshot", "suid", "suiddir", "symfollow", "sync",
- "update", "union", NULL };
+ "union", NULL };
static int
ffs_mount(struct mount *mp, struct thread *td)
@@ -128,9 +134,9 @@
struct ufsmount *ump = 0;
struct fs *fs;
int error, flags;
+ u_int mntorflags, mntandnotflags;
mode_t accessmode;
struct nameidata ndp;
- struct export_args export;
char *fspec;
if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
@@ -151,6 +157,38 @@
if (error)
return (error);
+ mntorflags = 0;
+ mntandnotflags = 0;
+ if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
+ mntorflags |= MNT_ACLS;
+
+ if (vfs_getopt(mp->mnt_optnew, "async", NULL, NULL) == 0)
+ mntorflags |= MNT_ASYNC;
+
+ if (vfs_getopt(mp->mnt_optnew, "force", NULL, NULL) == 0)
+ mntorflags |= MNT_FORCE;
+
+ if (vfs_getopt(mp->mnt_optnew, "multilabel", NULL, NULL) == 0)
+ mntorflags |= MNT_MULTILABEL;
+
+ if (vfs_getopt(mp->mnt_optnew, "noasync", NULL, NULL) == 0)
+ mntandnotflags |= MNT_ASYNC;
+
+ if (vfs_getopt(mp->mnt_optnew, "noatime", NULL, NULL) == 0)
+ mntorflags |= MNT_NOATIME;
+
+ if (vfs_getopt(mp->mnt_optnew, "noclusterr", NULL, NULL) == 0)
+ mntorflags |= MNT_NOCLUSTERR;
+
+ if (vfs_getopt(mp->mnt_optnew, "noclusterw", NULL, NULL) == 0)
+ mntorflags |= MNT_NOCLUSTERW;
+
+ if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0)
+ mntorflags |= MNT_SNAPSHOT;
+
+ MNT_ILOCK(mp);
+ mp->mnt_flag = (mp->mnt_flag | mntorflags) & ~mntandnotflags;
+ MNT_IUNLOCK(mp);
/*
* If updating, check whether changing from read-only to
* read/write; if there is no device name, that's all we do.
@@ -210,7 +248,9 @@
g_topology_unlock();
PICKUP_GIANT();
fs->fs_ronly = 1;
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_RDONLY;
+ MNT_IUNLOCK(mp);
}
if ((mp->mnt_flag & MNT_RELOAD) &&
(error = ffs_reload(mp, td)) != 0)
@@ -221,15 +261,16 @@
* If upgrade to read-write by non-root, then verify
* that user has necessary permissions on the device.
*/
- if (suser(td)) {
- vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
- if ((error = VOP_ACCESS(devvp, VREAD | VWRITE,
- td->td_ucred, td)) != 0) {
- VOP_UNLOCK(devvp, 0, td);
- return (error);
- }
+ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_ACCESS(devvp, VREAD | VWRITE,
+ td->td_ucred, td);
+ if (error)
+ error = priv_check(td, PRIV_VFS_MOUNT_PERM);
+ if (error) {
VOP_UNLOCK(devvp, 0, td);
+ return (error);
}
+ VOP_UNLOCK(devvp, 0, td);
fs->fs_flags &= ~FS_UNCLEAN;
if (fs->fs_clean == 0) {
fs->fs_flags |= FS_UNCLEAN;
@@ -262,7 +303,9 @@
if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
return (error);
fs->fs_ronly = 0;
+ MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_RDONLY;
+ MNT_IUNLOCK(mp);
fs->fs_clean = 0;
if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
vn_finished_write(mp);
@@ -285,19 +328,22 @@
* Softdep_mount() clears it in an initial mount
* or ro->rw remount.
*/
- if (mp->mnt_flag & MNT_SOFTDEP)
+ if (mp->mnt_flag & MNT_SOFTDEP) {
+ /* XXX: Reset too late ? */
+ MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_ASYNC;
+ MNT_IUNLOCK(mp);
+ }
/*
* Keep MNT_ACLS flag if it is stored in superblock.
*/
- if ((fs->fs_flags & FS_ACLS) != 0)
+ if ((fs->fs_flags & FS_ACLS) != 0) {
+ /* XXX: Set too late ? */
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_ACLS;
- /*
- * If not updating name, process export requests.
- */
- error = vfs_copyopt(mp->mnt_optnew, "export", &export, sizeof export);
- if (error == 0 && export.ex_flags != 0)
- return (vfs_export(mp, &export));
+ MNT_IUNLOCK(mp);
+ }
+
/*
* If this is a snapshot request, take the snapshot.
*/
@@ -323,14 +369,15 @@
* If mount by non-root, then verify that user has necessary
* permissions on the device.
*/
- if (suser(td)) {
- accessmode = VREAD;
- if ((mp->mnt_flag & MNT_RDONLY) == 0)
- accessmode |= VWRITE;
- if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td))!= 0){
- vput(devvp);
- return (error);
- }
+ accessmode = VREAD;
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ accessmode |= VWRITE;
+ error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td);
+ if (error)
+ error = priv_check(td, PRIV_VFS_MOUNT_PERM);
+ if (error) {
+ vput(devvp);
+ return (error);
}
if (mp->mnt_flag & MNT_UPDATE) {
@@ -558,6 +605,7 @@
int32_t *lp;
struct ucred *cred;
struct g_consumer *cp;
+ struct mount *nmp;
dev = devvp->v_rdev;
cred = td ? td->td_ucred : NOCRED;
@@ -594,7 +642,14 @@
* Try reading the superblock in each of its possible locations.
*/
for (i = 0; sblock_try[i] != -1; i++) {
- if ((error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE,
+ if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) {
+ error = EINVAL;
+ vfs_mount_error(mp,
+ "Invalid sectorsize %d for superblock size %d",
+ cp->provider->sectorsize, SBLOCKSIZE);
+ goto out;
+ }
+ if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE,
cred, &bp)) != 0)
goto out;
fs = (struct fs *)bp->b_data;
@@ -647,6 +702,35 @@
fs->fs_pendingblocks = 0;
fs->fs_pendinginodes = 0;
}
+ if ((fs->fs_flags & FS_GJOURNAL) != 0) {
+#ifdef UFS_GJOURNAL
+ /*
+ * Get journal provider name.
+ */
+ size = 1024;
+ mp->mnt_gjprovider = malloc(size, M_UFSMNT, M_WAITOK);
+ if (g_io_getattr("GJOURNAL::provider", cp, &size,
+ mp->mnt_gjprovider) == 0) {
+ mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, size,
+ M_UFSMNT, M_WAITOK);
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_GJOURNAL;
+ MNT_IUNLOCK(mp);
+ } else {
+ printf(
+"WARNING: %s: GJOURNAL flag on fs but no gjournal provider below\n",
+ mp->mnt_stat.f_mntonname);
+ free(mp->mnt_gjprovider, M_UFSMNT);
+ mp->mnt_gjprovider = NULL;
+ }
+#else
+ printf(
+"WARNING: %s: GJOURNAL flag on fs but no UFS_GJOURNAL support\n",
+ mp->mnt_stat.f_mntonname);
+#endif
+ } else {
+ mp->mnt_gjprovider = NULL;
+ }
ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
ump->um_cp = cp;
ump->um_bo = &devvp->v_bufobj;
@@ -707,27 +791,39 @@
mp->mnt_data = (qaddr_t)ump;
mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
+ nmp = NULL;
if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 ||
- vfs_getvfs(&mp->mnt_stat.f_fsid))
+ (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
+ if (nmp)
+ vfs_rel(nmp);
vfs_getnewfsid(mp);
+ }
mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_LOCAL;
- if ((fs->fs_flags & FS_MULTILABEL) != 0)
+ MNT_IUNLOCK(mp);
+ if ((fs->fs_flags & FS_MULTILABEL) != 0) {
#ifdef MAC
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_MULTILABEL;
+ MNT_IUNLOCK(mp);
#else
printf(
"WARNING: %s: multilabel flag on fs but no MAC support\n",
- fs->fs_fsmnt);
+ mp->mnt_stat.f_mntonname);
#endif
- if ((fs->fs_flags & FS_ACLS) != 0)
+ }
+ if ((fs->fs_flags & FS_ACLS) != 0) {
#ifdef UFS_ACL
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_ACLS;
+ MNT_IUNLOCK(mp);
#else
printf(
"WARNING: %s: ACLs flag on fs but no ACLs support\n",
- fs->fs_fsmnt);
+ mp->mnt_stat.f_mntonname);
#endif
+ }
ump->um_mountp = mp;
ump->um_dev = dev;
ump->um_devvp = devvp;
@@ -784,9 +880,9 @@
(void) ufs_extattr_autostart(mp, td);
#endif /* !UFS_EXTATTR_AUTOSTART */
#endif /* !UFS_EXTATTR */
-#ifndef QUOTA
+ MNT_ILOCK(mp);
mp->mnt_kern_flag |= MNTK_MPSAFE;
-#endif
+ MNT_IUNLOCK(mp);
return (0);
out:
if (bp)
@@ -800,6 +896,10 @@
}
if (ump) {
mtx_destroy(UFS_MTX(ump));
+ if (mp->mnt_gjprovider != NULL) {
+ free(mp->mnt_gjprovider, M_UFSMNT);
+ mp->mnt_gjprovider = NULL;
+ }
free(ump->um_fs, M_UFSMNT);
free(ump, M_UFSMNT);
mp->mnt_data = (qaddr_t)0;
@@ -850,13 +950,13 @@
}
if (fs->fs_magic == FS_UFS1_MAGIC &&
fs->fs_old_inodefmt < FS_44INODEFMT) {
- fs->fs_maxfilesize = (u_quad_t) 1LL << 39;
+ fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
fs->fs_qbmask = ~fs->fs_bmask;
fs->fs_qfmask = ~fs->fs_fmask;
}
if (fs->fs_magic == FS_UFS1_MAGIC) {
ump->um_savedmaxfilesize = fs->fs_maxfilesize;
- maxfilesize = (u_int64_t)0x40000000 * fs->fs_bsize - 1;
+ maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
if (fs->fs_maxfilesize > maxfilesize)
fs->fs_maxfilesize = maxfilesize;
}
@@ -959,11 +1059,17 @@
PICKUP_GIANT();
vrele(ump->um_devvp);
mtx_destroy(UFS_MTX(ump));
+ if (mp->mnt_gjprovider != NULL) {
+ free(mp->mnt_gjprovider, M_UFSMNT);
+ mp->mnt_gjprovider = NULL;
+ }
free(fs->fs_csp, M_UFSMNT);
free(fs, M_UFSMNT);
free(ump, M_UFSMNT);
mp->mnt_data = (qaddr_t)0;
+ MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_LOCAL;
+ MNT_IUNLOCK(mp);
return (error);
}
@@ -987,8 +1093,6 @@
if (error)
return (error);
for (i = 0; i < MAXQUOTAS; i++) {
- if (ump->um_quotas[i] == NULLVP)
- continue;
quotaoff(td, mp, i);
}
/*
@@ -1211,6 +1315,7 @@
struct vnode *vp;
struct cdev *dev;
int error;
+ struct thread *td;
error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
if (error || *vpp != NULL)
@@ -1275,7 +1380,15 @@
}
#endif
- error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
+ td = curthread;
+ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td);
+ error = insmntque(vp, mp);
+ if (error != 0) {
+ uma_zfree(uma_inode, ip);
+ *vpp = NULL;
+ return (error);
+ }
+ error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
if (error || *vpp != NULL)
return (error);
@@ -1393,26 +1506,6 @@
}
/*
- * Vnode pointer to File handle
- */
-/* ARGSUSED */
-static int
-ffs_vptofh(vp, fhp)
- struct vnode *vp;
- struct fid *fhp;
-{
- struct inode *ip;
- struct ufid *ufhp;
-
- ip = VTOI(vp);
- ufhp = (struct ufid *)fhp;
- ufhp->ufid_len = sizeof(struct ufid);
- ufhp->ufid_ino = ip->i_number;
- ufhp->ufid_gen = ip->i_gen;
- return (0);
-}
-
-/*
* Initialize the filesystem.
*/
static int
@@ -1441,7 +1534,7 @@
/*
* Write a superblock and associated information back to disk.
*/
-static int
+int
ffs_sbupdate(mp, waitfor, suspended)
struct ufsmount *mp;
int waitfor;
@@ -1569,10 +1662,10 @@
/*
* Process dependencies then return any unfinished ones.
*/
- if (LIST_FIRST(&bp->b_dep) != NULL)
+ if (!LIST_EMPTY(&bp->b_dep))
buf_complete(bp);
#ifdef SOFTUPDATES
- if (LIST_FIRST(&bp->b_dep) != NULL)
+ if (!LIST_EMPTY(&bp->b_dep))
softdep_move_dependencies(bp, origbp);
#endif
/*
@@ -1690,7 +1783,7 @@
#ifdef SOFTUPDATES
/* move over the dependencies */
- if (LIST_FIRST(&bp->b_dep) != NULL)
+ if (!LIST_EMPTY(&bp->b_dep))
softdep_move_dependencies(bp, newbp);
#endif
@@ -1728,6 +1821,7 @@
if ((vp->v_vflag & VV_COPYONWRITE) &&
vp->v_rdev->si_snapdata != NULL) {
if ((bp->b_flags & B_CLUSTER) != 0) {
+ runningbufwakeup(bp);
TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
b_cluster.cluster_entry) {
error = ffs_copyonwrite(vp, tbp);
@@ -1739,6 +1833,9 @@
return;
}
}
+ bp->b_runningbufspace = bp->b_bufsize;
+ atomic_add_int(&runningbufspace,
+ bp->b_runningbufspace);
} else {
error = ffs_copyonwrite(vp, bp);
if (error != 0 && error != EOPNOTSUPP) {
@@ -1753,11 +1850,11 @@
if ((bp->b_flags & B_CLUSTER) != 0) {
TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
b_cluster.cluster_entry) {
- if (LIST_FIRST(&tbp->b_dep) != NULL)
+ if (!LIST_EMPTY(&tbp->b_dep))
buf_start(tbp);
}
} else {
- if (LIST_FIRST(&bp->b_dep) != NULL)
+ if (!LIST_EMPTY(&bp->b_dep))
buf_start(bp);
}
Index: fs.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/fs.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ffs/fs.h -L sys/ufs/ffs/fs.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ffs/fs.h
+++ sys/ufs/ffs/fs.h
@@ -27,7 +27,7 @@
* SUCH DAMAGE.
*
* @(#)fs.h 8.13 (Berkeley) 3/21/95
- * $FreeBSD: src/sys/ufs/ffs/fs.h,v 1.48 2005/02/20 08:02:15 delphij Exp $
+ * $FreeBSD: src/sys/ufs/ffs/fs.h,v 1.49 2006/10/31 21:48:53 pjd Exp $
*/
#ifndef _UFS_FFS_FS_H_
@@ -323,7 +323,8 @@
u_int *fs_active; /* (u) used by snapshots to track fs */
int32_t fs_old_cpc; /* cyl per cycle in postbl */
int32_t fs_maxbsize; /* maximum blocking factor permitted */
- int64_t fs_sparecon64[17]; /* old rotation block list head */
+ int64_t fs_unrefs; /* number of unreferenced inodes */
+ int64_t fs_sparecon64[16]; /* old rotation block list head */
int64_t fs_sblockloc; /* byte offset of standard superblock */
struct csum_total fs_cstotal; /* (u) cylinder summary information */
ufs_time_t fs_time; /* last time written */
@@ -406,6 +407,7 @@
#define FS_INDEXDIRS 0x08 /* kernel supports indexed directories */
#define FS_ACLS 0x10 /* file system has ACLs enabled */
#define FS_MULTILABEL 0x20 /* file system is MAC multi-label */
+#define FS_GJOURNAL 0x40 /* gjournaled file system */
#define FS_FLAGS_UPDATED 0x80 /* flags have been moved to new location */
/*
@@ -475,7 +477,8 @@
int32_t cg_nclusterblks; /* number of clusters this cg */
int32_t cg_niblk; /* number of inode blocks this cg */
int32_t cg_initediblk; /* last initialized inode */
- int32_t cg_sparecon32[3]; /* reserved for future use */
+ int32_t cg_unrefs; /* number of unreferenced inodes */
+ int32_t cg_sparecon32[2]; /* reserved for future use */
ufs_time_t cg_time; /* time last written */
int64_t cg_sparecon64[3]; /* reserved for future use */
u_int8_t cg_space[1]; /* space for cylinder group maps */
Index: ffs_vnops.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_vnops.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ffs/ffs_vnops.c -L sys/ufs/ffs/ffs_vnops.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ffs/ffs_vnops.c
+++ sys/ufs/ffs/ffs_vnops.c
@@ -62,7 +62,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_vnops.c,v 1.157.2.1 2005/10/29 06:43:55 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_vnops.c,v 1.173 2007/07/13 18:51:08 rodrigc Exp $");
#include <sys/param.h>
#include <sys/bio.h>
@@ -74,6 +74,7 @@
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mount.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
@@ -103,7 +104,7 @@
extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
#endif
static vop_fsync_t ffs_fsync;
-static vop_lock_t ffs_lock;
+static vop_lock1_t ffs_lock;
static vop_getpages_t ffs_getpages;
static vop_read_t ffs_read;
static vop_write_t ffs_write;
@@ -117,6 +118,7 @@
static vop_listextattr_t ffs_listextattr;
static vop_openextattr_t ffs_openextattr;
static vop_setextattr_t ffs_setextattr;
+static vop_vptofh_t ffs_vptofh;
/* Global vfs data structures for ufs. */
@@ -124,16 +126,18 @@
.vop_default = &ufs_vnodeops,
.vop_fsync = ffs_fsync,
.vop_getpages = ffs_getpages,
- .vop_lock = ffs_lock,
+ .vop_lock1 = ffs_lock,
.vop_read = ffs_read,
.vop_reallocblks = ffs_reallocblks,
.vop_write = ffs_write,
+ .vop_vptofh = ffs_vptofh,
};
struct vop_vector ffs_fifoops1 = {
.vop_default = &ufs_fifoops,
.vop_fsync = ffs_fsync,
.vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */
+ .vop_vptofh = ffs_vptofh,
};
/* Global vfs data structures for ufs. */
@@ -141,7 +145,7 @@
.vop_default = &ufs_vnodeops,
.vop_fsync = ffs_fsync,
.vop_getpages = ffs_getpages,
- .vop_lock = ffs_lock,
+ .vop_lock1 = ffs_lock,
.vop_read = ffs_read,
.vop_reallocblks = ffs_reallocblks,
.vop_write = ffs_write,
@@ -151,12 +155,13 @@
.vop_listextattr = ffs_listextattr,
.vop_openextattr = ffs_openextattr,
.vop_setextattr = ffs_setextattr,
+ .vop_vptofh = ffs_vptofh,
};
struct vop_vector ffs_fifoops2 = {
.vop_default = &ufs_fifoops,
.vop_fsync = ffs_fsync,
- .vop_lock = ffs_lock,
+ .vop_lock1 = ffs_lock,
.vop_reallocblks = ffs_reallocblks,
.vop_strategy = ffsext_strategy,
.vop_closeextattr = ffs_closeextattr,
@@ -165,6 +170,7 @@
.vop_listextattr = ffs_listextattr,
.vop_openextattr = ffs_openextattr,
.vop_setextattr = ffs_setextattr,
+ .vop_vptofh = ffs_vptofh,
};
/*
@@ -226,7 +232,7 @@
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
continue;
VI_UNLOCK(vp);
- if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
+ if (!wait && !LIST_EMPTY(&bp->b_dep) &&
(bp->b_flags & B_DEFERRED) == 0 &&
buf_countdeps(bp, 0)) {
bp->b_flags |= B_DEFERRED;
@@ -332,13 +338,62 @@
static int
ffs_lock(ap)
- struct vop_lock_args /* {
+ struct vop_lock1_args /* {
struct vnode *a_vp;
int a_flags;
struct thread *a_td;
+ char *file;
+ int line;
} */ *ap;
{
- return (VOP_LOCK_APV(&ufs_vnodeops, ap));
+#ifndef NO_FFS_SNAPSHOT
+ struct vnode *vp;
+ int flags;
+ struct lock *lkp;
+ int result;
+
+ switch (ap->a_flags & LK_TYPE_MASK) {
+ case LK_SHARED:
+ case LK_UPGRADE:
+ case LK_EXCLUSIVE:
+ vp = ap->a_vp;
+ flags = ap->a_flags;
+ for (;;) {
+ /*
+ * vnode interlock must be held to ensure that
+ * the possibly external lock isn't freed,
+ * e.g. when mutating from snapshot file vnode
+ * to regular file vnode.
+ */
+ if ((flags & LK_INTERLOCK) == 0) {
+ VI_LOCK(vp);
+ flags |= LK_INTERLOCK;
+ }
+ lkp = vp->v_vnlock;
+ result = _lockmgr(lkp, flags, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line);
+ if (lkp == vp->v_vnlock || result != 0)
+ break;
+ /*
+ * Apparent success, except that the vnode
+ * mutated between snapshot file vnode and
+ * regular file vnode while this process
+ * slept. The lock currently held is not the
+ * right lock. Release it, and try to get the
+ * new lock.
+ */
+ (void) _lockmgr(lkp, LK_RELEASE, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line);
+ if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
+ flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
+ flags &= ~LK_INTERLOCK;
+ }
+ break;
+ default:
+ result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
+ }
+ return (result);
+#else
+ return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
+#endif
}
/*
@@ -510,7 +565,7 @@
break;
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
- (LIST_FIRST(&bp->b_dep) == NULL)) {
+ (LIST_EMPTY(&bp->b_dep))) {
/*
* If there are no dependencies, and it's VMIO,
* then we don't need the buf, mark it available
@@ -537,7 +592,7 @@
*/
if (bp != NULL) {
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
- (LIST_FIRST(&bp->b_dep) == NULL)) {
+ (LIST_EMPTY(&bp->b_dep))) {
bp->b_flags |= B_RELBUF;
brelse(bp);
} else {
@@ -546,8 +601,11 @@
}
if ((error == 0 || uio->uio_resid != orig_resid) &&
- (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
+ (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
+ VI_LOCK(vp);
ip->i_flag |= IN_ACCESS;
+ VI_UNLOCK(vp);
+ }
return (error);
}
@@ -689,7 +747,7 @@
error =
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
- (LIST_FIRST(&bp->b_dep) == NULL)) {
+ (LIST_EMPTY(&bp->b_dep))) {
bp->b_flags |= B_RELBUF;
}
@@ -730,10 +788,12 @@
* we clear the setuid and setgid bits as a precaution against
* tampering.
*/
- if (resid > uio->uio_resid && ap->a_cred &&
- suser_cred(ap->a_cred, SUSER_ALLOWJAIL)) {
- ip->i_mode &= ~(ISUID | ISGID);
- DIP_SET(ip, i_mode, ip->i_mode);
+ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
+ ap->a_cred) {
+ if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
+ ip->i_mode &= ~(ISUID | ISGID);
+ DIP_SET(ip, i_mode, ip->i_mode);
+ }
}
if (error) {
if (ioflag & IO_UNIT) {
@@ -906,7 +966,7 @@
break;
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
- (LIST_FIRST(&bp->b_dep) == NULL)) {
+ (LIST_EMPTY(&bp->b_dep))) {
/*
* If there are no dependencies, and it's VMIO,
* then we don't need the buf, mark it available
@@ -933,7 +993,7 @@
*/
if (bp != NULL) {
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
- (LIST_FIRST(&bp->b_dep) == NULL)) {
+ (LIST_EMPTY(&bp->b_dep))) {
bp->b_flags |= B_RELBUF;
brelse(bp);
} else {
@@ -942,8 +1002,11 @@
}
if ((error == 0 || uio->uio_resid != orig_resid) &&
- (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
+ (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
+ VI_LOCK(vp);
ip->i_flag |= IN_ACCESS;
+ VI_UNLOCK(vp);
+ }
return (error);
}
@@ -965,6 +1028,9 @@
fs = ip->i_fs;
dp = ip->i_din2;
+ KASSERT(!(ip->i_flag & IN_SPACECOUNTED), ("inode %u: inode is dead",
+ ip->i_number));
+
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
panic("ffs_extwrite: mode");
@@ -1024,7 +1090,7 @@
error =
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
- (LIST_FIRST(&bp->b_dep) == NULL)) {
+ (LIST_EMPTY(&bp->b_dep))) {
bp->b_flags |= B_RELBUF;
}
@@ -1053,10 +1119,11 @@
* we clear the setuid and setgid bits as a precaution against
* tampering.
*/
- if (resid > uio->uio_resid && ucred &&
- suser_cred(ucred, SUSER_ALLOWJAIL)) {
- ip->i_mode &= ~(ISUID | ISGID);
- dp->di_mode = ip->i_mode;
+ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
+ if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) {
+ ip->i_mode &= ~(ISUID | ISGID);
+ dp->di_mode = ip->i_mode;
+ }
}
if (error) {
if (ioflag & IO_UNIT) {
@@ -1125,14 +1192,18 @@
{
struct inode *ip;
struct ufs2_dinode *dp;
+ struct fs *fs;
struct uio luio;
struct iovec liovec;
int easize, error;
u_char *eae;
ip = VTOI(vp);
+ fs = ip->i_fs;
dp = ip->i_din2;
easize = dp->di_extsize;
+ if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize)
+ return (EFBIG);
eae = malloc(easize + extra, M_TEMP, M_WAITOK);
@@ -1296,6 +1367,9 @@
if (ap->a_vp->v_type == VCHR)
return (EOPNOTSUPP);
+ if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
+ return (EROFS);
+
return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
}
@@ -1330,6 +1404,9 @@
if (strlen(ap->a_name) == 0)
return (EINVAL);
+ if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, IWRITE);
if (error) {
@@ -1551,6 +1628,9 @@
if (ap->a_uio == NULL)
return (EOPNOTSUPP);
+ if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
+ return (EROFS);
+
error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
ap->a_cred, ap->a_td, IWRITE);
if (error) {
@@ -1633,3 +1713,26 @@
error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
return(error);
}
+
+/*
+ * Vnode pointer to File handle
+ */
+static int
+ffs_vptofh(struct vop_vptofh_args *ap)
+/*
+vop_vptofh {
+ IN struct vnode *a_vp;
+ IN struct fid *a_fhp;
+};
+*/
+{
+ struct inode *ip;
+ struct ufid *ufhp;
+
+ ip = VTOI(ap->a_vp);
+ ufhp = (struct ufid *)ap->a_fhp;
+ ufhp->ufid_len = sizeof(struct ufid);
+ ufhp->ufid_ino = ip->i_number;
+ ufhp->ufid_gen = ip->i_gen;
+ return (0);
+}
Index: ffs_snapshot.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_snapshot.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_snapshot.c -L sys/ufs/ffs/ffs_snapshot.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_snapshot.c
+++ sys/ufs/ffs/ffs_snapshot.c
@@ -34,7 +34,9 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_snapshot.c,v 1.103.2.5 2006/03/22 17:42:31 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_snapshot.c,v 1.136 2007/06/05 00:00:56 jeff Exp $");
+
+#include "opt_quota.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -66,6 +68,62 @@
#define KERNCRED thread0.td_ucred
#define DEBUG 1
+#include "opt_ffs.h"
+
+#ifdef NO_FFS_SNAPSHOT
+int
+ffs_snapshot(mp, snapfile)
+ struct mount *mp;
+ char *snapfile;
+{
+ return (EINVAL);
+}
+
+int
+ffs_snapblkfree(fs, devvp, bno, size, inum)
+ struct fs *fs;
+ struct vnode *devvp;
+ ufs2_daddr_t bno;
+ long size;
+ ino_t inum;
+{
+ return (EINVAL);
+}
+
+void
+ffs_snapremove(vp)
+ struct vnode *vp;
+{
+}
+
+void
+ffs_snapshot_mount(mp)
+ struct mount *mp;
+{
+}
+
+void
+ffs_snapshot_unmount(mp)
+ struct mount *mp;
+{
+}
+
+void
+ffs_snapgone(ip)
+ struct inode *ip;
+{
+}
+
+int
+ffs_copyonwrite(devvp, bp)
+ struct vnode *devvp;
+ struct buf *bp;
+{
+ return (EINVAL);
+}
+
+#else
+
TAILQ_HEAD(snaphead, inode);
struct snapdata {
@@ -104,6 +162,8 @@
struct fs *, ufs_lbn_t, int);
static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
static void process_deferred_inactive(struct mount *);
+static void try_free_snapdata(struct vnode *devvp, struct thread *td);
+static int ffs_bp_snapblk(struct vnode *, struct buf *);
/*
* To ensure the consistency of snapshots across crashes, we must
@@ -135,7 +195,7 @@
ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
int error, cg, snaploc;
int i, size, len, loc;
- int flag = mp->mnt_flag;
+ int flag;
struct timespec starttime = {0, 0}, endtime;
char saved_nice = 0;
long redo = 0, snaplistsize = 0;
@@ -156,10 +216,10 @@
ump = VFSTOUFS(mp);
fs = ump->um_fs;
- /*
- * XXX: make sure we don't go to out1 before we setup sn
- */
- sn = (void *)0xdeadbeef;
+ sn = NULL;
+ MNT_ILOCK(mp);
+ flag = mp->mnt_flag;
+ MNT_IUNLOCK(mp);
/*
* Need to serialize access to snapshot code per filesystem.
@@ -203,6 +263,7 @@
wrtmp = NULL;
if (wrtmp != mp)
panic("ffs_snapshot: mount mismatch");
+ vfs_rel(wrtmp);
if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
@@ -221,6 +282,7 @@
return (error);
}
vp = nd.ni_vp;
+ vp->v_vflag |= VV_SYSTEM;
ip = VTOI(vp);
devvp = ip->i_devvp;
/*
@@ -235,9 +297,10 @@
ip->i_size = lblktosize(fs, (off_t)numblks);
DIP_SET(ip, i_size, ip->i_size);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
- if ((error = readblock(vp, bp, numblks - 1)) != 0)
- goto out;
+ error = readblock(vp, bp, numblks - 1);
bawrite(bp);
+ if (error != 0)
+ goto out;
/*
* Preallocate critical data structures so that we can copy
* them in without further allocation after we suspend all
@@ -326,12 +389,15 @@
* Recind nice scheduling while running with the filesystem suspended.
*/
if (td->td_proc->p_nice > 0) {
- PROC_LOCK(td->td_proc);
- mtx_lock_spin(&sched_lock);
- saved_nice = td->td_proc->p_nice;
- sched_nice(td->td_proc, 0);
- mtx_unlock_spin(&sched_lock);
- PROC_UNLOCK(td->td_proc);
+ struct proc *p;
+
+ p = td->td_proc;
+ PROC_LOCK(p);
+ PROC_SLOCK(p);
+ saved_nice = p->p_nice;
+ sched_nice(p, 0);
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
}
/*
* Suspend operation on filesystem.
@@ -348,8 +414,23 @@
vn_start_write(NULL, &wrtmp, V_WAIT);
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (ip->i_effnlink == 0) {
+ error = ENOENT; /* Snapshot file unlinked */
+ goto out1;
+ }
if (collectsnapstats)
nanotime(&starttime);
+
+ /* The last block might have changed. Copy it again to be sure. */
+ error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
+ fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
+ if (error != 0)
+ goto out1;
+ error = readblock(vp, bp, numblks - 1);
+ bp->b_flags |= B_VALIDSUSPWRT;
+ bawrite(bp);
+ if (error != 0)
+ goto out1;
/*
* First, copy all the cylinder group maps that have changed.
*/
@@ -551,7 +632,6 @@
}
lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
VI_MTX(vp), td);
- transferlockers(&vp->v_lock, vp->v_vnlock);
lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
/*
* If this is the first snapshot on this filesystem, then we need
@@ -595,9 +675,10 @@
devvp->v_vflag |= VV_COPYONWRITE;
VI_UNLOCK(devvp);
ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
- vp->v_vflag |= VV_SYSTEM;
out1:
- KASSERT(sn != (void *)0xdeadbeef, ("email phk@ and mckusick@"));
+ KASSERT((sn != NULL && sbp != NULL && error == 0) ||
+ (sn == NULL && sbp == NULL && error != 0),
+ ("email phk@ and mckusick@"));
/*
* Resume operation on filesystem.
*/
@@ -625,6 +706,13 @@
else
error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
BLK_SNAP);
+ if (error == 0 && xp->i_effnlink == 0) {
+ error = ffs_freefile(ump,
+ copy_fs,
+ vp,
+ xp->i_number,
+ xp->i_mode);
+ }
if (error) {
fs->fs_snapinum[snaploc] = 0;
goto done;
@@ -707,21 +795,30 @@
* the inode for this snapshot then a deadlock can occur. Drop
* the snapshot lock until the buffer has been written.
*/
+ VREF(vp); /* Protect against ffs_snapgone() */
VOP_UNLOCK(vp, 0, td);
(void) bread(ip->i_devvp,
fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
(int) fs->fs_bsize, NOCRED, &nbp);
brelse(nbp);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (ip->i_effnlink == 0)
+ error = ENOENT; /* Snapshot file unlinked */
+ else
+ vrele(vp); /* Drop extra reference */
done:
FREE(copy_fs->fs_csp, M_UFSMNT);
bawrite(sbp);
out:
+ NDFREE(&nd, NDF_ONLY_PNBUF);
if (saved_nice > 0) {
- PROC_LOCK(td->td_proc);
- mtx_lock_spin(&sched_lock);
+ struct proc *p;
+
+ p = td->td_proc;
+ PROC_LOCK(p);
+ PROC_SLOCK(p);
sched_nice(td->td_proc, saved_nice);
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
PROC_UNLOCK(td->td_proc);
}
UFS_LOCK(ump);
@@ -730,7 +827,9 @@
fs->fs_active = 0;
}
UFS_UNLOCK(ump);
- mp->mnt_flag = flag;
+ MNT_ILOCK(mp);
+ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
+ MNT_IUNLOCK(mp);
if (error)
(void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
(void) ffs_syncvnode(vp, MNT_WAIT);
@@ -908,11 +1007,11 @@
}
/*
* Set a snapshot inode to be a zero length file, regular files
- * to be completely unallocated.
+ * or unlinked snapshots to be completely unallocated.
*/
dip = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, cancelip->i_number);
- if (expungetype == BLK_NOCOPY)
+ if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)
dip->di_mode = 0;
dip->di_size = 0;
dip->di_blocks = 0;
@@ -1469,18 +1568,16 @@
{
struct inode *ip;
struct vnode *devvp;
- struct lock *lkp;
struct buf *ibp;
struct fs *fs;
struct thread *td = curthread;
- ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
+ ufs2_daddr_t numblks, blkno, dblk;
int error, loc, last;
struct snapdata *sn;
ip = VTOI(vp);
fs = ip->i_fs;
devvp = ip->i_devvp;
- sn = devvp->v_rdev->si_snapdata;
/*
* If active, delete from incore list (this snapshot may
* already have been in the process of being deleted, so
@@ -1488,29 +1585,23 @@
*
* Clear copy-on-write flag if last snapshot.
*/
+ VI_LOCK(devvp);
if (ip->i_nextsnap.tqe_prev != 0) {
- lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
- VI_LOCK(devvp);
+ sn = devvp->v_rdev->si_snapdata;
TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
ip->i_nextsnap.tqe_prev = 0;
- lkp = vp->v_vnlock;
+ VI_UNLOCK(devvp);
+ lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
+ VI_LOCK(vp);
+ KASSERT(vp->v_vnlock == &sn->sn_lock,
+ ("ffs_snapremove: lost lock mutation"));
vp->v_vnlock = &vp->v_lock;
- lockmgr(lkp, LK_RELEASE, NULL, td);
- if (TAILQ_FIRST(&sn->sn_head) != 0) {
- VI_UNLOCK(devvp);
- } else {
- snapblklist = sn->sn_blklist;
- sn->sn_blklist = 0;
- sn->sn_listsize = 0;
- devvp->v_rdev->si_snapdata = NULL;
- devvp->v_vflag &= ~VV_COPYONWRITE;
- lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
- lockmgr(lkp, LK_RELEASE, NULL, td);
- lockdestroy(lkp);
- free(sn, M_UFSMNT);
- FREE(snapblklist, M_UFSMNT);
- }
- }
+ VI_UNLOCK(vp);
+ VI_LOCK(devvp);
+ lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
+ try_free_snapdata(devvp, td);
+ } else
+ VI_UNLOCK(devvp);
/*
* Clear all BLK_NOCOPY fields. Pass any block claims to other
* snapshots that want them (see ffs_snapblkfree below).
@@ -1575,6 +1666,13 @@
ip->i_flags &= ~SF_SNAPSHOT;
DIP_SET(ip, i_flags, ip->i_flags);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
+#ifdef QUOTA
+ /*
+ * Reenable disk quotas for ex-snapshot file.
+ */
+ if (!getinoquota(ip))
+ (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
+#endif
}
/*
@@ -1792,6 +1890,7 @@
struct thread *td = curthread;
struct snapdata *sn;
struct vnode *vp;
+ struct vnode *lastvp;
struct inode *ip;
struct uio auio;
struct iovec aiov;
@@ -1809,6 +1908,7 @@
* Process each snapshot listed in the superblock.
*/
vp = NULL;
+ lastvp = NULL;
sn = devvp->v_rdev->si_snapdata;
for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
if (fs->fs_snapinum[snaploc] == 0)
@@ -1866,7 +1966,6 @@
}
lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
VI_MTX(vp), td);
- transferlockers(&vp->v_lock, vp->v_vnlock);
lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
/*
* Link it onto the active snapshot list.
@@ -1880,7 +1979,9 @@
vp->v_vflag |= VV_SYSTEM;
VI_UNLOCK(devvp);
VOP_UNLOCK(vp, 0, td);
+ lastvp = vp;
}
+ vp = lastvp;
/*
* No usable snapshots found.
*/
@@ -1939,31 +2040,149 @@
struct snapdata *sn;
struct inode *xp;
struct vnode *vp;
+ struct thread *td = curthread;
- sn = devvp->v_rdev->si_snapdata;
VI_LOCK(devvp);
- while ((xp = TAILQ_FIRST(&sn->sn_head)) != 0) {
+ sn = devvp->v_rdev->si_snapdata;
+ while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
vp = ITOV(xp);
- vp->v_vnlock = &vp->v_lock;
TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
xp->i_nextsnap.tqe_prev = 0;
- if (xp->i_effnlink > 0) {
- VI_UNLOCK(devvp);
+ lockmgr(&sn->sn_lock,
+ LK_INTERLOCK | LK_EXCLUSIVE,
+ VI_MTX(devvp),
+ td);
+ VI_LOCK(vp);
+ lockmgr(&vp->v_lock,
+ LK_INTERLOCK | LK_EXCLUSIVE,
+ VI_MTX(vp), td);
+ VI_LOCK(vp);
+ KASSERT(vp->v_vnlock == &sn->sn_lock,
+ ("ffs_snapshot_unmount: lost lock mutation"));
+ vp->v_vnlock = &vp->v_lock;
+ VI_UNLOCK(vp);
+ lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
+ lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
+ if (xp->i_effnlink > 0)
vrele(vp);
- VI_LOCK(devvp);
- }
+ VI_LOCK(devvp);
+ sn = devvp->v_rdev->si_snapdata;
}
- devvp->v_rdev->si_snapdata = NULL;
- devvp->v_vflag &= ~VV_COPYONWRITE;
+ try_free_snapdata(devvp, td);
+ ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
+}
+
+/*
+ * Check the buffer block to be belong to device buffer that shall be
+ * locked after snaplk. devvp shall be locked on entry, and will be
+ * leaved locked upon exit.
+ */
+static int
+ffs_bp_snapblk(devvp, bp)
+ struct vnode *devvp;
+ struct buf *bp;
+{
+ struct snapdata *sn;
+ struct fs *fs;
+ ufs2_daddr_t lbn, *snapblklist;
+ int lower, upper, mid;
+
+ ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
+ KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
+ sn = devvp->v_rdev->si_snapdata;
+ if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
+ return (0);
+ fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
+ lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
+ snapblklist = sn->sn_blklist;
+ upper = sn->sn_listsize - 1;
+ lower = 1;
+ while (lower <= upper) {
+ mid = (lower + upper) / 2;
+ if (snapblklist[mid] == lbn)
+ break;
+ if (snapblklist[mid] < lbn)
+ lower = mid + 1;
+ else
+ upper = mid - 1;
+ }
+ if (lower <= upper)
+ return (1);
+ return (0);
+}
+
+void
+ffs_bdflush(bo, bp)
+ struct bufobj *bo;
+ struct buf *bp;
+{
+ struct thread *td;
+ struct vnode *vp, *devvp;
+ struct buf *nbp;
+ int bp_bdskip;
+
+ if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
+ return;
+
+ td = curthread;
+ vp = bp->b_vp;
+ devvp = bo->__bo_vnode;
+ KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
+
+ VI_LOCK(devvp);
+ bp_bdskip = ffs_bp_snapblk(devvp, bp);
+ if (bp_bdskip)
+ bdwriteskip++;
VI_UNLOCK(devvp);
- if (sn->sn_blklist != NULL) {
- FREE(sn->sn_blklist, M_UFSMNT);
- sn->sn_blklist = NULL;
- sn->sn_listsize = 0;
+ if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
+ (void) VOP_FSYNC(vp, MNT_NOWAIT, td);
+ altbufferflushes++;
+ } else {
+ BO_LOCK(bo);
+ /*
+ * Try to find a buffer to flush.
+ */
+ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+ if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+ BUF_LOCK(nbp,
+ LK_EXCLUSIVE | LK_NOWAIT, NULL))
+ continue;
+ if (bp == nbp)
+ panic("bdwrite: found ourselves");
+ BO_UNLOCK(bo);
+ /*
+ * Don't countdeps with the bo lock
+ * held.
+ */
+ if (buf_countdeps(nbp, 0)) {
+ BO_LOCK(bo);
+ BUF_UNLOCK(nbp);
+ continue;
+ }
+ if (bp_bdskip) {
+ VI_LOCK(devvp);
+ if (!ffs_bp_snapblk(vp, nbp)) {
+ if (BO_MTX(bo) != VI_MTX(vp)) {
+ VI_UNLOCK(devvp);
+ BO_LOCK(bo);
+ }
+ BUF_UNLOCK(nbp);
+ continue;
+ }
+ VI_UNLOCK(devvp);
+ }
+ if (nbp->b_flags & B_CLUSTEROK) {
+ vfs_bio_awrite(nbp);
+ } else {
+ bremfree(nbp);
+ bawrite(nbp);
+ }
+ dirtybufferflushes++;
+ break;
+ }
+ if (nbp == NULL)
+ BO_UNLOCK(bo);
}
- lockdestroy(&sn->sn_lock);
- free(sn, M_UFSMNT);
- ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
}
/*
@@ -1984,8 +2203,9 @@
ufs2_daddr_t lbn, blkno, *snapblklist;
int lower, upper, mid, indiroff, error = 0;
int launched_async_io, prev_norunningbuf;
+ long saved_runningbufspace;
- if ((VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
+ if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
return (0); /* Update on a snapshot file */
if (td->td_pflags & TDP_COWINPROGRESS)
panic("ffs_copyonwrite: recursive call");
@@ -1996,7 +2216,7 @@
VI_LOCK(devvp);
sn = devvp->v_rdev->si_snapdata;
if (sn == NULL ||
- TAILQ_FIRST(&sn->sn_head) == NULL) {
+ TAILQ_EMPTY(&sn->sn_head)) {
VI_UNLOCK(devvp);
return (0); /* No snapshot */
}
@@ -2026,7 +2246,9 @@
* for a long time waiting on snaplk, back it out of
* runningbufspace, possibly waking other threads waiting for space.
*/
- runningbufwakeup(bp);
+ saved_runningbufspace = bp->b_runningbufspace;
+ if (saved_runningbufspace != 0)
+ runningbufwakeup(bp);
/*
* Not in the precomputed list, so check the snapshots.
*/
@@ -2036,11 +2258,13 @@
VI_LOCK(devvp);
sn = devvp->v_rdev->si_snapdata;
if (sn == NULL ||
- TAILQ_FIRST(&sn->sn_head) == NULL) {
+ TAILQ_EMPTY(&sn->sn_head)) {
VI_UNLOCK(devvp);
- if (bp->b_runningbufspace)
+ if (saved_runningbufspace != 0) {
+ bp->b_runningbufspace = saved_runningbufspace;
atomic_add_int(&runningbufspace,
bp->b_runningbufspace);
+ }
return (0); /* Snapshot gone */
}
}
@@ -2161,8 +2385,10 @@
/*
* I/O on bp will now be started, so count it in runningbufspace.
*/
- if (bp->b_runningbufspace)
+ if (saved_runningbufspace != 0) {
+ bp->b_runningbufspace = saved_runningbufspace;
atomic_add_int(&runningbufspace, bp->b_runningbufspace);
+ }
return (error);
}
@@ -2184,25 +2410,24 @@
bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
bip->bio_data = bp->b_data;
bip->bio_length = bp->b_bcount;
+ bip->bio_done = NULL;
g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
-
- do
- msleep(bip, NULL, PRIBIO, "snaprdb", hz/10);
- while (!(bip->bio_flags & BIO_DONE));
- bp->b_error = bip->bio_error;
+ bp->b_error = biowait(bip, "snaprdb");
g_destroy_bio(bip);
return (bp->b_error);
}
/*
* Process file deletes that were deferred by ufs_inactive() due to
- * the file system being suspended.
+ * the file system being suspended. Transfer IN_LAZYACCESS into
+ * IN_MODIFIED for vnodes that were accessed during suspension.
*/
static void
process_deferred_inactive(struct mount *mp)
{
struct vnode *vp, *mvp;
+ struct inode *ip;
struct thread *td;
int error;
@@ -2212,9 +2437,15 @@
loop:
MNT_VNODE_FOREACH(vp, mp, mvp) {
VI_LOCK(vp);
- if ((vp->v_iflag & (VI_DOOMED | VI_OWEINACT)) != VI_OWEINACT ||
- vp->v_usecount > 0 ||
- vp->v_type == VNON) {
+ /*
+ * IN_LAZYACCESS is checked here without holding any
+ * vnode lock, but this flag is set only while holding
+ * vnode interlock.
+ */
+ if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 ||
+ ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
+ ((vp->v_iflag & VI_OWEINACT) == 0 ||
+ vp->v_usecount > 0))) {
VI_UNLOCK(vp);
continue;
}
@@ -2229,8 +2460,13 @@
MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
goto loop;
}
+ ip = VTOI(vp);
+ if ((ip->i_flag & IN_LAZYACCESS) != 0) {
+ ip->i_flag &= ~IN_LAZYACCESS;
+ ip->i_flag |= IN_MODIFIED;
+ }
VI_LOCK(vp);
- if ((vp->v_iflag & VI_OWEINACT) == 0) {
+ if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
VI_UNLOCK(vp);
VOP_UNLOCK(vp, 0, td);
vdrop(vp);
@@ -2259,3 +2495,33 @@
MNT_IUNLOCK(mp);
vn_finished_secondary_write(mp);
}
+
+/* Try to free snapdata associated with devvp */
+static void
+try_free_snapdata(struct vnode *devvp,
+ struct thread *td)
+{
+ struct snapdata *sn;
+ ufs2_daddr_t *snapblklist;
+
+ sn = devvp->v_rdev->si_snapdata;
+
+ if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
+ (devvp->v_vflag & VV_COPYONWRITE) == 0) {
+ VI_UNLOCK(devvp);
+ return;
+ }
+
+ devvp->v_rdev->si_snapdata = NULL;
+ devvp->v_vflag &= ~VV_COPYONWRITE;
+ snapblklist = sn->sn_blklist;
+ sn->sn_blklist = NULL;
+ sn->sn_listsize = 0;
+ lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
+ lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
+ lockdestroy(&sn->sn_lock);
+ free(sn, M_UFSMNT);
+ if (snapblklist != NULL)
+ FREE(snapblklist, M_UFSMNT);
+}
+#endif
Index: ffs_extern.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_extern.h,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_extern.h -L sys/ufs/ffs/ffs_extern.h -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_extern.h
+++ sys/ufs/ffs/ffs_extern.h
@@ -27,7 +27,7 @@
* SUCH DAMAGE.
*
* @(#)ffs_extern.h 8.6 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/ufs/ffs/ffs_extern.h,v 1.69.2.1 2006/03/13 03:07:37 jeff Exp $
+ * $FreeBSD: src/sys/ufs/ffs/ffs_extern.h,v 1.74 2007/02/17 08:25:43 mckusick Exp $
*/
#ifndef _UFS_FFS_EXTERN_H
@@ -61,6 +61,7 @@
ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
int ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void ffs_bdflush(struct bufobj *, struct buf *);
int ffs_copyonwrite(struct vnode *, struct buf *);
int ffs_flushfiles(struct mount *, int, struct thread *);
void ffs_fragacct(struct fs *, int, int32_t [], int);
@@ -72,6 +73,7 @@
int ffs_reallocblks(struct vop_reallocblks_args *);
int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
ufs2_daddr_t, int, int, struct ucred *, struct buf **);
+int ffs_sbupdate(struct ufsmount *, int, int);
void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t);
int ffs_snapblkfree(struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t);
void ffs_snapremove(struct vnode *vp);
Index: ufs_vfsops.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_vfsops.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/ufs/ufs/ufs_vfsops.c -L sys/ufs/ufs/ufs_vfsops.c -u -r1.1.1.2 -r1.2
--- sys/ufs/ufs/ufs_vfsops.c
+++ sys/ufs/ufs/ufs_vfsops.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vfsops.c,v 1.45.2.1 2006/02/20 00:53:15 yar Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vfsops.c,v 1.48 2007/02/01 02:13:53 mpp Exp $");
#include "opt_quota.h"
#include "opt_ufs.h"
@@ -60,7 +60,7 @@
#include <ufs/ufs/dirhash.h>
#endif
-MALLOC_DEFINE(M_UFSMNT, "UFS mount", "UFS mount structure");
+MALLOC_DEFINE(M_UFSMNT, "ufs_mount", "UFS mount structure");
/*
* Return the root of a filesystem.
@@ -86,11 +86,11 @@
* Do operations associated with quotas
*/
int
-ufs_quotactl(mp, cmds, uid, arg, td)
+ufs_quotactl(mp, cmds, id, arg, td)
struct mount *mp;
int cmds;
- uid_t uid;
- caddr_t arg;
+ uid_t id;
+ void *arg;
struct thread *td;
{
#ifndef QUOTA
@@ -98,10 +98,23 @@
#else
int cmd, type, error;
- if (uid == -1)
- uid = td->td_ucred->cr_ruid;
cmd = cmds >> SUBCMDSHIFT;
type = cmds & SUBCMDMASK;
+ if (id == -1) {
+ switch (type) {
+
+ case USRQUOTA:
+ id = td->td_ucred->cr_ruid;
+ break;
+
+ case GRPQUOTA:
+ id = td->td_ucred->cr_rgid;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+ }
if ((u_int)type >= MAXQUOTAS)
return (EINVAL);
@@ -118,15 +131,15 @@
break;
case Q_SETQUOTA:
- error = setquota(td, mp, uid, type, arg);
+ error = setquota(td, mp, id, type, arg);
break;
case Q_SETUSE:
- error = setuse(td, mp, uid, type, arg);
+ error = setuse(td, mp, id, type, arg);
break;
case Q_GETQUOTA:
- error = getquota(td, mp, uid, type, arg);
+ error = getquota(td, mp, id, type, arg);
break;
case Q_SYNC:
@@ -205,6 +218,6 @@
return (ESTALE);
}
*vpp = nvp;
- vnode_create_vobject_off(*vpp, DIP(ip, i_size), curthread);
+ vnode_create_vobject(*vpp, DIP(ip, i_size), curthread);
return (0);
}
Index: ufs_dirhash.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_dirhash.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/ufs_dirhash.c -L sys/ufs/ufs/ufs_dirhash.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/ufs_dirhash.c
+++ sys/ufs/ufs/ufs_dirhash.c
@@ -28,7 +28,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.21.2.1 2005/08/20 04:27:15 iedowse Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.23 2005/10/31 15:41:28 rwatson Exp $");
#include "opt_ufs.h"
@@ -62,7 +62,7 @@
#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0)
#define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n))
-static MALLOC_DEFINE(M_DIRHASH, "UFS dirhash", "UFS directory hash tables");
+static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");
static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
--- /dev/null
+++ sys/ufs/ufs/ufs_gjournal.c
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_gjournal.c,v 1.2 2007/05/28 00:28:15 pjd Exp $");
+
+#include "opt_ufs.h"
+
+#ifdef UFS_GJOURNAL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+
+#include <ufs/ufs/extattr.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/gjournal.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+/*
+ * Change the number of unreferenced inodes.
+ */
+static int
+ufs_gjournal_modref(struct vnode *vp, int count)
+{
+ struct cg *cgp;
+ struct buf *bp;
+ ufs2_daddr_t cgbno;
+ int error, cg;
+ struct cdev *dev;
+ struct inode *ip;
+ struct ufsmount *ump;
+ struct fs *fs;
+ struct vnode *devvp;
+ ino_t ino;
+
+ ip = VTOI(vp);
+ ump = ip->i_ump;
+ fs = ip->i_fs;
+ devvp = ip->i_devvp;
+ ino = ip->i_number;
+
+ cg = ino_to_cg(fs, ino);
+ if (devvp->v_type != VCHR) {
+ /* devvp is a snapshot */
+ dev = VTOI(devvp)->i_devvp->v_rdev;
+ cgbno = fragstoblks(fs, cgtod(fs, cg));
+ } else {
+ /* devvp is a normal disk device */
+ dev = devvp->v_rdev;
+ cgbno = fsbtodb(fs, cgtod(fs, cg));
+ }
+ if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+ panic("ffs_freefile: range: dev = %s, ino = %lu, fs = %s",
+ devtoname(dev), (u_long)ino, fs->fs_fsmnt);
+ if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) {
+ brelse(bp);
+ return (error);
+ }
+ cgp = (struct cg *)bp->b_data;
+ if (!cg_chkmagic(cgp)) {
+ brelse(bp);
+ return (0);
+ }
+ bp->b_xflags |= BX_BKGRDWRITE;
+ cgp->cg_unrefs += count;
+ UFS_LOCK(ump);
+ fs->fs_unrefs += count;
+ fs->fs_fmod = 1;
+ ACTIVECLEAR(fs, cg);
+ UFS_UNLOCK(ump);
+ bdwrite(bp);
+ return (0);
+}
+
+void
+ufs_gjournal_orphan(struct vnode *vp)
+{
+ struct inode *ip;
+
+ if (vp->v_mount->mnt_gjprovider == NULL)
+ return;
+ if (vp->v_usecount < 2 || (vp->v_vflag & VV_DELETED))
+ return;
+ ip = VTOI(vp);
+ if ((vp->v_type == VDIR && ip->i_nlink > 2) ||
+ (vp->v_type != VDIR && ip->i_nlink > 1)) {
+ return;
+ }
+ vp->v_vflag |= VV_DELETED;
+
+ ufs_gjournal_modref(vp, 1);
+}
+
+void
+ufs_gjournal_close(struct vnode *vp)
+{
+ struct inode *ip;
+
+ if (vp->v_mount->mnt_gjprovider == NULL)
+ return;
+ if (!(vp->v_vflag & VV_DELETED))
+ return;
+ ip = VTOI(vp);
+ if (ip->i_nlink > 0)
+ return;
+ ufs_gjournal_modref(vp, -1);
+}
+
+#endif /* UFS_GJOURNAL */
Index: dir.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/dir.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/dir.h -L sys/ufs/ufs/dir.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/dir.h
+++ sys/ufs/ufs/dir.h
@@ -32,7 +32,7 @@
* SUCH DAMAGE.
*
* @(#)dir.h 8.2 (Berkeley) 1/21/94
- * $FreeBSD: src/sys/ufs/ufs/dir.h,v 1.11 2005/01/07 02:29:26 imp Exp $
+ * $FreeBSD: src/sys/ufs/ufs/dir.h,v 1.12 2007/07/02 01:31:43 peter Exp $
*/
#ifndef _UFS_UFS_DIR_H_
@@ -110,7 +110,7 @@
*
*/
#define DIRECTSIZ(namlen) \
- (((int)&((struct direct *)0)->d_name + \
+ (((uintptr_t)&((struct direct *)0)->d_name + \
((namlen)+1)*sizeof(((struct direct *)0)->d_name[0]) + 3) & ~3)
#if (BYTE_ORDER == LITTLE_ENDIAN)
#define DIRSIZ(oldfmt, dp) \
Index: quota.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/quota.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/quota.h -L sys/ufs/ufs/quota.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/quota.h
+++ sys/ufs/ufs/quota.h
@@ -30,7 +30,7 @@
* SUCH DAMAGE.
*
* @(#)quota.h 8.3 (Berkeley) 8/19/94
- * $FreeBSD: src/sys/ufs/ufs/quota.h,v 1.27 2005/01/07 02:29:26 imp Exp $
+ * $FreeBSD: src/sys/ufs/ufs/quota.h,v 1.30 2007/03/14 08:54:07 kib Exp $
*/
#ifndef _UFS_UFS_QUOTA_H_
@@ -113,15 +113,18 @@
* filesystem. There is one allocated for each quota that exists on any
* filesystem for the current user or group. A cache is kept of recently
* used entries.
+ * (h) protected by dqhlock
*/
struct dquot {
- LIST_ENTRY(dquot) dq_hash; /* hash list */
- TAILQ_ENTRY(dquot) dq_freelist; /* free list */
+ LIST_ENTRY(dquot) dq_hash; /* (h) hash list */
+ TAILQ_ENTRY(dquot) dq_freelist; /* (h) free list */
+ struct mtx dq_lock; /* lock for concurrency */
u_int16_t dq_flags; /* flags, see below */
u_int16_t dq_type; /* quota type of this dquot */
- u_int32_t dq_cnt; /* count of active references */
+ u_int32_t dq_cnt; /* (h) count of active references */
u_int32_t dq_id; /* identifier this applies to */
- struct ufsmount *dq_ump; /* filesystem that this is taken from */
+ struct ufsmount *dq_ump; /* (h) filesystem that this is
+ taken from */
struct dqblk dq_dqb; /* actual usage & quotas */
};
/*
@@ -167,6 +170,23 @@
#define DQREF(dq) (dq)->dq_cnt++
#endif
+#define DQI_LOCK(dq) mtx_lock(&(dq)->dq_lock)
+#define DQI_UNLOCK(dq) mtx_unlock(&(dq)->dq_lock)
+
+#define DQI_WAIT(dq, prio, msg) do { \
+ while ((dq)->dq_flags & DQ_LOCK) { \
+ (dq)->dq_flags |= DQ_WANT; \
+ (void) msleep((dq), \
+ &(dq)->dq_lock, (prio), (msg), 0); \
+ } \
+} while (0)
+
+#define DQI_WAKEUP(dq) do { \
+ if ((dq)->dq_flags & DQ_WANT) \
+ wakeup((dq)); \
+ (dq)->dq_flags &= ~(DQ_WANT|DQ_LOCK); \
+} while (0)
+
struct inode;
struct mount;
struct thread;
@@ -174,17 +194,17 @@
struct vnode;
int chkdq(struct inode *, int64_t, struct ucred *, int);
-int chkiq(struct inode *, ino_t, struct ucred *, int);
+int chkiq(struct inode *, int, struct ucred *, int);
void dqinit(void);
void dqrele(struct vnode *, struct dquot *);
void dquninit(void);
int getinoquota(struct inode *);
-int getquota(struct thread *, struct mount *, u_long, int, caddr_t);
+int getquota(struct thread *, struct mount *, u_long, int, void *);
int qsync(struct mount *mp);
int quotaoff(struct thread *td, struct mount *, int);
-int quotaon(struct thread *td, struct mount *, int, caddr_t);
-int setquota(struct thread *, struct mount *, u_long, int, caddr_t);
-int setuse(struct thread *, struct mount *, u_long, int, caddr_t);
+int quotaon(struct thread *td, struct mount *, int, void *);
+int setquota(struct thread *, struct mount *, u_long, int, void *);
+int setuse(struct thread *, struct mount *, u_long, int, void *);
vfs_quotactl_t ufs_quotactl;
#else /* !_KERNEL */
Index: ufs_lookup.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_lookup.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_lookup.c -L sys/ufs/ufs/ufs_lookup.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_lookup.c
+++ sys/ufs/ufs/ufs_lookup.c
@@ -35,10 +35,11 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_lookup.c,v 1.77.2.2 2006/03/09 00:21:23 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_lookup.c,v 1.83 2007/03/14 08:50:27 kib Exp $");
#include "opt_ffs_broken_fixme.h"
#include "opt_ufs.h"
+#include "opt_quota.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -593,10 +594,12 @@
struct mount *mp;
mp = ITOV(ip)->v_mount;
- (void)printf("%s: bad dir ino %lu at offset %ld: %s\n",
- mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
if ((mp->mnt_flag & MNT_RDONLY) == 0)
- panic("ufs_dirbad: bad dir");
+ panic("ufs_dirbad: %s: bad dir ino %lu at offset %ld: %s",
+ mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
+ else
+ (void)printf("%s: bad dir ino %lu at offset %ld: %s\n",
+ mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
}
/*
@@ -700,7 +703,7 @@
struct buf *bp;
u_int dsize;
struct direct *ep, *nep;
- int error, ret, blkoff, loc, spacefree, flags;
+ int error, ret, blkoff, loc, spacefree, flags, namlen;
char *dirbuf;
td = curthread; /* XXX */
@@ -721,6 +724,13 @@
flags = BA_CLRBUF;
if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp))
flags |= IO_SYNC;
+#ifdef QUOTA
+ if ((error = getinoquota(dp)) != 0) {
+ if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
+ bdwrite(newdirbp);
+ return (error);
+ }
+#endif
if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ,
cr, flags, &bp)) != 0) {
if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
@@ -875,8 +885,16 @@
* Update the pointer fields in the previous entry (if any),
* copy in the new entry, and write out the block.
*/
+# if (BYTE_ORDER == LITTLE_ENDIAN)
+ if (OFSFMT(dvp))
+ namlen = ep->d_type;
+ else
+ namlen = ep->d_namlen;
+# else
+ namlen = ep->d_namlen;
+# endif
if (ep->d_ino == 0 ||
- (ep->d_ino == WINO &&
+ (ep->d_ino == WINO && namlen == dirp->d_namlen &&
bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
if (spacefree + dsize < newentrysize)
panic("ufs_direnter: compact1");
Index: ufs_vnops.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_vnops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_vnops.c -L sys/ufs/ufs/ufs_vnops.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_vnops.c
+++ sys/ufs/ufs/ufs_vnops.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vnops.c,v 1.271.2.4 2006/03/22 17:46:50 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vnops.c,v 1.291 2007/06/12 00:12:01 rwatson Exp $");
#include "opt_mac.h"
#include "opt_quota.h"
@@ -53,17 +53,20 @@
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/refcount.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/dirent.h>
#include <sys/lockf.h>
#include <sys/conf.h>
#include <sys/acl.h>
-#include <sys/mac.h>
#include <sys/jail.h>
#include <machine/mutex.h>
+#include <security/mac/mac_framework.h>
+
#include <sys/file.h> /* XXX */
#include <vm/vm.h>
@@ -81,6 +84,9 @@
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
+#ifdef UFS_GJOURNAL
+#include <ufs/ufs/gjournal.h>
+#endif
#include <ufs/ffs/ffs_extern.h>
@@ -121,39 +127,56 @@
0, DIRBLKSIZ - 12, 2, ".."
};
-void
-ufs_itimes(vp)
- struct vnode *vp;
+static void
+ufs_itimes_locked(struct vnode *vp)
{
struct inode *ip;
struct timespec ts;
+ ASSERT_VI_LOCKED(vp, __func__);
+
ip = VTOI(vp);
+ if ((vp->v_mount->mnt_flag & MNT_RDONLY) != 0)
+ goto out;
if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
return;
+
if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp))
ip->i_flag |= IN_LAZYMOD;
- else
+ else if (((vp->v_mount->mnt_kern_flag &
+ (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
+ (ip->i_flag & (IN_CHANGE | IN_UPDATE)))
ip->i_flag |= IN_MODIFIED;
- if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
- vfs_timestamp(&ts);
- if (ip->i_flag & IN_ACCESS) {
- DIP_SET(ip, i_atime, ts.tv_sec);
- DIP_SET(ip, i_atimensec, ts.tv_nsec);
- }
- if (ip->i_flag & IN_UPDATE) {
- DIP_SET(ip, i_mtime, ts.tv_sec);
- DIP_SET(ip, i_mtimensec, ts.tv_nsec);
- ip->i_modrev++;
- }
- if (ip->i_flag & IN_CHANGE) {
- DIP_SET(ip, i_ctime, ts.tv_sec);
- DIP_SET(ip, i_ctimensec, ts.tv_nsec);
- }
+ else if (ip->i_flag & IN_ACCESS)
+ ip->i_flag |= IN_LAZYACCESS;
+ vfs_timestamp(&ts);
+ if (ip->i_flag & IN_ACCESS) {
+ DIP_SET(ip, i_atime, ts.tv_sec);
+ DIP_SET(ip, i_atimensec, ts.tv_nsec);
+ }
+ if (ip->i_flag & IN_UPDATE) {
+ DIP_SET(ip, i_mtime, ts.tv_sec);
+ DIP_SET(ip, i_mtimensec, ts.tv_nsec);
+ ip->i_modrev++;
+ }
+ if (ip->i_flag & IN_CHANGE) {
+ DIP_SET(ip, i_ctime, ts.tv_sec);
+ DIP_SET(ip, i_ctimensec, ts.tv_nsec);
}
+
+ out:
ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
}
+void
+ufs_itimes(struct vnode *vp)
+{
+
+ VI_LOCK(vp);
+ ufs_itimes_locked(vp);
+ VI_UNLOCK(vp);
+}
+
/*
* Create a regular file
*/
@@ -245,7 +268,7 @@
if ((ip->i_flags & APPEND) &&
(ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
return (EPERM);
- vnode_create_vobject_off(vp, DIP(ip, i_size), ap->a_td);
+ vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td);
return (0);
}
@@ -265,10 +288,12 @@
} */ *ap;
{
struct vnode *vp = ap->a_vp;
+ int usecount;
VI_LOCK(vp);
- if (vp->v_usecount > 1)
- ufs_itimes(vp);
+ usecount = vp->v_usecount;
+ if (usecount > 1)
+ ufs_itimes_locked(vp);
VI_UNLOCK(vp);
return (0);
}
@@ -302,10 +327,6 @@
case VREG:
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
-#ifdef QUOTA
- if ((error = getinoquota(ip)) != 0)
- return (error);
-#endif
break;
default:
break;
@@ -364,7 +385,16 @@
struct inode *ip = VTOI(vp);
struct vattr *vap = ap->a_vap;
- ufs_itimes(vp);
+ VI_LOCK(vp);
+ ufs_itimes_locked(vp);
+ if (ip->i_ump->um_fstype == UFS1) {
+ vap->va_atime.tv_sec = ip->i_din1->di_atime;
+ vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
+ } else {
+ vap->va_atime.tv_sec = ip->i_din2->di_atime;
+ vap->va_atime.tv_nsec = ip->i_din2->di_atimensec;
+ }
+ VI_UNLOCK(vp);
/*
* Copy from inode table
*/
@@ -377,8 +407,6 @@
if (ip->i_ump->um_fstype == UFS1) {
vap->va_rdev = ip->i_din1->di_rdev;
vap->va_size = ip->i_din1->di_size;
- vap->va_atime.tv_sec = ip->i_din1->di_atime;
- vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
vap->va_mtime.tv_sec = ip->i_din1->di_mtime;
vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec;
vap->va_ctime.tv_sec = ip->i_din1->di_ctime;
@@ -389,8 +417,6 @@
} else {
vap->va_rdev = ip->i_din2->di_rdev;
vap->va_size = ip->i_din2->di_size;
- vap->va_atime.tv_sec = ip->i_din2->di_atime;
- vap->va_atime.tv_nsec = ip->i_din2->di_atimensec;
vap->va_mtime.tv_sec = ip->i_din2->di_mtime;
vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec;
vap->va_ctime.tv_sec = ip->i_din2->di_ctime;
@@ -465,8 +491,7 @@
* is non-zero; otherwise, they behave like unprivileged
* processes.
*/
- if (!suser_cred(cred,
- jail_chflags_allowed ? SUSER_ALLOWJAIL : 0)) {
+ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
if (ip->i_flags
& (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
error = securelevel_gt(cred, 0);
@@ -508,22 +533,35 @@
}
if (vap->va_size != VNOVAL) {
/*
- * Disallow write attempts on read-only filesystems;
- * unless the file is a socket, fifo, or a block or
- * character device resident on the filesystem.
+ * XXX most of the following special cases should be in
+ * callers instead of in N filesystems. The VDIR check
+ * mostly already is.
*/
switch (vp->v_type) {
case VDIR:
return (EISDIR);
case VLNK:
case VREG:
+ /*
+ * Truncation should have an effect in these cases.
+ * Disallow it if the filesystem is read-only or
+ * the file is being snapshotted.
+ */
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
if ((ip->i_flags & SF_SNAPSHOT) != 0)
return (EPERM);
break;
default:
- break;
+ /*
+ * According to POSIX, the result is unspecified
+ * for file types other than regular files,
+ * directories and shared memory objects. We
+ * don't support shared memory objects in the file
+ * system, and have dubious support for truncating
+ * symlinks. Just ignore the request in other cases.
+ */
+ return (0);
}
if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL,
cred, td)) != 0)
@@ -543,10 +581,19 @@
* super-user.
* If times is non-NULL, ... The caller must be the owner of
* the file or be the super-user.
+ *
+ * Possibly for historical reasons, try to use VADMIN in
+ * preference to VWRITE for a NULL timestamp. This means we
+ * will return EACCES in preference to EPERM if neither
+ * check succeeds.
*/
- if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) &&
- ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
- (error = VOP_ACCESS(vp, VWRITE, cred, td))))
+ if (vap->va_vaflags & VA_UTIMES_NULL) {
+ error = VOP_ACCESS(vp, VADMIN, cred, td);
+ if (error)
+ error = VOP_ACCESS(vp, VWRITE, cred, td);
+ } else
+ error = VOP_ACCESS(vp, VADMIN, cred, td);
+ if (error)
return (error);
if (vap->va_atime.tv_sec != VNOVAL)
ip->i_flag |= IN_ACCESS;
@@ -612,11 +659,11 @@
* jail(8).
*/
if (vp->v_type != VDIR && (mode & S_ISTXT)) {
- if (suser_cred(cred, SUSER_ALLOWJAIL))
+ if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0))
return (EFTYPE);
}
if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
- error = suser_cred(cred, SUSER_ALLOWJAIL);
+ error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
if (error)
return (error);
}
@@ -653,19 +700,19 @@
if (gid == (gid_t)VNOVAL)
gid = ip->i_gid;
/*
- * To modify the ownership of a file, must possess VADMIN
- * for that file.
+ * To modify the ownership of a file, must possess VADMIN for that
+ * file.
*/
if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
return (error);
/*
- * To change the owner of a file, or change the group of a file
- * to a group of which we are not a member, the caller must
- * have privilege.
+ * To change the owner of a file, or change the group of a file to a
+ * group of which we are not a member, the caller must have
+ * privilege.
*/
if ((uid != ip->i_uid ||
(gid != ip->i_gid && !groupmember(gid, cred))) &&
- (error = suser_cred(cred, SUSER_ALLOWJAIL)))
+ (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0)))
return (error);
ogid = ip->i_gid;
ouid = ip->i_uid;
@@ -736,9 +783,11 @@
panic("ufs_chown: lost quota");
#endif /* QUOTA */
ip->i_flag |= IN_CHANGE;
- if (suser_cred(cred, SUSER_ALLOWJAIL) && (ouid != uid || ogid != gid)) {
- ip->i_mode &= ~(ISUID | ISGID);
- DIP_SET(ip, i_mode, ip->i_mode);
+ if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
+ if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
+ ip->i_mode &= ~(ISUID | ISGID);
+ DIP_SET(ip, i_mode, ip->i_mode);
+ }
}
return (0);
}
@@ -764,6 +813,9 @@
error = EPERM;
goto out;
}
+#ifdef UFS_GJOURNAL
+ ufs_gjournal_orphan(vp);
+#endif
error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
if (ip->i_nlink <= 0)
vp->v_vflag |= VV_NOSYNC;
@@ -1047,7 +1099,7 @@
/*
* If ".." must be changed (ie the directory gets a new
* parent) then the source directory must not be in the
- * directory heirarchy above the target, as this would
+ * directory hierarchy above the target, as this would
* orphan everything below the source directory. Also
* the user must have write permission in the source so
* as to be able to change "..". We must repeat the call
@@ -1200,7 +1252,7 @@
DIP_SET(xp, i_nlink, xp->i_nlink);
xp->i_flag |= IN_CHANGE;
ioflag = IO_NORMAL;
- if (DOINGASYNC(tvp))
+ if (!DOINGASYNC(tvp))
ioflag |= IO_SYNC;
if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag,
tcnp->cn_cred, tcnp->cn_thread)) != 0)
@@ -1380,7 +1432,7 @@
* XXX This seems to never be accessed out of
* our context so a stack variable is ok.
*/
- ucred.cr_ref = 1;
+ refcount_init(&ucred.cr_ref, 1);
ucred.cr_uid = ip->i_uid;
ucred.cr_ngroups = 1;
ucred.cr_groups[0] = dp->i_gid;
@@ -1670,6 +1722,9 @@
error = EINVAL;
goto out;
}
+#ifdef UFS_GJOURNAL
+ ufs_gjournal_orphan(vp);
+#endif
/*
* Delete reference to directory before purging
* inode. If we crash in between, the directory
@@ -1707,7 +1762,7 @@
DIP_SET(ip, i_nlink, ip->i_nlink);
ip->i_flag |= IN_CHANGE;
ioflag = IO_NORMAL;
- if (DOINGASYNC(vp))
+ if (!DOINGASYNC(vp))
ioflag |= IO_SYNC;
error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred,
cnp->cn_thread);
@@ -1776,7 +1831,7 @@
struct uio *a_uio;
struct ucred *a_cred;
int *a_eofflag;
- int *ncookies;
+ int *a_ncookies;
u_long **a_cookies;
} */ *ap;
{
@@ -1978,10 +2033,12 @@
} */ *ap;
{
struct vnode *vp = ap->a_vp;
+ int usecount;
VI_LOCK(vp);
- if (vp->v_usecount > 1)
- ufs_itimes(vp);
+ usecount = vp->v_usecount;
+ if (usecount > 1)
+ ufs_itimes_locked(vp);
VI_UNLOCK(vp);
return (fifo_specops.vop_close(ap));
}
@@ -2211,7 +2268,7 @@
* XXX This seems to never be accessed out of our
* context so a stack variable is ok.
*/
- ucred.cr_ref = 1;
+ refcount_init(&ucred.cr_ref, 1);
ucred.cr_uid = ip->i_uid;
ucred.cr_ngroups = 1;
ucred.cr_groups[0] = pdir->i_gid;
@@ -2307,7 +2364,7 @@
if (DOINGSOFTDEP(tvp))
softdep_change_linkcnt(ip);
if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
- suser_cred(cnp->cn_cred, SUSER_ALLOWJAIL)) {
+ priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) {
ip->i_mode &= ~ISGID;
DIP_SET(ip, i_mode, ip->i_mode);
}
Index: ufsmount.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufsmount.h,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufsmount.h -L sys/ufs/ufs/ufsmount.h -u -r1.2 -r1.3
--- sys/ufs/ufs/ufsmount.h
+++ sys/ufs/ufs/ufsmount.h
@@ -27,7 +27,7 @@
* SUCH DAMAGE.
*
* @(#)ufsmount.h 8.6 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/ufs/ufs/ufsmount.h,v 1.34.2.2 2006/04/04 18:14:31 tegge Exp $
+ * $FreeBSD: src/sys/ufs/ufs/ufsmount.h,v 1.37 2006/04/03 22:23:23 tegge Exp $
*/
#ifndef _UFS_UFS_UFSMOUNT_H_
Index: ufs_acl.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_acl.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/ufs_acl.c -L sys/ufs/ufs/ufs_acl.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/ufs_acl.c
+++ sys/ufs/ufs/ufs_acl.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 1999-2001, 2003 Robert N. M. Watson
+ * Copyright (c) 1999-2003 Robert N. M. Watson
* All rights reserved.
*
* This software was developed by Robert Watson for the TrustedBSD Project.
@@ -31,7 +31,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_acl.c,v 1.20 2004/08/15 06:24:42 jmg Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_acl.c,v 1.21 2007/01/08 17:55:32 rwatson Exp $");
#include "opt_ufs.h"
#include "opt_quota.h"
Index: ufs_bmap.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_bmap.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/ufs_bmap.c -L sys/ufs/ufs/ufs_bmap.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/ufs_bmap.c
+++ sys/ufs/ufs/ufs_bmap.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_bmap.c,v 1.64.2.1 2005/11/26 21:19:20 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_bmap.c,v 1.66 2007/06/01 01:12:45 jeff Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -226,7 +226,7 @@
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
- curproc->p_stats->p_ru.ru_inblock++; /* XXX */
+ curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
brelse(bp);
--- /dev/null
+++ sys/ufs/ufs/gjournal.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/ufs/ufs/gjournal.h,v 1.1 2006/10/31 21:48:54 pjd Exp $
+ */
+
+#ifndef _UFS_UFS_GJOURNAL_H_
+#define _UFS_UFS_GJOURNAL_H_
+
+/*
+ * GEOM journal function prototypes.
+ */
+void ufs_gjournal_orphan(struct vnode *fvp);
+void ufs_gjournal_close(struct vnode *vp);
+#endif /* !_UFS_UFS_GJOURNAL_H_ */
Index: extattr.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/extattr.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/extattr.h -L sys/ufs/ufs/extattr.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/extattr.h
+++ sys/ufs/ufs/extattr.h
@@ -25,7 +25,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: src/sys/ufs/ufs/extattr.h,v 1.20 2005/01/31 08:16:45 imp Exp $
+ * $FreeBSD: src/sys/ufs/ufs/extattr.h,v 1.21 2007/03/06 08:13:20 mckusick Exp $
*/
/*
* Developed by the TrustedBSD Project.
@@ -69,6 +69,48 @@
/* data follows the header */
};
+/*
+ * This structure defines the required fields of an extended-attribute header.
+ */
+struct extattr {
+ int32_t ea_length; /* length of this attribute */
+ int8_t ea_namespace; /* name space of this attribute */
+ int8_t ea_contentpadlen; /* bytes of padding at end of attribute */
+ int8_t ea_namelength; /* length of attribute name */
+ char ea_name[1]; /* null-terminated attribute name */
+ /* extended attribute content follows */
+};
+
+/*
+ * These macros are used to access and manipulate an extended attribute:
+ *
+ * EXTATTR_NEXT(eap) returns a pointer to the next extended attribute
+ * following eap.
+ * EXTATTR_CONTENT(eap) returns a pointer to the extended attribute
+ * content referenced by eap.
+ * EXTATTR_CONTENT_SIZE(eap) returns the size of the extended attribute
+ * content referenced by eap.
+ * EXTATTR_SET_LENGTHS(eap, contentsize) called after initializing the
+ * attribute name to calculate and set the ea_length, ea_namelength,
+ * and ea_contentpadlen fields of the extended attribute structure.
+ */
+#define EXTATTR_NEXT(eap) \
+ ((struct extattr *)(((void *)(eap)) + (eap)->ea_length))
+#define EXTATTR_CONTENT(eap) (((void *)(eap)) + EXTATTR_BASE_LENGTH(eap))
+#define EXTATTR_CONTENT_SIZE(eap) \
+ ((eap)->ea_length - EXTATTR_BASE_LENGTH(eap) - (eap)->ea_contentpadlen)
+#define EXTATTR_BASE_LENGTH(eap) \
+ ((sizeof(struct extattr) + (eap)->ea_namelength + 7) & ~7)
+#define EXTATTR_SET_LENGTHS(eap, contentsize) do { \
+ KASSERT(((eap)->ea_name[0] != 0), \
+ ("Must initialize name before setting lengths")); \
+ (eap)->ea_namelength = strlen((eap)->ea_name); \
+ (eap)->ea_contentpadlen = ((contentsize) % 8) ? \
+ 8 - ((contentsize) % 8) : 0; \
+ (eap)->ea_length = EXTATTR_BASE_LENGTH(eap) + \
+ (contentsize) + (eap)->ea_contentpadlen; \
+} while (0)
+
#ifdef _KERNEL
#ifdef MALLOC_DECLARE
@@ -106,6 +148,13 @@
int ufs_setextattr(struct vop_setextattr_args *ap);
void ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td);
+#else
+
+/* User-level definition of KASSERT for macros above */
+#define KASSERT(cond, str) do { \
+ if (!(cond)) { printf("panic: "); printf(str); printf("\n"); exit(1); }\
+} while (0)
+
#endif /* !_KERNEL */
#endif /* !_UFS_UFS_EXTATTR_H_ */
Index: ufs_inode.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_inode.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_inode.c -L sys/ufs/ufs/ufs_inode.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_inode.c
+++ sys/ufs/ufs/ufs_inode.c
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_inode.c,v 1.63.2.2 2006/03/13 03:08:12 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_inode.c,v 1.69 2007/06/22 13:22:37 kib Exp $");
#include "opt_quota.h"
#include "opt_ufs.h"
@@ -57,6 +57,9 @@
#include <ufs/ufs/dir.h>
#include <ufs/ufs/dirhash.h>
#endif
+#ifdef UFS_GJOURNAL
+#include <ufs/ufs/gjournal.h>
+#endif
/*
* Last reference to an inode. If necessary, write or delete it.
@@ -83,9 +86,12 @@
*/
if (ip->i_mode == 0)
goto out;
- if (ip->i_effnlink == 0 && DOINGSOFTDEP(vp))
- softdep_releasefile(ip);
- if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+#ifdef UFS_GJOURNAL
+ ufs_gjournal_close(vp);
+#endif
+ if ((ip->i_effnlink == 0 && DOINGSOFTDEP(vp)) ||
+ (ip->i_nlink <= 0 &&
+ (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)) {
loop:
if (vn_start_secondary_write(vp, &mp, V_NOWAIT) != 0) {
/* Cannot delete file while file system is suspended */
@@ -112,6 +118,10 @@
return (0);
}
}
+ }
+ if (ip->i_effnlink == 0 && DOINGSOFTDEP(vp))
+ softdep_releasefile(ip);
+ if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
#ifdef QUOTA
if (!getinoquota(ip))
(void)chkiq(ip, -1, NOCRED, FORCE);
@@ -184,10 +194,9 @@
* Destroy the vm object and flush associated pages.
*/
vnode_destroy_vobject(vp);
- if (ip->i_flag & IN_LAZYMOD) {
+ if (ip->i_flag & IN_LAZYMOD)
ip->i_flag |= IN_MODIFIED;
- UFS_UPDATE(vp, 0);
- }
+ UFS_UPDATE(vp, 0);
/*
* Remove the inode from its hash chain.
*/
Index: ufs_quota.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_quota.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_quota.c -L sys/ufs/ufs/ufs_quota.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_quota.c
+++ sys/ufs/ufs/ufs_quota.c
@@ -33,7 +33,9 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_quota.c,v 1.74.2.2.2.1 2006/04/26 01:23:59 kris Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_quota.c,v 1.95 2007/06/12 00:12:01 rwatson Exp $");
+
+#include "opt_ffs.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -44,8 +46,10 @@
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/socket.h>
+#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
@@ -55,26 +59,26 @@
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
-SYSCTL_DECL(_security_bsd);
-
static int unprivileged_get_quota = 0;
SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW,
&unprivileged_get_quota, 0,
"Unprivileged processes may retrieve quotas for other uids and gids");
-static MALLOC_DEFINE(M_DQUOT, "UFS quota", "UFS quota entries");
+static MALLOC_DEFINE(M_DQUOT, "ufs_quota", "UFS quota entries");
/*
* Quota name to error message mapping.
*/
static char *quotatypes[] = INITQFNAMES;
-static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int);
-static int chkiqchg(struct inode *, ino_t, struct ucred *, int);
+static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int, int *);
+static int chkiqchg(struct inode *, int, struct ucred *, int, int *);
static int dqget(struct vnode *,
- u_long, struct ufsmount *, int, struct dquot **);
+ u_long, struct ufsmount *, int, struct dquot **);
static int dqsync(struct vnode *, struct dquot *);
static void dqflush(struct vnode *);
+static int quotaoff1(struct thread *td, struct mount *mp, int type);
+static int quotaoff_inchange(struct thread *td, struct mount *mp, int type);
#ifdef DIAGNOSTIC
static void dqref(struct dquot *);
@@ -94,16 +98,29 @@
struct inode *ip;
{
struct ufsmount *ump;
- struct vnode *vp = ITOV(ip);
+ struct vnode *vp;
int error;
+ vp = ITOV(ip);
+
+ /*
+ * Disk quotas must be turned off for system files. Currently
+ * snapshot and quota files.
+ */
+ if ((vp->v_vflag & VV_SYSTEM) != 0)
+ return (0);
+ /*
+ * XXX: Turn off quotas for files with a negative UID or GID.
+ * This prevents the creation of 100GB+ quota files.
+ */
+ if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
+ return (0);
ump = VFSTOUFS(vp->v_mount);
/*
* Set up the user quota based on file uid.
* EINVAL means that quotas are not enabled.
*/
- if (ip->i_dquot[USRQUOTA] == NODQUOT &&
- (error =
+ if ((error =
dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) &&
error != EINVAL)
return (error);
@@ -111,8 +128,7 @@
* Set up the group quota based on file gid.
* EINVAL means that quotas are not enabled.
*/
- if (ip->i_dquot[GRPQUOTA] == NODQUOT &&
- (error =
+ if ((error =
dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) &&
error != EINVAL)
return (error);
@@ -131,8 +147,21 @@
{
struct dquot *dq;
ufs2_daddr_t ncurblocks;
- int i, error;
+ struct vnode *vp = ITOV(ip);
+ int i, error, warn, do_check;
+ /*
+ * Disk quotas must be turned off for system files. Currently
+ * snapshot and quota files.
+ */
+ if ((vp->v_vflag & VV_SYSTEM) != 0)
+ return (0);
+ /*
+ * XXX: Turn off quotas for files with a negative UID or GID.
+ * This prevents the creation of 100GB+ quota files.
+ */
+ if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
+ return (0);
#ifdef DIAGNOSTIC
if ((flags & CHOWN) == 0)
chkdquot(ip);
@@ -143,10 +172,8 @@
for (i = 0; i < MAXQUOTAS; i++) {
if ((dq = ip->i_dquot[i]) == NODQUOT)
continue;
- while (dq->dq_flags & DQ_LOCK) {
- dq->dq_flags |= DQ_WANT;
- (void) tsleep(dq, PINOD+1, "chkdq1", 0);
- }
+ DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+1, "chkdq1");
ncurblocks = dq->dq_curblocks + change;
if (ncurblocks >= 0)
dq->dq_curblocks = ncurblocks;
@@ -154,24 +181,46 @@
dq->dq_curblocks = 0;
dq->dq_flags &= ~DQ_BLKS;
dq->dq_flags |= DQ_MOD;
+ DQI_UNLOCK(dq);
}
return (0);
}
- if ((flags & FORCE) == 0 && suser_cred(cred, 0)) {
- for (i = 0; i < MAXQUOTAS; i++) {
- if ((dq = ip->i_dquot[i]) == NODQUOT)
- continue;
- error = chkdqchg(ip, change, cred, i);
- if (error)
- return (error);
- }
- }
+ if ((flags & FORCE) == 0 &&
+ priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
+ do_check = 1;
+ else
+ do_check = 0;
for (i = 0; i < MAXQUOTAS; i++) {
if ((dq = ip->i_dquot[i]) == NODQUOT)
continue;
- while (dq->dq_flags & DQ_LOCK) {
- dq->dq_flags |= DQ_WANT;
- (void) tsleep(dq, PINOD+1, "chkdq2", 0);
+ warn = 0;
+ DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+1, "chkdq2");
+ if (do_check) {
+ error = chkdqchg(ip, change, cred, i, &warn);
+ if (error) {
+ /*
+ * Roll back user quota changes when
+ * group quota failed.
+ */
+ while (i > 0) {
+ --i;
+ dq = ip->i_dquot[i];
+ if (dq == NODQUOT)
+ continue;
+ DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+1, "chkdq3");
+ ncurblocks = dq->dq_curblocks - change;
+ if (ncurblocks >= 0)
+ dq->dq_curblocks = ncurblocks;
+ else
+ dq->dq_curblocks = 0;
+ dq->dq_flags &= ~DQ_BLKS;
+ dq->dq_flags |= DQ_MOD;
+ DQI_UNLOCK(dq);
+ }
+ return (error);
+ }
}
/* Reset timer when crossing soft limit */
if (dq->dq_curblocks + change >= dq->dq_bsoftlimit &&
@@ -180,6 +229,11 @@
VFSTOUFS(ITOV(ip)->v_mount)->um_btime[i];
dq->dq_curblocks += change;
dq->dq_flags |= DQ_MOD;
+ DQI_UNLOCK(dq);
+ if (warn)
+ uprintf("\n%s: warning, %s %s\n",
+ ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+ quotatypes[i], "disk quota exceeded");
}
return (0);
}
@@ -189,11 +243,12 @@
* Issue an error message if appropriate.
*/
static int
-chkdqchg(ip, change, cred, type)
+chkdqchg(ip, change, cred, type, warn)
struct inode *ip;
ufs2_daddr_t change;
struct ucred *cred;
int type;
+ int *warn;
{
struct dquot *dq = ip->i_dquot[type];
ufs2_daddr_t ncurblocks = dq->dq_curblocks + change;
@@ -204,11 +259,14 @@
if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) {
if ((dq->dq_flags & DQ_BLKS) == 0 &&
ip->i_uid == cred->cr_uid) {
+ dq->dq_flags |= DQ_BLKS;
+ DQI_UNLOCK(dq);
uprintf("\n%s: write failed, %s disk limit reached\n",
ITOV(ip)->v_mount->mnt_stat.f_mntonname,
quotatypes[type]);
- dq->dq_flags |= DQ_BLKS;
+ return (EDQUOT);
}
+ DQI_UNLOCK(dq);
return (EDQUOT);
}
/*
@@ -220,20 +278,21 @@
dq->dq_btime = time_second +
VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type];
if (ip->i_uid == cred->cr_uid)
- uprintf("\n%s: warning, %s %s\n",
- ITOV(ip)->v_mount->mnt_stat.f_mntonname,
- quotatypes[type], "disk quota exceeded");
+ *warn = 1;
return (0);
}
if (time_second > dq->dq_btime) {
if ((dq->dq_flags & DQ_BLKS) == 0 &&
ip->i_uid == cred->cr_uid) {
+ dq->dq_flags |= DQ_BLKS;
+ DQI_UNLOCK(dq);
uprintf("\n%s: write failed, %s %s\n",
ITOV(ip)->v_mount->mnt_stat.f_mntonname,
quotatypes[type],
"disk quota exceeded for too long");
- dq->dq_flags |= DQ_BLKS;
+ return (EDQUOT);
}
+ DQI_UNLOCK(dq);
return (EDQUOT);
}
}
@@ -246,13 +305,13 @@
int
chkiq(ip, change, cred, flags)
struct inode *ip;
- ino_t change;
+ int change;
struct ucred *cred;
int flags;
{
struct dquot *dq;
ino_t ncurinodes;
- int i, error;
+ int i, error, warn, do_check;
#ifdef DIAGNOSTIC
if ((flags & CHOWN) == 0)
@@ -260,41 +319,62 @@
#endif
if (change == 0)
return (0);
- /* XXX: change is unsigned */
if (change < 0) {
for (i = 0; i < MAXQUOTAS; i++) {
if ((dq = ip->i_dquot[i]) == NODQUOT)
continue;
- while (dq->dq_flags & DQ_LOCK) {
- dq->dq_flags |= DQ_WANT;
- (void) tsleep(dq, PINOD+1, "chkiq1", 0);
- }
+ DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+1, "chkiq1");
ncurinodes = dq->dq_curinodes + change;
/* XXX: ncurinodes is unsigned */
- if (ncurinodes >= 0)
+ if (dq->dq_curinodes != 0 && ncurinodes >= 0)
dq->dq_curinodes = ncurinodes;
else
dq->dq_curinodes = 0;
dq->dq_flags &= ~DQ_INODS;
dq->dq_flags |= DQ_MOD;
+ DQI_UNLOCK(dq);
}
return (0);
}
- if ((flags & FORCE) == 0 && suser_cred(cred, 0)) {
- for (i = 0; i < MAXQUOTAS; i++) {
- if ((dq = ip->i_dquot[i]) == NODQUOT)
- continue;
- error = chkiqchg(ip, change, cred, i);
- if (error)
- return (error);
- }
- }
+ if ((flags & FORCE) == 0 &&
+ priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
+ do_check = 1;
+ else
+ do_check = 0;
for (i = 0; i < MAXQUOTAS; i++) {
if ((dq = ip->i_dquot[i]) == NODQUOT)
continue;
- while (dq->dq_flags & DQ_LOCK) {
- dq->dq_flags |= DQ_WANT;
- (void) tsleep(dq, PINOD+1, "chkiq2", 0);
+ warn = 0;
+ DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+1, "chkiq2");
+ if (do_check) {
+ error = chkiqchg(ip, change, cred, i, &warn);
+ if (error) {
+ /*
+ * Roll back user quota changes when
+ * group quota failed.
+ */
+ while (i > 0) {
+ --i;
+ dq = ip->i_dquot[i];
+ if (dq == NODQUOT)
+ continue;
+ DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+1, "chkiq3");
+ ncurinodes = dq->dq_curinodes - change;
+ /* XXX: ncurinodes is unsigned */
+ if (dq->dq_curinodes != 0 &&
+ ncurinodes >= 0)
+ dq->dq_curinodes = ncurinodes;
+ else
+ dq->dq_curinodes = 0;
+ dq->dq_flags &= ~DQ_INODS;
+ dq->dq_flags |= DQ_MOD;
+ DQI_UNLOCK(dq);
+ }
+ return (error);
+ }
}
/* Reset timer when crossing soft limit */
if (dq->dq_curinodes + change >= dq->dq_isoftlimit &&
@@ -303,6 +383,11 @@
VFSTOUFS(ITOV(ip)->v_mount)->um_itime[i];
dq->dq_curinodes += change;
dq->dq_flags |= DQ_MOD;
+ DQI_UNLOCK(dq);
+ if (warn)
+ uprintf("\n%s: warning, %s %s\n",
+ ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+ quotatypes[i], "inode quota exceeded");
}
return (0);
}
@@ -312,11 +397,12 @@
* Issue an error message if appropriate.
*/
static int
-chkiqchg(ip, change, cred, type)
+chkiqchg(ip, change, cred, type, warn)
struct inode *ip;
- ino_t change;
+ int change;
struct ucred *cred;
int type;
+ int *warn;
{
struct dquot *dq = ip->i_dquot[type];
ino_t ncurinodes = dq->dq_curinodes + change;
@@ -327,11 +413,14 @@
if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) {
if ((dq->dq_flags & DQ_INODS) == 0 &&
ip->i_uid == cred->cr_uid) {
+ dq->dq_flags |= DQ_INODS;
+ DQI_UNLOCK(dq);
uprintf("\n%s: write failed, %s inode limit reached\n",
ITOV(ip)->v_mount->mnt_stat.f_mntonname,
quotatypes[type]);
- dq->dq_flags |= DQ_INODS;
+ return (EDQUOT);
}
+ DQI_UNLOCK(dq);
return (EDQUOT);
}
/*
@@ -343,20 +432,21 @@
dq->dq_itime = time_second +
VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type];
if (ip->i_uid == cred->cr_uid)
- uprintf("\n%s: warning, %s %s\n",
- ITOV(ip)->v_mount->mnt_stat.f_mntonname,
- quotatypes[type], "inode quota exceeded");
+ *warn = 1;
return (0);
}
if (time_second > dq->dq_itime) {
if ((dq->dq_flags & DQ_INODS) == 0 &&
ip->i_uid == cred->cr_uid) {
- uprintf("\n%s: write failed, %s %s\n",
- ITOV(ip)->v_mount->mnt_stat.f_mntonname,
- quotatypes[type],
- "inode quota exceeded for too long");
dq->dq_flags |= DQ_INODS;
+ DQI_UNLOCK(dq);
+ uprintf("\n%s: write failed, %s %s\n",
+ ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+ quotatypes[type],
+ "inode quota exceeded for too long");
+ return (EDQUOT);
}
+ DQI_UNLOCK(dq);
return (EDQUOT);
}
}
@@ -373,17 +463,34 @@
struct inode *ip;
{
struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount);
+ struct vnode *vp = ITOV(ip);
int i;
+ /*
+ * Disk quotas must be turned off for system files. Currently
+ * these are snapshots and quota files.
+ */
+ if ((vp->v_vflag & VV_SYSTEM) != 0)
+ return;
+ /*
+ * XXX: Turn off quotas for files with a negative UID or GID.
+ * This prevents the creation of 100GB+ quota files.
+ */
+ if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
+ return;
+
+ UFS_LOCK(ump);
for (i = 0; i < MAXQUOTAS; i++) {
if (ump->um_quotas[i] == NULLVP ||
(ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING)))
continue;
if (ip->i_dquot[i] == NODQUOT) {
+ UFS_UNLOCK(ump);
vprint("chkdquot: missing dquot", ITOV(ip));
panic("chkdquot: missing dquot");
}
}
+ UFS_UNLOCK(ump);
}
#endif
@@ -399,40 +506,59 @@
struct thread *td;
struct mount *mp;
int type;
- caddr_t fname;
+ void *fname;
{
- struct ufsmount *ump = VFSTOUFS(mp);
+ struct ufsmount *ump;
struct vnode *vp, **vpp;
struct vnode *mvp;
struct dquot *dq;
- int error, flags;
+ int error, flags, vfslocked;
struct nameidata nd;
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = priv_check(td, PRIV_UFS_QUOTAON);
if (error)
return (error);
- vpp = &ump->um_quotas[type];
- NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, td);
+ ump = VFSTOUFS(mp);
+ dq = NODQUOT;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_USERSPACE, fname, td);
flags = FREAD | FWRITE;
- error = vn_open(&nd, &flags, 0, -1);
+ error = vn_open(&nd, &flags, 0, NULL);
if (error)
return (error);
+ vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
VOP_UNLOCK(vp, 0, td);
if (vp->v_type != VREG) {
(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
+ VFS_UNLOCK_GIANT(vfslocked);
return (EACCES);
}
- if (*vpp != vp)
- quotaoff(td, mp, type);
- ump->um_qflags[type] |= QTF_OPENING;
+
+ UFS_LOCK(ump);
+ if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) {
+ UFS_UNLOCK(ump);
+ (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (EALREADY);
+ }
+ ump->um_qflags[type] |= QTF_OPENING|QTF_CLOSING;
+ MNT_ILOCK(mp);
mp->mnt_flag |= MNT_QUOTA;
+ MNT_IUNLOCK(mp);
+ UFS_UNLOCK(ump);
+
+ vpp = &ump->um_quotas[type];
+ if (*vpp != vp)
+ quotaoff1(td, mp, type);
+
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
vp->v_vflag |= VV_SYSTEM;
VOP_UNLOCK(vp, 0, td);
*vpp = vp;
+ VFS_UNLOCK_GIANT(vfslocked);
/*
* Save the credential of the process that turned on quotas.
* Set up the time limits for this quota.
@@ -448,6 +574,13 @@
dqrele(NULLVP, dq);
}
/*
+ * Allow the getdq from getinoquota below to read the quota
+ * from file.
+ */
+ UFS_LOCK(ump);
+ ump->um_qflags[type] &= ~QTF_CLOSING;
+ UFS_UNLOCK(ump);
+ /*
* Search vnodes associated with this mount point,
* adding references to quota file being opened.
* NB: only need to add dquot's for inodes being modified.
@@ -478,35 +611,49 @@
}
}
MNT_IUNLOCK(mp);
+
+ if (error)
+ quotaoff_inchange(td, mp, type);
+ UFS_LOCK(ump);
ump->um_qflags[type] &= ~QTF_OPENING;
- if (error)
- quotaoff(td, mp, type);
+ KASSERT((ump->um_qflags[type] & QTF_CLOSING) == 0,
+ ("quotaon: leaking flags"));
+ UFS_UNLOCK(ump);
+
return (error);
}
/*
- * Q_QUOTAOFF - turn off disk quotas for a filesystem.
+ * Main code to turn off disk quotas for a filesystem. Does not change
+ * flags.
*/
-int
-quotaoff(td, mp, type)
+static int
+quotaoff1(td, mp, type)
struct thread *td;
struct mount *mp;
int type;
{
struct vnode *vp;
struct vnode *qvp, *mvp;
- struct ufsmount *ump = VFSTOUFS(mp);
+ struct ufsmount *ump;
struct dquot *dq;
struct inode *ip;
+ struct ucred *cr;
+ int vfslocked;
int error;
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
- if (error)
- return (error);
+ ump = VFSTOUFS(mp);
- if ((qvp = ump->um_quotas[type]) == NULLVP)
+ UFS_LOCK(ump);
+ KASSERT((ump->um_qflags[type] & QTF_CLOSING) != 0,
+ ("quotaoff1: flags are invalid"));
+ if ((qvp = ump->um_quotas[type]) == NULLVP) {
+ UFS_UNLOCK(ump);
return (0);
- ump->um_qflags[type] |= QTF_CLOSING;
+ }
+ cr = ump->um_cred[type];
+ UFS_UNLOCK(ump);
+
/*
* Search vnodes associated with this mount point,
* deleting any references to quota file being closed.
@@ -535,24 +682,88 @@
MNT_ILOCK(mp);
}
MNT_IUNLOCK(mp);
+
dqflush(qvp);
+ /* Clear um_quotas before closing the quota vnode to prevent
+ * access to the closed vnode from dqget/dqsync
+ */
+ UFS_LOCK(ump);
+ ump->um_quotas[type] = NULLVP;
+ ump->um_cred[type] = NOCRED;
+ UFS_UNLOCK(ump);
+
+ vfslocked = VFS_LOCK_GIANT(qvp->v_mount);
vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY, td);
qvp->v_vflag &= ~VV_SYSTEM;
VOP_UNLOCK(qvp, 0, td);
error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td);
- ump->um_quotas[type] = NULLVP;
- crfree(ump->um_cred[type]);
- ump->um_cred[type] = NOCRED;
+ VFS_UNLOCK_GIANT(vfslocked);
+ crfree(cr);
+
+ return (error);
+}
+
+/*
+ * Turns off quotas, assumes that ump->um_qflags are already checked
+ * and QTF_CLOSING is set to indicate operation in progress. Fixes
+ * ump->um_qflags and mp->mnt_flag after.
+ */
+int
+quotaoff_inchange(td, mp, type)
+ struct thread *td;
+ struct mount *mp;
+ int type;
+{
+ struct ufsmount *ump;
+ int i;
+ int error;
+
+ error = quotaoff1(td, mp, type);
+
+ ump = VFSTOUFS(mp);
+ UFS_LOCK(ump);
ump->um_qflags[type] &= ~QTF_CLOSING;
- for (type = 0; type < MAXQUOTAS; type++)
- if (ump->um_quotas[type] != NULLVP)
+ for (i = 0; i < MAXQUOTAS; i++)
+ if (ump->um_quotas[i] != NULLVP)
break;
- if (type == MAXQUOTAS)
+ if (i == MAXQUOTAS) {
+ MNT_ILOCK(mp);
mp->mnt_flag &= ~MNT_QUOTA;
+ MNT_IUNLOCK(mp);
+ }
+ UFS_UNLOCK(ump);
return (error);
}
/*
+ * Q_QUOTAOFF - turn off disk quotas for a filesystem.
+ */
+int
+quotaoff(td, mp, type)
+ struct thread *td;
+ struct mount *mp;
+ int type;
+{
+ struct ufsmount *ump;
+ int error;
+
+ error = priv_check(td, PRIV_UFS_QUOTAOFF);
+ if (error)
+ return (error);
+
+ ump = VFSTOUFS(mp);
+ UFS_LOCK(ump);
+ if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) {
+ UFS_UNLOCK(ump);
+ return (EALREADY);
+ }
+ ump->um_qflags[type] |= QTF_CLOSING;
+ UFS_UNLOCK(ump);
+
+ return (quotaoff_inchange(td, mp, type));
+}
+
+/*
* Q_GETQUOTA - return current values in a dqblk structure.
*/
int
@@ -561,7 +772,7 @@
struct mount *mp;
u_long id;
int type;
- caddr_t addr;
+ void *addr;
{
struct dquot *dq;
int error;
@@ -569,15 +780,16 @@
switch (type) {
case USRQUOTA:
if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) {
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = priv_check(td, PRIV_VFS_GETQUOTA);
if (error)
return (error);
}
break;
case GRPQUOTA:
- if (!groupmember(id, td->td_ucred) && !unprivileged_get_quota) {
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ if (!groupmember(id, td->td_ucred) &&
+ !unprivileged_get_quota) {
+ error = priv_check(td, PRIV_VFS_GETQUOTA);
if (error)
return (error);
}
@@ -587,10 +799,11 @@
return (EINVAL);
}
+ dq = NODQUOT;
error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq);
if (error)
return (error);
- error = copyout((caddr_t)&dq->dq_dqb, addr, sizeof (struct dqblk));
+ error = copyout(&dq->dq_dqb, addr, sizeof (struct dqblk));
dqrele(NULLVP, dq);
return (error);
}
@@ -604,29 +817,32 @@
struct mount *mp;
u_long id;
int type;
- caddr_t addr;
+ void *addr;
{
struct dquot *dq;
struct dquot *ndq;
- struct ufsmount *ump = VFSTOUFS(mp);
+ struct ufsmount *ump;
struct dqblk newlim;
int error;
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = priv_check(td, PRIV_VFS_SETQUOTA);
if (error)
return (error);
- error = copyin(addr, (caddr_t)&newlim, sizeof (struct dqblk));
+ ump = VFSTOUFS(mp);
+ error = copyin(addr, &newlim, sizeof (struct dqblk));
if (error)
return (error);
+
+ ndq = NODQUOT;
+ ump = VFSTOUFS(mp);
+
error = dqget(NULLVP, id, ump, type, &ndq);
if (error)
return (error);
dq = ndq;
- while (dq->dq_flags & DQ_LOCK) {
- dq->dq_flags |= DQ_WANT;
- (void) tsleep(dq, PINOD+1, "setqta", 0);
- }
+ DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+1, "setqta");
/*
* Copy all but the current values.
* Reset time limit if previously had no soft limit or were
@@ -657,6 +873,7 @@
else
dq->dq_flags &= ~DQ_FAKE;
dq->dq_flags |= DQ_MOD;
+ DQI_UNLOCK(dq);
dqrele(NULLVP, dq);
return (0);
}
@@ -670,29 +887,32 @@
struct mount *mp;
u_long id;
int type;
- caddr_t addr;
+ void *addr;
{
struct dquot *dq;
- struct ufsmount *ump = VFSTOUFS(mp);
+ struct ufsmount *ump;
struct dquot *ndq;
struct dqblk usage;
int error;
- error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+ error = priv_check(td, PRIV_UFS_SETUSE);
if (error)
return (error);
- error = copyin(addr, (caddr_t)&usage, sizeof (struct dqblk));
+ ump = VFSTOUFS(mp);
+ error = copyin(addr, &usage, sizeof (struct dqblk));
if (error)
return (error);
+
+ ump = VFSTOUFS(mp);
+ ndq = NODQUOT;
+
error = dqget(NULLVP, id, ump, type, &ndq);
if (error)
return (error);
dq = ndq;
- while (dq->dq_flags & DQ_LOCK) {
- dq->dq_flags |= DQ_WANT;
- (void) tsleep(dq, PINOD+1, "setuse", 0);
- }
+ DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+1, "setuse");
/*
* Reset time limit if have a soft limit and were
* previously under it, but are now over it.
@@ -710,6 +930,7 @@
if (dq->dq_curinodes < dq->dq_isoftlimit)
dq->dq_flags &= ~DQ_INODS;
dq->dq_flags |= DQ_MOD;
+ DQI_UNLOCK(dq);
dqrele(NULLVP, dq);
return (0);
}
@@ -731,9 +952,11 @@
* Check if the mount point has any quotas.
* If not, simply return.
*/
+ UFS_LOCK(ump);
for (i = 0; i < MAXQUOTAS; i++)
if (ump->um_quotas[i] != NULLVP)
break;
+ UFS_UNLOCK(ump);
if (i == MAXQUOTAS)
return (0);
/*
@@ -761,7 +984,7 @@
}
for (i = 0; i < MAXQUOTAS; i++) {
dq = VTOI(vp)->i_dquot[i];
- if (dq != NODQUOT && (dq->dq_flags & DQ_MOD))
+ if (dq != NODQUOT)
dqsync(vp, dq);
}
vput(vp);
@@ -786,6 +1009,18 @@
static TAILQ_HEAD(dqfreelist, dquot) dqfreelist;
static long numdquot, desireddquot = DQUOTINC;
+/*
+ * Lock to protect quota hash, dq free list and dq_cnt ref counters of
+ * _all_ dqs.
+ */
+struct mtx dqhlock;
+
+#define DQH_LOCK() mtx_lock(&dqhlock)
+#define DQH_UNLOCK() mtx_unlock(&dqhlock)
+
+static struct dquot *dqhashfind(struct dqhash *dqh, u_long id,
+ struct vnode *dqvp);
+
/*
* Initialize the quota system.
*/
@@ -793,6 +1028,7 @@
dqinit()
{
+ mtx_init(&dqhlock, "dqhlock", NULL, MTX_DEF);
dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash);
TAILQ_INIT(&dqfreelist);
}
@@ -808,8 +1044,35 @@
hashdestroy(dqhashtbl, M_DQUOT, dqhash);
while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) {
TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
+ mtx_destroy(&dq->dq_lock);
free(dq, M_DQUOT);
}
+ mtx_destroy(&dqhlock);
+}
+
+static struct dquot *
+dqhashfind(dqh, id, dqvp)
+ struct dqhash *dqh;
+ u_long id;
+ struct vnode *dqvp;
+{
+ struct dquot *dq;
+
+ mtx_assert(&dqhlock, MA_OWNED);
+ LIST_FOREACH(dq, dqh, dq_hash) {
+ if (dq->dq_id != id ||
+ dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
+ continue;
+ /*
+ * Cache hit with no references. Take
+ * the structure off the free list.
+ */
+ if (dq->dq_cnt == 0)
+ TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
+ DQREF(dq);
+ return (dq);
+ }
+ return (NODQUOT);
}
/*
@@ -825,50 +1088,122 @@
struct dquot **dqp;
{
struct thread *td = curthread; /* XXX */
- struct dquot *dq;
+ struct dquot *dq, *dq1;
struct dqhash *dqh;
struct vnode *dqvp;
struct iovec aiov;
struct uio auio;
- int error;
+ int vfslocked, dqvplocked, error;
+#ifdef DEBUG_VFS_LOCKS
+ if (vp != NULLVP)
+ ASSERT_VOP_ELOCKED(vp, "dqget");
+#endif
+
+ if (vp != NULLVP && *dqp != NODQUOT) {
+ return (0);
+ }
+
+ /* XXX: Disallow negative id values to prevent the
+ * creation of 100GB+ quota data files.
+ */
+ if ((int)id < 0)
+ return (EINVAL);
+
+ UFS_LOCK(ump);
dqvp = ump->um_quotas[type];
if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) {
*dqp = NODQUOT;
+ UFS_UNLOCK(ump);
return (EINVAL);
}
+ vref(dqvp);
+ UFS_UNLOCK(ump);
+ error = 0;
+ dqvplocked = 0;
+
/*
* Check the cache first.
*/
dqh = DQHASH(dqvp, id);
- LIST_FOREACH(dq, dqh, dq_hash) {
- if (dq->dq_id != id ||
- dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
- continue;
+ DQH_LOCK();
+ dq = dqhashfind(dqh, id, dqvp);
+ if (dq != NULL) {
+ DQH_UNLOCK();
+hfound: DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+1, "dqget");
+ DQI_UNLOCK(dq);
+ if (dq->dq_ump == NULL) {
+ dqrele(vp, dq);
+ dq = NODQUOT;
+ error = EIO;
+ }
+ *dqp = dq;
+ vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
+ if (dqvplocked)
+ vput(dqvp);
+ else
+ vrele(dqvp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (error);
+ }
+
+ /*
+ * Quota vnode lock is before DQ_LOCK. Acquire dqvp lock there
+ * since new dq will appear on the hash chain DQ_LOCKed.
+ */
+ if (vp != dqvp) {
+ DQH_UNLOCK();
+ vn_lock(dqvp, LK_SHARED | LK_RETRY, td);
+ dqvplocked = 1;
+ DQH_LOCK();
/*
- * Cache hit with no references. Take
- * the structure off the free list.
+ * Recheck the cache after sleep for quota vnode lock.
*/
- if (dq->dq_cnt == 0)
- TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
- DQREF(dq);
- *dqp = dq;
- return (0);
+ dq = dqhashfind(dqh, id, dqvp);
+ if (dq != NULL) {
+ DQH_UNLOCK();
+ goto hfound;
+ }
}
+
/*
- * Not in cache, allocate a new one.
+ * Not in cache, allocate a new one or take it from the
+ * free list.
*/
if (TAILQ_FIRST(&dqfreelist) == NODQUOT &&
numdquot < MAXQUOTAS * desiredvnodes)
desireddquot += DQUOTINC;
if (numdquot < desireddquot) {
- dq = (struct dquot *)malloc(sizeof *dq, M_DQUOT,
- M_WAITOK | M_ZERO);
numdquot++;
+ DQH_UNLOCK();
+ dq1 = (struct dquot *)malloc(sizeof *dq, M_DQUOT,
+ M_WAITOK | M_ZERO);
+ mtx_init(&dq1->dq_lock, "dqlock", NULL, MTX_DEF);
+ DQH_LOCK();
+ /*
+ * Recheck the cache after sleep for memory.
+ */
+ dq = dqhashfind(dqh, id, dqvp);
+ if (dq != NULL) {
+ numdquot--;
+ DQH_UNLOCK();
+ mtx_destroy(&dq1->dq_lock);
+ free(dq1, M_DQUOT);
+ goto hfound;
+ }
+ dq = dq1;
} else {
if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) {
+ DQH_UNLOCK();
tablefull("dquot");
*dqp = NODQUOT;
+ vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
+ if (dqvplocked)
+ vput(dqvp);
+ else
+ vrele(dqvp);
+ VFS_UNLOCK_GIANT(vfslocked);
return (EUSERS);
}
if (dq->dq_cnt || (dq->dq_flags & DQ_MOD))
@@ -877,44 +1212,57 @@
if (dq->dq_ump != NULL)
LIST_REMOVE(dq, dq_hash);
}
+
/*
- * Initialize the contents of the dquot structure.
+ * Dq is put into hash already locked to prevent parallel
+ * usage while it is being read from file.
*/
- if (vp != dqvp)
- vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td);
- LIST_INSERT_HEAD(dqh, dq, dq_hash);
- DQREF(dq);
dq->dq_flags = DQ_LOCK;
dq->dq_id = id;
- dq->dq_ump = ump;
dq->dq_type = type;
+ dq->dq_ump = ump;
+ LIST_INSERT_HEAD(dqh, dq, dq_hash);
+ DQREF(dq);
+ DQH_UNLOCK();
+
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
- aiov.iov_base = (caddr_t)&dq->dq_dqb;
+ aiov.iov_base = &dq->dq_dqb;
aiov.iov_len = sizeof (struct dqblk);
auio.uio_resid = sizeof (struct dqblk);
- auio.uio_offset = (off_t)(id * sizeof (struct dqblk));
+ auio.uio_offset = (off_t)id * sizeof (struct dqblk);
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_rw = UIO_READ;
auio.uio_td = (struct thread *)0;
+
+ vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]);
if (auio.uio_resid == sizeof(struct dqblk) && error == 0)
- bzero((caddr_t)&dq->dq_dqb, sizeof(struct dqblk));
- if (vp != dqvp)
- VOP_UNLOCK(dqvp, 0, td);
- if (dq->dq_flags & DQ_WANT)
- wakeup(dq);
- dq->dq_flags = 0;
+ bzero(&dq->dq_dqb, sizeof(struct dqblk));
+ if (dqvplocked)
+ vput(dqvp);
+ else
+ vrele(dqvp);
+ VFS_UNLOCK_GIANT(vfslocked);
/*
* I/O error in reading quota file, release
* quota structure and reflect problem to caller.
*/
if (error) {
+ DQH_LOCK();
+ dq->dq_ump = NULL;
LIST_REMOVE(dq, dq_hash);
+ DQH_UNLOCK();
+ DQI_LOCK(dq);
+ if (dq->dq_flags & DQ_WANT)
+ wakeup(dq);
+ dq->dq_flags = 0;
+ DQI_UNLOCK(dq);
dqrele(vp, dq);
*dqp = NODQUOT;
return (error);
}
+ DQI_LOCK(dq);
/*
* Check for no limit to enforce.
* Initialize time values if necessary.
@@ -923,11 +1271,21 @@
dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
dq->dq_flags |= DQ_FAKE;
if (dq->dq_id != 0) {
- if (dq->dq_btime == 0)
+ if (dq->dq_btime == 0) {
dq->dq_btime = time_second + ump->um_btime[type];
- if (dq->dq_itime == 0)
+ if (dq->dq_bsoftlimit &&
+ dq->dq_curblocks >= dq->dq_bsoftlimit)
+ dq->dq_flags |= DQ_MOD;
+ }
+ if (dq->dq_itime == 0) {
dq->dq_itime = time_second + ump->um_itime[type];
+ if (dq->dq_isoftlimit &&
+ dq->dq_curinodes >= dq->dq_isoftlimit)
+ dq->dq_flags |= DQ_MOD;
+ }
}
+ DQI_WAKEUP(dq);
+ DQI_UNLOCK(dq);
*dqp = dq;
return (0);
}
@@ -956,15 +1314,24 @@
if (dq == NODQUOT)
return;
+ DQH_LOCK();
if (dq->dq_cnt > 1) {
dq->dq_cnt--;
+ DQH_UNLOCK();
return;
}
- if (dq->dq_flags & DQ_MOD)
- (void) dqsync(vp, dq);
+ DQH_UNLOCK();
+
+ (void) dqsync(vp, dq);
+
+ DQH_LOCK();
if (--dq->dq_cnt > 0)
+ {
+ DQH_UNLOCK();
return;
+ }
TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist);
+ DQH_UNLOCK();
}
/*
@@ -979,48 +1346,75 @@
struct vnode *dqvp;
struct iovec aiov;
struct uio auio;
- int error;
+ int vfslocked, error;
struct mount *mp;
+ struct ufsmount *ump;
+
+#ifdef DEBUG_VFS_LOCKS
+ if (vp != NULL)
+ ASSERT_VOP_ELOCKED(vp, "dqsync");
+#endif
mp = NULL;
+ error = 0;
if (dq == NODQUOT)
panic("dqsync: dquot");
- if ((dq->dq_flags & DQ_MOD) == 0)
+ if ((ump = dq->dq_ump) == NULL)
return (0);
- if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP)
+ UFS_LOCK(ump);
+ if ((dqvp = ump->um_quotas[dq->dq_type]) == NULLVP)
panic("dqsync: file");
+ vref(dqvp);
+ UFS_UNLOCK(ump);
+
+ vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
+ DQI_LOCK(dq);
+ if ((dq->dq_flags & DQ_MOD) == 0) {
+ DQI_UNLOCK(dq);
+ vrele(dqvp);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return (0);
+ }
+ DQI_UNLOCK(dq);
+
(void) vn_start_secondary_write(dqvp, &mp, V_WAIT);
if (vp != dqvp)
vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td);
- while (dq->dq_flags & DQ_LOCK) {
- dq->dq_flags |= DQ_WANT;
- (void) tsleep(dq, PINOD+2, "dqsync", 0);
- if ((dq->dq_flags & DQ_MOD) == 0) {
- if (vp != dqvp)
- VOP_UNLOCK(dqvp, 0, td);
- vn_finished_secondary_write(mp);
- return (0);
- }
- }
+
+ VFS_UNLOCK_GIANT(vfslocked);
+ DQI_LOCK(dq);
+ DQI_WAIT(dq, PINOD+2, "dqsync");
+ if ((dq->dq_flags & DQ_MOD) == 0)
+ goto out;
dq->dq_flags |= DQ_LOCK;
+ DQI_UNLOCK(dq);
+
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
- aiov.iov_base = (caddr_t)&dq->dq_dqb;
+ aiov.iov_base = &dq->dq_dqb;
aiov.iov_len = sizeof (struct dqblk);
auio.uio_resid = sizeof (struct dqblk);
- auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk));
+ auio.uio_offset = (off_t)dq->dq_id * sizeof (struct dqblk);
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_rw = UIO_WRITE;
auio.uio_td = (struct thread *)0;
+ vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]);
+ VFS_UNLOCK_GIANT(vfslocked);
if (auio.uio_resid && error == 0)
error = EIO;
- if (dq->dq_flags & DQ_WANT)
- wakeup(dq);
- dq->dq_flags &= ~(DQ_MOD|DQ_LOCK|DQ_WANT);
+
+ DQI_LOCK(dq);
+ DQI_WAKEUP(dq);
+ dq->dq_flags &= ~DQ_MOD;
+out: DQI_UNLOCK(dq);
+ vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
if (vp != dqvp)
- VOP_UNLOCK(dqvp, 0, td);
+ vput(dqvp);
+ else
+ vrele(dqvp);
vn_finished_secondary_write(mp);
+ VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -1039,6 +1433,7 @@
* file off their hash chains (they will eventually
* fall off the head of the free list and be re-used).
*/
+ DQH_LOCK();
for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) {
for (dq = LIST_FIRST(dqh); dq; dq = nextdq) {
nextdq = LIST_NEXT(dq, dq_hash);
@@ -1050,4 +1445,5 @@
dq->dq_ump = (struct ufsmount *)0;
}
}
+ DQH_UNLOCK();
}
Index: inode.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/inode.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/inode.h -L sys/ufs/ufs/inode.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/inode.h
+++ sys/ufs/ufs/inode.h
@@ -32,7 +32,7 @@
* SUCH DAMAGE.
*
* @(#)inode.h 8.9 (Berkeley) 5/14/95
- * $FreeBSD: src/sys/ufs/ufs/inode.h,v 1.49 2005/03/14 10:21:16 phk Exp $
+ * $FreeBSD: src/sys/ufs/ufs/inode.h,v 1.51 2006/10/10 09:20:54 kib Exp $
*/
#ifndef _UFS_UFS_INODE_H_
@@ -55,6 +55,13 @@
* is the permanent meta-data associated with the file which is read in
* from the permanent dinode from long term storage when the file becomes
* active, and is put back when the file is no longer being used.
+ *
+ * An inode may only be changed while holding either the exclusive
+ * vnode lock or the shared vnode lock and the vnode interlock. We use
+ * the latter only for "read" and "get" operations that require
+ * changing i_flag, or a timestamp. This locking protocol allows executing
+ * those operations without having to upgrade the vnode lock from shared to
+ * exclusive.
*/
struct inode {
TAILQ_ENTRY(inode) i_nextsnap; /* snapshot file list. */
@@ -119,6 +126,8 @@
#define IN_RENAME 0x0010 /* Inode is being renamed. */
#define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */
#define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */
+#define IN_LAZYACCESS 0x0100 /* Process IN_ACCESS after the
+ suspension finished */
#define i_devvp i_ump->um_devvp
#define i_umbufobj i_ump->um_bo
@@ -166,7 +175,7 @@
/* Determine if soft dependencies are being done */
#define DOINGSOFTDEP(vp) ((vp)->v_mount->mnt_flag & MNT_SOFTDEP)
-#define DOINGASYNC(vp) ((vp)->v_mount->mnt_flag & MNT_ASYNC)
+#define DOINGASYNC(vp) ((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC)
/* This overlays the fid structure (see mount.h). */
struct ufid {
Index: ufs_extattr.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_extattr.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_extattr.c -L sys/ufs/ufs/ufs_extattr.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_extattr.c
+++ sys/ufs/ufs/ufs_extattr.c
@@ -38,7 +38,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_extattr.c,v 1.81.2.3 2006/03/13 03:08:08 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_extattr.c,v 1.86 2007/06/01 14:33:11 kib Exp $");
#include "opt_ufs.h"
@@ -48,6 +48,7 @@
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/fcntl.h>
+#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
@@ -320,7 +321,7 @@
{
int error;
- error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, -1);
+ error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, NULL);
if (error) {
printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed "
"with %d\n", error);
@@ -699,7 +700,8 @@
* Processes with privilege, but in jail, are not allowed to
* configure extended attributes.
*/
- if ((error = suser(td))) {
+ error = priv_check(td, PRIV_UFS_EXTATTRCTL);
+ if (error) {
if (filename_vp != NULL)
VOP_UNLOCK(filename_vp, 0, td);
return (error);
Index: dinode.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/dinode.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/dinode.h -L sys/ufs/ufs/dinode.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/dinode.h
+++ sys/ufs/ufs/dinode.h
@@ -62,7 +62,7 @@
* SUCH DAMAGE.
*
* @(#)dinode.h 8.3 (Berkeley) 1/21/94
- * $FreeBSD: src/sys/ufs/ufs/dinode.h,v 1.15 2005/01/07 02:29:26 imp Exp $
+ * $FreeBSD: src/sys/ufs/ufs/dinode.h,v 1.17 2006/05/21 21:55:29 maxim Exp $
*/
#ifndef _UFS_UFS_DINODE_H_
@@ -79,7 +79,7 @@
/*
* The Whiteout inode# is a dummy non-zero inode number which will
* never be allocated to a real file. It is used as a place holder
- * in the directory entry which has been tagged as a DT_W entry.
+ * in the directory entry which has been tagged as a DT_WHT entry.
* See the comments about ROOTINO above.
*/
#define WINO ((ino_t)1)
@@ -129,7 +129,7 @@
u_int32_t di_gid; /* 8: File group. */
u_int32_t di_blksize; /* 12: Inode blocksize. */
u_int64_t di_size; /* 16: File byte count. */
- u_int64_t di_blocks; /* 24: Bytes actually held. */
+ u_int64_t di_blocks; /* 24: Blocks actually held. */
ufs_time_t di_atime; /* 32: Last access time. */
ufs_time_t di_mtime; /* 40: Last modified time. */
ufs_time_t di_ctime; /* 48: Last inode change time. */
More information about the Midnightbsd-cvs
mailing list