[Midnightbsd-cvs] src: sys/vm: Bring in ufs and vm changes from FreeBSD.

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Fri Sep 12 15:40:17 EDT 2008


Log Message:
-----------
Bring in ufs and vm changes from FreeBSD.

Modified Files:
--------------
    src/sys/vm:
        device_pager.c (r1.1.1.1 -> r1.2)
        memguard.c (r1.1.1.1 -> r1.2)
        memguard.h (r1.1.1.1 -> r1.2)
        phys_pager.c (r1.1.1.1 -> r1.2)
        pmap.h (r1.1.1.1 -> r1.2)
        swap_pager.c (r1.1.1.1 -> r1.2)
        swap_pager.h (r1.1.1.1 -> r1.2)
        uma.h (r1.1.1.1 -> r1.2)
        uma_core.c (r1.1.1.2 -> r1.2)
        uma_dbg.c (r1.1.1.1 -> r1.2)
        uma_dbg.h (r1.1.1.1 -> r1.2)
        uma_int.h (r1.1.1.1 -> r1.2)
        vm.h (r1.1.1.1 -> r1.2)
        vm_contig.c (r1.2 -> r1.3)
        vm_extern.h (r1.2 -> r1.3)
        vm_fault.c (r1.2 -> r1.3)
        vm_glue.c (r1.2 -> r1.3)
        vm_kern.c (r1.1.1.1 -> r1.2)
        vm_kern.h (r1.1.1.1 -> r1.2)
        vm_map.c (r1.1.1.1 -> r1.2)
        vm_map.h (r1.1.1.1 -> r1.2)
        vm_meter.c (r1.1.1.1 -> r1.2)
        vm_mmap.c (r1.1.1.2 -> r1.2)
        vm_object.c (r1.2 -> r1.3)
        vm_object.h (r1.1.1.1 -> r1.2)
        vm_page.c (r1.1.1.1 -> r1.2)
        vm_page.h (r1.1.1.1 -> r1.2)
        vm_pageout.c (r1.2 -> r1.3)
        vm_pageq.c (r1.1.1.1 -> r1.2)
        vm_pager.c (r1.1.1.1 -> r1.2)
        vm_param.h (r1.1.1.1 -> r1.2)
        vm_zeroidle.c (r1.2 -> r1.3)
        vnode_pager.c (r1.2 -> r1.3)
    src/sys/ufs/ffs:
        ffs_alloc.c (r1.2 -> r1.3)
        ffs_balloc.c (r1.1.1.1 -> r1.2)
        ffs_extern.h (r1.2 -> r1.3)
        ffs_inode.c (r1.1.1.1 -> r1.2)
        ffs_rawread.c (r1.2 -> r1.3)
        ffs_snapshot.c (r1.2 -> r1.3)
        ffs_softdep.c (r1.2 -> r1.3)
        ffs_vfsops.c (r1.2 -> r1.3)
        ffs_vnops.c (r1.1.1.1 -> r1.2)
        fs.h (r1.1.1.1 -> r1.2)
        softdep.h (r1.2 -> r1.3)
    src/sys/ufs/ufs:
        dinode.h (r1.1.1.1 -> r1.2)
        dir.h (r1.1.1.1 -> r1.2)
        extattr.h (r1.1.1.1 -> r1.2)
        inode.h (r1.1.1.1 -> r1.2)
        quota.h (r1.1.1.1 -> r1.2)
        ufs_acl.c (r1.1.1.1 -> r1.2)
        ufs_bmap.c (r1.1.1.1 -> r1.2)
        ufs_dirhash.c (r1.1.1.1 -> r1.2)
        ufs_extattr.c (r1.2 -> r1.3)
        ufs_inode.c (r1.2 -> r1.3)
        ufs_lookup.c (r1.2 -> r1.3)
        ufs_quota.c (r1.2 -> r1.3)
        ufs_vfsops.c (r1.1.1.2 -> r1.2)
        ufs_vnops.c (r1.2 -> r1.3)
        ufsmount.h (r1.2 -> r1.3)

Added Files:
-----------
    src/sys/vm:
        redzone.c (r1.1)
        redzone.h (r1.1)
        vm_phys.c (r1.1)
        vm_phys.h (r1.1)
    src/sys/ufs/ufs:
        gjournal.h (r1.1)
        ufs_gjournal.c (r1.1)

Removed Files:
-------------
    src/sys/ufs/ffs:
        README.softupdates

-------------- next part --------------
Index: vm_pageout.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_pageout.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_pageout.c -L sys/vm/vm_pageout.c -u -r1.2 -r1.3
--- sys/vm/vm_pageout.c
+++ sys/vm/vm_pageout.c
@@ -73,7 +73,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_pageout.c,v 1.268.2.3 2006/03/09 00:02:51 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_pageout.c,v 1.292 2007/09/25 06:25:06 alc Exp $");
 
 #include "opt_vm.h"
 #include <sys/param.h>
@@ -85,6 +85,7 @@
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
+#include <sys/mount.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
@@ -113,7 +114,6 @@
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_pmap_collect(void);
 static void vm_pageout_scan(int pass);
 
 struct proc *pageproc;
@@ -146,6 +146,9 @@
 #if !defined(NO_SWAPPING)
 static int vm_pageout_req_swapout;	/* XXX */
 static int vm_daemon_needed;
+static struct mtx vm_daemon_mtx;
+/* Allow for use by vm_pageout before vm_daemon is initialized. */
+MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
 #endif
 static int vm_max_launder = 32;
 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
@@ -207,7 +210,7 @@
 #if !defined(NO_SWAPPING)
 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
-static void vm_req_vmdaemon(void);
+static void vm_req_vmdaemon(int req);
 #endif
 static void vm_pageout_page_stats(void);
 
@@ -237,7 +240,8 @@
 	 * Initialize our marker
 	 */
 	bzero(&marker, sizeof(marker));
-	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+	marker.flags = PG_FICTITIOUS | PG_MARKER;
+	marker.oflags = VPO_BUSY;
 	marker.queue = m->queue;
 	marker.wire_count = 1;
 
@@ -292,10 +296,10 @@
 	 */
 
 	/*
-	 * Don't mess with the page if it's busy, held, or special
+	 * Can't clean the page if it's busy or held.
 	 */
 	if ((m->hold_count != 0) ||
-	    ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) {
+	    ((m->busy != 0) || (m->oflags & VPO_BUSY))) {
 		return 0;
 	}
 
@@ -338,8 +342,7 @@
 			ib = 0;
 			break;
 		}
-		if (((p->queue - p->pc) == PQ_CACHE) ||
-		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
+		if ((p->oflags & VPO_BUSY) || p->busy) {
 			ib = 0;
 			break;
 		}
@@ -368,8 +371,7 @@
 
 		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
 			break;
-		if (((p->queue - p->pc) == PQ_CACHE) ||
-		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
+		if ((p->oflags & VPO_BUSY) || p->busy) {
 			break;
 		}
 		vm_page_test_dirty(p);
@@ -432,20 +434,19 @@
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
 		vm_page_io_start(mc[i]);
-		pmap_page_protect(mc[i], VM_PROT_READ);
+		pmap_remove_write(mc[i]);
 	}
 	vm_page_unlock_queues();
 	vm_object_pip_add(object, count);
 
-	vm_pager_put_pages(object, mc, count,
-	    (flags | ((object == kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
-	    pageout_status);
+	vm_pager_put_pages(object, mc, count, flags, pageout_status);
 
 	vm_page_lock_queues();
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
-		KASSERT((mt->flags & PG_WRITEABLE) == 0,
+		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
+		    (mt->flags & PG_WRITEABLE) == 0,
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
@@ -539,7 +540,8 @@
 			if (p->wire_count != 0 ||
 			    p->hold_count != 0 ||
 			    p->busy != 0 ||
-			    (p->flags & (PG_BUSY|PG_UNMANAGED)) ||
+			    (p->oflags & VPO_BUSY) ||
+			    (p->flags & PG_UNMANAGED) ||
 			    !pmap_page_exists_quick(pmap, p)) {
 				p = next;
 				continue;
@@ -667,35 +669,6 @@
 #endif		/* !defined(NO_SWAPPING) */
 
 /*
- * This routine is very drastic, but can save the system
- * in a pinch.
- */
-static void
-vm_pageout_pmap_collect(void)
-{
-	int i;
-	vm_page_t m;
-	static int warningdone;
-
-	if (pmap_pagedaemon_waken == 0)
-		return;
-	if (warningdone < 5) {
-		printf("collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
-		warningdone++;
-	}
-	vm_page_lock_queues();
-	for (i = 0; i < vm_page_array_size; i++) {
-		m = &vm_page_array[i];
-		if (m->wire_count || m->hold_count || m->busy ||
-		    (m->flags & (PG_BUSY | PG_UNMANAGED)))
-			continue;
-		pmap_remove_all(m);
-	}
-	vm_page_unlock_queues();
-	pmap_pagedaemon_waken = 0;
-}
-	
-/*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  */
 static void
@@ -709,12 +682,10 @@
 	struct thread *td;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
-	int actcount, cache_cur, cache_first_failure;
-	static int cache_last_free;
+	int actcount;
 	int vnodes_skipped = 0;
 	int maxlaunder;
 
-	mtx_lock(&Giant);
 	/*
 	 * Decrease registered cache sizes.
 	 */
@@ -723,10 +694,6 @@
 	 * We do this explicitly after the caches have been drained above.
 	 */
 	uma_reclaim();
-	/*
-	 * Do whatever cleanup that the pmap code can.
-	 */
-	vm_pageout_pmap_collect();
 
 	addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit);
 
@@ -740,7 +707,8 @@
 	 * Initialize our marker
 	 */
 	bzero(&marker, sizeof(marker));
-	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+	marker.flags = PG_FICTITIOUS | PG_MARKER;
+	marker.oflags = VPO_BUSY;
 	marker.queue = PQ_INACTIVE;
 	marker.wire_count = 1;
 
@@ -775,7 +743,7 @@
 
 		cnt.v_pdpages++;
 
-		if (m->queue != PQ_INACTIVE) {
+		if (VM_PAGE_GETQUEUE(m) != PQ_INACTIVE) {
 			goto rescan0;
 		}
 
@@ -807,7 +775,7 @@
 			addl_page_shortage++;
 			continue;
 		}
-		if (m->busy || (m->flags & PG_BUSY)) {
+		if (m->busy || (m->oflags & VPO_BUSY)) {
 			VM_OBJECT_UNLOCK(object);
 			addl_page_shortage++;
 			continue;
@@ -883,7 +851,6 @@
 			/*
 			 * Invalid pages can be easily freed
 			 */
-			pmap_remove_all(m);
 			vm_page_free(m);
 			cnt.v_dfree++;
 			--page_shortage;
@@ -917,9 +884,9 @@
 			 * pressure where there are insufficient clean pages
 			 * on the inactive queue, we may have to go all out.
 			 */
-			int swap_pageouts_ok;
+			int swap_pageouts_ok, vfslocked = 0;
 			struct vnode *vp = NULL;
-			struct mount *mp;
+			struct mount *mp = NULL;
 
 			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
 				swap_pageouts_ok = 1;
@@ -975,24 +942,24 @@
 			 */
 			if (object->type == OBJT_VNODE) {
 				vp = object->handle;
-				mp = NULL;
 				if (vp->v_type == VREG &&
 				    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+					KASSERT(mp == NULL,
+					    ("vm_pageout_scan: mp != NULL"));
 					++pageout_lock_miss;
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
-					vp = NULL;
 					goto unlock_and_continue;
 				}
 				vm_page_unlock_queues();
-				VI_LOCK(vp);
+				vm_object_reference_locked(object);
 				VM_OBJECT_UNLOCK(object);
-				if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK |
-				    LK_TIMELOCK, curthread)) {
+				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+				if (vget(vp, LK_EXCLUSIVE | LK_TIMELOCK,
+				    curthread)) {
 					VM_OBJECT_LOCK(object);
 					vm_page_lock_queues();
 					++pageout_lock_miss;
-					vn_finished_write(mp);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vp = NULL;
@@ -1004,12 +971,10 @@
 				 * The page might have been moved to another
 				 * queue during potential blocking in vget()
 				 * above.  The page might have been freed and
-				 * reused for another vnode.  The object might
-				 * have been reused for another vnode.
+				 * reused for another vnode.
 				 */
-				if (m->queue != PQ_INACTIVE ||
+				if (VM_PAGE_GETQUEUE(m) != PQ_INACTIVE ||
 				    m->object != object ||
-				    object->handle != vp ||
 				    TAILQ_NEXT(m, pageq) != &marker) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
@@ -1018,11 +983,11 @@
 	
 				/*
 				 * The page may have been busied during the
-				 * blocking in vput();  We don't move the
+				 * blocking in vget().  We don't move the
 				 * page back onto the end of the queue so that
 				 * statistics are more correct if we don't.
 				 */
-				if (m->busy || (m->flags & PG_BUSY)) {
+				if (m->busy || (m->oflags & VPO_BUSY)) {
 					goto unlock_and_continue;
 				}
 
@@ -1054,9 +1019,12 @@
 			}
 unlock_and_continue:
 			VM_OBJECT_UNLOCK(object);
-			if (vp) {
+			if (mp != NULL) {
 				vm_page_unlock_queues();
-				vput(vp);
+				if (vp != NULL)
+					vput(vp);
+				VFS_UNLOCK_GIANT(vfslocked);
+				vm_object_deallocate(object);
 				vn_finished_write(mp);
 				vm_page_lock_queues();
 			}
@@ -1086,7 +1054,7 @@
 
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
 
-		KASSERT(m->queue == PQ_ACTIVE,
+		KASSERT(VM_PAGE_INQUEUE2(m, PQ_ACTIVE),
 		    ("vm_pageout_scan: page %p isn't active", m));
 
 		next = TAILQ_NEXT(m, pageq);
@@ -1106,7 +1074,7 @@
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
-		    (m->flags & PG_BUSY) ||
+		    (m->oflags & VPO_BUSY) ||
 		    (m->hold_count != 0)) {
 			VM_OBJECT_UNLOCK(object);
 			vm_pageq_requeue(m);
@@ -1169,43 +1137,6 @@
 		VM_OBJECT_UNLOCK(object);
 		m = next;
 	}
-
-	/*
-	 * We try to maintain some *really* free pages, this allows interrupt
-	 * code to be guaranteed space.  Since both cache and free queues 
-	 * are considered basically 'free', moving pages from cache to free
-	 * does not effect other calculations.
-	 */
-	cache_cur = cache_last_free;
-	cache_first_failure = -1;
-	while (cnt.v_free_count < cnt.v_free_reserved && (cache_cur =
-	    (cache_cur + PQ_PRIME2) & PQ_L2_MASK) != cache_first_failure) {
-		TAILQ_FOREACH(m, &vm_page_queues[PQ_CACHE + cache_cur].pl,
-		    pageq) {
-			KASSERT(m->dirty == 0,
-			    ("Found dirty cache page %p", m));
-			KASSERT(!pmap_page_is_mapped(m),
-			    ("Found mapped cache page %p", m));
-			KASSERT((m->flags & PG_UNMANAGED) == 0,
-			    ("Found unmanaged cache page %p", m));
-			KASSERT(m->wire_count == 0,
-			    ("Found wired cache page %p", m));
-			if (m->hold_count == 0 && VM_OBJECT_TRYLOCK(object =
-			    m->object)) {
-				KASSERT((m->flags & PG_BUSY) == 0 &&
-				    m->busy == 0, ("Found busy cache page %p",
-				    m));
-				vm_page_free(m);
-				VM_OBJECT_UNLOCK(object);
-				cnt.v_dfree++;
-				cache_last_free = cache_cur;
-				cache_first_failure = -1;
-				break;
-			}
-		}
-		if (m == NULL && cache_first_failure == -1)
-			cache_first_failure = cache_cur;
-	}
 	vm_page_unlock_queues();
 #if !defined(NO_SWAPPING)
 	/*
@@ -1214,8 +1145,7 @@
 	if (vm_swap_idle_enabled) {
 		static long lsec;
 		if (time_second != lsec) {
-			vm_pageout_req_swapout |= VM_SWAP_IDLE;
-			vm_req_vmdaemon();
+			vm_req_vmdaemon(VM_SWAP_IDLE);
 			lsec = time_second;
 		}
 	}
@@ -1230,10 +1160,8 @@
 		if (vnodes_skipped && vm_page_count_min())
 			(void) speedup_syncer();
 #if !defined(NO_SWAPPING)
-		if (vm_swap_enabled && vm_page_count_target()) {
-			vm_req_vmdaemon();
-			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
-		}
+		if (vm_swap_enabled && vm_page_count_target())
+			vm_req_vmdaemon(VM_SWAP_NORMAL);
 #endif
 	}
 
@@ -1275,22 +1203,24 @@
 			 * If the process is in a non-running type state,
 			 * don't touch it.  Check all the threads individually.
 			 */
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
+				thread_lock(td);
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
+					thread_unlock(td);
 					breakout = 1;
 					break;
 				}
+				thread_unlock(td);
 			}
+			PROC_SUNLOCK(p);
 			if (breakout) {
-				mtx_unlock_spin(&sched_lock);
 				PROC_UNLOCK(p);
 				continue;
 			}
-			mtx_unlock_spin(&sched_lock);
 			/*
 			 * get the process size
 			 */
@@ -1316,14 +1246,13 @@
 		sx_sunlock(&allproc_lock);
 		if (bigproc != NULL) {
 			killproc(bigproc, "out of swap space");
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(bigproc);
 			sched_nice(bigproc, PRIO_MIN);
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(bigproc);
 			PROC_UNLOCK(bigproc);
 			wakeup(&cnt.v_free_count);
 		}
 	}
-	mtx_unlock(&Giant);
 }
 
 /*
@@ -1363,7 +1292,7 @@
 	while ((m != NULL) && (pcount-- > 0)) {
 		int actcount;
 
-		KASSERT(m->queue == PQ_ACTIVE,
+		KASSERT(VM_PAGE_INQUEUE2(m, PQ_ACTIVE),
 		    ("vm_pageout_page_stats: page %p isn't active", m));
 
 		next = TAILQ_NEXT(m, pageq);
@@ -1384,7 +1313,7 @@
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
-		    (m->flags & PG_BUSY) ||
+		    (m->oflags & VPO_BUSY) ||
 		    (m->hold_count != 0)) {
 			VM_OBJECT_UNLOCK(object);
 			vm_pageq_requeue(m);
@@ -1454,7 +1383,7 @@
 	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
 	    cnt.v_interrupt_free_min;
 	cnt.v_free_reserved = vm_pageout_page_count +
-	    cnt.v_pageout_free_min + (cnt.v_page_count / 768) + PQ_L2_SIZE;
+	    cnt.v_pageout_free_min + (cnt.v_page_count / 768);
 	cnt.v_free_severe = cnt.v_free_min / 2;
 	cnt.v_free_min += cnt.v_free_reserved;
 	cnt.v_free_severe += cnt.v_free_reserved;
@@ -1508,13 +1437,13 @@
 	 * The pageout daemon is never done, so loop forever.
 	 */
 	while (TRUE) {
-		vm_page_lock_queues();
 		/*
 		 * If we have enough free memory, wakeup waiters.  Do
 		 * not clear vm_pages_needed until we reach our target,
 		 * otherwise we may be woken up over and over again and
 		 * waste a lot of cpu.
 		 */
+		mtx_lock(&vm_page_queue_free_mtx);
 		if (vm_pages_needed && !vm_page_count_min()) {
 			if (!vm_paging_needed())
 				vm_pages_needed = 0;
@@ -1528,8 +1457,9 @@
 			 */
 			++pass;
 			if (pass > 1)
-				msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM,
-				       "psleep", hz/2);
+				msleep(&vm_pages_needed,
+				    &vm_page_queue_free_mtx, PVM, "psleep",
+				    hz / 2);
 		} else {
 			/*
 			 * Good enough, sleep & handle stats.  Prime the pass
@@ -1539,10 +1469,13 @@
 				pass = 1;
 			else
 				pass = 0;
-			error = msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM,
-				    "psleep", vm_pageout_stats_interval * hz);
+			error = msleep(&vm_pages_needed,
+			    &vm_page_queue_free_mtx, PVM, "psleep",
+			    vm_pageout_stats_interval * hz);
 			if (error && !vm_pages_needed) {
+				mtx_unlock(&vm_page_queue_free_mtx);
 				pass = 0;
+				vm_page_lock_queues();
 				vm_pageout_page_stats();
 				vm_page_unlock_queues();
 				continue;
@@ -1550,16 +1483,16 @@
 		}
 		if (vm_pages_needed)
 			cnt.v_pdwakeups++;
-		vm_page_unlock_queues();
+		mtx_unlock(&vm_page_queue_free_mtx);
 		vm_pageout_scan(pass);
 	}
 }
 
 /*
- * Unless the page queue lock is held by the caller, this function
+ * Unless the free page queue lock is held by the caller, this function
  * should be regarded as advisory.  Specifically, the caller should
  * not msleep() on &cnt.v_free_count following this function unless
- * the page queue lock is held until the msleep() is performed.
+ * the free page queue lock is held until the msleep() is performed.
  */
 void
 pagedaemon_wakeup()
@@ -1573,14 +1506,17 @@
 
 #if !defined(NO_SWAPPING)
 static void
-vm_req_vmdaemon()
+vm_req_vmdaemon(int req)
 {
 	static int lastrun = 0;
 
+	mtx_lock(&vm_daemon_mtx);
+	vm_pageout_req_swapout |= req;
 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
 		wakeup(&vm_daemon_needed);
 		lastrun = ticks;
 	}
+	mtx_unlock(&vm_daemon_mtx);
 }
 
 static void
@@ -1589,21 +1525,23 @@
 	struct rlimit rsslim;
 	struct proc *p;
 	struct thread *td;
-	int breakout;
+	int breakout, swapout_flags;
 
-	mtx_lock(&Giant);
 	while (TRUE) {
-		tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
-		if (vm_pageout_req_swapout) {
-			swapout_procs(vm_pageout_req_swapout);
-			vm_pageout_req_swapout = 0;
-		}
+		mtx_lock(&vm_daemon_mtx);
+		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0);
+		swapout_flags = vm_pageout_req_swapout;
+		vm_pageout_req_swapout = 0;
+		mtx_unlock(&vm_daemon_mtx);
+		if (swapout_flags)
+			swapout_procs(swapout_flags);
+
 		/*
 		 * scan the processes for exceeding their rlimits or if
 		 * process is swapped out -- deactivate pages
 		 */
 		sx_slock(&allproc_lock);
-		LIST_FOREACH(p, &allproc, p_list) {
+		FOREACH_PROC_IN_SYSTEM(p) {
 			vm_pindex_t limit, size;
 
 			/*
@@ -1619,17 +1557,20 @@
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
+				thread_lock(td);
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
+					thread_unlock(td);
 					breakout = 1;
 					break;
 				}
+				thread_unlock(td);
 			}
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			if (breakout) {
 				PROC_UNLOCK(p);
 				continue;
@@ -1646,7 +1587,7 @@
 			 * swapped out set the limit to nothing (will force a
 			 * swap-out.)
 			 */
-			if ((p->p_sflag & PS_INMEM) == 0)
+			if ((p->p_flag & P_INMEM) == 0)
 				limit = 0;	/* XXX */
 			PROC_UNLOCK(p);
 
Index: memguard.c
===================================================================
RCS file: /home/cvs/src/sys/vm/memguard.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/memguard.c -L sys/vm/memguard.c -u -r1.1.1.1 -r1.2
--- sys/vm/memguard.c
+++ sys/vm/memguard.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/memguard.c,v 1.5 2005/02/16 21:45:59 bmilekic Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/memguard.c,v 1.6 2005/12/30 11:45:07 pjd Exp $");
 
 /*
  * MemGuard is a simple replacement allocator for debugging only
@@ -44,6 +44,7 @@
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
+#include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -59,6 +60,67 @@
  */
 #define	MAX_PAGES_PER_ITEM	64
 
+SYSCTL_NODE(_vm, OID_AUTO, memguard, CTLFLAG_RW, NULL, "MemGuard data");
+/*
+ * The vm_memguard_divisor variable controls how much of kmem_map should be
+ * reserved for MemGuard.
+ */
+u_int vm_memguard_divisor;
+SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RD, &vm_memguard_divisor,
+    0, "(kmem_size/memguard_divisor) == memguard submap size");     
+
+/*
+ * Short description (ks_shortdesc) of memory type to monitor.
+ */
+static char vm_memguard_desc[128] = "";
+static struct malloc_type *vm_memguard_mtype = NULL;
+TUNABLE_STR("vm.memguard.desc", vm_memguard_desc, sizeof(vm_memguard_desc));
+static int
+memguard_sysctl_desc(SYSCTL_HANDLER_ARGS)
+{
+	struct malloc_type_internal *mtip;
+	struct malloc_type_stats *mtsp;
+	struct malloc_type *mtp;
+	char desc[128];
+	long bytes;
+	int error, i;
+
+	strlcpy(desc, vm_memguard_desc, sizeof(desc));
+	error = sysctl_handle_string(oidp, desc, sizeof(desc), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	/*
+	 * We can change memory type when no memory has been allocated for it
+	 * or when there is no such memory type yet (ie. it will be loaded with
+	 * kernel module).
+	 */
+	bytes = 0;
+	mtx_lock(&malloc_mtx);
+	mtp = malloc_desc2type(desc);
+	if (mtp != NULL) {
+		mtip = mtp->ks_handle;
+		for (i = 0; i < MAXCPU; i++) {
+			mtsp = &mtip->mti_stats[i];
+			bytes += mtsp->mts_memalloced;
+			bytes -= mtsp->mts_memfreed;
+		}
+	}
+	if (bytes > 0)
+		error = EBUSY;
+	else {
+		/*
+		 * If mtp is NULL, it will be initialized in memguard_cmp().
+		 */
+		vm_memguard_mtype = mtp;
+		strlcpy(vm_memguard_desc, desc, sizeof(vm_memguard_desc));
+	}
+	mtx_unlock(&malloc_mtx);
+	return (error);
+}
+SYSCTL_PROC(_vm_memguard, OID_AUTO, desc, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
+    memguard_sysctl_desc, "A", "Short description of memory type to monitor");
+
 /*
  * Global MemGuard data.
  */
@@ -239,6 +301,34 @@
 	MEMGUARD_CRIT_SECTION_EXIT;
 }
 
+int
+memguard_cmp(struct malloc_type *mtp)
+{
+
+#if 1
+	/*
+	 * The safest way of comparsion is to always compare short description
+	 * string of memory type, but it is also the slowest way.
+	 */
+	return (strcmp(mtp->ks_shortdesc, vm_memguard_desc) == 0);
+#else
+	/*
+	 * If we compare pointers, there are two possible problems:
+	 * 1. Memory type was unloaded and new memory type was allocated at the
+	 *    same address.
+	 * 2. Memory type was unloaded and loaded again, but allocated at a
+	 *    different address.
+	 */
+	if (vm_memguard_mtype != NULL)
+		return (mtp == vm_memguard_mtype);
+	if (strcmp(mtp->ks_shortdesc, vm_memguard_desc) == 0) {
+		vm_memguard_mtype = mtp;
+		return (1);
+	}
+	return (0);
+#endif
+}
+
 /*
  * Guard a page containing specified object (make it read-only so that
  * future writes to it fail).
Index: device_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/device_pager.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/device_pager.c -L sys/vm/device_pager.c -u -r1.1.1.1 -r1.2
--- sys/vm/device_pager.c
+++ sys/vm/device_pager.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/device_pager.c,v 1.78 2005/06/10 17:27:54 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/device_pager.c,v 1.84 2007/08/18 16:41:31 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -64,8 +64,6 @@
 
 /* list of device pager objects */
 static struct pagerlst dev_pager_object_list;
-/* protect against object creation */
-static struct sx dev_pager_sx;
 /* protect list manipulation */
 static struct mtx dev_pager_mtx;
 
@@ -89,7 +87,6 @@
 dev_pager_init()
 {
 	TAILQ_INIT(&dev_pager_object_list);
-	sx_init(&dev_pager_sx, "dev_pager create");
 	mtx_init(&dev_pager_mtx, "dev_pager list", NULL, MTX_DEF);
 	fakepg_zone = uma_zcreate("DP fakepg", sizeof(struct vm_page),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
@@ -103,7 +100,7 @@
 dev_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff)
 {
 	struct cdev *dev;
-	vm_object_t object;
+	vm_object_t object, object1;
 	vm_pindex_t pindex;
 	unsigned int npages;
 	vm_paddr_t paddr;
@@ -126,7 +123,6 @@
 	csw = dev_refthread(dev);
 	if (csw == NULL)
 		return (NULL);
-	mtx_lock(&Giant);
 
 	/*
 	 * Check that the specified range of the device allows the desired
@@ -137,42 +133,46 @@
 	npages = OFF_TO_IDX(size);
 	for (off = foff; npages--; off += PAGE_SIZE)
 		if ((*csw->d_mmap)(dev, off, &paddr, (int)prot) != 0) {
-			mtx_unlock(&Giant);
 			dev_relthread(dev);
 			return (NULL);
 		}
 
-	/*
-	 * Lock to prevent object creation race condition.
-	 */
-	sx_xlock(&dev_pager_sx);
+	mtx_lock(&dev_pager_mtx);
 
 	/*
 	 * Look up pager, creating as necessary.
 	 */
+	object1 = NULL;
 	object = vm_pager_object_lookup(&dev_pager_object_list, handle);
 	if (object == NULL) {
 		/*
 		 * Allocate object and associate it with the pager.
 		 */
-		object = vm_object_allocate(OBJT_DEVICE, pindex);
-		object->handle = handle;
-		TAILQ_INIT(&object->un_pager.devp.devp_pglist);
-		mtx_lock(&dev_pager_mtx);
-		TAILQ_INSERT_TAIL(&dev_pager_object_list, object, pager_object_list);
 		mtx_unlock(&dev_pager_mtx);
+		object1 = vm_object_allocate(OBJT_DEVICE, pindex);
+		mtx_lock(&dev_pager_mtx);
+		object = vm_pager_object_lookup(&dev_pager_object_list, handle);
+		if (object != NULL) {
+			/*
+			 * We raced with other thread while allocating object.
+			 */
+			if (pindex > object->size)
+				object->size = pindex;
+		} else {
+			object = object1;
+			object1 = NULL;
+			object->handle = handle;
+			TAILQ_INIT(&object->un_pager.devp.devp_pglist);
+			TAILQ_INSERT_TAIL(&dev_pager_object_list, object,
+			    pager_object_list);
+		}
 	} else {
-		/*
-		 * Gain a reference to the object.
-		 */
-		vm_object_reference(object);
 		if (pindex > object->size)
 			object->size = pindex;
 	}
-
-	sx_xunlock(&dev_pager_sx);
-	mtx_unlock(&Giant);
+	mtx_unlock(&dev_pager_mtx);
 	dev_relthread(dev);
+	vm_object_deallocate(object1);
 	return (object);
 }
 
@@ -182,9 +182,11 @@
 {
 	vm_page_t m;
 
+	VM_OBJECT_UNLOCK(object);
 	mtx_lock(&dev_pager_mtx);
 	TAILQ_REMOVE(&dev_pager_object_list, object, pager_object_list);
 	mtx_unlock(&dev_pager_mtx);
+	VM_OBJECT_LOCK(object);
 	/*
 	 * Free up our fake pages.
 	 */
@@ -216,12 +218,10 @@
 	csw = dev_refthread(dev);
 	if (csw == NULL)
 		panic("dev_pager_getpage: no cdevsw");
-	mtx_lock(&Giant);
 	prot = PROT_READ;	/* XXX should pass in? */
 
 	ret = (*csw->d_mmap)(dev, (vm_offset_t)offset << PAGE_SHIFT, &paddr, prot);
 	KASSERT(ret == 0, ("dev_pager_getpage: map function returns error"));
-	mtx_unlock(&Giant);
 	dev_relthread(dev);
 
 	if ((m[reqpage]->flags & PG_FICTITIOUS) != 0) {
@@ -295,7 +295,8 @@
 
 	m = uma_zalloc(fakepg_zone, M_WAITOK);
 
-	m->flags = PG_BUSY | PG_FICTITIOUS;
+	m->flags = PG_FICTITIOUS;
+	m->oflags = VPO_BUSY;
 	m->valid = VM_PAGE_BITS_ALL;
 	m->dirty = 0;
 	m->busy = 0;
Index: vm_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_pager.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_pager.c -L sys/vm/vm_pager.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_pager.c
+++ sys/vm/vm_pager.c
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_pager.c,v 1.105.2.1 2005/08/15 14:04:47 kan Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_pager.c,v 1.108 2007/08/05 21:04:32 alc Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -82,7 +82,7 @@
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
-MALLOC_DEFINE(M_VMPGDATA, "VM pgdata", "XXX: VM pager private data");
+MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "XXX: VM pager private data");
 
 int cluster_pbuf_freecnt = -1;	/* unlimited to begin with */
 
@@ -261,17 +261,29 @@
  * vm_pager_has_page() - inline, see vm/vm_pager.h
  */
 
+/*
+ * Search the specified pager object list for an object with the
+ * specified handle.  If an object with the specified handle is found,
+ * increase its reference count and return it.  Otherwise, return NULL.
+ *
+ * The pager object list must be locked.
+ */
 vm_object_t
-vm_pager_object_lookup(pg_list, handle)
-	struct pagerlst *pg_list;
-	void *handle;
+vm_pager_object_lookup(struct pagerlst *pg_list, void *handle)
 {
 	vm_object_t object;
 
-	TAILQ_FOREACH(object, pg_list, pager_object_list)
-		if (object->handle == handle)
-			return (object);
-	return (NULL);
+	TAILQ_FOREACH(object, pg_list, pager_object_list) {
+		VM_OBJECT_LOCK(object);
+		if (object->handle == handle &&
+		    (object->flags & OBJ_DEAD) == 0) {
+			vm_object_reference_locked(object);
+			VM_OBJECT_UNLOCK(object);
+			break;
+		}
+		VM_OBJECT_UNLOCK(object);
+	}
+	return (object);
 }
 
 /*
Index: vm_extern.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_extern.h,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_extern.h -L sys/vm/vm_extern.h -u -r1.2 -r1.3
--- sys/vm/vm_extern.h
+++ sys/vm/vm_extern.h
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_extern.h	8.2 (Berkeley) 1/12/94
- * $FreeBSD: src/sys/vm/vm_extern.h,v 1.76.2.1 2006/03/16 00:25:32 alc Exp $
+ * $FreeBSD: src/sys/vm/vm_extern.h,v 1.78.4.1 2008/01/19 18:15:07 kib Exp $
  */
 
 #ifndef _VM_EXTERN_H_
@@ -70,14 +70,16 @@
 void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t);
 void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
 int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
-void vm_forkproc(struct thread *, struct proc *, struct thread *, int);
+int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
 void vm_waitproc(struct proc *);
 int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t);
 void vm_set_page_size(void);
 struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t);
 struct vmspace *vmspace_fork(struct vmspace *);
-void vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
-void vmspace_unshare(struct proc *);
+int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
+int vmspace_unshare(struct proc *);
+void vmspace_exit(struct thread *);
+struct vmspace *vmspace_acquire_ref(struct proc *);
 void vmspace_free(struct vmspace *);
 void vmspace_exitfree(struct proc *);
 void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
@@ -90,8 +92,8 @@
 void vm_imgact_unmap_page(struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 void vm_thread_dispose_altkstack(struct thread *td);
-void vm_thread_new(struct thread *td, int pages);
-void vm_thread_new_altkstack(struct thread *td, int pages);
+int vm_thread_new(struct thread *td, int pages);
+int vm_thread_new_altkstack(struct thread *td, int pages);
 void vm_thread_swapin(struct thread *td);
 void vm_thread_swapout(struct thread *td);
 #endif				/* _KERNEL */
Index: vm_page.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_page.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_page.h -L sys/vm/vm_page.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_page.h
+++ sys/vm/vm_page.h
@@ -57,7 +57,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: src/sys/vm/vm_page.h,v 1.136.2.1 2005/08/15 09:02:01 rwatson Exp $
+ * $FreeBSD: src/sys/vm/vm_page.h,v 1.152 2007/09/27 04:21:59 alc Exp $
  */
 
 /*
@@ -67,10 +67,6 @@
 #ifndef	_VM_PAGE_
 #define	_VM_PAGE_
 
-#if !defined(KLD_MODULE) && !defined(LIBMEMSTAT)
-#include "opt_vmpage.h"
-#endif
-
 #include <vm/pmap.h>
 
 /*
@@ -114,12 +110,15 @@
 	vm_pindex_t pindex;		/* offset into object (O,P) */
 	vm_paddr_t phys_addr;		/* physical address of page */
 	struct md_page md;		/* machine dependant stuff */
-	u_short	queue;			/* page queue index */
-	u_short	flags,			/* see below */
-		pc;			/* page color */
+	uint8_t	queue;			/* page queue index */
+	int8_t segind;  
+	u_short	flags;			/* see below */
+	uint8_t	order;			/* index of the buddy queue */
+	uint8_t pool;
 	u_short wire_count;		/* wired down maps refs (P) */
 	u_int cow;			/* page cow mapping count */
 	short hold_count;		/* page hold count */
+	u_short oflags;			/* page flags (O) */
 	u_char	act_count;		/* page usage count */
 	u_char	busy;			/* page busy count (O) */
 	/* NOTE that these must support one bit per DEV_BSIZE in a page!!! */
@@ -139,6 +138,18 @@
 #endif
 };
 
+/*
+ * Page flags stored in oflags:
+ *
+ * Access to these page flags is synchronized by the lock on the object
+ * containing the page (O).
+ */
+#define	VPO_BUSY	0x0001	/* page is in transit */
+#define	VPO_WANTED	0x0002	/* someone is waiting for page */
+#define	VPO_CLEANCHK	0x0100	/* page will be checked for cleaning */
+#define	VPO_SWAPINPROG	0x0200	/* swap I/O in progress on page */
+#define	VPO_NOSYNC	0x0400	/* do not collect for syncer */
+
 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
 #if PAGE_SIZE == 32768
 #ifdef CTASSERT
@@ -146,82 +157,33 @@
 #endif
 #endif
 
-#if !defined(KLD_MODULE)
-/*
- * Page coloring parameters
- */
+#define PQ_NONE		0
+#define	PQ_INACTIVE	1
+#define	PQ_ACTIVE	2
+#define	PQ_HOLD		3
+#define	PQ_COUNT	4
+#define	PQ_MAXCOUNT	4
+
+/* Returns the real queue a page is on. */
+#define VM_PAGE_GETQUEUE(m)	((m)->queue)
 
-/* Backward compatibility for existing PQ_*CACHE config options. */
-#if !defined(PQ_CACHESIZE)
-#if defined(PQ_HUGECACHE)
-#define PQ_CACHESIZE 1024
-#elif defined(PQ_LARGECACHE)
-#define PQ_CACHESIZE 512
-#elif defined(PQ_MEDIUMCACHE)
-#define PQ_CACHESIZE 256
-#elif defined(PQ_NORMALCACHE)
-#define PQ_CACHESIZE 64
-#elif defined(PQ_NOOPT)
-#define PQ_CACHESIZE 0
-#else
-#define PQ_CACHESIZE 128
-#endif
-#endif			/* !defined(PQ_CACHESIZE) */
+/* Returns the well known queue a page is on. */
+#define VM_PAGE_GETKNOWNQUEUE2(m)	VM_PAGE_GETQUEUE(m)
 
-#if PQ_CACHESIZE >= 1024
-#define PQ_PRIME1 31	/* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_PRIME2 23	/* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 256	/* A number of colors opt for 1M cache */
-
-#elif PQ_CACHESIZE >= 512
-#define PQ_PRIME1 31	/* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_PRIME2 23	/* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 128	/* A number of colors opt for 512K cache */
-
-#elif PQ_CACHESIZE >= 256
-#define PQ_PRIME1 13	/* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_PRIME2 7	/* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 64	/* A number of colors opt for 256K cache */
-
-#elif PQ_CACHESIZE >= 128
-#define PQ_PRIME1 9	/* Produces a good PQ_L2_SIZE/3 + PQ_PRIME1 */
-#define PQ_PRIME2 5	/* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 32	/* A number of colors opt for 128k cache */
-
-#elif PQ_CACHESIZE >= 64
-#define PQ_PRIME1 5	/* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_PRIME2 3	/* Prime number somewhat less than PQ_L2_SIZE */
-#define PQ_L2_SIZE 16	/* A reasonable number of colors (opt for 64K cache) */
+/* Returns true if the page is in the named well known queue. */
+#define VM_PAGE_INQUEUE2(m, q)	(VM_PAGE_GETKNOWNQUEUE2(m) == (q))
 
-#else
-#define PQ_PRIME1 1	/* Disable page coloring. */
-#define PQ_PRIME2 1
-#define PQ_L2_SIZE 1
-
-#endif
-
-#define PQ_L2_MASK (PQ_L2_SIZE - 1)
-
-/* PQ_CACHE and PQ_FREE represent PQ_L2_SIZE consecutive queues. */
-#define PQ_NONE 0
-#define PQ_FREE	1
-#define PQ_INACTIVE (1 + 1*PQ_L2_SIZE)
-#define PQ_ACTIVE (2 + 1*PQ_L2_SIZE)
-#define PQ_CACHE (3 + 1*PQ_L2_SIZE)
-#define PQ_HOLD  (3 + 2*PQ_L2_SIZE)
-#define PQ_COUNT (4 + 2*PQ_L2_SIZE)
+/* Sets the queue a page is on. */
+#define VM_PAGE_SETQUEUE2(m, q)	(VM_PAGE_GETQUEUE(m) = (q))
 
 struct vpgqueues {
 	struct pglist pl;
 	int	*cnt;
-	int	lcnt;
 };
 
-extern struct vpgqueues vm_page_queues[PQ_COUNT];
+extern struct vpgqueues vm_page_queues[PQ_MAXCOUNT];
 extern struct mtx vm_page_queue_free_mtx;
 
-#endif			/* !defined(KLD_MODULE) */
-
 /*
  * These are the flags defined for vm_page.
  *
@@ -232,16 +194,13 @@
  *	 pte mappings, nor can they be removed from their objects via 
  *	 the object, and such pages are also not on any PQ queue.
  */
-#define	PG_BUSY		0x0001		/* page is in transit (O) */
-#define	PG_WANTED	0x0002		/* someone is waiting for page (O) */
+#define	PG_CACHED	0x0001		/* page is cached */
+#define	PG_FREE		0x0002		/* page is free */
 #define PG_WINATCFLS	0x0004		/* flush dirty page on inactive q */
 #define	PG_FICTITIOUS	0x0008		/* physical page doesn't exist (O) */
 #define	PG_WRITEABLE	0x0010		/* page is mapped writeable */
 #define	PG_ZERO		0x0040		/* page is zeroed */
 #define PG_REFERENCED	0x0080		/* page has been referenced */
-#define PG_CLEANCHK	0x0100		/* page will be checked for cleaning */
-#define PG_SWAPINPROG	0x0200		/* swap I/O in progress on page	     */
-#define PG_NOSYNC	0x0400		/* do not collect for syncer */
 #define PG_UNMANAGED	0x0800		/* No PV management for page */
 #define PG_MARKER	0x1000		/* special queue marker page */
 #define	PG_SLAB		0x2000		/* object pointer is actually a slab */
@@ -255,18 +214,24 @@
 #define ACT_MAX			64
 
 #ifdef _KERNEL
+
+#include <vm/vm_param.h>
+
 /*
- * Each pageable resident page falls into one of four lists:
+ * Each pageable resident page falls into one of five lists:
  *
  *	free
  *		Available for allocation now.
  *
- * The following are all LRU sorted:
- *
  *	cache
- *		Almost available for allocation. Still in an
- *		object, but clean and immediately freeable at
- *		non-interrupt times.
+ *		Almost available for allocation. Still associated with
+ *		an object, but clean and immediately freeable.
+ *
+ *	hold
+ *		Will become free after a pending I/O operation
+ *		completes.
+ *
+ * The following lists are LRU sorted:
  *
  *	inactive
  *		Low activity, candidates for reclamation.
@@ -277,9 +242,6 @@
  *		Pages that are "active" i.e. they have been
  *		recently referenced.
  *
- *	zero
- *		Pages that are really free and have been pre-zeroed
- *
  */
 
 extern int vm_page_zero_count;
@@ -288,10 +250,25 @@
 extern int vm_page_array_size;		/* number of vm_page_t's */
 extern long first_page;			/* first physical page number */
 
+#define	VM_PAGE_IS_FREE(m)	(((m)->flags & PG_FREE) != 0)
+
 #define VM_PAGE_TO_PHYS(entry)	((entry)->phys_addr)
 
-#define PHYS_TO_VM_PAGE(pa) \
-		(&vm_page_array[atop(pa) - first_page ])
+vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
+
+static __inline vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
+
+static __inline vm_page_t
+PHYS_TO_VM_PAGE(vm_paddr_t pa)
+{
+#ifdef VM_PHYSSEG_SPARSE
+	return (vm_phys_paddr_to_vm_page(pa));
+#elif defined(VM_PHYSSEG_DENSE)
+	return (&vm_page_array[atop(pa) - first_page]);
+#else
+#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
+#endif
+}
 
 extern struct mtx vm_page_queue_mtx;
 #define vm_page_lock_queues()   mtx_lock(&vm_page_queue_mtx)
@@ -318,6 +295,8 @@
 #define	VM_ALLOC_RETRY		0x0080	/* vm_page_grab() only */
 #define	VM_ALLOC_NOOBJ		0x0100	/* No associated object */
 #define	VM_ALLOC_NOBUSY		0x0200	/* Do not busy the page */
+#define	VM_ALLOC_IFCACHED	0x0400	/* Fail if the page is not cached */
+#define	VM_ALLOC_IFNOTCACHED	0x0800	/* Fail if the page is cached */
 
 void vm_page_flag_set(vm_page_t m, unsigned short bits);
 void vm_page_flag_clear(vm_page_t m, unsigned short bits);
@@ -329,25 +308,21 @@
 void vm_page_unhold(vm_page_t mem);
 void vm_page_free(vm_page_t m);
 void vm_page_free_zero(vm_page_t m);
-int vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg);
 void vm_page_dirty(vm_page_t m);
 void vm_page_wakeup(vm_page_t m);
 
 void vm_pageq_init(void);
-vm_page_t vm_pageq_add_new_page(vm_paddr_t pa);
 void vm_pageq_enqueue(int queue, vm_page_t m);
-void vm_pageq_remove_nowakeup(vm_page_t m);
 void vm_pageq_remove(vm_page_t m);
-vm_page_t vm_pageq_find(int basequeue, int index, boolean_t prefer_zero);
 void vm_pageq_requeue(vm_page_t m);
 
 void vm_page_activate (vm_page_t);
 vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
-vm_page_t vm_page_alloc_contig (vm_pindex_t, vm_paddr_t, vm_paddr_t,
-	    vm_offset_t, vm_offset_t);
-void vm_page_release_contig (vm_page_t, vm_pindex_t);
 vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
 void vm_page_cache (register vm_page_t);
+void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
+void vm_page_cache_remove(vm_page_t);
+void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
 int vm_page_try_to_cache (vm_page_t);
 int vm_page_try_to_free (vm_page_t);
 void vm_page_dontneed (register vm_page_t);
@@ -356,10 +331,9 @@
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
 void vm_page_remove (vm_page_t);
 void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
-vm_page_t vm_page_select_cache(int);
+void vm_page_sleep(vm_page_t m, const char *msg);
 vm_page_t vm_page_splay(vm_pindex_t, vm_page_t);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
-void vm_page_unmanage (vm_page_t);
 void vm_page_unwire (vm_page_t, int);
 void vm_page_wire (vm_page_t);
 void vm_page_set_validclean (vm_page_t, int, int);
@@ -376,6 +350,27 @@
 void vm_page_cowclear (vm_page_t);
 
 /*
+ *	vm_page_sleep_if_busy:
+ *
+ *	Sleep and release the page queues lock if VPO_BUSY is set or,
+ *	if also_m_busy is TRUE, busy is non-zero.  Returns TRUE if the
+ *	thread slept and the page queues lock was released.
+ *	Otherwise, retains the page queues lock and returns FALSE.
+ *
+ *	The object containing the given page must be locked.
+ */
+static __inline int
+vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg)
+{
+
+	if ((m->oflags & VPO_BUSY) || (also_m_busy && m->busy)) {
+		vm_page_sleep(m, msg);
+		return (TRUE);
+	}
+	return (FALSE);
+}
+
+/*
  *	vm_page_undirty:
  *
  *	Set page to not be dirty.  Note: does not clear pmap modify bits
Index: vm_glue.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_glue.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_glue.c -L sys/vm/vm_glue.c -u -r1.2 -r1.3
--- sys/vm/vm_glue.c
+++ sys/vm/vm_glue.c
@@ -57,7 +57,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_glue.c,v 1.213.2.1 2006/03/16 00:25:32 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_glue.c,v 1.225.4.1 2008/01/19 18:15:07 kib Exp $");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
@@ -112,7 +112,8 @@
 SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL)
 
 #ifndef NO_SWAPPING
-static void swapout(struct proc *);
+static int swapout(struct proc *);
+static void swapclear(struct proc *);
 #endif
 
 
@@ -272,8 +273,8 @@
 	}
 	vm_page_lock_queues();
 	vm_page_hold(m);
-	vm_page_wakeup(m);
 	vm_page_unlock_queues();
+	vm_page_wakeup(m);
 out:
 	VM_OBJECT_UNLOCK(object);
 	return (m);
@@ -320,7 +321,7 @@
  * This routine directly affects the fork perf for a process and
  * create performance for a thread.
  */
-void
+int
 vm_thread_new(struct thread *td, int pages)
 {
 	vm_object_t ksobj;
@@ -337,18 +338,22 @@
 	 * Allocate an object for the kstack.
 	 */
 	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
-	td->td_kstack_obj = ksobj;
 	/*
 	 * Get a kernel virtual address for this thread's kstack.
 	 */
 	ks = kmem_alloc_nofault(kernel_map,
 	   (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
-	if (ks == 0)
-		panic("vm_thread_new: kstack allocation failed");
+	if (ks == 0) {
+		printf("vm_thread_new: kstack allocation failed\n");
+		vm_object_deallocate(ksobj);
+		return (0);
+	}
+	
 	if (KSTACK_GUARD_PAGES != 0) {
 		pmap_qremove(ks, KSTACK_GUARD_PAGES);
 		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	}
+	td->td_kstack_obj = ksobj;
 	td->td_kstack = ks;
 	/*
 	 * Knowing the number of pages allocated is useful when you
@@ -371,6 +376,7 @@
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
+	return (1);
 }
 
 /*
@@ -402,6 +408,7 @@
 	vm_object_deallocate(ksobj);
 	kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
+	td->td_kstack = 0;
 }
 
 /*
@@ -456,8 +463,8 @@
 		ma[i] = m;
 		vm_page_lock_queues();
 		vm_page_wire(m);
-		vm_page_wakeup(m);
 		vm_page_unlock_queues();
+		vm_page_wakeup(m);
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	pmap_qenter(td->td_kstack, ma, pages);
@@ -467,7 +474,7 @@
 /*
  * Set up a variable-sized alternate kstack.
  */
-void
+int
 vm_thread_new_altkstack(struct thread *td, int pages)
 {
 
@@ -475,7 +482,7 @@
 	td->td_altkstack_obj = td->td_kstack_obj;
 	td->td_altkstack_pages = td->td_kstack_pages;
 
-	vm_thread_new(td, pages);
+	return (vm_thread_new(td, pages));
 }
 
 /*
@@ -503,14 +510,16 @@
  * ready to run.  The new process is set up so that it returns directly
  * to user mode to avoid stack copying and relocation problems.
  */
-void
-vm_forkproc(td, p2, td2, flags)
+int
+vm_forkproc(td, p2, td2, vm2, flags)
 	struct thread *td;
 	struct proc *p2;
 	struct thread *td2;
+	struct vmspace *vm2;
 	int flags;
 {
 	struct proc *p1 = td->td_proc;
+	int error;
 
 	if ((flags & RFPROC) == 0) {
 		/*
@@ -520,11 +529,13 @@
 		 */
 		if ((flags & RFMEM) == 0) {
 			if (p1->p_vmspace->vm_refcnt > 1) {
-				vmspace_unshare(p1);
+				error = vmspace_unshare(p1);
+				if (error)
+					return (error);
 			}
 		}
 		cpu_fork(td, p2, td2, flags);
-		return;
+		return (0);
 	}
 
 	if (flags & RFMEM) {
@@ -537,7 +548,7 @@
 	}
 
 	if ((flags & RFMEM) == 0) {
-		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
+		p2->p_vmspace = vm2;
 		if (p1->p_vmspace->vm_shm)
 			shmfork(p1, p2);
 	}
@@ -547,6 +558,7 @@
 	 * and make the child ready to run.
 	 */
 	cpu_fork(td, p2, td2, flags);
+	return (0);
 }
 
 /*
@@ -601,7 +613,7 @@
 #ifdef NO_SWAPPING
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	if ((p->p_sflag & PS_INMEM) == 0)
+	if ((p->p_flag & P_INMEM) == 0)
 		panic("faultin: proc swapped out with NO_SWAPPING!");
 #else /* !NO_SWAPPING */
 	struct thread *td;
@@ -611,34 +623,34 @@
 	 * If another process is swapping in this process,
 	 * just wait until it finishes.
 	 */
-	if (p->p_sflag & PS_SWAPPINGIN)
-		msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0);
-	else if ((p->p_sflag & PS_INMEM) == 0) {
+	if (p->p_flag & P_SWAPPINGIN) {
+		while (p->p_flag & P_SWAPPINGIN)
+			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
+		return;
+	}
+	if ((p->p_flag & P_INMEM) == 0) {
 		/*
 		 * Don't let another thread swap process p out while we are
 		 * busy swapping it in.
 		 */
 		++p->p_lock;
-		mtx_lock_spin(&sched_lock);
-		p->p_sflag |= PS_SWAPPINGIN;
-		mtx_unlock_spin(&sched_lock);
+		p->p_flag |= P_SWAPPINGIN;
 		PROC_UNLOCK(p);
 
+		/*
+		 * We hold no lock here because the list of threads
+		 * can not change while all threads in the process are
+		 * swapped out.
+		 */
 		FOREACH_THREAD_IN_PROC(p, td)
 			vm_thread_swapin(td);
-
 		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
-		p->p_sflag &= ~PS_SWAPPINGIN;
-		p->p_sflag |= PS_INMEM;
-		FOREACH_THREAD_IN_PROC(p, td) {
-			TD_CLR_SWAPPED(td);
-			if (TD_CAN_RUN(td))
-				setrunnable(td);
-		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SLOCK(p);
+		swapclear(p);
+		p->p_swtick = ticks;
+		PROC_SUNLOCK(p);
 
-		wakeup(&p->p_sflag);
+		wakeup(&p->p_flag);
 
 		/* Allow other threads to swap p out now. */
 		--p->p_lock;
@@ -662,9 +674,11 @@
 {
 	struct proc *p;
 	struct thread *td;
-	int pri;
 	struct proc *pp;
+	int slptime;
+	int swtime;
 	int ppri;
+	int pri;
 
 	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
 	mtx_unlock(&Giant);
@@ -672,9 +686,9 @@
 loop:
 	if (vm_page_count_min()) {
 		VM_WAIT;
-		mtx_lock_spin(&sched_lock);
+		thread_lock(&thread0);
 		proc0_rescan = 0;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(&thread0);
 		goto loop;
 	}
 
@@ -682,26 +696,27 @@
 	ppri = INT_MIN;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
-		struct ksegrp *kg;
-		if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
+		PROC_LOCK(p);
+		if (p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
+			PROC_UNLOCK(p);
 			continue;
 		}
-		mtx_lock_spin(&sched_lock);
+		swtime = (ticks - p->p_swtick) / hz;
+		PROC_SLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td) {
 			/*
 			 * An otherwise runnable thread of a process
 			 * swapped out has only the TDI_SWAPPED bit set.
 			 * 
 			 */
+			thread_lock(td);
 			if (td->td_inhibitors == TDI_SWAPPED) {
-				kg = td->td_ksegrp;
-				pri = p->p_swtime + kg->kg_slptime;
-				if ((p->p_sflag & PS_SWAPINREQ) == 0) {
+				slptime = (ticks - td->td_slptick) / hz;
+				pri = swtime + slptime;
+				if ((td->td_flags & TDF_SWAPINREQ) == 0)
 					pri -= p->p_nice * 8;
-				}
-
 				/*
-				 * if this ksegrp is higher priority
+				 * if this thread is higher priority
 				 * and there is enough space, then select
 				 * this process instead of the previous
 				 * selection.
@@ -711,8 +726,10 @@
 					ppri = pri;
 				}
 			}
+			thread_unlock(td);
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
+		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 
@@ -720,13 +737,13 @@
 	 * Nothing to do, back to sleep.
 	 */
 	if ((p = pp) == NULL) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(&thread0);
 		if (!proc0_rescan) {
 			TD_SET_IWAIT(&thread0);
 			mi_switch(SW_VOL, NULL);
 		}
 		proc0_rescan = 0;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(&thread0);
 		goto loop;
 	}
 	PROC_LOCK(p);
@@ -736,28 +753,23 @@
 	 * brought this process in while we traverse all threads.
 	 * Or, this process may even be being swapped out again.
 	 */
-	if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
+	if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
 		PROC_UNLOCK(p);
-		mtx_lock_spin(&sched_lock);
+		thread_lock(&thread0);
 		proc0_rescan = 0;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(&thread0);
 		goto loop;
 	}
 
-	mtx_lock_spin(&sched_lock);
-	p->p_sflag &= ~PS_SWAPINREQ;
-	mtx_unlock_spin(&sched_lock);
-
 	/*
 	 * We would like to bring someone in. (only if there is space).
 	 * [What checks the space? ]
 	 */
 	faultin(p);
 	PROC_UNLOCK(p);
-	mtx_lock_spin(&sched_lock);
-	p->p_swtime = 0;
+	thread_lock(&thread0);
 	proc0_rescan = 0;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(&thread0);
 	goto loop;
 }
 
@@ -765,16 +777,18 @@
 {
 	struct thread *td = &thread0;
 
-		
+	/* XXX This will probably cause a LOR in some cases */
+	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
-		CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, 0);
+		CTR2(KTR_INTR, "%s: sched_add %d", __func__, 0);
 		TD_CLR_IWAIT(td);
-		setrunqueue(td, SRQ_INTR);
+		sched_add(td, SRQ_INTR);
 	} else {
 		proc0_rescan = 1;
 		CTR2(KTR_INTR, "%s: state %d",
 		    __func__, td->td_state);
 	}
+	thread_unlock(td);
 	
 }
 
@@ -798,7 +812,7 @@
 
 /*
  * Swapout is driven by the pageout daemon.  Very simple, we find eligible
- * procs and unwire their u-areas.  We try to always "swap" at least one
+ * procs and swap out their stacks.  We try to always "swap" at least one
  * process in case we need the room for a swapin.
  * If any procs have been sleeping/stopped for at least maxslp seconds,
  * they are swapped.  Else, we swap the longest-sleeping or stopped process,
@@ -810,7 +824,6 @@
 {
 	struct proc *p;
 	struct thread *td;
-	struct ksegrp *kg;
 	int didswap = 0;
 
 retry:
@@ -818,19 +831,15 @@
 	FOREACH_PROC_IN_SYSTEM(p) {
 		struct vmspace *vm;
 		int minslptime = 100000;
+		int slptime;
 		
 		/*
 		 * Watch out for a process in
 		 * creation.  It may have no
 		 * address space or lock yet.
 		 */
-		mtx_lock_spin(&sched_lock);
-		if (p->p_state == PRS_NEW) {
-			mtx_unlock_spin(&sched_lock);
+		if (p->p_state == PRS_NEW)
 			continue;
-		}
-		mtx_unlock_spin(&sched_lock);
-
 		/*
 		 * An aio daemon switches its
 		 * address space while running.
@@ -839,7 +848,6 @@
 		 */
 		if ((p->p_flag & P_SYSTEM) != 0)
 			continue;
-
 		/*
 		 * Do not swapout a process that
 		 * is waiting for VM data
@@ -852,12 +860,9 @@
 		 * process may attempt to alter
 		 * the map.
 		 */
-		PROC_LOCK(p);
-		vm = p->p_vmspace;
-		KASSERT(vm != NULL,
-			("swapout_procs: a process has no address space"));
-		atomic_add_int(&vm->vm_refcnt, 1);
-		PROC_UNLOCK(p);
+		vm = vmspace_acquire_ref(p);
+		if (vm == NULL)
+			continue;
 		if (!vm_map_trylock(&vm->vm_map))
 			goto nextproc1;
 
@@ -872,7 +877,7 @@
 		 * skipped because of the if statement above checking 
 		 * for P_SYSTEM
 		 */
-		if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM)
+		if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
 			goto nextproc2;
 
 		switch (p->p_state) {
@@ -882,21 +887,26 @@
 			break;
 
 		case PRS_NORMAL:
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			/*
 			 * do not swapout a realtime process
 			 * Check all the thread groups..
 			 */
-			FOREACH_KSEGRP_IN_PROC(p, kg) {
-				if (PRI_IS_REALTIME(kg->kg_pri_class))
+			FOREACH_THREAD_IN_PROC(p, td) {
+				thread_lock(td);
+				if (PRI_IS_REALTIME(td->td_pri_class)) {
+					thread_unlock(td);
 					goto nextproc;
-
+				}
+				slptime = (ticks - td->td_slptick) / hz;
 				/*
 				 * Guarantee swap_idle_threshold1
 				 * time in memory.
 				 */
-				if (kg->kg_slptime < swap_idle_threshold1)
+				if (slptime < swap_idle_threshold1) {
+					thread_unlock(td);
 					goto nextproc;
+				}
 
 				/*
 				 * Do not swapout a process if it is
@@ -907,10 +917,10 @@
 				 * This could be refined to support
 				 * swapping out a thread.
 				 */
-				FOREACH_THREAD_IN_GROUP(kg, td) {
-					if ((td->td_priority) < PSOCK ||
-					    !thread_safetoswapout(td))
-						goto nextproc;
+				if ((td->td_priority) < PSOCK ||
+				    !thread_safetoswapout(td)) {
+					thread_unlock(td);
+					goto nextproc;
 				}
 				/*
 				 * If the system is under memory stress,
@@ -920,11 +930,14 @@
 				 */
 				if (((action & VM_SWAP_NORMAL) == 0) &&
 				    (((action & VM_SWAP_IDLE) == 0) ||
-				    (kg->kg_slptime < swap_idle_threshold2)))
+				    (slptime < swap_idle_threshold2))) {
+					thread_unlock(td);
 					goto nextproc;
+				}
 
-				if (minslptime > kg->kg_slptime)
-					minslptime = kg->kg_slptime;
+				if (minslptime > slptime)
+					minslptime = slptime;
+				thread_unlock(td);
 			}
 
 			/*
@@ -935,9 +948,9 @@
 			if ((action & VM_SWAP_NORMAL) ||
 				((action & VM_SWAP_IDLE) &&
 				 (minslptime > swap_idle_threshold2))) {
-				swapout(p);
-				didswap++;
-				mtx_unlock_spin(&sched_lock);
+				if (swapout(p) == 0)
+					didswap++;
+				PROC_SUNLOCK(p);
 				PROC_UNLOCK(p);
 				vm_map_unlock(&vm->vm_map);
 				vmspace_free(vm);
@@ -945,7 +958,7 @@
 				goto retry;
 			}
 nextproc:			
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 		}
 nextproc2:
 		PROC_UNLOCK(p);
@@ -964,13 +977,35 @@
 }
 
 static void
+swapclear(p)
+	struct proc *p;
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		td->td_flags |= TDF_INMEM;
+		td->td_flags &= ~TDF_SWAPINREQ;
+		TD_CLR_SWAPPED(td);
+		if (TD_CAN_RUN(td))
+			setrunnable(td);
+		thread_unlock(td);
+	}
+	p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
+	p->p_flag |= P_INMEM;
+}
+
+static int
 swapout(p)
 	struct proc *p;
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED | MA_NOTRECURSED);
 #if defined(SWAP_DEBUG)
 	printf("swapping out %d\n", p->p_pid);
 #endif
@@ -980,40 +1015,46 @@
 	 * by now.  Assuming that there is only one pageout daemon thread,
 	 * this process should still be in memory.
 	 */
-	KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM,
+	KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
 		("swapout: lost a swapout race?"));
 
-#if defined(INVARIANTS)
 	/*
-	 * Make sure that all threads are safe to be swapped out.
-	 *
-	 * Alternatively, we could swap out only safe threads.
+	 * remember the process resident count
+	 */
+	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
+	/*
+	 * Check and mark all threads before we proceed.
 	 */
+	p->p_flag &= ~P_INMEM;
+	p->p_flag |= P_SWAPPINGOUT;
 	FOREACH_THREAD_IN_PROC(p, td) {
-		KASSERT(thread_safetoswapout(td),
-			("swapout: there is a thread not safe for swapout"));
+		thread_lock(td);
+		if (!thread_safetoswapout(td)) {
+			thread_unlock(td);
+			swapclear(p);
+			return (EBUSY);
+		}
+		td->td_flags &= ~TDF_INMEM;
+		TD_SET_SWAPPED(td);
+		thread_unlock(td);
 	}
-#endif /* INVARIANTS */
+	td = FIRST_THREAD_IN_PROC(p);
+	++td->td_ru.ru_nswap;
+	PROC_SUNLOCK(p);
+	PROC_UNLOCK(p);
 
-	++p->p_stats->p_ru.ru_nswap;
 	/*
-	 * remember the process resident count
+	 * This list is stable because all threads are now prevented from
+	 * running.  The list is only modified in the context of a running
+	 * thread in this process.
 	 */
-	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
-
-	p->p_sflag &= ~PS_INMEM;
-	p->p_sflag |= PS_SWAPPINGOUT;
-	PROC_UNLOCK(p);
-	FOREACH_THREAD_IN_PROC(p, td)
-		TD_SET_SWAPPED(td);
-	mtx_unlock_spin(&sched_lock);
-
 	FOREACH_THREAD_IN_PROC(p, td)
 		vm_thread_swapout(td);
 
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
-	p->p_sflag &= ~PS_SWAPPINGOUT;
-	p->p_swtime = 0;
+	p->p_flag &= ~P_SWAPPINGOUT;
+	PROC_SLOCK(p);
+	p->p_swtick = ticks;
+	return (0);
 }
 #endif /* !NO_SWAPPING */
Index: uma_dbg.c
===================================================================
RCS file: /home/cvs/src/sys/vm/uma_dbg.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/uma_dbg.c -L sys/vm/uma_dbg.c -u -r1.1.1.1 -r1.2
--- sys/vm/uma_dbg.c
+++ sys/vm/uma_dbg.c
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/uma_dbg.c,v 1.20.2.1 2005/08/20 13:31:05 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/uma_dbg.c,v 1.21 2005/07/16 09:51:52 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
Index: swap_pager.h
===================================================================
RCS file: /home/cvs/src/sys/vm/swap_pager.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/swap_pager.h -L sys/vm/swap_pager.h -u -r1.1.1.1 -r1.2
--- sys/vm/swap_pager.h
+++ sys/vm/swap_pager.h
@@ -32,12 +32,44 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)swap_pager.h	7.1 (Berkeley) 12/5/90
- * $FreeBSD: src/sys/vm/swap_pager.h,v 1.50 2005/01/07 02:29:27 imp Exp $
+ * $FreeBSD: src/sys/vm/swap_pager.h,v 1.52 2007/02/07 17:43:11 jhb Exp $
  */
 
 #ifndef	_VM_SWAP_PAGER_H_
 #define	_VM_SWAP_PAGER_H_ 1
 
+typedef	int32_t	swblk_t;	/*
+				 * swap offset.  This is the type used to
+				 * address the "virtual swap device" and
+				 * therefore the maximum swap space is
+				 * 2^32 pages.
+				 */
+
+struct buf;
+struct swdevt;
+typedef void sw_strategy_t(struct buf *, struct swdevt *);
+typedef void sw_close_t(struct thread *, struct swdevt *);
+
+/*
+ * Swap device table
+ */
+struct swdevt {
+	int	sw_flags;
+	int	sw_nblks;
+	int     sw_used;
+	dev_t	sw_dev;
+	struct vnode *sw_vp;
+	void	*sw_id;
+	swblk_t	sw_first;
+	swblk_t	sw_end;
+	struct blist *sw_blist;
+	TAILQ_ENTRY(swdevt)	sw_list;
+	sw_strategy_t		*sw_strategy;
+	sw_close_t		*sw_close;
+};
+
+#define	SW_CLOSING	0x04
+
 #ifdef _KERNEL
 
 extern int swap_pager_full;
@@ -50,6 +82,7 @@
 int swap_pager_isswapped(vm_object_t, struct swdevt *);
 int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t);
 void swap_pager_status(int *total, int *used);
+void swapoff_all(void);
 
 #endif				/* _KERNEL */
 #endif				/* _VM_SWAP_PAGER_H_ */
Index: vm_page.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_page.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_page.c -L sys/vm/vm_page.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_page.c
+++ sys/vm/vm_page.c
@@ -97,7 +97,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_page.c,v 1.304.2.2 2005/11/13 08:44:25 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_page.c,v 1.357.2.3 2007/11/28 22:23:35 alc Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -117,10 +117,13 @@
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
+#include <machine/md_var.h>
+
 /*
  *	Associated with page of user-allocatable memory is a
  *	page structure.
@@ -156,6 +159,36 @@
 }
 
 /*
+ *	vm_page_blacklist_lookup:
+ *
+ *	See if a physical address in this page has been listed
+ *	in the blacklist tunable.  Entries in the tunable are
+ *	separated by spaces or commas.  If an invalid integer is
+ *	encountered then the rest of the string is skipped.
+ */
+static int
+vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
+{
+	vm_paddr_t bad;
+	char *cp, *pos;
+
+	for (pos = list; *pos != '\0'; pos = cp) {
+		bad = strtoq(pos, &cp, 0);
+		if (*cp != '\0') {
+			if (*cp == ' ' || *cp == ',') {
+				cp++;
+				if (cp == pos)
+					continue;
+			} else
+				break;
+		}
+		if (pa == trunc_page(bad))
+			return (1);
+	}
+	return (0);
+}
+
+/*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.
@@ -175,10 +208,12 @@
 	vm_paddr_t pa;
 	int nblocks;
 	vm_paddr_t last_pa;
+	char *list;
 
 	/* the biggest memory array is the second group of pages */
 	vm_paddr_t end;
 	vm_paddr_t biggestsize;
+	vm_paddr_t low_water, high_water;
 	int biggestone;
 
 	vm_paddr_t total;
@@ -194,6 +229,9 @@
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
 
+	low_water = phys_avail[0];
+	high_water = phys_avail[1];
+
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
 
@@ -201,6 +239,10 @@
 			biggestone = i;
 			biggestsize = size;
 		}
+		if (phys_avail[i] < low_water)
+			low_water = phys_avail[i];
+		if (phys_avail[i + 1] > high_water)
+			high_water = phys_avail[i + 1];
 		++nblocks;
 		total += size;
 	}
@@ -213,7 +255,7 @@
 	mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
 	    MTX_RECURSE);
 	mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
-	    MTX_SPIN);
+	    MTX_DEF);
 
 	/*
 	 * Initialize the queue headers for the free queue, the active queue
@@ -232,13 +274,40 @@
 	bzero((void *)mapped, end - new_end);
 	uma_startup((void *)mapped, boot_pages);
 
+#if defined(__amd64__) || defined(__i386__)
+	/*
+	 * Allocate a bitmap to indicate that a random physical page
+	 * needs to be included in a minidump.
+	 *
+	 * The amd64 port needs this to indicate which direct map pages
+	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
+	 *
+	 * However, i386 still needs this workspace internally within the
+	 * minidump code.  In theory, they are not needed on i386, but are
+	 * included should the sf_buf code decide to use them.
+	 */
+	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE;
+	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
+	new_end -= vm_page_dump_size;
+	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
+	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
+	bzero((void *)vm_page_dump, vm_page_dump_size);
+#endif
 	/*
 	 * Compute the number of pages of memory that will be available for
 	 * use (taking into account the overhead of a page structure per
 	 * page).
 	 */
-	first_page = phys_avail[0] / PAGE_SIZE;
-	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
+	first_page = low_water / PAGE_SIZE;
+#ifdef VM_PHYSSEG_SPARSE
+	page_range = 0;
+	for (i = 0; phys_avail[i + 1] != 0; i += 2)
+		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
+#elif defined(VM_PHYSSEG_DENSE)
+	page_range = high_water / PAGE_SIZE - first_page;
+#else
+#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
+#endif
 	npages = (total - (page_range * sizeof(struct vm_page)) -
 	    (end - new_end)) / PAGE_SIZE;
 	end = new_end;
@@ -256,30 +325,61 @@
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	vm_page_array = (vm_page_t) mapped;
+#ifdef __amd64__
+	/*
+	 * pmap_map on amd64 comes out of the direct-map, not kvm like i386,
+	 * so the pages must be tracked for a crashdump to include this data.
+	 * This includes the vm_page_array and the early UMA bootstrap pages.
+	 */
+	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
+		dump_add_page(pa);
+#endif	
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
 	 * Clear all of the page structures
 	 */
 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
+	for (i = 0; i < page_range; i++)
+		vm_page_array[i].order = VM_NFREEORDER;
 	vm_page_array_size = page_range;
 
 	/*
-	 * Construct the free queue(s) in descending order (by physical
-	 * address) so that the first 16MB of physical memory is allocated
-	 * last rather than first.  On large-memory machines, this avoids
-	 * the exhaustion of low physical memory before isa_dma_init has run.
+	 * This assertion tests the hypothesis that npages and total are
+	 * redundant.  XXX
+	 */
+	page_range = 0;
+	for (i = 0; phys_avail[i + 1] != 0; i += 2)
+		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
+	KASSERT(page_range == npages,
+	    ("vm_page_startup: inconsistent page counts"));
+
+	/*
+	 * Initialize the physical memory allocator.
+	 */
+	vm_phys_init();
+
+	/*
+	 * Add every available physical page that is not blacklisted to
+	 * the free lists.
 	 */
 	cnt.v_page_count = 0;
 	cnt.v_free_count = 0;
-	for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
+	list = getenv("vm.blacklist");
+	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		pa = phys_avail[i];
 		last_pa = phys_avail[i + 1];
-		while (pa < last_pa && npages-- > 0) {
-			vm_pageq_add_new_page(pa);
+		while (pa < last_pa) {
+			if (list != NULL &&
+			    vm_page_blacklist_lookup(list, pa))
+				printf("Skipping page with pa 0x%jx\n",
+				    (uintmax_t)pa);
+			else
+				vm_phys_add_page(pa);
 			pa += PAGE_SIZE;
 		}
 	}
+	freeenv(list);
 	return (vaddr);
 }
 
@@ -304,9 +404,9 @@
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	KASSERT((m->flags & PG_BUSY) == 0,
+	KASSERT((m->oflags & VPO_BUSY) == 0,
 	    ("vm_page_busy: page already busy!!!"));
-	vm_page_flag_set(m, PG_BUSY);
+	m->oflags |= VPO_BUSY;
 }
 
 /*
@@ -319,8 +419,8 @@
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if (m->flags & PG_WANTED) {
-		vm_page_flag_clear(m, PG_WANTED);
+	if (m->oflags & VPO_WANTED) {
+		m->oflags &= ~VPO_WANTED;
 		wakeup(m);
 	}
 }
@@ -328,7 +428,7 @@
 /*
  *      vm_page_wakeup:
  *
- *      clear the PG_BUSY flag and wakeup anyone waiting for the
+ *      clear the VPO_BUSY flag and wakeup anyone waiting for the
  *      page.
  *
  */
@@ -337,8 +437,8 @@
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!"));
-	vm_page_flag_clear(m, PG_BUSY);
+	KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
+	m->oflags &= ~VPO_BUSY;
 	vm_page_flash(m);
 }
 
@@ -355,7 +455,6 @@
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->busy--;
 	if (m->busy == 0)
 		vm_page_flash(m);
@@ -382,26 +481,21 @@
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	--mem->hold_count;
 	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
-	if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
+	if (mem->hold_count == 0 && VM_PAGE_INQUEUE2(mem, PQ_HOLD))
 		vm_page_free_toq(mem);
 }
 
 /*
  *	vm_page_free:
  *
- *	Free a page
- *
- *	The clearing of PG_ZERO is a temporary safety until the code can be
- *	reviewed to determine that PG_ZERO is being properly cleared on
- *	write faults or maps.  PG_ZERO was previously cleared in
- *	vm_page_alloc().
+ *	Free a page.
  */
 void
 vm_page_free(vm_page_t m)
 {
-	vm_page_flag_clear(m, PG_ZERO);
+
+	m->flags &= ~PG_ZERO;
 	vm_page_free_toq(m);
-	vm_page_zero_idle_wakeup();
 }
 
 /*
@@ -412,41 +506,37 @@
 void
 vm_page_free_zero(vm_page_t m)
 {
-	vm_page_flag_set(m, PG_ZERO);
+
+	m->flags |= PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
- *	vm_page_sleep_if_busy:
+ *	vm_page_sleep:
+ *
+ *	Sleep and release the page queues lock.
  *
- *	Sleep and release the page queues lock if PG_BUSY is set or,
- *	if also_m_busy is TRUE, busy is non-zero.  Returns TRUE if the
- *	thread slept and the page queues lock was released.
- *	Otherwise, retains the page queues lock and returns FALSE.
+ *	The object containing the given page must be locked.
  */
-int
-vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg)
+void
+vm_page_sleep(vm_page_t m, const char *msg)
 {
-	vm_object_t object;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) {
-		vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
-		/*
-		 * It's possible that while we sleep, the page will get
-		 * unbusied and freed.  If we are holding the object
-		 * lock, we will assume we hold a reference to the object
-		 * such that even if m->object changes, we can re-lock
-		 * it.
-		 */
-		object = m->object;
-		VM_OBJECT_UNLOCK(object);
-		msleep(m, &vm_page_queue_mtx, PDROP | PVM, msg, 0);
-		VM_OBJECT_LOCK(object);
-		return (TRUE);
-	}
-	return (FALSE);
+	if (!mtx_owned(&vm_page_queue_mtx))
+		vm_page_lock_queues();
+	vm_page_flag_set(m, PG_REFERENCED);
+	vm_page_unlock_queues();
+
+	/*
+	 * It's possible that while we sleep, the page will get
+	 * unbusied and freed.  If we are holding the object
+	 * lock, we will assume we hold a reference to the object
+	 * such that even if m->object changes, we can re-lock
+	 * it.
+	 */
+	m->oflags |= VPO_WANTED;
+	msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
 }
 
 /*
@@ -457,9 +547,9 @@
 void
 vm_page_dirty(vm_page_t m)
 {
-	KASSERT(m->queue - m->pc != PQ_CACHE,
+	KASSERT((m->flags & PG_CACHED) == 0,
 	    ("vm_page_dirty: page in cache!"));
-	KASSERT(m->queue - m->pc != PQ_FREE,
+	KASSERT(!VM_PAGE_IS_FREE(m),
 	    ("vm_page_dirty: page is free!"));
 	m->dirty = VM_PAGE_BITS_ALL;
 }
@@ -588,7 +678,7 @@
 
 	/*
 	 * Since we are inserting a new and possibly dirty page,
-	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
+	 * update the object's OBJ_MIGHTBEDIRTY flag.
 	 */
 	if (m->flags & PG_WRITEABLE)
 		vm_object_set_writeable_dirty(object);
@@ -612,14 +702,14 @@
 	vm_object_t object;
 	vm_page_t root;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((object = m->object) == NULL)
 		return;
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	if (m->flags & PG_BUSY) {
-		vm_page_flag_clear(m, PG_BUSY);
+	if (m->oflags & VPO_BUSY) {
+		m->oflags &= ~VPO_BUSY;
 		vm_page_flash(m);
 	}
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 
 	/*
 	 * Now remove from the object's list of backed pages.
@@ -700,48 +790,201 @@
 
 	vm_page_remove(m);
 	vm_page_insert(m, new_object, new_pindex);
-	if (m->queue - m->pc == PQ_CACHE)
-		vm_page_deactivate(m);
 	vm_page_dirty(m);
 }
 
 /*
- *	vm_page_select_cache:
+ *	Convert all of the given object's cached pages that have a
+ *	pindex within the given range into free pages.  If the value
+ *	zero is given for "end", then the range's upper bound is
+ *	infinity.  If the given object is backed by a vnode and it
+ *	transitions from having one or more cached pages to none, the
+ *	vnode's hold count is reduced. 
+ */
+void
+vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+{
+	vm_page_t m, m_next;
+	boolean_t empty;
+
+	mtx_lock(&vm_page_queue_free_mtx);
+	if (__predict_false(object->cache == NULL)) {
+		mtx_unlock(&vm_page_queue_free_mtx);
+		return;
+	}
+	m = object->cache = vm_page_splay(start, object->cache);
+	if (m->pindex < start) {
+		if (m->right == NULL)
+			m = NULL;
+		else {
+			m_next = vm_page_splay(start, m->right);
+			m_next->left = m;
+			m->right = NULL;
+			m = object->cache = m_next;
+		}
+	}
+
+	/*
+	 * At this point, "m" is either (1) a reference to the page
+	 * with the least pindex that is greater than or equal to
+	 * "start" or (2) NULL.
+	 */
+	for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
+		/*
+		 * Find "m"'s successor and remove "m" from the
+		 * object's cache.
+		 */
+		if (m->right == NULL) {
+			object->cache = m->left;
+			m_next = NULL;
+		} else {
+			m_next = vm_page_splay(start, m->right);
+			m_next->left = m->left;
+			object->cache = m_next;
+		}
+		/* Convert "m" to a free page. */
+		m->object = NULL;
+		m->valid = 0;
+		/* Clear PG_CACHED and set PG_FREE. */
+		m->flags ^= PG_CACHED | PG_FREE;
+		KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
+		    ("vm_page_cache_free: page %p has inconsistent flags", m));
+		cnt.v_cache_count--;
+		cnt.v_free_count++;
+	}
+	empty = object->cache == NULL;
+	mtx_unlock(&vm_page_queue_free_mtx);
+	if (object->type == OBJT_VNODE && empty)
+		vdrop(object->handle);
+}
+
+/*
+ *	Returns the cached page that is associated with the given
+ *	object and offset.  If, however, none exists, returns NULL.
  *
- *	Move a page of the given color from the cache queue to the free
- *	queue.  As pages might be found, but are not applicable, they are
- *	deactivated.
+ *	The free page queue must be locked.
+ */
+static inline vm_page_t
+vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
+{
+	vm_page_t m;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	if ((m = object->cache) != NULL && m->pindex != pindex) {
+		m = vm_page_splay(pindex, m);
+		if ((object->cache = m)->pindex != pindex)
+			m = NULL;
+	}
+	return (m);
+}
+
+/*
+ *	Remove the given cached page from its containing object's
+ *	collection of cached pages.
  *
- *	This routine may not block.
+ *	The free page queue must be locked.
  */
-vm_page_t
-vm_page_select_cache(int color)
+void
+vm_page_cache_remove(vm_page_t m)
 {
 	vm_object_t object;
-	vm_page_t m;
-	boolean_t was_trylocked;
+	vm_page_t root;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	while ((m = vm_pageq_find(PQ_CACHE, color, FALSE)) != NULL) {
-		KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
-		KASSERT(!pmap_page_is_mapped(m),
-		    ("Found mapped cache page %p", m));
-		KASSERT((m->flags & PG_UNMANAGED) == 0,
-		    ("Found unmanaged cache page %p", m));
-		KASSERT(m->wire_count == 0, ("Found wired cache page %p", m));
-		if (m->hold_count == 0 && (object = m->object,
-		    (was_trylocked = VM_OBJECT_TRYLOCK(object)) ||
-		    VM_OBJECT_LOCKED(object))) {
-			KASSERT((m->flags & PG_BUSY) == 0 && m->busy == 0,
-			    ("Found busy cache page %p", m));
-			vm_page_free(m);
-			if (was_trylocked)
-				VM_OBJECT_UNLOCK(object);
-			break;
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	KASSERT((m->flags & PG_CACHED) != 0,
+	    ("vm_page_cache_remove: page %p is not cached", m));
+	object = m->object;
+	if (m != object->cache) {
+		root = vm_page_splay(m->pindex, object->cache);
+		KASSERT(root == m,
+		    ("vm_page_cache_remove: page %p is not cached in object %p",
+		    m, object));
+	}
+	if (m->left == NULL)
+		root = m->right;
+	else if (m->right == NULL)
+		root = m->left;
+	else {
+		root = vm_page_splay(m->pindex, m->left);
+		root->right = m->right;
+	}
+	object->cache = root;
+	m->object = NULL;
+	cnt.v_cache_count--;
+}
+
+/*
+ *	Transfer all of the cached pages with offset greater than or
+ *	equal to 'offidxstart' from the original object's cache to the
+ *	new object's cache.  However, any cached pages with offset
+ *	greater than or equal to the new object's size are kept in the
+ *	original object.  Initially, the new object's cache must be
+ *	empty.  Offset 'offidxstart' in the original object must
+ *	correspond to offset zero in the new object.
+ *
+ *	The new object must be locked.
+ */
+void
+vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
+    vm_object_t new_object)
+{
+	vm_page_t m, m_next;
+
+	/*
+	 * Insertion into an object's collection of cached pages
+	 * requires the object to be locked.  In contrast, removal does
+	 * not.
+	 */
+	VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
+	KASSERT(new_object->cache == NULL,
+	    ("vm_page_cache_transfer: object %p has cached pages",
+	    new_object));
+	mtx_lock(&vm_page_queue_free_mtx);
+	if ((m = orig_object->cache) != NULL) {
+		/*
+		 * Transfer all of the pages with offset greater than or
+		 * equal to 'offidxstart' from the original object's
+		 * cache to the new object's cache.
+		 */
+		m = vm_page_splay(offidxstart, m);
+		if (m->pindex < offidxstart) {
+			orig_object->cache = m;
+			new_object->cache = m->right;
+			m->right = NULL;
+		} else {
+			orig_object->cache = m->left;
+			new_object->cache = m;
+			m->left = NULL;
 		}
-		vm_page_deactivate(m);
+		while ((m = new_object->cache) != NULL) {
+			if ((m->pindex - offidxstart) >= new_object->size) {
+				/*
+				 * Return all of the cached pages with
+				 * offset greater than or equal to the
+				 * new object's size to the original
+				 * object's cache. 
+				 */
+				new_object->cache = m->left;
+				m->left = orig_object->cache;
+				orig_object->cache = m;
+				break;
+			}
+			m_next = vm_page_splay(m->pindex, m->right);
+			/* Update the page's object and offset. */
+			m->object = new_object;
+			m->pindex -= offidxstart;
+			if (m_next == NULL)
+				break;
+			m->right = NULL;
+			m_next->left = m;
+			new_object->cache = m_next;
+		}
+		KASSERT(new_object->cache == NULL ||
+		    new_object->type == OBJT_SWAP,
+		    ("vm_page_cache_transfer: object %p's type is incompatible"
+		    " with cached pages", new_object));
 	}
-	return (m);
+	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
@@ -757,16 +1000,14 @@
  *	VM_ALLOC_ZERO		zero page
  *
  *	This routine may not block.
- *
- *	Additional special handling is required when called from an
- *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
- *	the page cache in this case.
  */
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
-	vm_page_t m = NULL;
-	int color, flags, page_req;
+	struct vnode *vp = NULL;
+	vm_object_t m_object;
+	vm_page_t m;
+	int flags, page_req;
 
 	page_req = req & VM_ALLOC_CLASS_MASK;
 	KASSERT(curthread->td_intr_nesting_level == 0 ||
@@ -777,9 +1018,7 @@
 		KASSERT(object != NULL,
 		    ("vm_page_alloc: NULL object."));
 		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-		color = (pindex + object->pg_color) & PQ_L2_MASK;
-	} else
-		color = pindex & PQ_L2_MASK;
+	}
 
 	/*
 	 * The pager is allowed to eat deeper into the free page list.
@@ -788,43 +1027,35 @@
 		page_req = VM_ALLOC_SYSTEM;
 	};
 
-loop:
-	mtx_lock_spin(&vm_page_queue_free_mtx);
-	if (cnt.v_free_count > cnt.v_free_reserved ||
+	mtx_lock(&vm_page_queue_free_mtx);
+	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
 	    (page_req == VM_ALLOC_SYSTEM && 
-	     cnt.v_cache_count == 0 && 
-	     cnt.v_free_count > cnt.v_interrupt_free_min) ||
-	    (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)) {
+	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
+	    (page_req == VM_ALLOC_INTERRUPT &&
+	    cnt.v_free_count + cnt.v_cache_count > 0)) {
 		/*
 		 * Allocate from the free queue if the number of free pages
 		 * exceeds the minimum for the request class.
 		 */
-		m = vm_pageq_find(PQ_FREE, color, (req & VM_ALLOC_ZERO) != 0);
-	} else if (page_req != VM_ALLOC_INTERRUPT) {
-		mtx_unlock_spin(&vm_page_queue_free_mtx);
-		/*
-		 * Allocatable from cache (non-interrupt only).  On success,
-		 * we must free the page and try again, thus ensuring that
-		 * cnt.v_*_free_min counters are replenished.
-		 */
-		vm_page_lock_queues();
-		if ((m = vm_page_select_cache(color)) == NULL) {
-#if defined(DIAGNOSTIC)
-			if (cnt.v_cache_count > 0)
-				printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count);
-#endif
-			vm_page_unlock_queues();
-			atomic_add_int(&vm_pageout_deficit, 1);
-			pagedaemon_wakeup();
+		if (object != NULL &&
+		    (m = vm_page_cache_lookup(object, pindex)) != NULL) {
+			if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
+				mtx_unlock(&vm_page_queue_free_mtx);
+				return (NULL);
+			}
+			vm_phys_unfree_page(m);
+			vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
+		} else if ((req & VM_ALLOC_IFCACHED) != 0) {
+			mtx_unlock(&vm_page_queue_free_mtx);
 			return (NULL);
-		}
-		vm_page_unlock_queues();
-		goto loop;
+		} else
+			m = vm_phys_alloc_pages(object != NULL ?
+			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
 	} else {
 		/*
-		 * Not allocatable from cache from interrupt, give up.
+		 * Not allocatable, give up.
 		 */
-		mtx_unlock_spin(&vm_page_queue_free_mtx);
+		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit, 1);
 		pagedaemon_wakeup();
 		return (NULL);
@@ -838,24 +1069,41 @@
 	    m != NULL,
 	    ("vm_page_alloc(): missing page on free queue")
 	);
-
-	/*
-	 * Remove from free queue
-	 */
-	vm_pageq_remove_nowakeup(m);
+	if ((m->flags & PG_CACHED) != 0) {
+		KASSERT(m->valid != 0,
+		    ("vm_page_alloc: cached page %p is invalid", m));
+		if (m->object == object && m->pindex == pindex)
+	  		cnt.v_reactivated++;
+		else
+			m->valid = 0;
+		m_object = m->object;
+		vm_page_cache_remove(m);
+		if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
+			vp = m_object->handle;
+	} else {
+		KASSERT(VM_PAGE_IS_FREE(m),
+		    ("vm_page_alloc: page %p is not free", m));
+		KASSERT(m->valid == 0,
+		    ("vm_page_alloc: free page %p is valid", m));
+		cnt.v_free_count--;
+	}
 
 	/*
 	 * Initialize structure.  Only the PG_ZERO flag is inherited.
 	 */
-	flags = PG_BUSY;
+	flags = 0;
 	if (m->flags & PG_ZERO) {
 		vm_page_zero_count--;
 		if (req & VM_ALLOC_ZERO)
-			flags = PG_ZERO | PG_BUSY;
+			flags = PG_ZERO;
 	}
-	if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ))
-		flags &= ~PG_BUSY;
+	if (object == NULL || object->type == OBJT_PHYS)
+		flags |= PG_UNMANAGED;
 	m->flags = flags;
+	if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ))
+		m->oflags = 0;
+	else
+		m->oflags = VPO_BUSY;
 	if (req & VM_ALLOC_WIRED) {
 		atomic_add_int(&cnt.v_wire_count, 1);
 		m->wire_count = 1;
@@ -864,9 +1112,8 @@
 	m->hold_count = 0;
 	m->act_count = 0;
 	m->busy = 0;
-	m->valid = 0;
 	KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
-	mtx_unlock_spin(&vm_page_queue_free_mtx);
+	mtx_unlock(&vm_page_queue_free_mtx);
 
 	if ((req & VM_ALLOC_NOOBJ) == 0)
 		vm_page_insert(m, object, pindex);
@@ -874,6 +1121,15 @@
 		m->pindex = pindex;
 
 	/*
+	 * The following call to vdrop() must come after the above call
+	 * to vm_page_insert() in case both affect the same object and
+	 * vnode.  Otherwise, the affected vnode's hold count could
+	 * temporarily become zero.
+	 */
+	if (vp != NULL)
+		vdrop(vp);
+
+	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
@@ -893,17 +1149,17 @@
 vm_wait(void)
 {
 
-	vm_page_lock_queues();
+	mtx_lock(&vm_page_queue_free_mtx);
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
-		msleep(&vm_pageout_pages_needed, &vm_page_queue_mtx,
+		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 		    PDROP | PSWP, "VMWait", 0);
 	} else {
 		if (!vm_pages_needed) {
 			vm_pages_needed = 1;
 			wakeup(&vm_pages_needed);
 		}
-		msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PVM,
+		msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
 		    "vmwait", 0);
 	}
 }
@@ -922,12 +1178,12 @@
 vm_waitpfault(void)
 {
 
-	vm_page_lock_queues();
+	mtx_lock(&vm_page_queue_free_mtx);
 	if (!vm_pages_needed) {
 		vm_pages_needed = 1;
 		wakeup(&vm_pages_needed);
 	}
-	msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PUSER,
+	msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
 	    "pfault", 0);
 }
 
@@ -946,9 +1202,7 @@
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	if (m->queue != PQ_ACTIVE) {
-		if ((m->queue - m->pc) == PQ_CACHE)
-			cnt.v_reactivated++;
+	if (VM_PAGE_GETKNOWNQUEUE2(m) != PQ_ACTIVE) {
 		vm_pageq_remove(m);
 		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 			if (m->act_count < ACT_INIT)
@@ -971,11 +1225,11 @@
  *	The page queues must be locked.
  *	This routine may not block.
  */
-static __inline void
+static inline void
 vm_page_free_wakeup(void)
 {
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	/*
 	 * if pageout daemon needs pages, then tell it that there are
 	 * some free.
@@ -999,7 +1253,7 @@
 /*
  *	vm_page_free_toq:
  *
- *	Returns the given page to the PQ_FREE list,
+ *	Returns the given page to the free list,
  *	disassociating it with any VM object.
  *
  *	Object and page must be locked prior to entry.
@@ -1009,17 +1263,19 @@
 void
 vm_page_free_toq(vm_page_t m)
 {
-	struct vpgqueues *pq;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	cnt.v_tfree++;
+	if (VM_PAGE_GETQUEUE(m) != PQ_NONE)
+		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	KASSERT(!pmap_page_is_mapped(m),
+	    ("vm_page_free_toq: freeing mapped page %p", m));
+	PCPU_INC(cnt.v_tfree);
 
-	if (m->busy || ((m->queue - m->pc) == PQ_FREE)) {
+	if (m->busy || VM_PAGE_IS_FREE(m)) {
 		printf(
-		"vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n",
-		    (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0,
+		"vm_page_free: pindex(%lu), busy(%d), VPO_BUSY(%d), hold(%d)\n",
+		    (u_long)m->pindex, m->busy, (m->oflags & VPO_BUSY) ? 1 : 0,
 		    m->hold_count);
-		if ((m->queue - m->pc) == PQ_FREE)
+		if (VM_PAGE_IS_FREE(m))
 			panic("vm_page_free: freeing free page");
 		else
 			panic("vm_page_free: freeing busy page");
@@ -1031,7 +1287,7 @@
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
-	vm_pageq_remove_nowakeup(m);
+	vm_pageq_remove(m);
 	vm_page_remove(m);
 
 	/*
@@ -1052,66 +1308,23 @@
 		}
 		panic("vm_page_free: freeing wired page");
 	}
-
-	/*
-	 * Clear the UNMANAGED flag when freeing an unmanaged page.
-	 */
-	if (m->flags & PG_UNMANAGED) {
-		m->flags &= ~PG_UNMANAGED;
-	}
-
 	if (m->hold_count != 0) {
 		m->flags &= ~PG_ZERO;
-		m->queue = PQ_HOLD;
-	} else
-		m->queue = PQ_FREE + m->pc;
-	pq = &vm_page_queues[m->queue];
-	mtx_lock_spin(&vm_page_queue_free_mtx);
-	pq->lcnt++;
-	++(*pq->cnt);
-
-	/*
-	 * Put zero'd pages on the end ( where we look for zero'd pages
-	 * first ) and non-zerod pages at the head.
-	 */
-	if (m->flags & PG_ZERO) {
-		TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
-		++vm_page_zero_count;
+		vm_pageq_enqueue(PQ_HOLD, m);
 	} else {
-		TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
-	}
-	mtx_unlock_spin(&vm_page_queue_free_mtx);
-	vm_page_free_wakeup();
-}
-
-/*
- *	vm_page_unmanage:
- *
- * 	Prevent PV management from being done on the page.  The page is
- *	removed from the paging queues as if it were wired, and as a 
- *	consequence of no longer being managed the pageout daemon will not
- *	touch it (since there is no way to locate the pte mappings for the
- *	page).  madvise() calls that mess with the pmap will also no longer
- *	operate on the page.
- *
- *	Beyond that the page is still reasonably 'normal'.  Freeing the page
- *	will clear the flag.
- *
- *	This routine is used by OBJT_PHYS objects - objects using unswappable
- *	physical memory as backing store rather then swap-backed memory and
- *	will eventually be extended to support 4MB unmanaged physical 
- *	mappings.
- */
-void
-vm_page_unmanage(vm_page_t m)
-{
-
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	if ((m->flags & PG_UNMANAGED) == 0) {
-		if (m->wire_count == 0)
-			vm_pageq_remove(m);
+		m->flags |= PG_FREE;
+		mtx_lock(&vm_page_queue_free_mtx);
+		cnt.v_free_count++;
+		if ((m->flags & PG_ZERO) != 0) {
+			vm_phys_free_pages(m, 0);
+			++vm_page_zero_count;
+		} else {
+			vm_phys_free_pages(m, 0);
+			vm_page_zero_idle_wakeup();
+		}
+		vm_page_free_wakeup();
+		mtx_unlock(&vm_page_queue_free_mtx);
 	}
-	vm_page_flag_set(m, PG_UNMANAGED);
 }
 
 /*
@@ -1209,7 +1422,7 @@
  *
  * This routine may not block.
  */
-static __inline void
+static inline void
 _vm_page_deactivate(vm_page_t m, int athead)
 {
 
@@ -1218,19 +1431,16 @@
 	/*
 	 * Ignore if already inactive.
 	 */
-	if (m->queue == PQ_INACTIVE)
+	if (VM_PAGE_INQUEUE2(m, PQ_INACTIVE))
 		return;
 	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
-		if ((m->queue - m->pc) == PQ_CACHE)
-			cnt.v_reactivated++;
 		vm_page_flag_clear(m, PG_WINATCFLS);
 		vm_pageq_remove(m);
 		if (athead)
 			TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
 		else
 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
-		m->queue = PQ_INACTIVE;
-		vm_page_queues[PQ_INACTIVE].lcnt++;
+		VM_PAGE_SETQUEUE2(m, PQ_INACTIVE);
 		cnt.v_inactive_count++;
 	}
 }
@@ -1253,7 +1463,7 @@
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
-	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+	    (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
 		return (0);
 	}
 	pmap_remove_all(m);
@@ -1277,7 +1487,7 @@
 	if (m->object != NULL)
 		VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
-	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+	    (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
 		return (0);
 	}
 	pmap_remove_all(m);
@@ -1297,29 +1507,99 @@
 void
 vm_page_cache(vm_page_t m)
 {
+	vm_object_t object;
+	vm_page_t root;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy ||
+	object = m->object;
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	if ((m->flags & PG_UNMANAGED) || (m->oflags & VPO_BUSY) || m->busy ||
 	    m->hold_count || m->wire_count) {
-		printf("vm_page_cache: attempting to cache busy page\n");
-		return;
+		panic("vm_page_cache: attempting to cache busy page");
 	}
-	if ((m->queue - m->pc) == PQ_CACHE)
+	pmap_remove_all(m);
+	if (m->dirty != 0)
+		panic("vm_page_cache: page %p is dirty", m);
+	if (m->valid == 0 || object->type == OBJT_DEFAULT ||
+	    (object->type == OBJT_SWAP &&
+	    !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
+		/*
+		 * Hypothesis: A cache-elgible page belonging to a
+		 * default object or swap object but without a backing
+		 * store must be zero filled.
+		 */
+		vm_page_free(m);
 		return;
+	}
+	KASSERT((m->flags & PG_CACHED) == 0,
+	    ("vm_page_cache: page %p is already cached", m));
+	cnt.v_tcached++;
 
 	/*
-	 * Remove all pmaps and indicate that the page is not
-	 * writeable or mapped.
+	 * Remove the page from the paging queues.
 	 */
-	pmap_remove_all(m);
-	if (m->dirty != 0) {
-		panic("vm_page_cache: caching a dirty page, pindex: %ld",
-			(long)m->pindex);
+	vm_pageq_remove(m);
+
+	/*
+	 * Remove the page from the object's collection of resident
+	 * pages. 
+	 */
+	if (m != object->root)
+		vm_page_splay(m->pindex, object->root);
+	if (m->left == NULL)
+		root = m->right;
+	else {
+		root = vm_page_splay(m->pindex, m->left);
+		root->right = m->right;
+	}
+	object->root = root;
+	TAILQ_REMOVE(&object->memq, m, listq);
+	object->resident_page_count--;
+	object->generation++;
+
+	/*
+	 * Insert the page into the object's collection of cached pages
+	 * and the physical memory allocator's cache/free page queues.
+	 */
+	vm_page_flag_set(m, PG_CACHED);
+	vm_page_flag_clear(m, PG_ZERO);
+	mtx_lock(&vm_page_queue_free_mtx);
+	vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
+	cnt.v_cache_count++;
+	root = object->cache;
+	if (root == NULL) {
+		m->left = NULL;
+		m->right = NULL;
+	} else {
+		root = vm_page_splay(m->pindex, root);
+		if (m->pindex < root->pindex) {
+			m->left = root->left;
+			m->right = root;
+			root->left = NULL;
+		} else if (__predict_false(m->pindex == root->pindex))
+			panic("vm_page_cache: offset already cached");
+		else {
+			m->right = root->right;
+			m->left = root;
+			root->right = NULL;
+		}
 	}
-	vm_pageq_remove_nowakeup(m);
-	vm_pageq_enqueue(PQ_CACHE + m->pc, m);
+	object->cache = m;
+	vm_phys_free_pages(m, 0);
 	vm_page_free_wakeup();
+	mtx_unlock(&vm_page_queue_free_mtx);
+
+	/*
+	 * Increment the vnode's hold count if this is the object's only
+	 * cached page.  Decrement the vnode's hold count if this was
+	 * the object's only resident page.
+	 */
+	if (object->type == OBJT_VNODE) {
+		if (root == NULL && object->resident_page_count != 0)
+			vhold(object->handle);
+		else if (root != NULL && object->resident_page_count == 0)
+			vdrop(object->handle);
+	}
 }
 
 /*
@@ -1357,9 +1637,7 @@
 	 * occassionally leave the page alone
 	 */
 	if ((dnw & 0x01F0) == 0 ||
-	    m->queue == PQ_INACTIVE || 
-	    m->queue - m->pc == PQ_CACHE
-	) {
+	    VM_PAGE_INQUEUE2(m, PQ_INACTIVE)) {
 		if (m->act_count >= ACT_INIT)
 			--m->act_count;
 		return;
@@ -1400,21 +1678,18 @@
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
-		vm_page_lock_queues();
-		if (m->busy || (m->flags & PG_BUSY)) {
-			vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
-			VM_OBJECT_UNLOCK(object);
-			msleep(m, &vm_page_queue_mtx, PDROP | PVM, "pgrbwt", 0);
-			VM_OBJECT_LOCK(object);
+		if (vm_page_sleep_if_busy(m, TRUE, "pgrbwt")) {
 			if ((allocflags & VM_ALLOC_RETRY) == 0)
 				return (NULL);
 			goto retrylookup;
 		} else {
-			if (allocflags & VM_ALLOC_WIRED)
+			if ((allocflags & VM_ALLOC_WIRED) != 0) {
+				vm_page_lock_queues();
 				vm_page_wire(m);
+				vm_page_unlock_queues();
+			}
 			if ((allocflags & VM_ALLOC_NOBUSY) == 0)
 				vm_page_busy(m);
-			vm_page_unlock_queues();
 			return (m);
 		}
 	}
@@ -1426,7 +1701,8 @@
 		if ((allocflags & VM_ALLOC_RETRY) == 0)
 			return (NULL);
 		goto retrylookup;
-	}
+	} else if (m->valid != 0)
+		return (m);
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
@@ -1438,7 +1714,7 @@
  *
  * Inputs are required to range within a page.
  */
-__inline int
+int
 vm_page_bits(int base, int size)
 {
 	int first_bit;
@@ -1505,7 +1781,7 @@
 	/*
 	 * Set valid, clear dirty bits.  If validating the entire
 	 * page we can safely clear the pmap modify bit.  We also
-	 * use this opportunity to clear the PG_NOSYNC flag.  If a process
+	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
 	 * takes a write fault on a MAP_NOSYNC memory area the flag will
 	 * be set again.
 	 *
@@ -1528,7 +1804,7 @@
 	m->dirty &= ~pagebits;
 	if (base == 0 && size == PAGE_SIZE) {
 		pmap_clear_modify(m);
-		vm_page_flag_clear(m, PG_NOSYNC);
+		m->oflags &= ~VPO_NOSYNC;
 	}
 }
 
@@ -1556,6 +1832,8 @@
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	bits = vm_page_bits(base, size);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
+		pmap_remove_all(m);
 	m->valid &= ~bits;
 	m->dirty &= ~bits;
 	m->object->generation++;
@@ -1640,6 +1918,14 @@
 
 int so_zerocp_fullpage = 0;
 
+/*
+ *	Replace the given page with a copy.  The copied page assumes
+ *	the portion of the given page's "wire_count" that is not the
+ *	responsibility of this copy-on-write mechanism.
+ *
+ *	The object containing the given page must have a non-zero
+ *	paging-in-progress count and be locked.
+ */
 void
 vm_page_cowfault(vm_page_t m)
 {
@@ -1648,20 +1934,32 @@
 	vm_pindex_t pindex;
 
 	object = m->object;
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	KASSERT(object->paging_in_progress != 0,
+	    ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
+	    object)); 
 	pindex = m->pindex;
 
  retry_alloc:
 	pmap_remove_all(m);
 	vm_page_remove(m);
-	mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
+	mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (mnew == NULL) {
 		vm_page_insert(m, object, pindex);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		VM_WAIT;
 		VM_OBJECT_LOCK(object);
-		vm_page_lock_queues();
-		goto retry_alloc;
+		if (m == vm_page_lookup(object, pindex)) {
+			vm_page_lock_queues();
+			goto retry_alloc;
+		} else {
+			/*
+			 * Page disappeared during the wait.
+			 */
+			vm_page_lock_queues();
+			return;
+		}
 	}
 
 	if (m->cow == 0) {
@@ -1677,7 +1975,6 @@
 			pmap_copy_page(m, mnew);
 		mnew->valid = VM_PAGE_BITS_ALL;
 		vm_page_dirty(mnew);
-		vm_page_flag_clear(mnew, PG_BUSY);
 		mnew->wire_count = m->wire_count - m->cow;
 		m->wire_count = m->cow;
 	}
@@ -1705,7 +2002,7 @@
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->cow++;
-	pmap_page_protect(m, VM_PROT_READ);
+	pmap_remove_write(m);
 }
 
 #include "opt_ddb.h"
@@ -1730,21 +2027,17 @@
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
-	int i;
+		
 	db_printf("PQ_FREE:");
-	for (i = 0; i < PQ_L2_SIZE; i++) {
-		db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
-	}
+	db_printf(" %d", cnt.v_free_count);
 	db_printf("\n");
 		
 	db_printf("PQ_CACHE:");
-	for (i = 0; i < PQ_L2_SIZE; i++) {
-		db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
-	}
+	db_printf(" %d", cnt.v_cache_count);
 	db_printf("\n");
 
 	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
-		vm_page_queues[PQ_ACTIVE].lcnt,
-		vm_page_queues[PQ_INACTIVE].lcnt);
+		*vm_page_queues[PQ_ACTIVE].cnt,
+		*vm_page_queues[PQ_INACTIVE].cnt);
 }
 #endif /* DDB */
Index: swap_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/swap_pager.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/swap_pager.c -L sys/vm/swap_pager.c -u -r1.1.1.1 -r1.2
--- sys/vm/swap_pager.c
+++ sys/vm/swap_pager.c
@@ -67,7 +67,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/swap_pager.c,v 1.273.2.1 2005/08/20 06:07:55 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/swap_pager.c,v 1.295 2007/08/05 21:04:32 alc Exp $");
 
 #include "opt_mac.h"
 #include "opt_swap.h"
@@ -77,6 +77,7 @@
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
@@ -85,7 +86,6 @@
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
@@ -94,6 +94,8 @@
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
@@ -136,37 +138,6 @@
 #define SWAP_META_PAGES		(SWB_NPAGES * 2)
 #define SWAP_META_MASK		(SWAP_META_PAGES - 1)
 
-typedef	int32_t	swblk_t;	/*
-				 * swap offset.  This is the type used to
-				 * address the "virtual swap device" and
-				 * therefore the maximum swap space is
-				 * 2^32 pages.
-				 */
-
-struct swdevt;
-typedef void sw_strategy_t(struct buf *bp, struct swdevt *sw);
-typedef void sw_close_t(struct thread *td, struct swdevt *sw);
-
-/*
- * Swap device table
- */
-struct swdevt {
-	int	sw_flags;
-	int	sw_nblks;
-	int     sw_used;
-	dev_t	sw_dev;
-	struct vnode *sw_vp;
-	void	*sw_id;
-	swblk_t	sw_first;
-	swblk_t	sw_end;
-	struct blist *sw_blist;
-	TAILQ_ENTRY(swdevt)	sw_list;
-	sw_strategy_t		*sw_strategy;
-	sw_close_t		*sw_close;
-};
-
-#define	SW_CLOSING	0x04
-
 struct swblock {
 	struct swblock	*swb_hnext;
 	vm_object_t	swb_object;
@@ -266,6 +237,7 @@
 static void	swp_pager_async_iodone(struct buf *bp);
 static int	swapongeom(struct thread *, struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
+static int	swapoff_one(struct swdevt *sp, struct thread *td);
 
 /*
  * Swap bitmap functions
@@ -487,9 +459,7 @@
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 
-		if (object != NULL) {
-			vm_object_reference(object);
-		} else {
+		if (object == NULL) {
 			object = vm_object_allocate(OBJT_DEFAULT, pindex);
 			object->handle = handle;
 
@@ -1055,20 +1025,18 @@
 	bp->b_pager.pg_reqpage = reqpage - i;
 
 	VM_OBJECT_LOCK(object);
-	vm_page_lock_queues();
 	{
 		int k;
 
 		for (k = i; k < j; ++k) {
 			bp->b_pages[k - i] = m[k];
-			vm_page_flag_set(m[k], PG_SWAPINPROG);
+			m[k]->oflags |= VPO_SWAPINPROG;
 		}
 	}
-	vm_page_unlock_queues();
 	bp->b_npages = j - i;
 
-	cnt.v_swapin++;
-	cnt.v_swappgsin += bp->b_npages;
+	PCPU_INC(cnt.v_swapin);
+	PCPU_ADD(cnt.v_swappgsin, bp->b_npages);
 
 	/*
 	 * We still hold the lock on mreq, and our automatic completion routine
@@ -1092,23 +1060,24 @@
 	swp_pager_strategy(bp);
 
 	/*
-	 * wait for the page we want to complete.  PG_SWAPINPROG is always
+	 * wait for the page we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 	 * is set in the meta-data.
 	 */
-	vm_page_lock_queues();
-	while ((mreq->flags & PG_SWAPINPROG) != 0) {
-		vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
-		cnt.v_intrans++;
-		if (msleep(mreq, &vm_page_queue_mtx, PSWP, "swread", hz*20)) {
+	VM_OBJECT_LOCK(object);
+	while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
+		mreq->oflags |= VPO_WANTED;
+		vm_page_lock_queues();
+		vm_page_flag_set(mreq, PG_REFERENCED);
+		vm_page_unlock_queues();
+		PCPU_INC(cnt.v_intrans);
+		if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) {
 			printf(
 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 			    bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
 		}
 	}
-	vm_page_unlock_queues();
 
-	VM_OBJECT_LOCK(object);
 	/*
 	 * mreq is left busied after completion, but all the other pages
 	 * are freed.  If we had an unrecoverable read error the page will
@@ -1157,7 +1126,6 @@
 	int i;
 	int n = 0;
 
-	GIANT_REQUIRED;
 	if (count && m[0]->object != object) {
 		panic("swap_pager_getpages: object mismatch %p/%p", 
 		    object, 
@@ -1281,9 +1249,7 @@
 			vm_page_dirty(mreq);
 			rtvals[i+j] = VM_PAGER_OK;
 
-			vm_page_lock_queues();
-			vm_page_flag_set(mreq, PG_SWAPINPROG);
-			vm_page_unlock_queues();
+			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
 		VM_OBJECT_UNLOCK(object);
@@ -1294,8 +1260,8 @@
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bcount;
 
-		cnt.v_swapout++;
-		cnt.v_swappgsout += bp->b_npages;
+		PCPU_INC(cnt.v_swapout);
+		PCPU_ADD(cnt.v_swappgsout, bp->b_npages);
 
 		/*
 		 * asynchronous
@@ -1398,7 +1364,7 @@
 	for (i = 0; i < bp->b_npages; ++i) {
 		vm_page_t m = bp->b_pages[i];
 
-		vm_page_flag_clear(m, PG_SWAPINPROG);
+		m->oflags &= ~VPO_SWAPINPROG;
 
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
@@ -1417,17 +1383,12 @@
 				 * not match anything ).
 				 *
 				 * We have to wake specifically requested pages
-				 * up too because we cleared PG_SWAPINPROG and
+				 * up too because we cleared VPO_SWAPINPROG and
 				 * someone may be waiting for that.
 				 *
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
-				 *
-				 * XXX IT IS NOT LEGAL TO FREE THE PAGE HERE
-				 * AS THIS MESSES WITH object->memq, and it is
-				 * not legal to mess with object->memq from an
-				 * interrupt.
 				 */
 				m->valid = 0;
 				if (i != bp->b_pager.pg_reqpage)
@@ -1476,7 +1437,7 @@
 
 			/*
 			 * We have to wake specifically requested pages
-			 * up too because we cleared PG_SWAPINPROG and
+			 * up too because we cleared VPO_SWAPINPROG and
 			 * could be waiting for it in getpages.  However,
 			 * be sure to not unbusy getpages specifically
 			 * requested page - getpages expects it to be 
@@ -1512,6 +1473,15 @@
 		VM_OBJECT_UNLOCK(object);
 	}
 
+	/* 
+	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling 
+	 * bstrategy(). Set them back to NULL now we're done with it, or we'll
+	 * trigger a KASSERT in relpbuf().
+	 */
+	if (bp->b_vp) {
+		    bp->b_vp = NULL;
+		    bp->b_bufobj = NULL;
+	}
 	/*
 	 * release the physical I/O buffer
 	 */
@@ -1579,7 +1549,7 @@
  *	XXX - The code to page the whole block in doesn't work, so we
  *	      revert to the one-by-one behavior for now.  Sigh.
  */
-static __inline void
+static inline void
 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
@@ -1591,8 +1561,8 @@
 		vm_page_lock_queues();
 		vm_page_activate(m);
 		vm_page_dirty(m);
-		vm_page_wakeup(m);
 		vm_page_unlock_queues();
+		vm_page_wakeup(m);
 		vm_pager_page_unswapped(m);
 		return;
 	}
@@ -1603,8 +1573,8 @@
 	vm_page_lock_queues();
 	vm_page_dirty(m);
 	vm_page_dontneed(m);
-	vm_page_wakeup(m);
 	vm_page_unlock_queues();
+	vm_page_wakeup(m);
 	vm_pager_page_unswapped(m);
 }
 
@@ -1653,7 +1623,6 @@
 	}
 	mtx_unlock(&swhash_mtx);
 	if (sp->sw_used) {
-		int dummy;
 		/*
 		 * Objects may be locked or paging to the device being
 		 * removed, so we will miss their pages and need to
@@ -1665,7 +1634,7 @@
 			panic("swapoff: failed to locate %d swap blocks",
 			    sp->sw_used);
 		}
-		tsleep(&dummy, PVM, "swpoff", hz / 20);
+		pause("swpoff", hz / 20);
 		goto full_rescan;
 	}
 }
@@ -1742,6 +1711,8 @@
 		if (swap == NULL) {
 			mtx_unlock(&swhash_mtx);
 			VM_OBJECT_UNLOCK(object);
+			if (uma_zone_exhausted(swap_zone))
+				printf("swap zone exhausted, increase kern.maxswzone\n");
 			VM_WAIT;
 			VM_OBJECT_LOCK(object);
 			goto retry;
@@ -1963,11 +1934,11 @@
 	struct nameidata nd;
 	int error;
 
-	mtx_lock(&Giant);
-	error = suser(td);
+	error = priv_check(td, PRIV_SWAPON);
 	if (error)
-		goto done2;
+		return (error);
 
+	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
 	swdev_syscall_active = 1;
@@ -1981,7 +1952,8 @@
 		goto done;
 	}
 
-	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW, UIO_USERSPACE, uap->name, td);
+	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->name, td);
 	error = namei(&nd);
 	if (error)
 		goto done;
@@ -2006,7 +1978,6 @@
 done:
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
-done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
@@ -2100,20 +2071,19 @@
 	struct vnode *vp;
 	struct nameidata nd;
 	struct swdevt *sp;
-	u_long nblks, dvbase;
 	int error;
 
-	mtx_lock(&Giant);
-
-	error = suser(td);
+	error = priv_check(td, PRIV_SWAPOFF);
 	if (error)
-		goto done2;
+		return (error);
 
+	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
 
-	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
+	    td);
 	error = namei(&nd);
 	if (error)
 		goto done;
@@ -2123,21 +2093,37 @@
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_vp == vp)
-			goto found;
+			break;
 	}
 	mtx_unlock(&sw_dev_mtx);
-	error = EINVAL;
-	goto done;
-found:
-	mtx_unlock(&sw_dev_mtx);
+	if (sp == NULL) {
+		error = EINVAL;
+		goto done;
+	}
+	error = swapoff_one(sp, td);
+done:
+	swdev_syscall_active = 0;
+	wakeup_one(&swdev_syscall_active);
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static int
+swapoff_one(struct swdevt *sp, struct thread *td)
+{
+	u_long nblks, dvbase;
 #ifdef MAC
-	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-	error = mac_check_system_swapoff(td->td_ucred, vp);
-	(void) VOP_UNLOCK(vp, 0, td);
+	int error;
+#endif
+
+	mtx_assert(&Giant, MA_OWNED);
+#ifdef MAC
+	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY, td);
+	error = mac_check_system_swapoff(td->td_ucred, sp->sw_vp);
+	(void) VOP_UNLOCK(sp->sw_vp, 0, td);
 	if (error != 0)
-		goto done;
+		return (error);
 #endif
-	
 	nblks = sp->sw_nblks;
 
 	/*
@@ -2148,8 +2134,7 @@
 	 */
 	if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
 	    nblks + nswap_lowat) {
-		error = ENOMEM;
-		goto done;
+		return (ENOMEM);
 	}
 
 	/*
@@ -2182,13 +2167,42 @@
 	mtx_unlock(&sw_dev_mtx);
 	blist_destroy(sp->sw_blist);
 	free(sp, M_VMPGDATA);
+	return (0);
+}
 
-done:
+void
+swapoff_all(void)
+{
+	struct swdevt *sp, *spt;
+	const char *devname;
+	int error;
+ 
+	mtx_lock(&Giant);
+	while (swdev_syscall_active)
+		tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
+	swdev_syscall_active = 1;
+ 
+	mtx_lock(&sw_dev_mtx);
+	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
+		mtx_unlock(&sw_dev_mtx);
+		if (vn_isdisk(sp->sw_vp, NULL))
+			devname = sp->sw_vp->v_rdev->si_name;
+		else
+			devname = "[file]";
+		error = swapoff_one(sp, &thread0);
+		if (error != 0) {
+			printf("Cannot remove swap device %s (error=%d), "
+			    "skipping.\n", devname, error);
+		} else if (bootverbose) {
+			printf("Swap device %s removed.\n", devname);
+		}
+		mtx_lock(&sw_dev_mtx);
+	}
+	mtx_unlock(&sw_dev_mtx);
+ 
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
-done2:
 	mtx_unlock(&Giant);
-	return (error);
 }
 
 void
@@ -2475,10 +2489,12 @@
 	vp2 = sp->sw_id;
 	vhold(vp2);
 	if (bp->b_iocmd == BIO_WRITE) {
-		if (bp->b_bufobj) /* XXX: should always be true /phk */
+		if (bp->b_bufobj)
 			bufobj_wdrop(bp->b_bufobj);
 		bufobj_wref(&vp2->v_bufobj);
 	}
+	if (bp->b_bufobj != &vp2->v_bufobj)
+		bp->b_bufobj = &vp2->v_bufobj;
 	bp->b_vp = vp2;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
@@ -2516,7 +2532,7 @@
 	error = mac_check_system_swapon(td->td_ucred, vp);
 	if (error == 0)
 #endif
-		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, -1);
+		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
 	(void) VOP_UNLOCK(vp, 0, td);
 	if (error)
 		return (error);
Index: vm_map.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_map.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_map.h -L sys/vm/vm_map.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_map.h
+++ sys/vm/vm_map.h
@@ -57,7 +57,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: src/sys/vm/vm_map.h,v 1.117 2005/01/07 02:29:27 imp Exp $
+ * $FreeBSD: src/sys/vm/vm_map.h,v 1.120 2007/08/20 12:05:45 kib Exp $
  */
 
 /*
@@ -242,7 +242,6 @@
 	caddr_t vm_taddr;	/* (c) user virtual address of text */
 	caddr_t vm_daddr;	/* (c) user virtual address of data */
 	caddr_t vm_maxsaddr;	/* user VA at max stack growth */
-	int	vm_exitingcnt;	/* several processes zombied in exit1  */
 	int	vm_refcnt;	/* number of references */
 };
 
@@ -296,7 +295,6 @@
 /* XXX: number of kernel maps and entries to statically allocate */
 #define MAX_KMAP	10
 #define	MAX_KMAPENT	128
-#define	MAX_MAPENT	128
 
 /*
  * Copy-on-write flags for vm_map operations
@@ -335,6 +333,7 @@
 vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t);
 int vm_map_delete (vm_map_t, vm_offset_t, vm_offset_t);
 int vm_map_find (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, boolean_t, vm_prot_t, vm_prot_t, int);
+int vm_map_fixed (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int);
 int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *);
 int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t);
 void vm_map_init (struct vm_map *, vm_offset_t, vm_offset_t);
--- /dev/null
+++ sys/vm/redzone.c
@@ -0,0 +1,181 @@
+/*-
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/vm/redzone.c,v 1.1 2006/01/31 11:09:20 pjd Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/stack.h>
+#include <sys/sysctl.h>
+
+#include <vm/redzone.h>
+
+
+SYSCTL_NODE(_vm, OID_AUTO, redzone, CTLFLAG_RW, NULL, "RedZone data");
+static u_long redzone_extra_mem = 0;
+SYSCTL_ULONG(_vm_redzone, OID_AUTO, extra_mem, CTLFLAG_RD, &redzone_extra_mem,
+    0, "Extra memory allocated by redzone");     
+static int redzone_panic = 0;
+TUNABLE_INT("vm.redzone.panic", &redzone_panic);
+SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RW, &redzone_panic, 0,
+    "Panic when buffer corruption is detected");     
+
+#define	REDZONE_CHSIZE	(16)
+#define	REDZONE_CFSIZE	(16)
+#define	REDZONE_HSIZE	(sizeof(struct stack) + sizeof(u_long) + REDZONE_CHSIZE)
+#define	REDZONE_FSIZE	(REDZONE_CFSIZE)
+
+static u_long
+redzone_roundup(u_long n)
+{
+
+	if (n <= 128)
+		return (128);
+	else if (n <= 256)
+		return (256);
+	else if (n <= 512)
+		return (512);
+	else if (n <= 1024)
+		return (1024);
+	else if (n <= 2048)
+		return (2048);
+	return (PAGE_SIZE);
+}
+
+u_long
+redzone_get_size(caddr_t naddr)
+{
+	u_long nsize;
+
+	bcopy(naddr - REDZONE_CHSIZE - sizeof(u_long), &nsize, sizeof(nsize));
+	return (nsize);
+}
+
+u_long
+redzone_size_ntor(u_long nsize)
+{
+
+	return (nsize + redzone_roundup(nsize) + REDZONE_FSIZE);
+}
+
+void *
+redzone_addr_ntor(caddr_t naddr)
+{
+
+	return (naddr - redzone_roundup(redzone_get_size(naddr)));
+}
+
+/*
+ * Set redzones and remember allocation backtrace.
+ */
+void *
+redzone_setup(caddr_t raddr, u_long nsize)
+{
+	struct stack st;
+	caddr_t haddr, faddr;
+
+	atomic_add_long(&redzone_extra_mem, redzone_size_ntor(nsize) - nsize);
+
+	haddr = raddr + redzone_roundup(nsize) - REDZONE_HSIZE;
+	faddr = haddr + REDZONE_HSIZE + nsize;
+
+	/* Redzone header. */
+	stack_save(&st);
+	bcopy(&st, haddr, sizeof(st));
+	haddr += sizeof(st);
+	bcopy(&nsize, haddr, sizeof(nsize));
+	haddr += sizeof(nsize);
+	memset(haddr, 0x42, REDZONE_CHSIZE);
+	haddr += REDZONE_CHSIZE;
+
+	/* Redzone footer. */
+	memset(faddr, 0x42, REDZONE_CFSIZE);
+
+	return (haddr);
+}
+
+/*
+ * Verify redzones.
+ * This function is called on free() and realloc().
+ */
+void
+redzone_check(caddr_t naddr)
+{
+	struct stack ast, fst;
+	caddr_t haddr, faddr;
+	u_int ncorruptions;
+	u_long nsize;
+	int i;
+
+	haddr = naddr - REDZONE_HSIZE;
+	bcopy(haddr, &ast, sizeof(ast));
+	haddr += sizeof(ast);
+	bcopy(haddr, &nsize, sizeof(nsize));
+	haddr += sizeof(nsize);
+
+	atomic_subtract_long(&redzone_extra_mem,
+	    redzone_size_ntor(nsize) - nsize);
+
+	/* Look for buffer underflow. */
+	ncorruptions = 0;
+	for (i = 0; i < REDZONE_CHSIZE; i++, haddr++) {
+		if (*(u_char *)haddr != 0x42)
+			ncorruptions++;
+	}
+	if (ncorruptions > 0) {
+		printf("REDZONE: Buffer underflow detected. %u byte%s "
+		    "corrupted before %p (%lu bytes allocated).\n",
+		    ncorruptions, ncorruptions == 1 ? "" : "s", naddr, nsize);
+		printf("Allocation backtrace:\n");
+		stack_print(&ast);
+		printf("Free backtrace:\n");
+		stack_save(&fst);
+		stack_print(&fst);
+		if (redzone_panic)
+			panic("Stopping here.");
+	}
+	faddr = naddr + nsize;
+	/* Look for buffer overflow. */
+	ncorruptions = 0;
+	for (i = 0; i < REDZONE_CFSIZE; i++, faddr++) {
+		if (*(u_char *)faddr != 0x42)
+			ncorruptions++;
+	}
+	if (ncorruptions > 0) {
+		printf("REDZONE: Buffer overflow detected. %u byte%s corrupted "
+		    "after %p (%lu bytes allocated).\n", ncorruptions,
+		    ncorruptions == 1 ? "" : "s", naddr + nsize, nsize);
+		printf("Allocation backtrace:\n");
+		stack_print(&ast);
+		printf("Free backtrace:\n");
+		stack_save(&fst);
+		stack_print(&fst);
+		if (redzone_panic)
+			panic("Stopping here.");
+	}
+}
--- /dev/null
+++ sys/vm/vm_phys.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2002-2006 Rice University
+ * Copyright (c) 2007 Alan L. Cox <alc at cs.rice.edu>
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Alan L. Cox,
+ * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/vm/vm_phys.h,v 1.3 2007/09/25 06:25:06 alc Exp $
+ */
+
+/*
+ *	Physical memory system definitions
+ */
+
+#ifndef	_VM_PHYS_H_
+#define	_VM_PHYS_H_
+
+void vm_phys_add_page(vm_paddr_t pa);
+vm_page_t vm_phys_alloc_contig(unsigned long npages,
+    vm_paddr_t low, vm_paddr_t high,
+    unsigned long alignment, unsigned long boundary);
+vm_page_t vm_phys_alloc_pages(int pool, int order);
+vm_paddr_t vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment);
+void vm_phys_free_pages(vm_page_t m, int order);
+void vm_phys_init(void);
+void vm_phys_set_pool(int pool, vm_page_t m, int order);
+void vm_phys_unfree_page(vm_page_t m);
+boolean_t vm_phys_zero_pages_idle(void);
+
+#endif	/* !_VM_PHYS_H_ */
--- /dev/null
+++ sys/vm/redzone.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/vm/redzone.h,v 1.1 2006/01/31 11:09:20 pjd Exp $
+ */
+
+#ifndef	_VM_REDZONE_H_
+#define	_VM_REDZONE_H_
+
+u_long redzone_get_size(caddr_t naddr);
+u_long redzone_size_ntor(u_long nsize);
+void *redzone_addr_ntor(caddr_t naddr);
+void *redzone_setup(caddr_t raddr, u_long nsize);
+void redzone_check(caddr_t naddr);
+
+#endif	/* _VM_REDZONE_H_ */
Index: vnode_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vnode_pager.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vnode_pager.c -L sys/vm/vnode_pager.c -u -r1.2 -r1.3
--- sys/vm/vnode_pager.c
+++ sys/vm/vnode_pager.c
@@ -51,7 +51,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vnode_pager.c,v 1.221.2.6 2006/03/13 03:08:26 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vnode_pager.c,v 1.236.2.1 2007/10/26 00:12:23 alc Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -75,8 +75,8 @@
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 
-static daddr_t vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
-					 int *run);
+static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
+    daddr_t *rtaddress, int *run);
 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
@@ -95,30 +95,9 @@
 
 int vnode_pbuf_freecnt;
 
-/*
- * Compatibility function for RELENG_6, in which vnode_create_vobject()
- * takes file size as size_t due to an oversight.  The type may not just
- * change to off_t because the ABI to 3rd party modules must be preserved
- * for RELENG_6 lifetime.
- */
+/* Create the VM system backing object for this vnode */
 int
-vnode_create_vobject(struct vnode *vp, size_t isize __unused, struct thread *td)
-{
-
-	/*
-	 * Size of 0 will indicate to vnode_create_vobject_off()
-	 * VOP_GETATTR() is to be called to get the actual size.
-	 */
-	return (vnode_create_vobject_off(vp, 0, td));
-}
-
-/*
- * Create the VM system backing object for this vnode -- for RELENG_6 only.
- * In HEAD, vnode_create_vobject() has been fixed to take file size as off_t
- * and so it can be used as is.
- */
-int
-vnode_create_vobject_off(struct vnode *vp, off_t isize, struct thread *td)
+vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td)
 {
 	vm_object_t object;
 	vm_ooffset_t size = isize;
@@ -172,7 +151,7 @@
 	obj = vp->v_object;
 	if (obj == NULL)
 		return;
-	ASSERT_VOP_LOCKED(vp, "vnode_destroy_vobject");
+	ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
 	VM_OBJECT_LOCK(obj);
 	if (obj->ref_count == 0) {
 		/*
@@ -219,7 +198,7 @@
 
 	vp = (struct vnode *) handle;
 
-	ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc");
+	ASSERT_VOP_ELOCKED(vp, "vnode_pager_alloc");
 
 	/*
 	 * If the object is being terminated, wait for it to
@@ -277,7 +256,7 @@
 		vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
 		wakeup(object);
 	}
-	ASSERT_VOP_LOCKED(vp, "vnode_pager_dealloc");
+	ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc");
 	vp->v_object = NULL;
 	vp->v_vflag &= ~VV_TEXT;
 }
@@ -447,6 +426,10 @@
 			if (m->dirty != 0)
 				m->dirty = VM_PAGE_BITS_ALL;
 			vm_page_unlock_queues();
+		} else if ((nsize & PAGE_MASK) &&
+		    __predict_false(object->cache != NULL)) {
+			vm_page_cache_free(object, OFF_TO_IDX(nsize),
+			    nobjsize);
 		}
 	}
 	object->un_pager.vnp.vnp_size = nsize;
@@ -458,15 +441,11 @@
  * calculate the linear (byte) disk address of specified virtual
  * file address
  */
-static daddr_t
-vnode_pager_addr(vp, address, run)
-	struct vnode *vp;
-	vm_ooffset_t address;
-	int *run;
+static int
+vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress,
+    int *run)
 {
-	daddr_t rtaddress;
 	int bsize;
-	daddr_t block;
 	int err;
 	daddr_t vblock;
 	daddr_t voffset;
@@ -481,12 +460,10 @@
 	vblock = address / bsize;
 	voffset = address % bsize;
 
-	err = VOP_BMAP(vp, vblock, NULL, &block, run, NULL);
-
-	if (err || (block == -1))
-		rtaddress = -1;
-	else {
-		rtaddress = block + voffset / DEV_BSIZE;
+	err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
+	if (err == 0) {
+		if (*rtaddress != -1)
+			*rtaddress += voffset / DEV_BSIZE;
 		if (run) {
 			*run += 1;
 			*run *= bsize/PAGE_SIZE;
@@ -494,7 +471,7 @@
 		}
 	}
 
-	return rtaddress;
+	return (err);
 }
 
 /*
@@ -534,7 +511,9 @@
 		if (address >= object->un_pager.vnp.vnp_size) {
 			fileaddr = -1;
 		} else {
-			fileaddr = vnode_pager_addr(vp, address, NULL);
+			error = vnode_pager_addr(vp, address, &fileaddr, NULL);
+			if (error)
+				break;
 		}
 		if (fileaddr != -1) {
 			bp = getpbuf(&vnode_pbuf_freecnt);
@@ -716,13 +695,13 @@
 	vm_offset_t kva;
 	off_t foff, tfoff, nextoff;
 	int i, j, size, bsize, first;
-	daddr_t firstaddr;
+	daddr_t firstaddr, reqblock;
 	struct bufobj *bo;
 	int runpg;
 	int runend;
 	struct buf *bp;
 	int count;
-	int error = 0;
+	int error;
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
@@ -745,18 +724,28 @@
 	/*
 	 * if we can't bmap, use old VOP code
 	 */
-	if (VOP_BMAP(vp, 0, &bo, 0, NULL, NULL)) {
+	error = VOP_BMAP(vp, foff / bsize, &bo, &reqblock, NULL, NULL);
+	if (error == EOPNOTSUPP) {
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
-		cnt.v_vnodein++;
-		cnt.v_vnodepgsin++;
+		PCPU_INC(cnt.v_vnodein);
+		PCPU_INC(cnt.v_vnodepgsin);
 		error = vnode_pager_input_old(object, m[reqpage]);
 		VM_OBJECT_UNLOCK(object);
 		return (error);
+	} else if (error != 0) {
+		VM_OBJECT_LOCK(object);
+		vm_page_lock_queues();
+		for (i = 0; i < count; i++)
+			if (i != reqpage)
+				vm_page_free(m[i]);
+		vm_page_unlock_queues();
+		VM_OBJECT_UNLOCK(object);
+		return (VM_PAGER_ERROR);
 
 		/*
 		 * if the blocksize is smaller than a page size, then use
@@ -772,8 +761,8 @@
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
-		cnt.v_vnodein++;
-		cnt.v_vnodepgsin++;
+		PCPU_INC(cnt.v_vnodein);
+		PCPU_INC(cnt.v_vnodepgsin);
 		return vnode_pager_input_smlfs(object, m[reqpage]);
 	}
 
@@ -791,6 +780,17 @@
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return VM_PAGER_OK;
+	} else if (reqblock == -1) {
+		pmap_zero_page(m[reqpage]);
+		vm_page_undirty(m[reqpage]);
+		m[reqpage]->valid = VM_PAGE_BITS_ALL;
+		vm_page_lock_queues();
+		for (i = 0; i < count; i++)
+			if (i != reqpage)
+				vm_page_free(m[i]);
+		vm_page_unlock_queues();
+		VM_OBJECT_UNLOCK(object);
+		return (VM_PAGER_OK);
 	}
 	m[reqpage]->valid = 0;
 	VM_OBJECT_UNLOCK(object);
@@ -804,8 +804,17 @@
 	 * calculate the run that includes the required page
 	 */
 	for (first = 0, i = 0; i < count; i = runend) {
-		firstaddr = vnode_pager_addr(vp,
-			IDX_TO_OFF(m[i]->pindex), &runpg);
+		if (vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &firstaddr,
+		    &runpg) != 0) {
+			VM_OBJECT_LOCK(object);
+			vm_page_lock_queues();
+			for (; i < count; i++)
+				if (i != reqpage)
+					vm_page_free(m[i]);
+			vm_page_unlock_queues();
+			VM_OBJECT_UNLOCK(object);
+			return (VM_PAGER_ERROR);
+		}
 		if (firstaddr == -1) {
 			VM_OBJECT_LOCK(object);
 			if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
@@ -852,9 +861,7 @@
 	 * to be zero based...
 	 */
 	if (first != 0) {
-		for (i = first; i < count; i++) {
-			m[i - first] = m[i];
-		}
+		m += first;
 		count -= first;
 		reqpage -= first;
 	}
@@ -906,8 +913,8 @@
 	bp->b_runningbufspace = bp->b_bufsize;
 	atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 
-	cnt.v_vnodein++;
-	cnt.v_vnodepgsin += count;
+	PCPU_INC(cnt.v_vnodein);
+	PCPU_ADD(cnt.v_vnodepgsin, count);
 
 	/* do the input */
 	bp->b_iooffset = dbtob(bp->b_blkno);
@@ -977,7 +984,7 @@
 			 * now tell them that it is ok to use
 			 */
 			if (!error) {
-				if (mt->flags & PG_WANTED)
+				if (mt->oflags & VPO_WANTED)
 					vm_page_activate(mt);
 				else
 					vm_page_deactivate(mt);
@@ -1154,8 +1161,8 @@
 	auio.uio_resid = maxsize;
 	auio.uio_td = (struct thread *) 0;
 	error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
-	cnt.v_vnodeout++;
-	cnt.v_vnodepgsout += ncount;
+	PCPU_INC(cnt.v_vnodeout);
+	PCPU_ADD(cnt.v_vnodepgsout, ncount);
 
 	if (error) {
 		if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
Index: vm_kern.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_kern.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_kern.h -L sys/vm/vm_kern.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_kern.h
+++ sys/vm/vm_kern.h
@@ -57,7 +57,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: src/sys/vm/vm_kern.h,v 1.28 2005/01/07 02:29:27 imp Exp $
+ * $FreeBSD: src/sys/vm/vm_kern.h,v 1.29 2006/11/20 16:23:34 ru Exp $
  */
 
 #ifndef _VM_VM_KERN_H_
@@ -67,7 +67,6 @@
 extern vm_map_t buffer_map;
 extern vm_map_t kernel_map;
 extern vm_map_t kmem_map;
-extern vm_map_t clean_map;
 extern vm_map_t exec_map;
 extern vm_map_t pipe_map;
 extern u_int vm_kmem_size;
Index: uma_dbg.h
===================================================================
RCS file: /home/cvs/src/sys/vm/uma_dbg.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/uma_dbg.h -L sys/vm/uma_dbg.h -u -r1.1.1.1 -r1.2
--- sys/vm/uma_dbg.h
+++ sys/vm/uma_dbg.h
@@ -24,7 +24,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/vm/uma_dbg.h,v 1.8.2.1 2005/08/20 13:31:05 rwatson Exp $
+ * $FreeBSD: src/sys/vm/uma_dbg.h,v 1.9 2005/07/16 09:51:52 rwatson Exp $
  *
  */
 
Index: vm.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm.h -L sys/vm/vm.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm.h
+++ sys/vm/vm.h
@@ -55,7 +55,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: src/sys/vm/vm.h,v 1.26 2005/04/01 20:00:11 jhb Exp $
+ * $FreeBSD: src/sys/vm/vm.h,v 1.27 2006/07/21 23:22:49 alc Exp $
  */
 
 #ifndef VM_H
@@ -115,19 +115,6 @@
 #endif				/* _KERNEL */
 
 /*
- * Virtual memory MPSAFE temporary workarounds.
- */
-extern int debug_mpsafevm;		/* defined in vm/vm_meter.c */
-#define	VM_LOCK_GIANT() do {						\
-	if (!debug_mpsafevm)						\
-		mtx_lock(&Giant);					\
-} while (0)
-#define	VM_UNLOCK_GIANT() do {						\
-	if (!debug_mpsafevm)						\
-		mtx_unlock(&Giant);					\
-} while (0)
-
-/*
  * Information passed from the machine-independant VM initialization code
  * for use by machine-dependant code (mainly for MMU support)
  */
Index: memguard.h
===================================================================
RCS file: /home/cvs/src/sys/vm/memguard.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/memguard.h -L sys/vm/memguard.h -u -r1.1.1.1 -r1.2
--- sys/vm/memguard.h
+++ sys/vm/memguard.h
@@ -23,9 +23,12 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/vm/memguard.h,v 1.2 2005/02/16 21:45:59 bmilekic Exp $
+ * $FreeBSD: src/sys/vm/memguard.h,v 1.3 2005/12/30 11:45:07 pjd Exp $
  */
 
+extern u_int vm_memguard_divisor;
+
 void	memguard_init(vm_map_t parent_map, unsigned long size);
 void 	*memguard_alloc(unsigned long size, int flags);
 void	memguard_free(void *addr);
+int	memguard_cmp(struct malloc_type *mtp);
Index: pmap.h
===================================================================
RCS file: /home/cvs/src/sys/vm/pmap.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/pmap.h -L sys/vm/pmap.h -u -r1.1.1.1 -r1.2
--- sys/vm/pmap.h
+++ sys/vm/pmap.h
@@ -57,7 +57,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: src/sys/vm/pmap.h,v 1.71.2.1 2005/11/13 21:45:48 alc Exp $
+ * $FreeBSD: src/sys/vm/pmap.h,v 1.79.4.1 2008/01/19 18:15:07 kib Exp $
  */
 
 /*
@@ -90,8 +90,6 @@
  */
 extern vm_offset_t kernel_vm_end;
 
-extern	int	 pmap_pagedaemon_waken;
-
 void		 pmap_change_wiring(pmap_t, vm_offset_t, boolean_t);
 void		 pmap_clear_modify(vm_page_t m);
 void		 pmap_clear_reference(vm_page_t m);
@@ -99,8 +97,10 @@
 void		 pmap_copy_page(vm_page_t, vm_page_t);
 void		 pmap_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t,
 		    boolean_t);
-vm_page_t	 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
-		    vm_prot_t prot, vm_page_t mpte);
+void	 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
+		    vm_prot_t prot);
+void		 pmap_enter_object(pmap_t pmap, vm_offset_t start,
+		    vm_offset_t end, vm_page_t m_start, vm_prot_t prot);
 vm_paddr_t	 pmap_extract(pmap_t pmap, vm_offset_t va);
 vm_page_t	 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va,
 		    vm_prot_t prot);
@@ -114,8 +114,7 @@
 		    vm_object_t object, vm_pindex_t pindex, vm_size_t size);
 boolean_t	 pmap_page_exists_quick(pmap_t pmap, vm_page_t m);
 void		 pmap_page_init(vm_page_t m);
-void		 pmap_page_protect(vm_page_t m, vm_prot_t prot);
-void		 pmap_pinit(pmap_t);
+int		 pmap_pinit(pmap_t);
 void		 pmap_pinit0(pmap_t);
 void		 pmap_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
 void		 pmap_qenter(vm_offset_t, vm_page_t *, int);
@@ -123,14 +122,14 @@
 void		 pmap_release(pmap_t);
 void		 pmap_remove(pmap_t, vm_offset_t, vm_offset_t);
 void		 pmap_remove_all(vm_page_t m);
-void		 pmap_remove_pages(pmap_t, vm_offset_t, vm_offset_t);
+void		 pmap_remove_pages(pmap_t);
+void		 pmap_remove_write(vm_page_t m);
 void		 pmap_zero_page(vm_page_t);
 void		 pmap_zero_page_area(vm_page_t, int off, int size);
 void		 pmap_zero_page_idle(vm_page_t);
 int		 pmap_mincore(pmap_t pmap, vm_offset_t addr);
 void		 pmap_activate(struct thread *td);
 vm_offset_t	 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size);
-void		 pmap_init2(void);
 
 #define	pmap_resident_count(pm)	((pm)->pm_stats.resident_count)
 #define	pmap_wired_count(pm)	((pm)->pm_stats.wired_count)
Index: uma_core.c
===================================================================
RCS file: /home/cvs/src/sys/vm/uma_core.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/vm/uma_core.c -L sys/vm/uma_core.c -u -r1.1.1.2 -r1.2
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff at FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
- * Copyright (c) 2004-2005 Robert N. M. Watson
+ * Copyright (c) 2004-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -48,7 +48,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/uma_core.c,v 1.119.2.15 2006/02/14 03:37:58 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/uma_core.c,v 1.147.2.1 2007/10/18 18:45:17 jhb Exp $");
 
 /* I should really use ktr.. */
 /*
@@ -111,6 +111,9 @@
  */
 static uma_zone_t hashzone;
 
+/* The boot-time adjusted value for cache line alignment. */
+static int uma_align_cache = 16 - 1;
+
 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 
 /*
@@ -238,27 +241,21 @@
 static int uma_zalloc_bucket(uma_zone_t zone, int flags);
 static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags);
 static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab);
-static void zone_drain(uma_zone_t);
 static uma_zone_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, u_int32_t flags);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
-static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 
 #ifdef WITNESS
 static int nosleepwithlocks = 1;
-SYSCTL_INT(_debug, OID_AUTO, nosleepwithlocks, CTLFLAG_RW, &nosleepwithlocks,
-    0, "Convert M_WAITOK to M_NOWAIT to avoid lock-held-across-sleep paths");
 #else
 static int nosleepwithlocks = 0;
+#endif
 SYSCTL_INT(_debug, OID_AUTO, nosleepwithlocks, CTLFLAG_RW, &nosleepwithlocks,
     0, "Convert M_WAITOK to M_NOWAIT to avoid lock-held-across-sleep paths");
-#endif
-SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD,
-    NULL, 0, sysctl_vm_zone, "A", "Zone Info");
 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
@@ -685,7 +682,7 @@
  * Returns:
  *	Nothing.
  */
-static void
+void
 zone_drain(uma_zone_t zone)
 {
 	struct slabhead freeslabs = { 0 };
@@ -1713,13 +1710,22 @@
 	args.size = size;
 	args.uminit = uminit;
 	args.fini = fini;
-	args.align = align;
+	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 	args.flags = flags;
 	args.zone = zone;
 	return (uma_zalloc_internal(kegs, &args, M_WAITOK));
 }
 
 /* See uma.h */
+void
+uma_set_align(int align)
+{
+
+	if (align != UMA_ALIGN_CACHE)
+		uma_align_cache = align;
+}
+
+/* See uma.h */
 uma_zone_t
 uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 		uma_init uminit, uma_fini fini, int align, u_int32_t flags)
@@ -1777,7 +1783,6 @@
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int cpu;
-	int badness;
 
 	/* This is the fast path allocation */
 #ifdef UMA_DEBUG_ALLOC_1
@@ -1786,29 +1791,9 @@
 	CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
 	    zone->uz_name, flags);
 
-	if (!(flags & M_NOWAIT)) {
-		KASSERT(curthread->td_intr_nesting_level == 0,
-		   ("malloc(M_WAITOK) in interrupt context"));
-		if (nosleepwithlocks) {
-#ifdef WITNESS
-			badness = WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
-			    NULL,
-			    "malloc(M_WAITOK) of \"%s\", forcing M_NOWAIT",
-			    zone->uz_name);
-#else
-			badness = 1;
-#endif
-		} else {
-			badness = 0;
-#ifdef WITNESS
-			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
-			    "malloc(M_WAITOK) of \"%s\"", zone->uz_name);
-#endif
-		}
-		if (badness) {
-			flags &= ~M_WAITOK;
-			flags |= M_NOWAIT;
-		}
+	if (flags & M_WAITOK) {
+		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
 	}
 
 	/*
@@ -1981,7 +1966,7 @@
 	 * caller can't handle that. 
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0)
-		if ((zone != slabzone) && (zone != slabrefzone))
+		if (zone != slabzone && zone != slabrefzone && zone != zones)
 			return (NULL);
 
 	slab = NULL;
@@ -2417,8 +2402,7 @@
 	 * If nothing else caught this, we'll just do an internal free.
 	 */
 zfree_internal:
-	uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFAIL |
-	    ZFREE_STATFREE);
+	uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
 
 	return;
 }
@@ -2502,8 +2486,13 @@
 		if (keg->uk_pages < keg->uk_maxpages)
 			keg->uk_flags &= ~UMA_ZFLAG_FULL;
 
-		/* We can handle one more allocation */
-		wakeup_one(keg);
+		/* 
+		 * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
+		 * wake up all procs blocked on pages. This should be uncommon, so 
+		 * keeping this simple for now (rather than adding count of blocked 
+		 * threads etc).
+		 */
+		wakeup(keg);
 	}
 
 	ZONE_UNLOCK(zone);
@@ -2689,6 +2678,24 @@
 	bucket_zone_drain();
 }
 
+/* See uma.h */
+int
+uma_zone_exhausted(uma_zone_t zone)
+{
+	int full;
+
+	ZONE_LOCK(zone);
+	full = (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL);
+	ZONE_UNLOCK(zone);
+	return (full);	
+}
+
+int
+uma_zone_exhausted_nolock(uma_zone_t zone)
+{
+	return (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL);
+}
+
 void *
 uma_large_malloc(int size, int wait)
 {
@@ -2776,6 +2783,7 @@
 	}
 }
 
+#ifdef DDB
 /*
  * Generate statistics across both the zone and its per-cpu cache's.  Return
  * desired statistics if the pointer is non-NULL for that statistic.
@@ -2817,83 +2825,7 @@
 	if (freesp != NULL)
 		*freesp = frees;
 }
-
-/*
- * Sysctl handler for vm.zone
- *
- * stolen from vm_zone.c
- */
-static int
-sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
-{
-	int error, len, cnt;
-	const int linesize = 128;	/* conservative */
-	int totalfree;
-	char *tmpbuf, *offset;
-	uma_zone_t z;
-	uma_keg_t zk;
-	char *p;
-	int cachefree;
-	uma_bucket_t bucket;
-	u_int64_t allocs, frees;
-
-	cnt = 0;
-	mtx_lock(&uma_mtx);
-	LIST_FOREACH(zk, &uma_kegs, uk_link) {
-		LIST_FOREACH(z, &zk->uk_zones, uz_link)
-			cnt++;
-	}
-	mtx_unlock(&uma_mtx);
-	MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize,
-			M_TEMP, M_WAITOK);
-	len = snprintf(tmpbuf, linesize,
-	    "\nITEM            SIZE     LIMIT     USED    FREE  REQUESTS\n\n");
-	if (cnt == 0)
-		tmpbuf[len - 1] = '\0';
-	error = SYSCTL_OUT(req, tmpbuf, cnt == 0 ? len-1 : len);
-	if (error || cnt == 0)
-		goto out;
-	offset = tmpbuf;
-	mtx_lock(&uma_mtx);
-	LIST_FOREACH(zk, &uma_kegs, uk_link) {
-	  LIST_FOREACH(z, &zk->uk_zones, uz_link) {
-		if (cnt == 0)	/* list may have changed size */
-			break;
-		ZONE_LOCK(z);
-		cachefree = 0;
-		if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
-			uma_zone_sumstat(z, &cachefree, &allocs, &frees);
-		} else {
-			allocs = z->uz_allocs;
-			frees = z->uz_frees;
-		}
-
-		LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) {
-			cachefree += bucket->ub_cnt;
-		}
-		totalfree = zk->uk_free + cachefree;
-		len = snprintf(offset, linesize,
-		    "%-12.12s  %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n",
-		    z->uz_name, zk->uk_size,
-		    zk->uk_maxpages * zk->uk_ipers,
-		    (zk->uk_ipers * (zk->uk_pages / zk->uk_ppera)) - totalfree,
-		    totalfree,
-		    (unsigned long long)allocs);
-		ZONE_UNLOCK(z);
-		for (p = offset + 12; p > offset && *p == ' '; --p)
-			/* nothing */ ;
-		p[1] = ':';
-		cnt--;
-		offset += len;
-	  }
-	}
-	mtx_unlock(&uma_mtx);
-	*offset++ = '\0';
-	error = SYSCTL_OUT(req, tmpbuf, offset - tmpbuf);
-out:
-	FREE(tmpbuf, M_TEMP);
-	return (error);
-}
+#endif /* DDB */
 
 static int
 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
@@ -3055,8 +2987,8 @@
 	uma_zone_t z;
 	int cachefree;
 
-	db_printf("%18s %12s %12s %12s %8s\n", "Zone", "Allocs", "Frees",
-	    "Used", "Cache");
+	db_printf("%18s %8s %8s %8s %12s\n", "Zone", "Size", "Used", "Free",
+	    "Requests");
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
@@ -3071,8 +3003,10 @@
 				cachefree += kz->uk_free;
 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
 				cachefree += bucket->ub_cnt;
-			db_printf("%18s %12ju %12ju %12ju %8d\n", z->uz_name,
-			    allocs, frees, allocs - frees, cachefree);
+			db_printf("%18s %8ju %8jd %8d %12ju\n", z->uz_name,
+			    (uintmax_t)kz->uk_size,
+			    (intmax_t)(allocs - frees), cachefree,
+			    (uintmax_t)allocs);
 		}
 	}
 }
Index: vm_pageq.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_pageq.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_pageq.c -L sys/vm/vm_pageq.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_pageq.c
+++ sys/vm/vm_pageq.c
@@ -26,13 +26,15 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_pageq.c,v 1.18 2005/06/10 03:33:36 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_pageq.c,v 1.35 2007/09/25 06:25:06 alc Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/linker_set.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
@@ -44,21 +46,16 @@
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/vm_extern.h>
 
-struct vpgqueues vm_page_queues[PQ_COUNT];
+struct vpgqueues vm_page_queues[PQ_MAXCOUNT];
 
 void
-vm_pageq_init(void) 
+vm_pageq_init(void)
 {
 	int i;
 
-	for (i = 0; i < PQ_L2_SIZE; i++) {
-		vm_page_queues[PQ_FREE+i].cnt = &cnt.v_free_count;
-	}
-	for (i = 0; i < PQ_L2_SIZE; i++) {
-		vm_page_queues[PQ_CACHE+i].cnt = &cnt.v_cache_count;
-	}
 	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
 	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
 	vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
@@ -71,7 +68,7 @@
 void
 vm_pageq_requeue(vm_page_t m)
 {
-	int queue = m->queue;
+	int queue = VM_PAGE_GETQUEUE(m);
 	struct vpgqueues *vpq;
 
 	if (queue != PQ_NONE) {
@@ -90,84 +87,9 @@
 	struct vpgqueues *vpq;
 
 	vpq = &vm_page_queues[queue];
-	m->queue = queue;
+	VM_PAGE_SETQUEUE2(m, queue);
 	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
 	++*vpq->cnt;
-	++vpq->lcnt;
-}
-
-/*
- *	vm_add_new_page:
- *
- *	Add a new page to the freelist for use by the system.
- */
-vm_page_t
-vm_pageq_add_new_page(vm_paddr_t pa)
-{
-	vm_paddr_t bad;
-	vm_page_t m;
-	char *cp, *list, *pos;
-
-	GIANT_REQUIRED;
-
-	/*
-	 * See if a physical address in this page has been listed
-	 * in the blacklist tunable.  Entries in the tunable are
-	 * separated by spaces or commas.  If an invalid integer is
-	 * encountered then the rest of the string is skipped.
-	 */
-	if (testenv("vm.blacklist")) {
-		list = getenv("vm.blacklist");
-		for (pos = list; *pos != '\0'; pos = cp) {
-			bad = strtoq(pos, &cp, 0);
-			if (*cp != '\0') {
-				if (*cp == ' ' || *cp == ',') {
-					cp++;
-					if (cp == pos)
-						continue;
-				} else
-					break;
-			}
-			if (pa == trunc_page(bad)) {
-				printf("Skipping page with pa 0x%jx\n",
-				    (uintmax_t)pa);
-				freeenv(list);
-				return (NULL);
-			}
-		}
-		freeenv(list);
-	}
-
-	++cnt.v_page_count;
-	m = PHYS_TO_VM_PAGE(pa);
-	m->phys_addr = pa;
-	m->flags = 0;
-	m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
-	pmap_page_init(m);
-	vm_pageq_enqueue(m->pc + PQ_FREE, m);
-	return (m);
-}
-
-/*
- * vm_pageq_remove_nowakeup:
- *
- * 	vm_page_unqueue() without any wakeup
- *
- *	The queue containing the given page must be locked.
- *	This routine may not block.
- */
-void
-vm_pageq_remove_nowakeup(vm_page_t m)
-{
-	int queue = m->queue;
-	struct vpgqueues *pq;
-	if (queue != PQ_NONE) {
-		pq = &vm_page_queues[queue];
-		m->queue = PQ_NONE;
-		TAILQ_REMOVE(&pq->pl, m, pageq);
-		(*pq->cnt)--;
-		pq->lcnt--;
-	}
 }
 
 /*
@@ -181,87 +103,13 @@
 void
 vm_pageq_remove(vm_page_t m)
 {
-	int queue = m->queue;
+	int queue = VM_PAGE_GETQUEUE(m);
 	struct vpgqueues *pq;
 
 	if (queue != PQ_NONE) {
-		m->queue = PQ_NONE;
+		VM_PAGE_SETQUEUE2(m, PQ_NONE);
 		pq = &vm_page_queues[queue];
 		TAILQ_REMOVE(&pq->pl, m, pageq);
 		(*pq->cnt)--;
-		pq->lcnt--;
-		if ((queue - m->pc) == PQ_CACHE) {
-			if (vm_paging_needed())
-				pagedaemon_wakeup();
-		}
-	}
-}
-
-#if PQ_L2_SIZE > 1
-
-/*
- *	vm_pageq_find:
- *
- *	Find a page on the specified queue with color optimization.
- *
- *	The page coloring optimization attempts to locate a page
- *	that does not overload other nearby pages in the object in
- *	the cpu's L2 cache.  We need this optimization because cpu
- *	caches tend to be physical caches, while object spaces tend 
- *	to be virtual.
- *
- *	The specified queue must be locked.
- *	This routine may not block.
- *
- *	This routine may only be called from the vm_pageq_find()
- *	function in this file.
- */
-static __inline vm_page_t
-_vm_pageq_find(int basequeue, int index)
-{
-	int i;
-	vm_page_t m = NULL;
-	struct vpgqueues *pq;
-
-	pq = &vm_page_queues[basequeue];
-
-	/*
-	 * Note that for the first loop, index+i and index-i wind up at the
-	 * same place.  Even though this is not totally optimal, we've already
-	 * blown it by missing the cache case so we do not care.
-	 */
-	for (i = PQ_L2_SIZE / 2; i > 0; --i) {
-		if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_L2_MASK].pl)) != NULL)
-			break;
-
-		if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_L2_MASK].pl)) != NULL)
-			break;
 	}
-	return (m);
 }
-#endif		/* PQ_L2_SIZE > 1 */
-
-vm_page_t
-vm_pageq_find(int basequeue, int index, boolean_t prefer_zero)
-{
-        vm_page_t m;
-
-#if PQ_L2_SIZE > 1
-        if (prefer_zero) {
-                m = TAILQ_LAST(&vm_page_queues[basequeue+index].pl, pglist);
-        } else {
-                m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
-        }
-        if (m == NULL) {
-                m = _vm_pageq_find(basequeue, index);
-	}
-#else
-        if (prefer_zero) {
-                m = TAILQ_LAST(&vm_page_queues[basequeue].pl, pglist);
-        } else {
-                m = TAILQ_FIRST(&vm_page_queues[basequeue].pl);
-        }
-#endif
-        return (m);
-}
-
Index: uma.h
===================================================================
RCS file: /home/cvs/src/sys/vm/uma.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/uma.h -L sys/vm/uma.h -u -r1.1.1.1 -r1.2
--- sys/vm/uma.h
+++ sys/vm/uma.h
@@ -24,7 +24,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/vm/uma.h,v 1.22.2.6 2005/11/13 08:44:24 alc Exp $
+ * $FreeBSD: src/sys/vm/uma.h,v 1.31 2007/02/11 20:13:52 rwatson Exp $
  *
  */
 
@@ -48,6 +48,8 @@
 /* Opaque type used as a handle to the zone */
 typedef struct uma_zone * uma_zone_t;
 
+void zone_drain(uma_zone_t);
+
 /* 
  * Item constructor
  *
@@ -234,7 +236,7 @@
 #define UMA_ALIGN_INT	(sizeof(int) - 1)	/* "" int */
 #define UMA_ALIGN_SHORT	(sizeof(short) - 1)	/* "" short */
 #define UMA_ALIGN_CHAR	(sizeof(char) - 1)	/* "" char */
-#define UMA_ALIGN_CACHE	(16 - 1)		/* Cache line size align */
+#define UMA_ALIGN_CACHE	(0 - 1)			/* Cache line size align */
 
 /*
  * Destroys an empty uma zone.  If the zone is not empty uma complains loudly.
@@ -386,6 +388,18 @@
 void uma_reclaim(void);
 
 /*
+ * Sets the alignment mask to be used for all zones requesting cache
+ * alignment.  Should be called by MD boot code prior to starting VM/UMA.
+ *
+ * Arguments:
+ *	align The alignment mask
+ *
+ * Returns:
+ *	Nothing
+ */
+void uma_set_align(int align);
+
+/*
  * Switches the backing object of a zone
  *
  * Arguments:
@@ -509,6 +523,18 @@
 u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item);
 
 /*
+ * Used to determine if a fixed-size zone is exhausted.
+ *
+ * Arguments:
+ *	zone    The zone to check
+ *
+ * Returns:
+ * 	Non-zero if zone is exhausted.
+ */
+int uma_zone_exhausted(uma_zone_t zone);
+int uma_zone_exhausted_nolock(uma_zone_t zone);
+
+/*
  * Exported statistics structures to be used by user space monitoring tools.
  * Statistics stream consusts of a uma_stream_header, followed by a series of
  * alternative uma_type_header and uma_type_stat structures.  Statistics
Index: vm_meter.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_meter.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_meter.c -L sys/vm/vm_meter.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_meter.c
+++ sys/vm/vm_meter.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_meter.c,v 1.85 2005/05/08 23:56:16 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_meter.c,v 1.96 2007/07/27 20:01:21 alc Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -52,18 +52,6 @@
 #include <vm/vm_object.h>
 #include <sys/sysctl.h>
 
-/*
- * Virtual memory MPSAFE temporary workarounds.
- */
-#if !defined(__arm__) && !defined(__powerpc__)
-int debug_mpsafevm = 1;
-#else
-int debug_mpsafevm;
-#endif
-TUNABLE_INT("debug.mpsafevm", &debug_mpsafevm);
-SYSCTL_INT(_debug, OID_AUTO, mpsafevm, CTLFLAG_RD, &debug_mpsafevm, 0,
-    "Enable/disable MPSAFE virtual memory support");
-
 struct vmmeter cnt;
 
 int maxslp = MAXSLP;
@@ -109,15 +97,15 @@
 {
 /* XXXKSE almost completely broken */
 	struct proc *p;
-	struct vmtotal total, *totalp;
+	struct vmtotal total;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_map_t map;
 	int paging;
 	struct thread *td;
+	struct vmspace *vm;
 
-	totalp = &total;
-	bzero(totalp, sizeof *totalp);
+	bzero(&total, sizeof(total));
 	/*
 	 * Mark all objects as inactive.
 	 */
@@ -143,49 +131,58 @@
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_flag & P_SYSTEM)
 			continue;
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		switch (p->p_state) {
 		case PRS_NEW:
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			continue;
 			break;
 		default:
 			FOREACH_THREAD_IN_PROC(p, td) {
 				/* Need new statistics  XXX */
+				thread_lock(td);
 				switch (td->td_state) {
 				case TDS_INHIBITED:
+					/*
+					 * XXX stats no longer synchronized.
+					 */
 					if (TD_ON_LOCK(td) ||
 					    (td->td_inhibitors ==
 					    TDI_SWAPPED)) {
-						totalp->t_sw++;
+						total.t_sw++;
 					} else if (TD_IS_SLEEPING(td) ||
 					   TD_AWAITING_INTR(td) ||
 					   TD_IS_SUSPENDED(td)) {
 						if (td->td_priority <= PZERO)
-							totalp->t_dw++;
+							total.t_dw++;
 						else
-							totalp->t_sl++;
+							total.t_sl++;
 					}
 					break;
 
 				case TDS_CAN_RUN:
-					totalp->t_sw++;
+					total.t_sw++;
 					break;
 				case TDS_RUNQ:
 				case TDS_RUNNING:
-					totalp->t_rq++;
+					total.t_rq++;
+					thread_unlock(td);
 					continue;
 				default:
 					break;
 				}
+				thread_unlock(td);
 			}
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		/*
 		 * Note active objects.
 		 */
 		paging = 0;
-		map = &p->p_vmspace->vm_map;
+		vm = vmspace_acquire_ref(p);
+		if (vm == NULL)
+			continue;
+		map = &vm->vm_map;
 		vm_map_lock_read(map);
 		for (entry = map->header.next;
 		    entry != &map->header; entry = entry->next) {
@@ -198,8 +195,9 @@
 			VM_OBJECT_UNLOCK(object);
 		}
 		vm_map_unlock_read(map);
+		vmspace_free(vm);
 		if (paging)
-			totalp->t_pw++;
+			total.t_pw++;
 	}
 	sx_sunlock(&allproc_lock);
 	/*
@@ -219,25 +217,32 @@
 			 */
 			continue;
 		}
-		totalp->t_vm += object->size;
-		totalp->t_rm += object->resident_page_count;
+		if (object->ref_count == 0) {
+			/*
+			 * Also skip unreferenced objects, including
+			 * vnodes representing mounted file systems.
+			 */
+			continue;
+		}
+		total.t_vm += object->size;
+		total.t_rm += object->resident_page_count;
 		if (object->flags & OBJ_ACTIVE) {
-			totalp->t_avm += object->size;
-			totalp->t_arm += object->resident_page_count;
+			total.t_avm += object->size;
+			total.t_arm += object->resident_page_count;
 		}
 		if (object->shadow_count > 1) {
 			/* shared object */
-			totalp->t_vmshr += object->size;
-			totalp->t_rmshr += object->resident_page_count;
+			total.t_vmshr += object->size;
+			total.t_rmshr += object->resident_page_count;
 			if (object->flags & OBJ_ACTIVE) {
-				totalp->t_avmshr += object->size;
-				totalp->t_armshr += object->resident_page_count;
+				total.t_avmshr += object->size;
+				total.t_armshr += object->resident_page_count;
 			}
 		}
 	}
 	mtx_unlock(&vm_object_list_mtx);
-	totalp->t_free = cnt.v_free_count + cnt.v_cache_count;
-	return (sysctl_handle_opaque(oidp, totalp, sizeof total, req));
+	total.t_free = cnt.v_free_count + cnt.v_cache_count;
+	return (sysctl_handle_opaque(oidp, &total, sizeof(total), req));
 }
 
 /*
@@ -324,6 +329,8 @@
 	&cnt.v_pdwakeups, 0, vcnt, "IU", "Pagedaemon wakeups");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, CTLTYPE_UINT|CTLFLAG_RD,
 	&cnt.v_pdpages, 0, vcnt, "IU", "Pagedaemon page scans");
+SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_tcached, CTLTYPE_UINT|CTLFLAG_RD,
+	&cnt.v_tcached, 0, vcnt, "IU", "Total pages cached");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_dfree, CTLTYPE_UINT|CTLFLAG_RD,
 	&cnt.v_dfree, 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pfree, CTLTYPE_UINT|CTLFLAG_RD,
@@ -379,13 +386,3 @@
 
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	zero_page_count, CTLFLAG_RD, &vm_page_zero_count, 0, "");
-#if 0
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
-	page_mask, CTLFLAG_RD, &page_mask, 0, "");
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
-	page_shift, CTLFLAG_RD, &page_shift, 0, "");
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
-	first_page, CTLFLAG_RD, &first_page, 0, "");
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
-	last_page, CTLFLAG_RD, &last_page, 0, "");
-#endif
Index: vm_fault.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_fault.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_fault.c -L sys/vm/vm_fault.c -u -r1.2 -r1.3
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -72,7 +72,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_fault.c,v 1.205.2.4 2006/03/08 23:53:39 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_fault.c,v 1.237 2007/10/08 20:09:53 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -130,17 +130,17 @@
 	struct vnode *vp;
 };
 
-static __inline void
+static inline void
 release_page(struct faultstate *fs)
 {
-	vm_page_lock_queues();
 	vm_page_wakeup(fs->m);
+	vm_page_lock_queues();
 	vm_page_deactivate(fs->m);
 	vm_page_unlock_queues();
 	fs->m = NULL;
 }
 
-static __inline void
+static inline void
 unlock_map(struct faultstate *fs)
 {
 	if (fs->lookup_still_valid) {
@@ -152,7 +152,6 @@
 static void
 unlock_and_deallocate(struct faultstate *fs)
 {
-	boolean_t firstobjneedgiant;
 
 	vm_object_pip_wakeup(fs->object);
 	VM_OBJECT_UNLOCK(fs->object);
@@ -165,7 +164,6 @@
 		VM_OBJECT_UNLOCK(fs->first_object);
 		fs->first_m = NULL;
 	}
-	firstobjneedgiant = (fs->first_object->flags & OBJ_NEEDGIANT) != 0;
 	vm_object_deallocate(fs->first_object);
 	unlock_map(fs);	
 	if (fs->vp != NULL) { 
@@ -176,8 +174,6 @@
 		fs->vp = NULL;
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
-	if (firstobjneedgiant)
-		VM_UNLOCK_GIANT();
 }
 
 /*
@@ -223,7 +219,7 @@
 
 	hardfault = 0;
 	growstack = TRUE;
-	atomic_add_int(&cnt.v_vm_faults, 1);
+	PCPU_INC(cnt.v_vm_faults);
 
 RetryFault:;
 
@@ -302,7 +298,7 @@
 	KASSERT((fs.first_object->flags & OBJ_NEEDGIANT) == 0 ||
 	    !fs.map->system_map,
 	    ("vm_fault: Object requiring giant mapped by system map"));
-	if (fs.first_object->flags & OBJ_NEEDGIANT && debug_mpsafevm)
+	if (fs.first_object->flags & OBJ_NEEDGIANT)
 		mtx_unlock(&Giant);
 	vm_object_pip_add(fs.first_object, 1);
 
@@ -332,8 +328,6 @@
 		 */
 		fs.m = vm_page_lookup(fs.object, fs.pindex);
 		if (fs.m != NULL) {
-			int queue;
-
 			/* 
 			 * check for page-based copy on write.
 			 * We check fs.object == fs.first_object so
@@ -355,7 +349,7 @@
 
 			/*
 			 * Wait/Retry if the page is busy.  We have to do this
-			 * if the page is busy via either PG_BUSY or 
+			 * if the page is busy via either VPO_BUSY or 
 			 * vm_page_t->busy because the vm_pager may be using
 			 * vm_page_t->busy for pageouts ( and even pageins if
 			 * it is the vnode pager ), and we could end up trying
@@ -369,7 +363,7 @@
 			 * around with a vm_page_t->busy page except, perhaps,
 			 * to pmap it.
 			 */
-			if ((fs.m->flags & PG_BUSY) || fs.m->busy) {
+			if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) {
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(fs.object);
 				if (fs.object != fs.first_object) {
@@ -393,30 +387,17 @@
 				VM_OBJECT_LOCK(fs.object);
 				if (fs.m == vm_page_lookup(fs.object,
 				    fs.pindex)) {
-					vm_page_lock_queues();
-					if (!vm_page_sleep_if_busy(fs.m, TRUE,
-					    "vmpfw"))
-						vm_page_unlock_queues();
+					vm_page_sleep_if_busy(fs.m, TRUE,
+					    "vmpfw");
 				}
 				vm_object_pip_wakeup(fs.object);
 				VM_OBJECT_UNLOCK(fs.object);
-				atomic_add_int(&cnt.v_intrans, 1);
-				if (fs.first_object->flags & OBJ_NEEDGIANT)
-					VM_UNLOCK_GIANT();
+				PCPU_INC(cnt.v_intrans);
 				vm_object_deallocate(fs.first_object);
 				goto RetryFault;
 			}
-			queue = fs.m->queue;
-
-			vm_pageq_remove_nowakeup(fs.m);
-
-			if ((queue - fs.m->pc) == PQ_CACHE && vm_page_count_severe()) {
-				vm_page_activate(fs.m);
-				vm_page_unlock_queues();
-				unlock_and_deallocate(&fs);
-				VM_WAITPFAULT;
-				goto RetryFault;
-			}
+			vm_pageq_remove(fs.m);
+			vm_page_unlock_queues();
 
 			/*
 			 * Mark page busy for other processes, and the 
@@ -425,7 +406,6 @@
 			 * found the page ).
 			 */
 			vm_page_busy(fs.m);
-			vm_page_unlock_queues();
 			if (((fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) &&
 				fs.m->object != kernel_object && fs.m->object != kmem_object) {
 				goto readrest;
@@ -456,7 +436,8 @@
 				unlock_and_deallocate(&fs);
 				VM_WAITPFAULT;
 				goto RetryFault;
-			}
+			} else if ((fs.m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
+				break;
 		}
 
 readrest:
@@ -471,7 +452,7 @@
 		 */
 		if (TRYPAGER) {
 			int rv;
-			int reqpage;
+			int reqpage = 0;
 			int ahead, behind;
 			u_char behavior = vm_map_entry_behavior(fs.entry);
 
@@ -517,7 +498,8 @@
 					if (mt == NULL || (mt->valid != VM_PAGE_BITS_ALL))
 						break;
 					if (mt->busy ||
-						(mt->flags & (PG_BUSY | PG_FICTITIOUS | PG_UNMANAGED)) ||
+					    (mt->oflags & VPO_BUSY) ||
+					    (mt->flags & (PG_FICTITIOUS | PG_UNMANAGED)) ||
 						mt->hold_count ||
 						mt->wire_count) 
 						continue;
@@ -546,7 +528,7 @@
 			 * return value is the index into the marray for the
 			 * vm_page_t passed to the routine.
 			 *
-			 * fs.m plus the additional pages are PG_BUSY'd.
+			 * fs.m plus the additional pages are VPO_BUSY'd.
 			 *
 			 * XXX vm_fault_additional_pages() can block
 			 * without releasing the map lock.
@@ -566,7 +548,7 @@
 			/*
 			 * Call the pager to retrieve the data, if any, after
 			 * releasing the lock on the map.  We hold a ref on
-			 * fs.object and the pages are PG_BUSY'd.
+			 * fs.object and the pages are VPO_BUSY'd.
 			 */
 			unlock_map(&fs);
 
@@ -674,9 +656,9 @@
 			if ((fs.m->flags & PG_ZERO) == 0) {
 				pmap_zero_page(fs.m);
 			} else {
-				atomic_add_int(&cnt.v_ozfod, 1);
+				PCPU_INC(cnt.v_ozfod);
 			}
-			atomic_add_int(&cnt.v_zfod, 1);
+			PCPU_INC(cnt.v_zfod);
 			fs.m->valid = VM_PAGE_BITS_ALL;
 			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
@@ -691,7 +673,7 @@
 		}
 	}
 
-	KASSERT((fs.m->flags & PG_BUSY) != 0,
+	KASSERT((fs.m->oflags & VPO_BUSY) != 0,
 	    ("vm_fault: not busy after main loop"));
 
 	/*
@@ -747,7 +729,6 @@
 				/*
 				 * get rid of the unnecessary page
 				 */
-				pmap_remove_all(fs.first_m);
 				vm_page_free(fs.first_m);
 				/*
 				 * grab the page and put it into the 
@@ -755,11 +736,11 @@
 				 * automatically made dirty.
 				 */
 				vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
-				vm_page_busy(fs.m);
 				vm_page_unlock_queues();
+				vm_page_busy(fs.m);
 				fs.first_m = fs.m;
 				fs.m = NULL;
-				atomic_add_int(&cnt.v_cow_optim, 1);
+				PCPU_INC(cnt.v_cow_optim);
 			} else {
 				/*
 				 * Oh, well, lets copy it.
@@ -787,7 +768,7 @@
 			fs.m = fs.first_m;
 			if (!is_first_object_locked)
 				VM_OBJECT_LOCK(fs.object);
-			atomic_add_int(&cnt.v_cow_faults, 1);
+			PCPU_INC(cnt.v_cow_faults);
 		} else {
 			prot &= ~VM_PROT_WRITE;
 		}
@@ -813,7 +794,7 @@
 			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
 
 			/*
-			 * If we don't need the page any longer, put it on the active
+			 * If we don't need the page any longer, put it on the inactive
 			 * list (the easiest thing to do here).  If no one needs it,
 			 * pageout will grab it eventually.
 			 */
@@ -848,16 +829,14 @@
 		}
 	}
 	if (prot & VM_PROT_WRITE) {
-		vm_page_lock_queues();
-		vm_page_flag_set(fs.m, PG_WRITEABLE);
-		vm_object_set_writeable_dirty(fs.m->object);
+		vm_object_set_writeable_dirty(fs.object);
 
 		/*
 		 * If the fault is a write, we know that this page is being
 		 * written NOW so dirty it explicitly to save on 
 		 * pmap_is_modified() calls later.
 		 *
-		 * If this is a NOSYNC mmap we do not want to set PG_NOSYNC
+		 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
 		 * if the page is already dirty to prevent data written with
 		 * the expectation of being synced from not being synced.
 		 * Likewise if this entry does not request NOSYNC then make
@@ -869,11 +848,10 @@
 		 */
 		if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
 			if (fs.m->dirty == 0)
-				vm_page_flag_set(fs.m, PG_NOSYNC);
+				fs.m->oflags |= VPO_NOSYNC;
 		} else {
-			vm_page_flag_clear(fs.m, PG_NOSYNC);
+			fs.m->oflags &= ~VPO_NOSYNC;
 		}
-		vm_page_unlock_queues();
 		if (fault_flags & VM_FAULT_DIRTY) {
 			vm_page_dirty(fs.m);
 			vm_pager_page_unswapped(fs.m);
@@ -883,7 +861,7 @@
 	/*
 	 * Page had better still be busy
 	 */
-	KASSERT(fs.m->flags & PG_BUSY,
+	KASSERT(fs.m->oflags & VPO_BUSY,
 		("vm_fault: page %p not busy!", fs.m));
 	/*
 	 * Sanity check: page must be completely valid or it is not fit to
@@ -921,22 +899,17 @@
 	} else {
 		vm_page_activate(fs.m);
 	}
-	vm_page_wakeup(fs.m);
 	vm_page_unlock_queues();
+	vm_page_wakeup(fs.m);
 
 	/*
 	 * Unlock everything, and return
 	 */
 	unlock_and_deallocate(&fs);
-	PROC_LOCK(curproc);
-	if ((curproc->p_sflag & PS_INMEM) && curproc->p_stats) {
-		if (hardfault) {
-			curproc->p_stats->p_ru.ru_majflt++;
-		} else {
-			curproc->p_stats->p_ru.ru_minflt++;
-		}
-	}
-	PROC_UNLOCK(curproc);
+	if (hardfault)
+		curthread->td_ru.ru_majflt++;
+	else
+		curthread->td_ru.ru_minflt++;
 
 	return (KERN_SUCCESS);
 }
@@ -953,7 +926,7 @@
 	int i;
 	vm_offset_t addr, starta;
 	vm_pindex_t pindex;
-	vm_page_t m, mpte;
+	vm_page_t m;
 	vm_object_t object;
 
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
@@ -968,7 +941,6 @@
 		starta = 0;
 	}
 
-	mpte = NULL;
 	for (i = 0; i < PAGEORDER_SIZE; i++) {
 		vm_object_t backing_object, lobject;
 
@@ -1004,13 +976,10 @@
 		}
 		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			(m->busy == 0) &&
-		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
+		    (m->flags & PG_FICTITIOUS) == 0) {
 
 			vm_page_lock_queues();
-			if ((m->queue - m->pc) == PQ_CACHE)
-				vm_page_deactivate(m);
-			mpte = pmap_enter_quick(pmap, addr, m,
-			    entry->protection, mpte);
+			pmap_enter_quick(pmap, addr, m, entry->protection);
 			vm_page_unlock_queues();
 		}
 		VM_OBJECT_UNLOCK(lobject);
@@ -1198,17 +1167,15 @@
 		 * Enter it in the pmap...
 		 */
 		pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE);
-		VM_OBJECT_LOCK(dst_object);
-		vm_page_lock_queues();
-		if ((prot & VM_PROT_WRITE) != 0)
-			vm_page_flag_set(dst_m, PG_WRITEABLE);
 
 		/*
 		 * Mark it no longer busy, and put it on the active list.
 		 */
+		VM_OBJECT_LOCK(dst_object);
+		vm_page_lock_queues();
 		vm_page_activate(dst_m);
-		vm_page_wakeup(dst_m);
 		vm_page_unlock_queues();
+		vm_page_wakeup(dst_m);
 	}
 	VM_OBJECT_UNLOCK(dst_object);
 }
@@ -1248,15 +1215,7 @@
 
 	object = m->object;
 	pindex = m->pindex;
-
-	/*
-	 * we don't fault-ahead for device pager
-	 */
-	if (object->type == OBJT_DEVICE) {
-		*reqpage = 0;
-		marray[0] = m;
-		return 1;
-	}
+	cbehind = cahead = 0;
 
 	/*
 	 * if the requested page is not available, then give up now
@@ -1280,17 +1239,6 @@
 	}
 
 	/*
-	 * try to do any readahead that we might have free pages for.
-	 */
-	if ((rahead + rbehind) >
-		((cnt.v_free_count + cnt.v_cache_count) - cnt.v_free_reserved)) {
-		pagedaemon_wakeup();
-		marray[0] = m;
-		*reqpage = 0;
-		return 1;
-	}
-
-	/*
 	 * scan backward for the read behind pages -- in memory 
 	 */
 	if (pindex > 0) {
@@ -1301,30 +1249,29 @@
 			startpindex = pindex - rbehind;
 		}
 
-		for (tpindex = pindex - 1; tpindex >= startpindex; tpindex -= 1) {
-			if (vm_page_lookup(object, tpindex)) {
-				startpindex = tpindex + 1;
-				break;
-			}
-			if (tpindex == 0)
-				break;
-		}
-
-		for (i = 0, tpindex = startpindex; tpindex < pindex; i++, tpindex++) {
+		if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL &&
+		    rtm->pindex >= startpindex)
+			startpindex = rtm->pindex + 1;
+
+		/* tpindex is unsigned; beware of numeric underflow. */
+		for (i = 0, tpindex = pindex - 1; tpindex >= startpindex &&
+		    tpindex < pindex; i++, tpindex--) {
 
-			rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+			rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
+			    VM_ALLOC_IFNOTCACHED);
 			if (rtm == NULL) {
-				vm_page_lock_queues();
+				/*
+				 * Shift the allocated pages to the
+				 * beginning of the array.
+				 */
 				for (j = 0; j < i; j++) {
-					vm_page_free(marray[j]);
+					marray[j] = marray[j + tpindex + 1 -
+					    startpindex];
 				}
-				vm_page_unlock_queues();
-				marray[0] = m;
-				*reqpage = 0;
-				return 1;
+				break;
 			}
 
-			marray[i] = rtm;
+			marray[tpindex - startpindex] = rtm;
 		}
 	} else {
 		startpindex = 0;
@@ -1342,16 +1289,15 @@
 	 * scan forward for the read ahead pages
 	 */
 	endpindex = tpindex + rahead;
+	if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex)
+		endpindex = rtm->pindex;
 	if (endpindex > object->size)
 		endpindex = object->size;
 
 	for (; tpindex < endpindex; i++, tpindex++) {
 
-		if (vm_page_lookup(object, tpindex)) {
-			break;
-		}
-
-		rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+		rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
+		    VM_ALLOC_IFNOTCACHED);
 		if (rtm == NULL) {
 			break;
 		}
@@ -1359,6 +1305,6 @@
 		marray[i] = rtm;
 	}
 
-	/* return number of bytes of pages */
+	/* return number of pages */
 	return i;
 }
Index: vm_object.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_object.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_object.c -L sys/vm/vm_object.c -u -r1.2 -r1.3
--- sys/vm/vm_object.c
+++ sys/vm/vm_object.c
@@ -63,7 +63,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_object.c,v 1.349.2.4 2006/03/13 03:08:21 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_object.c,v 1.385.2.1 2007/10/19 05:48:45 alc Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -110,6 +110,7 @@
 
 static void	vm_object_qcollapse(vm_object_t object);
 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
+static void	vm_object_vndeallocate(vm_object_t object);
 
 /*
  *	Virtual memory objects maintain the actual data
@@ -143,20 +144,17 @@
 struct vm_object kernel_object_store;
 struct vm_object kmem_object_store;
 
+SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats");
+
 static long object_collapses;
-static long object_bypasses;
+SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
+    &object_collapses, 0, "VM object collapses");
 
-/*
- * next_index determines the page color that is assigned to the next
- * allocated object.  Accesses to next_index are not synchronized
- * because the effects of two or more object allocations using
- * next_index simultaneously are inconsequential.  At any given time,
- * numerous objects have the same page color.
- */
-static int next_index;
+static long object_bypasses;
+SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
+    &object_bypasses, 0, "VM object bypasses");
 
 static uma_zone_t obj_zone;
-#define VM_OBJECTS_INIT 256
 
 static int vm_object_zinit(void *mem, int size, int flags);
 
@@ -172,6 +170,9 @@
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages",
 	    object));
+	KASSERT(object->cache == NULL,
+	    ("object %p has cached pages",
+	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
@@ -203,7 +204,6 @@
 void
 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 {
-	int incr;
 
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
@@ -216,15 +216,11 @@
 	object->flags = 0;
 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 		object->flags = OBJ_ONEMAPPING;
-	if (size > (PQ_L2_SIZE / 3 + PQ_PRIME1))
-		incr = PQ_L2_SIZE / 3 + PQ_PRIME1;
-	else
-		incr = size;
-	object->pg_color = next_index;
-	next_index = (object->pg_color + incr) & PQ_L2_MASK;
+	object->pg_color = 0;
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
+	object->cache = NULL;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
@@ -243,11 +239,11 @@
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object");
-	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
+	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kernel_object);
 
 	VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object");
-	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
+	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kmem_object);
 
 	/*
@@ -262,7 +258,6 @@
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
-	uma_prealloc(obj_zone, VM_OBJECTS_INIT);
 }
 
 void
@@ -393,7 +388,7 @@
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
-void
+static void
 vm_object_vndeallocate(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
@@ -440,23 +435,37 @@
 
 	while (object != NULL) {
 		int vfslocked;
-		/*
-		 * In general, the object should be locked when working with
-		 * its type.  In this case, in order to maintain proper lock
-		 * ordering, an exception is possible because a vnode-backed
-		 * object never changes its type.
-		 */
+
 		vfslocked = 0;
-		if (object->type == OBJT_VNODE) {
-			struct vnode *vp = (struct vnode *) object->handle;
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		}
+	restart:
 		VM_OBJECT_LOCK(object);
 		if (object->type == OBJT_VNODE) {
+			struct vnode *vp = (struct vnode *) object->handle;
+
+			/*
+			 * Conditionally acquire Giant for a vnode-backed
+			 * object.  We have to be careful since the type of
+			 * a vnode object can change while the object is
+			 * unlocked.
+			 */
+			if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
+				vfslocked = 1;
+				if (!mtx_trylock(&Giant)) {
+					VM_OBJECT_UNLOCK(object);
+					mtx_lock(&Giant);
+					goto restart;
+				}
+			}
 			vm_object_vndeallocate(object);
 			VFS_UNLOCK_GIANT(vfslocked);
 			return;
-		}
+		} else
+			/*
+			 * This is to handle the case that the object
+			 * changed type while we dropped its lock to
+			 * obtain Giant.
+			 */
+			VFS_UNLOCK_GIANT(vfslocked);
 
 		KASSERT(object->ref_count != 0,
 			("vm_object_deallocate: object deallocated too many times: %d", object->type));
@@ -497,7 +506,7 @@
 					 * priority than the current thread.
 					 * Let the lower priority thread run.
 					 */
-					tsleep(&proc0, PVM, "vmo_de", 1);
+					pause("vmo_de", 1);
 					continue;
 				}
 				/*
@@ -517,8 +526,11 @@
 						VM_OBJECT_UNLOCK(object);
 						vm_object_pip_wait(robject,
 						    "objde1");
-						VM_OBJECT_LOCK(object);
-						goto retry;
+						temp = robject->backing_object;
+						if (object == temp) {
+							VM_OBJECT_LOCK(object);
+							goto retry;
+						}
 					} else if (object->paging_in_progress) {
 						VM_OBJECT_UNLOCK(robject);
 						object->flags |= OBJ_PIPWNT;
@@ -526,10 +538,14 @@
 						    VM_OBJECT_MTX(object),
 						    PDROP | PVM, "objde2", 0);
 						VM_OBJECT_LOCK(robject);
-						VM_OBJECT_LOCK(object);
-						goto retry;
-					}
-					VM_OBJECT_UNLOCK(object);
+						temp = robject->backing_object;
+						if (object == temp) {
+							VM_OBJECT_LOCK(object);
+							goto retry;
+						}
+					} else
+						VM_OBJECT_UNLOCK(object);
+
 					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
@@ -624,7 +640,7 @@
 	 */
 	vm_page_lock_queues();
 	while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
-		KASSERT(!p->busy && (p->flags & PG_BUSY) == 0,
+		KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
 			("vm_object_terminate: freeing busy page %p "
 			"p->busy = %d, p->flags %x\n", p, p->busy, p->flags));
 		if (p->wire_count == 0) {
@@ -636,6 +652,9 @@
 	}
 	vm_page_unlock_queues();
 
+	if (__predict_false(object->cache != NULL))
+		vm_page_cache_free(object, 0, 0);
+
 	/*
 	 * Let the pager know object is dead.
 	 */
@@ -660,7 +679,7 @@
  *
  *	Clean all dirty pages in the specified range of object.  Leaves page 
  * 	on whatever queue it is currently on.   If NOSYNC is set then do not
- *	write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
+ *	write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
  *	leaving the object dirty.
  *
  *	When stuffing pages asynchronously, allow clustering.  XXX we need a
@@ -720,8 +739,7 @@
 		while (tscan < tend) {
 			curgeneration = object->generation;
 			p = vm_page_lookup(object, tscan);
-			if (p == NULL || p->valid == 0 ||
-			    (p->queue - p->pc) == PQ_CACHE) {
+			if (p == NULL || p->valid == 0) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
@@ -738,7 +756,7 @@
 			 * If we have been asked to skip nosync pages and 
 			 * this is a nosync page, we can't continue.
 			 */
-			if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
+			if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
@@ -777,17 +795,17 @@
 	 */
 	clearobjflags = 1;
 	TAILQ_FOREACH(p, &object->memq, listq) {
-		vm_page_flag_set(p, PG_CLEANCHK);
-		if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
+		p->oflags |= VPO_CLEANCHK;
+		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC))
 			clearobjflags = 0;
 		else
-			pmap_page_protect(p, VM_PROT_READ);
+			pmap_remove_write(p);
 	}
 
 	if (clearobjflags && (tstart == 0) && (tend == object->size)) {
 		struct vnode *vp;
 
-		vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+		vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
 		if (object->type == OBJT_VNODE &&
 		    (vp = (struct vnode *)object->handle) != NULL) {
 			VI_LOCK(vp);
@@ -807,17 +825,16 @@
 
 again:
 		pi = p->pindex;
-		if (((p->flags & PG_CLEANCHK) == 0) ||
+		if ((p->oflags & VPO_CLEANCHK) == 0 ||
 			(pi < tstart) || (pi >= tend) ||
-			(p->valid == 0) ||
-			((p->queue - p->pc) == PQ_CACHE)) {
-			vm_page_flag_clear(p, PG_CLEANCHK);
+		    p->valid == 0) {
+			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0) {
-			vm_page_flag_clear(p, PG_CLEANCHK);
+			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
@@ -826,8 +843,8 @@
 		 * nosync page, skip it.  Note that the object flags were
 		 * not cleared in this case so we do not have to set them.
 		 */
-		if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
-			vm_page_flag_clear(p, PG_CLEANCHK);
+		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
+			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
@@ -883,18 +900,14 @@
 		vm_page_t tp;
 
 		if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
-			if ((tp->flags & PG_BUSY) ||
+			if ((tp->oflags & VPO_BUSY) ||
 				((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
-				 (tp->flags & PG_CLEANCHK) == 0) ||
+				 (tp->oflags & VPO_CLEANCHK) == 0) ||
 				(tp->busy != 0))
 				break;
-			if((tp->queue - tp->pc) == PQ_CACHE) {
-				vm_page_flag_clear(tp, PG_CLEANCHK);
-				break;
-			}
 			vm_page_test_dirty(tp);
 			if ((tp->dirty & tp->valid) == 0) {
-				vm_page_flag_clear(tp, PG_CLEANCHK);
+				tp->oflags &= ~VPO_CLEANCHK;
 				break;
 			}
 			maf[ i - 1 ] = tp;
@@ -911,18 +924,14 @@
 			vm_page_t tp;
 
 			if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
-				if ((tp->flags & PG_BUSY) ||
+				if ((tp->oflags & VPO_BUSY) ||
 					((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
-					 (tp->flags & PG_CLEANCHK) == 0) ||
+					 (tp->oflags & VPO_CLEANCHK) == 0) ||
 					(tp->busy != 0))
 					break;
-				if ((tp->queue - tp->pc) == PQ_CACHE) {
-					vm_page_flag_clear(tp, PG_CLEANCHK);
-					break;
-				}
 				vm_page_test_dirty(tp);
 				if ((tp->dirty & tp->valid) == 0) {
-					vm_page_flag_clear(tp, PG_CLEANCHK);
+					tp->oflags &= ~VPO_CLEANCHK;
 					break;
 				}
 				mab[ i - 1 ] = tp;
@@ -936,22 +945,22 @@
 	for(i = 0; i < maxb; i++) {
 		int index = (maxb - i) - 1;
 		ma[index] = mab[i];
-		vm_page_flag_clear(ma[index], PG_CLEANCHK);
+		ma[index]->oflags &= ~VPO_CLEANCHK;
 	}
-	vm_page_flag_clear(p, PG_CLEANCHK);
+	p->oflags &= ~VPO_CLEANCHK;
 	ma[maxb] = p;
 	for(i = 0; i < maxf; i++) {
 		int index = (maxb + i) + 1;
 		ma[index] = maf[i];
-		vm_page_flag_clear(ma[index], PG_CLEANCHK);
+		ma[index]->oflags &= ~VPO_CLEANCHK;
 	}
 	runlen = maxb + maxf + 1;
 
 	vm_pageout_flush(ma, runlen, pagerflags);
 	for (i = 0; i < runlen; i++) {
 		if (ma[i]->valid & ma[i]->dirty) {
-			pmap_page_protect(ma[i], VM_PROT_READ);
-			vm_page_flag_set(ma[i], PG_CLEANCHK);
+			pmap_remove_write(ma[i]);
+			ma[i]->oflags |= VPO_CLEANCHK;
 
 			/*
 			 * maxf will end up being the actual number of pages
@@ -1092,6 +1101,13 @@
 			}
 		}
 		m = vm_page_lookup(tobject, tpindex);
+		if (m == NULL && advise == MADV_WILLNEED) {
+			/*
+			 * If the page is cached, reactivate it.
+			 */
+			m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
+			    VM_ALLOC_NOBUSY);
+		}
 		if (m == NULL) {
 			/*
 			 * There may be swap even if there is no backing page
@@ -1125,12 +1141,13 @@
 			vm_page_unlock_queues();
 			goto unlock_tobject;
 		}
-		if ((m->flags & PG_BUSY) || m->busy) {
-			vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
+		if ((m->oflags & VPO_BUSY) || m->busy) {
+			vm_page_flag_set(m, PG_REFERENCED);
+			vm_page_unlock_queues();
 			if (object != tobject)
 				VM_OBJECT_UNLOCK(object);
-			VM_OBJECT_UNLOCK(tobject);
-			msleep(m, &vm_page_queue_mtx, PDROP | PVM, "madvpo", 0);
+			m->oflags |= VPO_WANTED;
+			msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo", 0);
 			VM_OBJECT_LOCK(object);
   			goto relookup;
 		}
@@ -1231,17 +1248,8 @@
 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
 		source->shadow_count++;
 		source->generation++;
-		if (length < source->size)
-			length = source->size;
-		if (length > PQ_L2_SIZE / 3 + PQ_PRIME1 ||
-		    source->generation > 1)
-			length = PQ_L2_SIZE / 3 + PQ_PRIME1;
-		result->pg_color = (source->pg_color +
-		    length * source->generation) & PQ_L2_MASK;
 		result->flags |= source->flags & OBJ_NEEDGIANT;
 		VM_OBJECT_UNLOCK(source);
-		next_index = (result->pg_color + PQ_L2_SIZE / 3 + PQ_PRIME1) &
-		    PQ_L2_MASK;
 	}
 
 
@@ -1262,10 +1270,10 @@
 void
 vm_object_split(vm_map_entry_t entry)
 {
-	vm_page_t m;
+	vm_page_t m, m_next;
 	vm_object_t orig_object, new_object, source;
-	vm_pindex_t offidxstart, offidxend;
-	vm_size_t idx, size;
+	vm_pindex_t idx, offidxstart;
+	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
@@ -1275,8 +1283,7 @@
 	VM_OBJECT_UNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
-	offidxend = offidxstart + OFF_TO_IDX(entry->end - entry->start);
-	size = offidxend - offidxstart;
+	size = atop(entry->end - entry->start);
 
 	/*
 	 * If swap_pager_copy() is later called, it will convert new_object
@@ -1284,11 +1291,23 @@
 	 */
 	new_object = vm_object_allocate(OBJT_DEFAULT, size);
 
+	/*
+	 * At this point, the new object is still private, so the order in
+	 * which the original and new objects are locked does not matter.
+	 */
 	VM_OBJECT_LOCK(new_object);
 	VM_OBJECT_LOCK(orig_object);
 	source = orig_object->backing_object;
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
+		if ((source->flags & OBJ_DEAD) != 0) {
+			VM_OBJECT_UNLOCK(source);
+			VM_OBJECT_UNLOCK(orig_object);
+			VM_OBJECT_UNLOCK(new_object);
+			vm_object_deallocate(new_object);
+			VM_OBJECT_LOCK(orig_object);
+			return;
+		}
 		LIST_INSERT_HEAD(&source->shadow_head,
 				  new_object, shadow_list);
 		source->shadow_count++;
@@ -1301,12 +1320,18 @@
 		new_object->backing_object = source;
 	}
 	new_object->flags |= orig_object->flags & OBJ_NEEDGIANT;
+retry:
+	if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) {
+		if (m->pindex < offidxstart) {
+			m = vm_page_splay(offidxstart, orig_object->root);
+			if ((orig_object->root = m)->pindex < offidxstart)
+				m = TAILQ_NEXT(m, listq);
+		}
+	}
 	vm_page_lock_queues();
-	for (idx = 0; idx < size; idx++) {
-	retry:
-		m = vm_page_lookup(orig_object, offidxstart + idx);
-		if (m == NULL)
-			continue;
+	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
+	    m = m_next) {
+		m_next = TAILQ_NEXT(m, listq);
 
 		/*
 		 * We must wait for pending I/O to complete before we can
@@ -1315,14 +1340,13 @@
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
-		if ((m->flags & PG_BUSY) || m->busy) {
-			vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
-			VM_OBJECT_UNLOCK(orig_object);
+		if ((m->oflags & VPO_BUSY) || m->busy) {
+			vm_page_flag_set(m, PG_REFERENCED);
+			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(new_object);
-			msleep(m, &vm_page_queue_mtx, PDROP | PVM, "spltwt", 0);
+			m->oflags |= VPO_WANTED;
+			msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
 			VM_OBJECT_LOCK(new_object);
-			VM_OBJECT_LOCK(orig_object);
-			vm_page_lock_queues();
 			goto retry;
 		}
 		vm_page_rename(m, new_object, idx);
@@ -1336,12 +1360,17 @@
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
+
+		/*
+		 * Transfer any cached pages from orig_object to new_object.
+		 */
+		if (__predict_false(orig_object->cache != NULL))
+			vm_page_cache_transfer(orig_object, offidxstart,
+			    new_object);
 	}
 	VM_OBJECT_UNLOCK(orig_object);
-	vm_page_lock_queues();
 	TAILQ_FOREACH(m, &new_object->memq, listq)
 		vm_page_wakeup(m);
-	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
@@ -1372,8 +1401,8 @@
 	 */
 	if (op & OBSC_TEST_ALL_SHADOWED) {
 		/*
-		 * We do not want to have to test for the existence of
-		 * swap pages in the backing object.  XXX but with the
+		 * We do not want to have to test for the existence of cache
+		 * or swap pages in the backing object.  XXX but with the
 		 * new swapper this would be pretty easy to do.
 		 *
 		 * XXX what about anonymous MAP_SHARED memory that hasn't
@@ -1442,20 +1471,20 @@
 			vm_page_t pp;
 
 			if (op & OBSC_COLLAPSE_NOWAIT) {
-				if ((p->flags & PG_BUSY) ||
+				if ((p->oflags & VPO_BUSY) ||
 				    !p->valid || 
 				    p->busy) {
 					p = next;
 					continue;
 				}
 			} else if (op & OBSC_COLLAPSE_WAIT) {
-				if ((p->flags & PG_BUSY) || p->busy) {
+				if ((p->oflags & VPO_BUSY) || p->busy) {
 					vm_page_lock_queues();
-					vm_page_flag_set(p,
-					    PG_WANTED | PG_REFERENCED);
-					VM_OBJECT_UNLOCK(backing_object);
+					vm_page_flag_set(p, PG_REFERENCED);
+					vm_page_unlock_queues();
 					VM_OBJECT_UNLOCK(object);
-					msleep(p, &vm_page_queue_mtx,
+					p->oflags |= VPO_WANTED;
+					msleep(p, VM_OBJECT_MTX(backing_object),
 					    PDROP | PVM, "vmocol", 0);
 					VM_OBJECT_LOCK(object);
 					VM_OBJECT_LOCK(backing_object);
@@ -1646,6 +1675,12 @@
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
+
+				/*
+				 * Free any cached pages from backing_object.
+				 */
+				if (__predict_false(backing_object->cache != NULL))
+					vm_page_cache_free(backing_object, 0, 0);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
@@ -1765,14 +1800,15 @@
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->resident_page_count == 0)
-		return;
+		goto skipmemq;
 
 	/*
 	 * Since physically-backed objects do not use managed pages, we can't
 	 * remove pages from the object (we must instead remove the page
 	 * references, and then destroy the object).
 	 */
-	KASSERT(object->type != OBJT_PHYS,
+	KASSERT(object->type != OBJT_PHYS || object == kernel_object ||
+	    object == kmem_object,
 	    ("attempt to remove pages from a physical object"));
 
 	vm_object_pip_add(object, 1);
@@ -1804,7 +1840,7 @@
 		if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
 			goto again;
 		if (clean_only && p->valid) {
-			pmap_page_protect(p, VM_PROT_READ | VM_PROT_EXECUTE);
+			pmap_remove_write(p);
 			if (p->valid & p->dirty)
 				continue;
 		}
@@ -1813,6 +1849,9 @@
 	}
 	vm_page_unlock_queues();
 	vm_object_pip_wakeup(object);
+skipmemq:
+	if (__predict_false(object->cache != NULL))
+		vm_page_cache_free(object, start, end);
 }
 
 /*
@@ -1903,10 +1942,9 @@
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	if ((object->flags & (OBJ_MIGHTBEDIRTY|OBJ_WRITEABLE)) ==
-	    (OBJ_MIGHTBEDIRTY|OBJ_WRITEABLE))
+	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
 		return;
-	vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
 	if (object->type == OBJT_VNODE &&
 	    (vp = (struct vnode *)object->handle) != NULL) {
 		VI_LOCK(vp);
@@ -1968,7 +2006,7 @@
 	struct proc *p;
 
 	/* sx_slock(&allproc_lock); */
-	LIST_FOREACH(p, &allproc, p_list) {
+	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
@@ -2090,7 +2128,7 @@
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		vm_pindex_t idx, fidx;
 		vm_pindex_t osize;
-		vm_paddr_t pa = -1, padiff;
+		vm_paddr_t pa = -1;
 		int rcount;
 		vm_page_t m;
 
@@ -2132,17 +2170,8 @@
 				continue;
 			}
 			if (rcount) {
-				padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
-				padiff >>= PAGE_SHIFT;
-				padiff &= PQ_L2_MASK;
-				if (padiff == 0) {
-					pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
-					++rcount;
-					continue;
-				}
-				db_printf(" index(%ld)run(%d)pa(0x%lx)",
+				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 					(long)fidx, rcount, (long)pa);
-				db_printf("pd(%ld)\n", (long)padiff);
 				if (nl > 18) {
 					c = cngetc();
 					if (c != ' ')
Index: vm_mmap.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_mmap.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/vm/vm_mmap.c -L sys/vm/vm_mmap.c -u -r1.1.1.2 -r1.2
--- sys/vm/vm_mmap.c
+++ sys/vm/vm_mmap.c
@@ -41,9 +41,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_mmap.c,v 1.200.2.2 2005/12/26 13:47:20 dds Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_mmap.c,v 1.213 2007/08/20 12:05:45 kib Exp $");
 
 #include "opt_compat.h"
+#include "opt_hwpmc_hooks.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
@@ -53,13 +54,13 @@
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
-#include <sys/mac.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/conf.h>
@@ -67,6 +68,8 @@
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
@@ -79,6 +82,10 @@
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
@@ -201,6 +208,9 @@
 	struct thread *td;
 	struct mmap_args *uap;
 {
+#ifdef HWPMC_HOOKS
+	struct pmckern_map_in pkm;
+#endif
 	struct file *fp;
 	struct vnode *vp;
 	vm_offset_t addr;
@@ -297,7 +307,7 @@
 		if ((error = fget(td, uap->fd, &fp)) != 0)
 			goto done;
 		if (fp->f_type != DTYPE_VNODE) {
-			error = EINVAL;
+			error = ENODEV;
 			goto done;
 		}
 		/*
@@ -364,6 +374,15 @@
 
 	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
 	    flags, handle_type, handle, pos);
+#ifdef HWPMC_HOOKS
+	/* inform hwpmc(4) if an executable is being mapped */
+	if (error == 0 && handle_type == OBJT_VNODE &&
+	    (prot & PROT_EXEC)) {
+		pkm.pm_file = handle;
+		pkm.pm_address = (uintptr_t) addr;
+		PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
+	}
+#endif
 	if (error == 0)
 		td->td_retval[0] = (register_t) (addr + pageoff);
 done:
@@ -373,6 +392,20 @@
 	return (error);
 }
 
+int
+freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
+{
+	struct mmap_args oargs;
+
+	oargs.addr = uap->addr;
+	oargs.len = uap->len;
+	oargs.prot = uap->prot;
+	oargs.flags = uap->flags;
+	oargs.fd = uap->fd;
+	oargs.pos = uap->pos;
+	return (mmap(td, &oargs));
+}
+
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct ommap_args {
@@ -495,6 +528,10 @@
 	struct thread *td;
 	struct munmap_args *uap;
 {
+#ifdef HWPMC_HOOKS
+	struct pmckern_map_out pkm;
+	vm_map_entry_t entry;
+#endif
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_map_t map;
@@ -525,6 +562,26 @@
 		vm_map_unlock(map);
 		return (EINVAL);
 	}
+#ifdef HWPMC_HOOKS
+	/*
+	 * Inform hwpmc if the address range being unmapped contains
+	 * an executable region.
+	 */
+	if (vm_map_lookup_entry(map, addr, &entry)) {
+		for (;
+		     entry != &map->header && entry->start < addr + size;
+		     entry = entry->next) {
+			if (vm_map_check_protection(map, entry->start,
+				entry->end, VM_PROT_EXECUTE) == TRUE) {
+				pkm.pm_address = (uintptr_t) addr;
+				pkm.pm_size = (size_t) size;
+				PMC_CALL_HOOK(td, PMC_FN_MUNMAP,
+				    (void *) &pkm);
+				break;
+			}
+		}
+	}
+#endif
 	/* returns nothing but KERN_SUCCESS anyway */
 	vm_map_delete(map, addr, addr + size);
 	vm_map_unlock(map);
@@ -642,7 +699,7 @@
 	 * "immortal."
 	 */
 	if (uap->behav == MADV_PROTECT) {
-		error = suser(td);
+		error = priv_check(td, PRIV_VM_MADV_PROTECT);
 		if (error == 0) {
 			p = td->td_proc;
 			PROC_LOCK(p);
@@ -716,7 +773,7 @@
 	end = addr + (vm_size_t)round_page(uap->len);
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (end > vm_map_max(map) || end < addr)
-		return (EINVAL);
+		return (ENOMEM);
 
 	/*
 	 * Address of byte vector
@@ -729,8 +786,10 @@
 RestartScan:
 	timestamp = map->timestamp;
 
-	if (!vm_map_lookup_entry(map, addr, &entry))
-		entry = entry->next;
+	if (!vm_map_lookup_entry(map, addr, &entry)) {
+		vm_map_unlock_read(map);
+		return (ENOMEM);
+	}
 
 	/*
 	 * Do this on a map entry basis so that if the pages are not
@@ -743,6 +802,16 @@
 	    current = current->next) {
 
 		/*
+		 * check for contiguity
+		 */
+		if (current->end < end &&
+		    (entry->next == &map->header ||
+		     current->next->start > current->end)) {
+			vm_map_unlock_read(map);
+			return (ENOMEM);
+		}
+
+		/*
 		 * ignore submaps (for now) or null objects
 		 */
 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
@@ -897,7 +966,7 @@
 	vm_size_t npages, size;
 	int error;
 
-	error = suser(td);
+	error = priv_check(td, PRIV_VM_MLOCK);
 	if (error)
 		return (error);
 	addr = (vm_offset_t)uap->addr;
@@ -962,7 +1031,7 @@
 	}
 	PROC_UNLOCK(td->td_proc);
 #else
-	error = suser(td);
+	error = priv_check(td, PRIV_VM_MLOCK);
 	if (error)
 		return (error);
 #endif
@@ -1007,7 +1076,7 @@
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
-	error = suser(td);
+	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 
@@ -1041,7 +1110,7 @@
 	vm_size_t size;
 	int error;
 
-	error = suser(td);
+	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 	addr = (vm_offset_t)uap->addr;
@@ -1236,7 +1305,7 @@
 	vm_ooffset_t foff)
 {
 	boolean_t fitit;
-	vm_object_t object;
+	vm_object_t object = NULL;
 	int rv = KERN_SUCCESS;
 	int docow, error;
 	struct thread *td = curthread;
@@ -1272,7 +1341,6 @@
 		if (*addr != trunc_page(*addr))
 			return (EINVAL);
 		fitit = FALSE;
-		(void) vm_map_remove(map, *addr, *addr + size);
 	}
 	/*
 	 * Lookup/allocate object.
@@ -1294,6 +1362,7 @@
 		/* FALLTHROUGH */
 	default:
 		error = EINVAL;
+		break;
 	}
 	if (error)
 		return (error);
@@ -1330,8 +1399,11 @@
 	if (flags & MAP_STACK)
 		rv = vm_map_stack(map, *addr, size, prot, maxprot,
 		    docow | MAP_STACK_GROWS_DOWN);
+	else if (fitit)
+		rv = vm_map_find(map, object, foff, addr, size, TRUE,
+				 prot, maxprot, docow);
 	else
-		rv = vm_map_find(map, object, foff, addr, size, fitit,
+		rv = vm_map_fixed(map, object, foff, addr, size,
 				 prot, maxprot, docow);
 
 	if (rv != KERN_SUCCESS) {
Index: vm_param.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_param.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_param.h -L sys/vm/vm_param.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_param.h
+++ sys/vm/vm_param.h
@@ -57,7 +57,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: src/sys/vm/vm_param.h,v 1.21 2005/01/07 02:29:27 imp Exp $
+ * $FreeBSD: src/sys/vm/vm_param.h,v 1.22 2007/04/19 04:52:47 alc Exp $
  */
 
 /*
@@ -79,8 +79,8 @@
 #define VM_V_FREE_TARGET	4	/* cnt.v_free_target */
 #define VM_V_FREE_RESERVED	5	/* cnt.v_free_reserved */
 #define VM_V_INACTIVE_TARGET	6	/* cnt.v_inactive_target */
-#define VM_V_CACHE_MIN		7	/* cnt.v_cache_max */
-#define VM_V_CACHE_MAX		8	/* cnt.v_cache_min */
+#define	VM_V_CACHE_MIN		7	/* cnt.v_cache_min */
+#define	VM_V_CACHE_MAX		8	/* cnt.v_cache_max */
 #define VM_V_PAGEOUT_FREE_MIN	9	/* cnt.v_pageout_free_min */
 #define	VM_PAGEOUT_ALGORITHM	10	/* pageout algorithm */
 #define VM_SWAPPING_ENABLED	11	/* swapping enabled */
Index: vm_contig.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_contig.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_contig.c -L sys/vm/vm_contig.c -u -r1.2 -r1.3
--- sys/vm/vm_contig.c
+++ sys/vm/vm_contig.c
@@ -60,12 +60,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_contig.c,v 1.43.2.3.2.1 2006/04/25 15:29:50 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_contig.c,v 1.63 2007/09/25 06:25:06 alc Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
@@ -83,6 +84,7 @@
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/vm_extern.h>
 
 static int
@@ -92,6 +94,7 @@
 	vm_page_t m_tmp;
 	struct vnode *vp;
 	struct mount *mp;
+	int vfslocked;
 
 	object = m->object;
 	if (!VM_OBJECT_TRYLOCK(object))
@@ -115,11 +118,13 @@
 			vm_object_reference_locked(object);
 			VM_OBJECT_UNLOCK(object);
 			(void) vn_start_write(vp, &mp, V_WAIT);
+			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
 			VM_OBJECT_LOCK(object);
 			vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 			VM_OBJECT_UNLOCK(object);
 			VOP_UNLOCK(vp, 0, curthread);
+			VFS_UNLOCK_GIANT(vfslocked);
 			vm_object_deallocate(object);
 			vn_finished_write(mp);
 			vm_page_lock_queues();
@@ -150,7 +155,7 @@
 		if ((m->flags & PG_MARKER) != 0)
 			continue;
 
-		KASSERT(m->queue == queue,
+		KASSERT(VM_PAGE_INQUEUE2(m, queue),
 		    ("vm_contig_launder: page %p's queue is not %d", m, queue));
 		error = vm_contig_launder_page(m);
 		if (error == 0)
@@ -161,191 +166,6 @@
 	return (FALSE);
 }
 
-/*
- * This interface is for merging with malloc() someday.
- * Even if we never implement compaction so that contiguous allocation
- * works after initialization time, malloc()'s data structures are good
- * for statistics and for allocations of less than a page.
- */
-static void *
-contigmalloc1(
-	unsigned long size,	/* should be size_t here and for malloc() */
-	struct malloc_type *type,
-	int flags,
-	vm_paddr_t low,
-	vm_paddr_t high,
-	unsigned long alignment,
-	unsigned long boundary,
-	vm_map_t map)
-{
-	int i, start;
-	vm_paddr_t phys;
-	vm_object_t object;
-	vm_offset_t addr, tmp_addr;
-	int pass, pqtype;
-	int inactl, actl, inactmax, actmax;
-	vm_page_t pga = vm_page_array;
-
-	size = round_page(size);
-	if (size == 0)
-		panic("contigmalloc1: size must not be 0");
-	if ((alignment & (alignment - 1)) != 0)
-		panic("contigmalloc1: alignment must be a power of 2");
-	if ((boundary & (boundary - 1)) != 0)
-		panic("contigmalloc1: boundary must be a power of 2");
-
-	start = 0;
-	for (pass = 2; pass >= 0; pass--) {
-		vm_page_lock_queues();
-again0:
-		mtx_lock_spin(&vm_page_queue_free_mtx);
-again:
-		/*
-		 * Find first page in array that is free, within range,
-		 * aligned, and such that the boundary won't be crossed.
-		 */
-		for (i = start; i < cnt.v_page_count; i++) {
-			phys = VM_PAGE_TO_PHYS(&pga[i]);
-			pqtype = pga[i].queue - pga[i].pc;
-			if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) &&
-			    (phys >= low) && (phys < high) &&
-			    ((phys & (alignment - 1)) == 0) &&
-			    (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0))
-				break;
-		}
-
-		/*
-		 * If the above failed or we will exceed the upper bound, fail.
-		 */
-		if ((i == cnt.v_page_count) ||
-			((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
-			mtx_unlock_spin(&vm_page_queue_free_mtx);
-			/*
-			 * Instead of racing to empty the inactive/active
-			 * queues, give up, even with more left to free,
-			 * if we try more than the initial amount of pages.
-			 *
-			 * There's no point attempting this on the last pass.
-			 */
-			if (pass > 0) {
-				inactl = actl = 0;
-				inactmax = vm_page_queues[PQ_INACTIVE].lcnt;
-				actmax = vm_page_queues[PQ_ACTIVE].lcnt;
-again1:
-				if (inactl < inactmax &&
-				    vm_contig_launder(PQ_INACTIVE)) {
-					inactl++;
-					goto again1;
-				}
-				if (actl < actmax &&
-				    vm_contig_launder(PQ_ACTIVE)) {
-					actl++;
-					goto again1;
-				}
-			}
-			vm_page_unlock_queues();
-			continue;
-		}
-		start = i;
-
-		/*
-		 * Check successive pages for contiguous and free.
-		 */
-		for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
-			pqtype = pga[i].queue - pga[i].pc;
-			if ((VM_PAGE_TO_PHYS(&pga[i]) !=
-			    (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
-			    ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) {
-				start++;
-				goto again;
-			}
-		}
-		mtx_unlock_spin(&vm_page_queue_free_mtx);
-		for (i = start; i < (start + size / PAGE_SIZE); i++) {
-			vm_page_t m = &pga[i];
-
-			if ((m->queue - m->pc) == PQ_CACHE) {
-				if (m->hold_count != 0) {
-					start++;
-					goto again0;
-				}
-				object = m->object;
-				if (!VM_OBJECT_TRYLOCK(object)) {
-					start++;
-					goto again0;
-				}
-				if ((m->flags & PG_BUSY) || m->busy != 0) {
-					VM_OBJECT_UNLOCK(object);
-					start++;
-					goto again0;
-				}
-				vm_page_free(m);
-				VM_OBJECT_UNLOCK(object);
-			}
-		}
-		mtx_lock_spin(&vm_page_queue_free_mtx);
-		for (i = start; i < (start + size / PAGE_SIZE); i++) {
-			pqtype = pga[i].queue - pga[i].pc;
-			if (pqtype != PQ_FREE) {
-				start++;
-				goto again;
-			}
-		}
-		for (i = start; i < (start + size / PAGE_SIZE); i++) {
-			vm_page_t m = &pga[i];
-			vm_pageq_remove_nowakeup(m);
-			m->valid = VM_PAGE_BITS_ALL;
-			if (m->flags & PG_ZERO)
-				vm_page_zero_count--;
-			/* Don't clear the PG_ZERO flag, we'll need it later. */
-			m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
-			KASSERT(m->dirty == 0,
-			    ("contigmalloc1: page %p was dirty", m));
-			m->wire_count = 0;
-			m->busy = 0;
-		}
-		mtx_unlock_spin(&vm_page_queue_free_mtx);
-		vm_page_unlock_queues();
-		/*
-		 * We've found a contiguous chunk that meets are requirements.
-		 * Allocate kernel VM, unfree and assign the physical pages to
-		 * it and return kernel VM pointer.
-		 */
-		vm_map_lock(map);
-		if (vm_map_findspace(map, vm_map_min(map), size, &addr) !=
-		    KERN_SUCCESS) {
-			/*
-			 * XXX We almost never run out of kernel virtual
-			 * space, so we don't make the allocated memory
-			 * above available.
-			 */
-			vm_map_unlock(map);
-			return (NULL);
-		}
-		vm_object_reference(kernel_object);
-		vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS,
-		    addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
-		vm_map_unlock(map);
-
-		tmp_addr = addr;
-		VM_OBJECT_LOCK(kernel_object);
-		for (i = start; i < (start + size / PAGE_SIZE); i++) {
-			vm_page_t m = &pga[i];
-			vm_page_insert(m, kernel_object,
-				OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
-			if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
-				pmap_zero_page(m);
-			tmp_addr += PAGE_SIZE;
-		}
-		VM_OBJECT_UNLOCK(kernel_object);
-		vm_map_wire(map, addr, addr + size,
-		    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
-
-		return ((void *)addr);
-	}
-	return (NULL);
-}
-
 static void
 vm_page_release_contigl(vm_page_t m, vm_pindex_t count)
 {
@@ -355,7 +175,7 @@
 	}
 }
 
-void
+static void
 vm_page_release_contig(vm_page_t m, vm_pindex_t count)
 {
 	vm_page_lock_queues();
@@ -363,162 +183,6 @@
 	vm_page_unlock_queues();
 }
 
-static int
-vm_contig_unqueue_free(vm_page_t m)
-{
-	int error = 0;
-
-	mtx_lock_spin(&vm_page_queue_free_mtx);
-	if ((m->queue - m->pc) == PQ_FREE)
-		vm_pageq_remove_nowakeup(m);
-	else
-		error = EAGAIN;
-	mtx_unlock_spin(&vm_page_queue_free_mtx);
-	if (error)
-		return (error);
-	m->valid = VM_PAGE_BITS_ALL;
-	if (m->flags & PG_ZERO)
-		vm_page_zero_count--;
-	/* Don't clear the PG_ZERO flag; we'll need it later. */
-	m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
-	KASSERT(m->dirty == 0,
-	    ("contigmalloc2: page %p was dirty", m));
-	m->wire_count = 0;
-	m->busy = 0;
-	return (error);
-}
-
-vm_page_t
-vm_page_alloc_contig(vm_pindex_t npages, vm_paddr_t low, vm_paddr_t high,
-	    vm_offset_t alignment, vm_offset_t boundary)
-{
-	vm_object_t object;
-	vm_offset_t size;
-	vm_paddr_t phys;
-	vm_page_t pga = vm_page_array;
-	int i, pass, pqtype, start;
-
-	size = npages << PAGE_SHIFT;
-	if (size == 0)
-		panic("vm_page_alloc_contig: size must not be 0");
-	if ((alignment & (alignment - 1)) != 0)
-		panic("vm_page_alloc_contig: alignment must be a power of 2");
-	if ((boundary & (boundary - 1)) != 0)
-		panic("vm_page_alloc_contig: boundary must be a power of 2");
-
-	for (pass = 0; pass < 2; pass++) {
-		if (atop(high) < vm_page_array_size)
-			start = atop(high) - npages + 1;
-		else
-			start = vm_page_array_size - npages + 1;
-		vm_page_lock_queues();
-retry:
-		start--;
-		/*
-		 * Find last page in array that is free, within range,
-		 * aligned, and such that the boundary won't be crossed.
-		 */
-		for (i = start; i >= 0; i--) {
-			phys = VM_PAGE_TO_PHYS(&pga[i]);
-			pqtype = pga[i].queue - pga[i].pc;
-			if (pass == 0) {
-				if (pqtype != PQ_FREE && pqtype != PQ_CACHE)
-					continue;
-			} else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
-				    pga[i].queue != PQ_ACTIVE &&
-				    pga[i].queue != PQ_INACTIVE)
-				continue;
-			if (phys >= low && phys + size <= high &&
-			    ((phys & (alignment - 1)) == 0) &&
-			    ((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)
-				break;
-		}
-		/* There are no candidates at all. */
-		if (i == -1) {
-			vm_page_unlock_queues();
-			continue;
-		}
-		start = i;
-		/*
-		 * Check successive pages for contiguous and free.
-		 */
-		for (i = start + npages - 1; i > start; i--) {
-			pqtype = pga[i].queue - pga[i].pc;
-			if (VM_PAGE_TO_PHYS(&pga[i]) !=
-			    VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE) {
-				start = i - npages + 1;
-				goto retry;
-			}
-			if (pass == 0) {
-				if (pqtype != PQ_FREE && pqtype != PQ_CACHE) {
-					start = i - npages + 1;
-					goto retry;
-				}
-			} else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
-				    pga[i].queue != PQ_ACTIVE &&
-				    pga[i].queue != PQ_INACTIVE) {
-				start = i - npages + 1;
-				goto retry;
-			}
-		}
-		for (i = start + npages - 1; i >= start; i--) {
-			vm_page_t m = &pga[i];
-
-retry_page:
-			pqtype = m->queue - m->pc;
-			if (pass != 0 && pqtype != PQ_FREE &&
-			    pqtype != PQ_CACHE) {
-				switch (m->queue) {
-				case PQ_ACTIVE:
-				case PQ_INACTIVE:
-					if (vm_contig_launder_page(m) != 0)
-						goto cleanup_freed;
-					pqtype = m->queue - m->pc;
-					if (pqtype == PQ_FREE ||
-					    pqtype == PQ_CACHE)
-						break;
-				default:
-cleanup_freed:
-					vm_page_release_contigl(&pga[i + 1],
-					    start + npages - 1 - i);
-					start = i - npages + 1;
-					goto retry;
-				}
-			}
-			if (pqtype == PQ_CACHE) {
-				if (m->hold_count != 0) {
-					start = i - npages + 1;
-					goto retry;
-				}
-				object = m->object;
-				if (!VM_OBJECT_TRYLOCK(object)) {
-					start = i - npages + 1;
-					goto retry;
-				}
-				if ((m->flags & PG_BUSY) || m->busy != 0) {
-					VM_OBJECT_UNLOCK(object);
-					start = i - npages + 1;
-					goto retry;
-				}
-				vm_page_free(m);
-				VM_OBJECT_UNLOCK(object);
-			}
-			/*
-			 * There is no good API for freeing a page
-			 * directly to PQ_NONE on our behalf, so spin.
-			 */
-			if (vm_contig_unqueue_free(m) != 0)
-				goto retry_page;
-		}
-		vm_page_unlock_queues();
-		/*
-		 * We've found a contiguous chunk that meets are requirements.
-		 */
-		return (&pga[start]);
-	}
-	return (NULL);
-}
-
 static void *
 contigmalloc2(vm_page_t m, vm_pindex_t npages, int flags)
 {
@@ -546,7 +210,7 @@
 	for (i = 0; i < npages; i++) {
 		vm_page_insert(&m[i], object,
 		    OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
-		if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
+		if ((flags & M_ZERO) && !(m[i].flags & PG_ZERO))
 			pmap_zero_page(&m[i]);
 		tmp_addr += PAGE_SIZE;
 	}
@@ -556,11 +220,6 @@
 	return ((void *)addr);
 }
 
-static int vm_old_contigmalloc = 0;
-SYSCTL_INT(_vm, OID_AUTO, old_contigmalloc,
-    CTLFLAG_RW, &vm_old_contigmalloc, 0, "Use the old contigmalloc algorithm");
-TUNABLE_INT("vm.old_contigmalloc", &vm_old_contigmalloc);
-
 void *
 contigmalloc(
 	unsigned long size,	/* should be size_t here and for malloc() */
@@ -573,26 +232,41 @@
 {
 	void * ret;
 	vm_page_t pages;
-	vm_pindex_t npgs;
+	unsigned long npgs;
+	int actl, actmax, inactl, inactmax, tries;
 
 	npgs = round_page(size) >> PAGE_SHIFT;
-	mtx_lock(&Giant);
-	if (vm_old_contigmalloc) {
-		ret = contigmalloc1(size, type, flags, low, high, alignment,
-		    boundary, kernel_map);
-	} else {
-		pages = vm_page_alloc_contig(npgs, low, high,
-		    alignment, boundary);
-		if (pages == NULL) {
-			ret = NULL;
-		} else {
-			ret = contigmalloc2(pages, npgs, flags);
-			if (ret == NULL)
-				vm_page_release_contig(pages, npgs);
+	tries = 0;
+retry:
+	pages = vm_phys_alloc_contig(npgs, low, high, alignment, boundary);
+	if (pages == NULL) {
+		if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
+			vm_page_lock_queues();
+			inactl = 0;
+			inactmax = tries < 1 ? 0 : cnt.v_inactive_count;
+			actl = 0;
+			actmax = tries < 2 ? 0 : cnt.v_active_count;
+again:
+			if (inactl < inactmax &&
+			    vm_contig_launder(PQ_INACTIVE)) {
+				inactl++;
+				goto again;
+			}
+			if (actl < actmax &&
+			    vm_contig_launder(PQ_ACTIVE)) {
+				actl++;
+				goto again;
+			}
+			vm_page_unlock_queues();
+			tries++;
+			goto retry;
 		}
-		
+		ret = NULL;
+	} else {
+		ret = contigmalloc2(pages, npgs, flags);
+		if (ret == NULL)
+			vm_page_release_contig(pages, npgs);
 	}
-	mtx_unlock(&Giant);
 	malloc_type_allocated(type, ret == NULL ? 0 : npgs << PAGE_SHIFT);
 	return (ret);
 }
Index: uma_int.h
===================================================================
RCS file: /home/cvs/src/sys/vm/uma_int.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/uma_int.h -L sys/vm/uma_int.h -u -r1.1.1.1 -r1.2
--- sys/vm/uma_int.h
+++ sys/vm/uma_int.h
@@ -24,7 +24,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/vm/uma_int.h,v 1.31.2.6 2005/08/15 09:01:11 rwatson Exp $
+ * $FreeBSD: src/sys/vm/uma_int.h,v 1.38 2007/05/09 22:53:34 rwatson Exp $
  *
  */
 
@@ -75,13 +75,13 @@
  * pair, as well as with its own set of small per-CPU caches, layered above
  * the Zone's general Bucket cache.
  *
- * The PCPU caches are protected by their own locks, while the Zones backed
- * by the same Keg all share a common Keg lock (to coalesce contention on
- * the backing slabs).  The backing Keg typically only serves one Zone but
- * in the case of multiple Zones, one of the Zones is considered the
- * Master Zone and all Zone-related stats from the Keg are done in the
- * Master Zone.  For an example of a Multi-Zone setup, refer to the
- * Mbuf allocation code.
+ * The PCPU caches are protected by critical sections, and may be accessed
+ * safely only from their associated CPU, while the Zones backed by the same
+ * Keg all share a common Keg lock (to coalesce contention on the backing
+ * slabs).  The backing Keg typically only serves one Zone but in the case of
+ * multiple Zones, one of the Zones is considered the Master Zone and all
+ * Zone-related stats from the Keg are done in the Master Zone.  For an
+ * example of a Multi-Zone setup, refer to the Mbuf allocation code.
  */
 
 /*
Index: vm_object.h
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_object.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_object.h -L sys/vm/vm_object.h -u -r1.1.1.1 -r1.2
--- sys/vm/vm_object.h
+++ sys/vm/vm_object.h
@@ -57,7 +57,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: src/sys/vm/vm_object.h,v 1.111 2005/05/03 11:11:26 jeff Exp $
+ * $FreeBSD: src/sys/vm/vm_object.h,v 1.114 2007/09/25 06:25:06 alc Exp $
  */
 
 /*
@@ -100,6 +100,7 @@
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
+	vm_page_t cache;		/* root of the cache page splay tree */
 	void *handle;
 	union {
 		/*
@@ -140,7 +141,6 @@
 #define OBJ_DEAD	0x0008		/* dead objects (during rundown) */
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
 #define OBJ_PIPWNT	0x0040		/* paging in progress wanted */
-#define	OBJ_WRITEABLE	0x0080		/* object has been made writable */
 #define OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty */
 #define OBJ_CLEANING	0x0200
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
@@ -201,7 +201,6 @@
 void vm_object_collapse (vm_object_t);
 void vm_object_deallocate (vm_object_t);
 void vm_object_terminate (vm_object_t);
-void vm_object_vndeallocate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_page_clean (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t);
Index: vm_zeroidle.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_zeroidle.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/vm/vm_zeroidle.c -L sys/vm/vm_zeroidle.c -u -r1.2 -r1.3
--- sys/vm/vm_zeroidle.c
+++ sys/vm/vm_zeroidle.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/vm/vm_zeroidle.c,v 1.34.2.2 2006/06/16 22:11:55 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_zeroidle.c,v 1.49 2007/07/14 19:00:44 alc Exp $");
 
 #include <opt_sched.h>
 
@@ -51,23 +51,14 @@
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
+#include <vm/vm_phys.h>
 
-SYSCTL_DECL(_vm_stats_misc);
-
-static int cnt_prezero;
-SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
-    &cnt_prezero, 0, "");
-
-static int idlezero_enable_default = 1;
+static int idlezero_enable_default = 0;
 TUNABLE_INT("vm.idlezero_enable", &idlezero_enable_default);
 /* Defer setting the enable flag until the kthread is running. */
 static int idlezero_enable = 0;
 SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0, "");
 
-static int idlezero_maxrun = 16;
-SYSCTL_INT(_vm, OID_AUTO, idlezero_maxrun, CTLFLAG_RW, &idlezero_maxrun, 0, "");
-TUNABLE_INT("vm.idlezero_maxrun", &idlezero_maxrun);
-
 /*
  * Implement the pre-zeroed page mechanism.
  */
@@ -99,30 +90,16 @@
 	return (1);
 }
 
-static int
+static void
 vm_page_zero_idle(void)
 {
-	static int free_rover;
-	vm_page_t m;
 
-	mtx_lock_spin(&vm_page_queue_free_mtx);
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	zero_state = 0;
-	m = vm_pageq_find(PQ_FREE, free_rover, FALSE);
-	if (m != NULL && (m->flags & PG_ZERO) == 0) {
-		vm_pageq_remove_nowakeup(m);
-		mtx_unlock_spin(&vm_page_queue_free_mtx);
-		pmap_zero_page_idle(m);
-		mtx_lock_spin(&vm_page_queue_free_mtx);
-		m->flags |= PG_ZERO;
-		vm_pageq_enqueue(PQ_FREE + m->pc, m);
-		++vm_page_zero_count;
-		++cnt_prezero;
+	if (vm_phys_zero_pages_idle()) {
 		if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
 			zero_state = 1;
 	}
-	free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK;
-	mtx_unlock_spin(&vm_page_queue_free_mtx);
-	return (1);
 }
 
 /* Called by vm_page_free to hint that a new page is available. */
@@ -130,7 +107,7 @@
 vm_page_zero_idle_wakeup(void)
 {
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if (wakeup_needed && vm_page_zero_check()) {
 		wakeup_needed = FALSE;
 		wakeup(&zero_state);
@@ -143,21 +120,21 @@
 
 	idlezero_enable = idlezero_enable_default;
 
+	mtx_lock(&vm_page_queue_free_mtx);
 	for (;;) {
 		if (vm_page_zero_check()) {
 			vm_page_zero_idle();
 #ifndef PREEMPTION
 			if (sched_runnable()) {
-				mtx_lock_spin(&sched_lock);
+				thread_lock(curthread);
 				mi_switch(SW_VOL, NULL);
-				mtx_unlock_spin(&sched_lock);
+				thread_unlock(curthread);
 			}
 #endif
 		} else {
-			vm_page_lock_queues();
 			wakeup_needed = TRUE;
-			msleep(&zero_state, &vm_page_queue_mtx,
-			    PDROP, "pgzero", hz * 300);
+			msleep(&zero_state, &vm_page_queue_free_mtx, 0,
+			    "pgzero", hz * 300);
 		}
 	}
 }
@@ -180,11 +157,11 @@
 	PROC_LOCK(pagezero_proc);
 	pagezero_proc->p_flag |= P_NOLOAD;
 	PROC_UNLOCK(pagezero_proc);
-	mtx_lock_spin(&sched_lock);
 	td = FIRST_THREAD_IN_PROC(pagezero_proc);
-	sched_class(td->td_ksegrp, PRI_IDLE);
+	thread_lock(td);
+	sched_class(td, PRI_IDLE);
 	sched_prio(td, PRI_MAX_IDLE);
-	setrunqueue(td, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+	sched_add(td, SRQ_BORING);
+	thread_unlock(td);
 }
 SYSINIT(pagezero, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, pagezero_start, NULL)
Index: vm_kern.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_kern.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_kern.c -L sys/vm/vm_kern.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_kern.c
+++ sys/vm/vm_kern.c
@@ -63,11 +63,12 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_kern.c,v 1.122 2005/01/07 02:29:27 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_kern.c,v 1.128.4.1 2008/01/17 14:57:50 pjd Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>		/* for ticks and hz */
+#include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
@@ -81,6 +82,7 @@
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
+#include <vm/uma.h>
 
 vm_map_t kernel_map=0;
 vm_map_t kmem_map=0;
@@ -175,9 +177,8 @@
 		mem = vm_page_grab(kernel_object, OFF_TO_IDX(offset + i),
 		    VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
 		mem->valid = VM_PAGE_BITS_ALL;
-		vm_page_lock_queues();
-		vm_page_unmanage(mem);
-		vm_page_unlock_queues();
+		KASSERT((mem->flags & PG_UNMANAGED) != 0,
+		    ("kmem_alloc: page %p is managed", mem));
 	}
 	VM_OBJECT_UNLOCK(kernel_object);
 
@@ -295,10 +296,25 @@
 	vm_map_lock(map);
 	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
 		vm_map_unlock(map);
-		if ((flags & M_NOWAIT) == 0)
-			panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
-				(long)size, (long)map->size);
-		return (0);
+                if ((flags & M_NOWAIT) == 0) {
+			for (i = 0; i < 8; i++) {
+				EVENTHANDLER_INVOKE(vm_lowmem, 0);
+				uma_reclaim();
+				vm_map_lock(map);
+				if (vm_map_findspace(map, vm_map_min(map),
+				    size, &addr) == 0) {
+					break;
+				}
+				vm_map_unlock(map);
+				tsleep(&i, 0, "nokva", (hz / 4) * (i + 1));
+			}
+			if (i == 8) {
+				panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
+				    (long)size, (long)map->size);
+			}
+		} else {
+			return (0);
+		}
 	}
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	vm_object_reference(kmem_object);
@@ -364,9 +380,8 @@
 		if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
 			pmap_zero_page(m);
 		m->valid = VM_PAGE_BITS_ALL;
-		vm_page_lock_queues();
-		vm_page_unmanage(m);
-		vm_page_unlock_queues();
+		KASSERT((m->flags & PG_UNMANAGED) != 0,
+		    ("kmem_malloc: page %p is managed", m));
 	}
 	VM_OBJECT_UNLOCK(kmem_object);
 
@@ -390,9 +405,7 @@
 	vm_map_simplify_entry(map, entry);
 
 	/*
-	 * Loop thru pages, entering them in the pmap. (We cannot add them to
-	 * the wired count without wrapping the vm_page_queue_lock in
-	 * splimp...)
+	 * Loop thru pages, entering them in the pmap.
 	 */
 	VM_OBJECT_LOCK(kmem_object);
 	for (i = 0; i < size; i += PAGE_SIZE) {
@@ -401,10 +414,7 @@
 		 * Because this is kernel_pmap, this call will not block.
 		 */
 		pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, 1);
-		vm_page_lock_queues();
-		vm_page_flag_set(m, PG_WRITEABLE | PG_REFERENCED);
 		vm_page_wakeup(m);
-		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(kmem_object);
 	vm_map_unlock(map);
@@ -492,7 +502,8 @@
 	/* N.B.: cannot use kgdb to debug, starting with this assignment ... */
 	kernel_map = m;
 	(void) vm_map_insert(m, NULL, (vm_ooffset_t) 0,
-	    VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL, 0);
+	    VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL,
+	    MAP_NOFAULT);
 	/* ... and ending with the completion of the above `insert' */
 	vm_map_unlock(m);
 }
Index: phys_pager.c
===================================================================
RCS file: /home/cvs/src/sys/vm/phys_pager.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/phys_pager.c -L sys/vm/phys_pager.c -u -r1.1.1.1 -r1.2
--- sys/vm/phys_pager.c
+++ sys/vm/phys_pager.c
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/phys_pager.c,v 1.23 2005/01/07 02:29:26 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/phys_pager.c,v 1.28.2.1 2007/11/10 11:21:17 remko Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -42,9 +42,7 @@
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
-/* prevent concurrant creation races */
-static int phys_pager_alloc_lock;
-/* list of device pager objects */
+/* list of phys pager objects */
 static struct pagerlst phys_pager_object_list;
 /* protect access to phys_pager_object_list */
 static struct mtx phys_pager_mtx;
@@ -64,7 +62,7 @@
 phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		 vm_ooffset_t foff)
 {
-	vm_object_t object;
+	vm_object_t object, object1;
 	vm_pindex_t pindex;
 
 	/*
@@ -76,42 +74,41 @@
 	pindex = OFF_TO_IDX(foff + PAGE_MASK + size);
 
 	if (handle != NULL) {
-		mtx_lock(&Giant);
-		/*
-		 * Lock to prevent object creation race condition.
-		 */
-		while (phys_pager_alloc_lock) {
-			phys_pager_alloc_lock = -1;
-			tsleep(&phys_pager_alloc_lock, PVM, "swpalc", 0);
-		}
-		phys_pager_alloc_lock = 1;
-
+		mtx_lock(&phys_pager_mtx);
 		/*
 		 * Look up pager, creating as necessary.
 		 */
+		object1 = NULL;
 		object = vm_pager_object_lookup(&phys_pager_object_list, handle);
 		if (object == NULL) {
 			/*
 			 * Allocate object and associate it with the pager.
 			 */
-			object = vm_object_allocate(OBJT_PHYS, pindex);
-			object->handle = handle;
-			mtx_lock(&phys_pager_mtx);
-			TAILQ_INSERT_TAIL(&phys_pager_object_list, object,
-			    pager_object_list);
 			mtx_unlock(&phys_pager_mtx);
+			object1 = vm_object_allocate(OBJT_PHYS, pindex);
+			mtx_lock(&phys_pager_mtx);
+			object = vm_pager_object_lookup(&phys_pager_object_list,
+			    handle);
+			if (object != NULL) {
+				/*
+				 * We raced with other thread while
+				 * allocating object.
+				 */
+				if (pindex > object->size)
+					object->size = pindex;
+			} else {
+				object = object1;
+				object1 = NULL;
+				object->handle = handle;
+				TAILQ_INSERT_TAIL(&phys_pager_object_list, object,
+				    pager_object_list);
+			}
 		} else {
-			/*
-			 * Gain a reference to the object.
-			 */
-			vm_object_reference(object);
 			if (pindex > object->size)
 				object->size = pindex;
 		}
-		if (phys_pager_alloc_lock == -1)
-			wakeup(&phys_pager_alloc_lock);
-		phys_pager_alloc_lock = 0;
-		mtx_unlock(&Giant);
+		mtx_unlock(&phys_pager_mtx);
+		vm_object_deallocate(object1);
 	} else {
 		object = vm_object_allocate(OBJT_PHYS, pindex);
 	}
@@ -127,9 +124,11 @@
 {
 
 	if (object->handle != NULL) {
+		VM_OBJECT_UNLOCK(object);
 		mtx_lock(&phys_pager_mtx);
 		TAILQ_REMOVE(&phys_pager_object_list, object, pager_object_list);
 		mtx_unlock(&phys_pager_mtx);
+		VM_OBJECT_LOCK(object);
 	}
 }
 
@@ -150,19 +149,13 @@
 		}
 		KASSERT(m[i]->valid == VM_PAGE_BITS_ALL,
 		    ("phys_pager_getpages: partially valid page %p", m[i]));
-	}
-	vm_page_lock_queues();
-	for (i = 0; i < count; i++) {
-		/* Switch off pv_entries */
-		vm_page_unmanage(m[i]);
 		m[i]->dirty = 0;
 		/* The requested page must remain busy, the others not. */
 		if (reqpage != i) {
-			vm_page_flag_clear(m[i], PG_BUSY);
+			m[i]->oflags &= ~VPO_BUSY;
 			m[i]->busy = 0;
 		}
 	}
-	vm_page_unlock_queues();
 	return (VM_PAGER_OK);
 }
 
--- /dev/null
+++ sys/vm/vm_phys.c
@@ -0,0 +1,750 @@
+/*-
+ * Copyright (c) 2002-2006 Rice University
+ * Copyright (c) 2007 Alan L. Cox <alc at cs.rice.edu>
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Alan L. Cox,
+ * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/vm/vm_phys.c,v 1.4 2007/09/25 06:25:06 alc Exp $");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_phys.h>
+
+struct vm_freelist {
+	struct pglist pl;
+	int lcnt;
+};
+
+struct vm_phys_seg {
+	vm_paddr_t	start;
+	vm_paddr_t	end;
+	vm_page_t	first_page;
+	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
+};
+
+static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
+
+static int vm_phys_nsegs;
+
+static struct vm_freelist
+    vm_phys_free_queues[VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
+
+static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
+
+static int cnt_prezero;
+SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
+    &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
+
+static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
+
+static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
+
+static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
+static int vm_phys_paddr_to_segind(vm_paddr_t pa);
+static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
+    int order);
+
+/*
+ * Outputs the state of the physical memory allocator, specifically,
+ * the amount of physical memory in each free list.
+ */
+static int
+sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sbuf;
+	struct vm_freelist *fl;
+	char *cbuf;
+	const int cbufsize = vm_nfreelists*(VM_NFREEORDER + 1)*81;
+	int error, flind, oind, pind;
+
+	cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
+	sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+	for (flind = 0; flind < vm_nfreelists; flind++) {
+		sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
+		    "\n  ORDER (SIZE)  |  NUMBER"
+		    "\n              ", flind);
+		for (pind = 0; pind < VM_NFREEPOOL; pind++)
+			sbuf_printf(&sbuf, "  |  POOL %d", pind);
+		sbuf_printf(&sbuf, "\n--            ");
+		for (pind = 0; pind < VM_NFREEPOOL; pind++)
+			sbuf_printf(&sbuf, "-- --      ");
+		sbuf_printf(&sbuf, "--\n");
+		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+			sbuf_printf(&sbuf, "  %2.2d (%6.6dK)", oind,
+			    1 << (PAGE_SHIFT - 10 + oind));
+			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+				fl = vm_phys_free_queues[flind][pind];
+				sbuf_printf(&sbuf, "  |  %6.6d", fl[oind].lcnt);
+			}
+			sbuf_printf(&sbuf, "\n");
+		}
+	}
+	sbuf_finish(&sbuf);
+	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+	sbuf_delete(&sbuf);
+	free(cbuf, M_TEMP);
+	return (error);
+}
+
+/*
+ * Outputs the set of physical memory segments.
+ */
+static int
+sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sbuf;
+	struct vm_phys_seg *seg;
+	char *cbuf;
+	const int cbufsize = VM_PHYSSEG_MAX*(VM_NFREEORDER + 1)*81;
+	int error, segind;
+
+	cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
+	sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
+		seg = &vm_phys_segs[segind];
+		sbuf_printf(&sbuf, "start:     %#jx\n",
+		    (uintmax_t)seg->start);
+		sbuf_printf(&sbuf, "end:       %#jx\n",
+		    (uintmax_t)seg->end);
+		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
+	}
+	sbuf_finish(&sbuf);
+	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+	sbuf_delete(&sbuf);
+	free(cbuf, M_TEMP);
+	return (error);
+}
+
+/*
+ * Create a physical memory segment.
+ */
+static void
+vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
+{
+	struct vm_phys_seg *seg;
+#ifdef VM_PHYSSEG_SPARSE
+	long pages;
+	int segind;
+
+	pages = 0;
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		pages += atop(seg->end - seg->start);
+	}
+#endif
+	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
+	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
+	seg = &vm_phys_segs[vm_phys_nsegs++];
+	seg->start = start;
+	seg->end = end;
+#ifdef VM_PHYSSEG_SPARSE
+	seg->first_page = &vm_page_array[pages];
+#else
+	seg->first_page = PHYS_TO_VM_PAGE(start);
+#endif
+	seg->free_queues = &vm_phys_free_queues[flind];
+}
+
+/*
+ * Initialize the physical memory allocator.
+ */
+void
+vm_phys_init(void)
+{
+	struct vm_freelist *fl;
+	int flind, i, oind, pind;
+
+	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+#ifdef	VM_FREELIST_ISADMA
+		if (phys_avail[i] < 16777216) {
+			if (phys_avail[i + 1] > 16777216) {
+				vm_phys_create_seg(phys_avail[i], 16777216,
+				    VM_FREELIST_ISADMA);
+				vm_phys_create_seg(16777216, phys_avail[i + 1],
+				    VM_FREELIST_DEFAULT);
+			} else {
+				vm_phys_create_seg(phys_avail[i],
+				    phys_avail[i + 1], VM_FREELIST_ISADMA);
+			}
+			if (VM_FREELIST_ISADMA >= vm_nfreelists)
+				vm_nfreelists = VM_FREELIST_ISADMA + 1;
+		} else
+#endif
+#ifdef	VM_FREELIST_HIGHMEM
+		if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
+			if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
+				vm_phys_create_seg(phys_avail[i],
+				    VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
+				vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
+				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
+			} else {
+				vm_phys_create_seg(phys_avail[i],
+				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
+			}
+			if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
+				vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
+		} else
+#endif
+		vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
+		    VM_FREELIST_DEFAULT);
+	}
+	for (flind = 0; flind < vm_nfreelists; flind++) {
+		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+			fl = vm_phys_free_queues[flind][pind];
+			for (oind = 0; oind < VM_NFREEORDER; oind++)
+				TAILQ_INIT(&fl[oind].pl);
+		}
+	}
+}
+
+/*
+ * Split a contiguous, power of two-sized set of physical pages.
+ */
+static __inline void
+vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
+{
+	vm_page_t m_buddy;
+
+	while (oind > order) {
+		oind--;
+		m_buddy = &m[1 << oind];
+		KASSERT(m_buddy->order == VM_NFREEORDER,
+		    ("vm_phys_split_pages: page %p has unexpected order %d",
+		    m_buddy, m_buddy->order));
+		m_buddy->order = oind;
+		TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
+		fl[oind].lcnt++;
+        }
+}
+
+/*
+ * Initialize a physical page and add it to the free lists.
+ */
+void
+vm_phys_add_page(vm_paddr_t pa)
+{
+	vm_page_t m;
+
+	cnt.v_page_count++;
+	m = vm_phys_paddr_to_vm_page(pa);
+	m->phys_addr = pa;
+	m->segind = vm_phys_paddr_to_segind(pa);
+	m->flags = PG_FREE;
+	KASSERT(m->order == VM_NFREEORDER,
+	    ("vm_phys_add_page: page %p has unexpected order %d",
+	    m, m->order));
+	m->pool = VM_FREEPOOL_DEFAULT;
+	pmap_page_init(m);
+	mtx_lock(&vm_page_queue_free_mtx);
+	cnt.v_free_count++;
+	vm_phys_free_pages(m, 0);
+	mtx_unlock(&vm_page_queue_free_mtx);
+}
+
+/*
+ * Allocate a contiguous, power of two-sized set of physical pages
+ * from the free lists.
+ *
+ * The free page queues must be locked.
+ */
+vm_page_t
+vm_phys_alloc_pages(int pool, int order)
+{
+	struct vm_freelist *fl;
+	struct vm_freelist *alt;
+	int flind, oind, pind;
+	vm_page_t m;
+
+	KASSERT(pool < VM_NFREEPOOL,
+	    ("vm_phys_alloc_pages: pool %d is out of range", pool));
+	KASSERT(order < VM_NFREEORDER,
+	    ("vm_phys_alloc_pages: order %d is out of range", order));
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	for (flind = 0; flind < vm_nfreelists; flind++) {
+		fl = vm_phys_free_queues[flind][pool];
+		for (oind = order; oind < VM_NFREEORDER; oind++) {
+			m = TAILQ_FIRST(&fl[oind].pl);
+			if (m != NULL) {
+				TAILQ_REMOVE(&fl[oind].pl, m, pageq);
+				fl[oind].lcnt--;
+				m->order = VM_NFREEORDER;
+				vm_phys_split_pages(m, oind, fl, order);
+				return (m);
+			}
+		}
+
+		/*
+		 * The given pool was empty.  Find the largest
+		 * contiguous, power-of-two-sized set of pages in any
+		 * pool.  Transfer these pages to the given pool, and
+		 * use them to satisfy the allocation.
+		 */
+		for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
+			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+				alt = vm_phys_free_queues[flind][pind];
+				m = TAILQ_FIRST(&alt[oind].pl);
+				if (m != NULL) {
+					TAILQ_REMOVE(&alt[oind].pl, m, pageq);
+					alt[oind].lcnt--;
+					m->order = VM_NFREEORDER;
+					vm_phys_set_pool(pool, m, oind);
+					vm_phys_split_pages(m, oind, fl, order);
+					return (m);
+				}
+			}
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Allocate physical memory from phys_avail[].
+ */
+vm_paddr_t
+vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
+{
+	vm_paddr_t pa;
+	int i;
+
+	size = round_page(size);
+	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+		if (phys_avail[i + 1] - phys_avail[i] < size)
+			continue;
+		pa = phys_avail[i];
+		phys_avail[i] += size;
+		return (pa);
+	}
+	panic("vm_phys_bootstrap_alloc");
+}
+
+/*
+ * Find the vm_page corresponding to the given physical address.
+ */
+vm_page_t
+vm_phys_paddr_to_vm_page(vm_paddr_t pa)
+{
+	struct vm_phys_seg *seg;
+	int segind;
+
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		if (pa >= seg->start && pa < seg->end)
+			return (&seg->first_page[atop(pa - seg->start)]);
+	}
+	panic("vm_phys_paddr_to_vm_page: paddr %#jx is not in any segment",
+	    (uintmax_t)pa);
+}
+
+/*
+ * Find the segment containing the given physical address.
+ */
+static int
+vm_phys_paddr_to_segind(vm_paddr_t pa)
+{
+	struct vm_phys_seg *seg;
+	int segind;
+
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		if (pa >= seg->start && pa < seg->end)
+			return (segind);
+	}
+	panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
+	    (uintmax_t)pa);
+}
+
+/*
+ * Free a contiguous, power of two-sized set of physical pages.
+ *
+ * The free page queues must be locked.
+ */
+void
+vm_phys_free_pages(vm_page_t m, int order)
+{
+	struct vm_freelist *fl;
+	struct vm_phys_seg *seg;
+	vm_paddr_t pa, pa_buddy;
+	vm_page_t m_buddy;
+
+	KASSERT(m->order == VM_NFREEORDER,
+	    ("vm_phys_free_pages: page %p has unexpected order %d",
+	    m, m->order));
+	KASSERT(m->pool < VM_NFREEPOOL,
+	    ("vm_phys_free_pages: page %p has unexpected pool %d",
+	    m, m->pool));
+	KASSERT(order < VM_NFREEORDER,
+	    ("vm_phys_free_pages: order %d is out of range", order));
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	pa = VM_PAGE_TO_PHYS(m);
+	seg = &vm_phys_segs[m->segind];
+	while (order < VM_NFREEORDER - 1) {
+		pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
+		if (pa_buddy < seg->start ||
+		    pa_buddy >= seg->end)
+			break;
+		m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
+		if (m_buddy->order != order)
+			break;
+		fl = (*seg->free_queues)[m_buddy->pool];
+		TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
+		fl[m_buddy->order].lcnt--;
+		m_buddy->order = VM_NFREEORDER;
+		if (m_buddy->pool != m->pool)
+			vm_phys_set_pool(m->pool, m_buddy, order);
+		order++;
+		pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
+		m = &seg->first_page[atop(pa - seg->start)];
+	}
+	m->order = order;
+	fl = (*seg->free_queues)[m->pool];
+	TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
+	fl[order].lcnt++;
+}
+
+/*
+ * Set the pool for a contiguous, power of two-sized set of physical pages. 
+ */
+void
+vm_phys_set_pool(int pool, vm_page_t m, int order)
+{
+	vm_page_t m_tmp;
+
+	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
+		m_tmp->pool = pool;
+}
+
+/*
+ * Remove the given physical page "m" from the free lists.
+ *
+ * The free page queues must be locked.
+ */
+void
+vm_phys_unfree_page(vm_page_t m)
+{
+	struct vm_freelist *fl;
+	struct vm_phys_seg *seg;
+	vm_paddr_t pa, pa_half;
+	vm_page_t m_set, m_tmp;
+	int order;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+	/*
+	 * First, find the contiguous, power of two-sized set of free
+	 * physical pages containing the given physical page "m" and
+	 * assign it to "m_set".
+	 */
+	seg = &vm_phys_segs[m->segind];
+	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
+	    order < VM_NFREEORDER; ) {
+		order++;
+		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
+		KASSERT(pa >= seg->start && pa < seg->end,
+		    ("vm_phys_unfree_page: paddr %#jx is not within segment %p",
+		    (uintmax_t)pa, seg));
+		m_set = &seg->first_page[atop(pa - seg->start)];
+	}
+	KASSERT(m_set->order >= order, ("vm_phys_unfree_page: page %p's order"
+	    " (%d) is less than expected (%d)", m_set, m_set->order, order));
+	KASSERT(m_set->order < VM_NFREEORDER,
+	    ("vm_phys_unfree_page: page %p has unexpected order %d",
+	    m_set, m_set->order));
+	KASSERT(order < VM_NFREEORDER,
+	    ("vm_phys_unfree_page: order %d is out of range", order));
+
+	/*
+	 * Next, remove "m_set" from the free lists.  Finally, extract
+	 * "m" from "m_set" using an iterative algorithm: While "m_set"
+	 * is larger than a page, shrink "m_set" by returning the half
+	 * of "m_set" that does not contain "m" to the free lists.
+	 */
+	fl = (*seg->free_queues)[m_set->pool];
+	order = m_set->order;
+	TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
+	fl[order].lcnt--;
+	m_set->order = VM_NFREEORDER;
+	while (order > 0) {
+		order--;
+		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
+		if (m->phys_addr < pa_half)
+			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
+		else {
+			m_tmp = m_set;
+			m_set = &seg->first_page[atop(pa_half - seg->start)];
+		}
+		m_tmp->order = order;
+		TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
+		fl[order].lcnt++;
+	}
+	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
+}
+
+/*
+ * Try to zero one physical page.  Used by an idle priority thread.
+ */
+boolean_t
+vm_phys_zero_pages_idle(void)
+{
+	static struct vm_freelist *fl = vm_phys_free_queues[0][0];
+	static int flind, oind, pind;
+	vm_page_t m, m_tmp;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	for (;;) {
+		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
+			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
+				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
+					vm_phys_unfree_page(m_tmp);
+					cnt.v_free_count--;
+					mtx_unlock(&vm_page_queue_free_mtx);
+					pmap_zero_page_idle(m_tmp);
+					m_tmp->flags |= PG_ZERO;
+					mtx_lock(&vm_page_queue_free_mtx);
+					cnt.v_free_count++;
+					vm_phys_free_pages(m_tmp, 0);
+					vm_page_zero_count++;
+					cnt_prezero++;
+					return (TRUE);
+				}
+			}
+		}
+		oind++;
+		if (oind == VM_NFREEORDER) {
+			oind = 0;
+			pind++;
+			if (pind == VM_NFREEPOOL) {
+				pind = 0;
+				flind++;
+				if (flind == vm_nfreelists)
+					flind = 0;
+			}
+			fl = vm_phys_free_queues[flind][pind];
+		}
+	}
+}
+
+/*
+ * Allocate a contiguous set of physical pages of the given size
+ * "npages" from the free lists.  All of the physical pages must be at
+ * or above the given physical address "low" and below the given
+ * physical address "high".  The given value "alignment" determines the
+ * alignment of the first physical page in the set.  If the given value
+ * "boundary" is non-zero, then the set of physical pages cannot cross
+ * any physical address boundary that is a multiple of that value.  Both
+ * "alignment" and "boundary" must be a power of two.
+ */
+vm_page_t
+vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
+    unsigned long alignment, unsigned long boundary)
+{
+	struct vm_freelist *fl;
+	struct vm_phys_seg *seg;
+	vm_object_t m_object;
+	vm_paddr_t pa, pa_last, size;
+	vm_page_t m, m_ret;
+	int flind, i, oind, order, pind;
+
+	size = npages << PAGE_SHIFT;
+	KASSERT(size != 0,
+	    ("vm_phys_alloc_contig: size must not be 0"));
+	KASSERT((alignment & (alignment - 1)) == 0,
+	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
+	KASSERT((boundary & (boundary - 1)) == 0,
+	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
+	/* Compute the queue that is the best fit for npages. */
+	for (order = 0; (1 << order) < npages; order++);
+	mtx_lock(&vm_page_queue_free_mtx);
+	for (flind = 0; flind < vm_nfreelists; flind++) {
+		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
+			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+				fl = vm_phys_free_queues[flind][pind];
+				TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
+					/*
+					 * A free list may contain physical pages
+					 * from one or more segments.
+					 */
+					seg = &vm_phys_segs[m_ret->segind];
+					if (seg->start > high ||
+					    low >= seg->end)
+						continue;
+
+					/*
+					 * Is the size of this allocation request
+					 * larger than the largest block size?
+					 */
+					if (order >= VM_NFREEORDER) {
+						/*
+						 * Determine if a sufficient number
+						 * of subsequent blocks to satisfy
+						 * the allocation request are free.
+						 */
+						pa = VM_PAGE_TO_PHYS(m_ret);
+						pa_last = pa + size;
+						for (;;) {
+							pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
+							if (pa >= pa_last)
+								break;
+							if (pa < seg->start ||
+							    pa >= seg->end)
+								break;
+							m = &seg->first_page[atop(pa - seg->start)];
+							if (m->order != VM_NFREEORDER - 1)
+								break;
+						}
+						/* If not, continue to the next block. */
+						if (pa < pa_last)
+							continue;
+					}
+
+					/*
+					 * Determine if the blocks are within the given range,
+					 * satisfy the given alignment, and do not cross the
+					 * given boundary.
+					 */
+					pa = VM_PAGE_TO_PHYS(m_ret);
+					if (pa >= low &&
+					    pa + size <= high &&
+					    (pa & (alignment - 1)) == 0 &&
+					    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
+						goto done;
+				}
+			}
+		}
+	}
+	mtx_unlock(&vm_page_queue_free_mtx);
+	return (NULL);
+done:
+	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
+		fl = (*seg->free_queues)[m->pool];
+		TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
+		fl[m->order].lcnt--;
+		m->order = VM_NFREEORDER;
+	}
+	if (m_ret->pool != VM_FREEPOOL_DEFAULT)
+		vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
+	fl = (*seg->free_queues)[m_ret->pool];
+	vm_phys_split_pages(m_ret, oind, fl, order);
+	for (i = 0; i < npages; i++) {
+		m = &m_ret[i];
+		KASSERT(m->queue == PQ_NONE,
+		    ("vm_phys_alloc_contig: page %p has unexpected queue %d",
+		    m, m->queue));
+		m_object = m->object;
+		if ((m->flags & PG_CACHED) != 0)
+			vm_page_cache_remove(m);
+		else {
+			KASSERT(VM_PAGE_IS_FREE(m),
+			    ("vm_phys_alloc_contig: page %p is not free", m));
+			cnt.v_free_count--;
+		}
+		m->valid = VM_PAGE_BITS_ALL;
+		if (m->flags & PG_ZERO)
+			vm_page_zero_count--;
+		/* Don't clear the PG_ZERO flag; we'll need it later. */
+		m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
+		m->oflags = 0;
+		KASSERT(m->dirty == 0,
+		    ("vm_phys_alloc_contig: page %p was dirty", m));
+		m->wire_count = 0;
+		m->busy = 0;
+		if (m_object != NULL &&
+		    m_object->type == OBJT_VNODE &&
+		    m_object->cache == NULL) {
+			mtx_unlock(&vm_page_queue_free_mtx);
+			vdrop(m_object->handle);
+			mtx_lock(&vm_page_queue_free_mtx);
+		}
+	}
+	for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
+		m = &m_ret[i];
+		KASSERT(m->order == VM_NFREEORDER,
+		    ("vm_phys_alloc_contig: page %p has unexpected order %d",
+		    m, m->order));
+		vm_phys_free_pages(m, 0);
+	}
+	mtx_unlock(&vm_page_queue_free_mtx);
+	return (m_ret);
+}
+
+#ifdef DDB
+/*
+ * Show the number of physical pages in each of the free lists.
+ */
+DB_SHOW_COMMAND(freepages, db_show_freepages)
+{
+	struct vm_freelist *fl;
+	int flind, oind, pind;
+
+	for (flind = 0; flind < vm_nfreelists; flind++) {
+		db_printf("FREE LIST %d:\n"
+		    "\n  ORDER (SIZE)  |  NUMBER"
+		    "\n              ", flind);
+		for (pind = 0; pind < VM_NFREEPOOL; pind++)
+			db_printf("  |  POOL %d", pind);
+		db_printf("\n--            ");
+		for (pind = 0; pind < VM_NFREEPOOL; pind++)
+			db_printf("-- --      ");
+		db_printf("--\n");
+		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+			db_printf("  %2.2d (%6.6dK)", oind,
+			    1 << (PAGE_SHIFT - 10 + oind));
+			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+				fl = vm_phys_free_queues[flind][pind];
+				db_printf("  |  %6.6d", fl[oind].lcnt);
+			}
+			db_printf("\n");
+		}
+		db_printf("\n");
+	}
+}
+#endif
Index: vm_map.c
===================================================================
RCS file: /home/cvs/src/sys/vm/vm_map.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/vm/vm_map.c -L sys/vm/vm_map.c -u -r1.1.1.1 -r1.2
--- sys/vm/vm_map.c
+++ sys/vm/vm_map.c
@@ -63,7 +63,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/vm/vm_map.c,v 1.366.2.2 2005/11/13 21:45:49 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/vm/vm_map.c,v 1.388.2.1.2.2 2008/01/19 18:15:07 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -148,6 +148,29 @@
 static void vmspace_zdtor(void *mem, int size, void *arg);
 #endif
 
+/* 
+ * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
+ * stable.
+ */
+#define PROC_VMSPACE_LOCK(p) do { } while (0)
+#define PROC_VMSPACE_UNLOCK(p) do { } while (0)
+
+/*
+ *	VM_MAP_RANGE_CHECK:	[ internal use only ]
+ *
+ *	Asserts that the starting and ending region
+ *	addresses fall within the valid range of the map.
+ */
+#define	VM_MAP_RANGE_CHECK(map, start, end)		\
+		{					\
+		if (start < vm_map_min(map))		\
+			start = vm_map_min(map);	\
+		if (end > vm_map_max(map))		\
+			end = vm_map_max(map);		\
+		if (start > end)			\
+			start = end;			\
+		}
+
 void
 vm_map_startup(void)
 {
@@ -166,7 +189,6 @@
 	uma_prealloc(kmapentzone, MAX_KMAPENT);
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-	uma_prealloc(mapentzone, MAX_MAPENT);
 }
 
 static void
@@ -175,7 +197,6 @@
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
-	pmap_release(vmspace_pmap(vm));
 	vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
 }
 
@@ -186,8 +207,8 @@
 
 	vm = (struct vmspace *)mem;
 
+	vm->vm_map.pmap = NULL;
 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
-	pmap_pinit(vmspace_pmap(vm));
 	return (0);
 }
 
@@ -250,6 +271,10 @@
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
+	if (vm->vm_map.pmap == NULL && !pmap_pinit(vmspace_pmap(vm))) {
+		uma_zfree(vmspace_zone, vm);
+		return (NULL);
+	}
 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 	_vm_map_init(&vm->vm_map, min, max);
 	vm->vm_map.pmap = vmspace_pmap(vm);		/* XXX */
@@ -262,7 +287,6 @@
 	vm->vm_taddr = 0;
 	vm->vm_daddr = 0;
 	vm->vm_maxsaddr = 0;
-	vm->vm_exitingcnt = 0;
 	return (vm);
 }
 
@@ -279,10 +303,9 @@
 	    NULL,
 #endif
 	    vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-	pmap_init2();
 }
 
-static __inline void
+static inline void
 vmspace_dofree(struct vmspace *vm)
 {
 	CTR1(KTR_VM, "vmspace_free: %p", vm);
@@ -298,11 +321,15 @@
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
-	vm_map_lock(&vm->vm_map);
-	(void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
+	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
 	    vm->vm_map.max_offset);
-	vm_map_unlock(&vm->vm_map);
 
+	/*
+	 * XXX Comment out the pmap_release call for now. The
+	 * vmspace_zone is marked as UMA_ZONE_NOFREE, and bugs cause
+	 * pmap.resident_count to be != 0 on exit sometimes.
+	 */
+/* 	pmap_release(vmspace_pmap(vm)); */
 	uma_zfree(vmspace_zone, vm);
 }
 
@@ -317,7 +344,7 @@
 	do
 		refcnt = vm->vm_refcnt;
 	while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
-	if (refcnt == 1 && vm->vm_exitingcnt == 0)
+	if (refcnt == 1)
 		vmspace_dofree(vm);
 }
 
@@ -325,28 +352,93 @@
 vmspace_exitfree(struct proc *p)
 {
 	struct vmspace *vm;
-	int exitingcnt;
 
+	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	p->p_vmspace = NULL;
+	PROC_VMSPACE_UNLOCK(p);
+	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
+	vmspace_free(vm);
+}
+
+void
+vmspace_exit(struct thread *td)
+{
+	int refcnt;
+	struct vmspace *vm;
+	struct proc *p;
 
 	/*
-	 * cleanup by parent process wait()ing on exiting child.  vm_refcnt
-	 * may not be 0 (e.g. fork() and child exits without exec()ing).
-	 * exitingcnt may increment above 0 and drop back down to zero
-	 * several times while vm_refcnt is held non-zero.  vm_refcnt
-	 * may also increment above 0 and drop back down to zero several
-	 * times while vm_exitingcnt is held non-zero.
+	 * Release user portion of address space.
+	 * This releases references to vnodes,
+	 * which could cause I/O if the file has been unlinked.
+	 * Need to do this early enough that we can still sleep.
 	 *
-	 * The last wait on the exiting child's vmspace will clean up
-	 * the remainder of the vmspace.
+	 * The last exiting process to reach this point releases as
+	 * much of the environment as it can. vmspace_dofree() is the
+	 * slower fallback in case another process had a temporary
+	 * reference to the vmspace.
 	 */
-	do
-		exitingcnt = vm->vm_exitingcnt;
-	while (!atomic_cmpset_int(&vm->vm_exitingcnt, exitingcnt,
-	    exitingcnt - 1));
-	if (vm->vm_refcnt == 0 && exitingcnt == 1)
+
+	p = td->td_proc;
+	vm = p->p_vmspace;
+	atomic_add_int(&vmspace0.vm_refcnt, 1);
+	do {
+		refcnt = vm->vm_refcnt;
+		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
+			/* Switch now since other proc might free vmspace */
+			PROC_VMSPACE_LOCK(p);
+			p->p_vmspace = &vmspace0;
+			PROC_VMSPACE_UNLOCK(p);
+			pmap_activate(td);
+		}
+	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
+	if (refcnt == 1) {
+		if (p->p_vmspace != vm) {
+			/* vmspace not yet freed, switch back */
+			PROC_VMSPACE_LOCK(p);
+			p->p_vmspace = vm;
+			PROC_VMSPACE_UNLOCK(p);
+			pmap_activate(td);
+		}
+		pmap_remove_pages(vmspace_pmap(vm));
+		/* Switch now since this proc will free vmspace */
+		PROC_VMSPACE_LOCK(p);
+		p->p_vmspace = &vmspace0;
+		PROC_VMSPACE_UNLOCK(p);
+		pmap_activate(td);
 		vmspace_dofree(vm);
+	}
+}
+
+/* Acquire reference to vmspace owned by another process. */
+
+struct vmspace *
+vmspace_acquire_ref(struct proc *p)
+{
+	struct vmspace *vm;
+	int refcnt;
+
+	PROC_VMSPACE_LOCK(p);
+	vm = p->p_vmspace;
+	if (vm == NULL) {
+		PROC_VMSPACE_UNLOCK(p);
+		return (NULL);
+	}
+	do {
+		refcnt = vm->vm_refcnt;
+		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
+			PROC_VMSPACE_UNLOCK(p);
+			return (NULL);
+		}
+	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
+	if (vm != p->p_vmspace) {
+		PROC_VMSPACE_UNLOCK(p);
+		vmspace_free(vm);
+		return (NULL);
+	}
+	PROC_VMSPACE_UNLOCK(p);
+	return (vm);
 }
 
 void
@@ -356,7 +448,7 @@
 	if (map->system_map)
 		_mtx_lock_flags(&map->system_mtx, 0, file, line);
 	else
-		_sx_xlock(&map->lock, file, line);
+		(void)_sx_xlock(&map->lock, 0, file, line);
 	map->timestamp++;
 }
 
@@ -377,7 +469,7 @@
 	if (map->system_map)
 		_mtx_lock_flags(&map->system_mtx, 0, file, line);
 	else
-		_sx_xlock(&map->lock, file, line);
+		(void)_sx_xlock(&map->lock, 0, file, line);
 }
 
 void
@@ -564,7 +656,7 @@
  *	Set the expected access behavior, either normal, random, or
  *	sequential.
  */
-static __inline void
+static inline void
 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 {
 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
@@ -576,7 +668,7 @@
  *
  *	Set the max_free field in a vm_map_entry.
  */
-static __inline void
+static inline void
 vm_map_entry_set_max_free(vm_map_entry_t entry)
 {
 
@@ -1078,6 +1170,25 @@
 	return (0);
 }
 
+int
+vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
+    vm_offset_t *addr /* IN/OUT */, vm_size_t length, vm_prot_t prot,
+    vm_prot_t max, int cow)
+{
+	vm_offset_t start, end;
+	int result;
+
+	start = *addr;
+	vm_map_lock(map);
+	end = start + length;
+	VM_MAP_RANGE_CHECK(map, start, end);
+	(void) vm_map_delete(map, start, end);
+	result = vm_map_insert(map, object, offset, start, end, prot,
+	    max, cow);
+	vm_map_unlock(map);
+	return (result);
+}
+
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
@@ -1288,22 +1399,6 @@
 }
 
 /*
- *	VM_MAP_RANGE_CHECK:	[ internal use only ]
- *
- *	Asserts that the starting and ending region
- *	addresses fall within the valid range of the map.
- */
-#define	VM_MAP_RANGE_CHECK(map, start, end)		\
-		{					\
-		if (start < vm_map_min(map))		\
-			start = vm_map_min(map);	\
-		if (end > vm_map_max(map))		\
-			end = vm_map_max(map);		\
-		if (start > end)			\
-			start = end;			\
-		}
-
-/*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
@@ -1362,17 +1457,18 @@
 /*
  *	vm_map_pmap_enter:
  *
- *	Preload read-only mappings for the given object into the specified
- *	map.  This eliminates the soft faults on process startup and
- *	immediately after an mmap(2).
+ *	Preload read-only mappings for the given object's resident pages into
+ *	the given map.  This eliminates the soft faults on process startup and
+ *	immediately after an mmap(2).  Unless the given flags include
+ *	MAP_PREFAULT_MADVISE, cached pages are not reactivated and mapped.
  */
 void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
-	vm_offset_t tmpidx;
-	int psize;
-	vm_page_t p, mpte;
+	vm_offset_t start;
+	vm_page_t p, p_start;
+	vm_pindex_t psize, tmpidx;
 	boolean_t are_queues_locked;
 
 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
@@ -1398,7 +1494,8 @@
 	}
 
 	are_queues_locked = FALSE;
-	mpte = NULL;
+	start = 0;
+	p_start = NULL;
 
 	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		if (p->pindex < pindex) {
@@ -1421,20 +1518,32 @@
 		 */
 		if ((flags & MAP_PREFAULT_MADVISE) &&
 		    cnt.v_free_count < cnt.v_free_reserved) {
+			psize = tmpidx;
 			break;
 		}
 		if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL &&
-		    (p->busy == 0) &&
-		    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
+		    (p->busy == 0)) {
+			if (p_start == NULL) {
+				start = addr + ptoa(tmpidx);
+				p_start = p;
+			}
+		} else if (p_start != NULL) {
 			if (!are_queues_locked) {
 				are_queues_locked = TRUE;
 				vm_page_lock_queues();
 			}
-			if ((p->queue - p->pc) == PQ_CACHE)
-				vm_page_deactivate(p);
-			mpte = pmap_enter_quick(map->pmap,
-			    addr + ptoa(tmpidx), p, prot, mpte);
+			pmap_enter_object(map->pmap, start, addr +
+			    ptoa(tmpidx), p_start, prot);
+			p_start = NULL;
+		}
+	}
+	if (p_start != NULL) {
+		if (!are_queues_locked) {
+			are_queues_locked = TRUE;
+			vm_page_lock_queues();
 		}
+		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
+		    p_start, prot);
 	}
 	if (are_queues_locked)
 		vm_page_unlock_queues();
@@ -2104,7 +2213,8 @@
 	/*
 	 * Make a first pass to check for user-wired memory and holes.
 	 */
-	for (current = entry; current->start < end; current = current->next) {
+	for (current = entry; current != &map->header && current->start < end;
+	    current = current->next) {
 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
@@ -2117,16 +2227,15 @@
 		}
 	}
 
-	if (invalidate) {
-		VM_LOCK_GIANT();
+	if (invalidate)
 		pmap_remove(map->pmap, start, end);
-		VM_UNLOCK_GIANT();
-	}
+
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
-	for (current = entry; current->start < end; current = current->next) {
+	for (current = entry; current != &map->header && current->start < end;
+	    current = current->next) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
@@ -2193,8 +2302,7 @@
 		VM_OBJECT_LOCK(object);
 		if (object->ref_count != 1 &&
 		    ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
-		     object == kernel_object || object == kmem_object) &&
-		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
+		    object == kernel_object || object == kmem_object)) {
 			vm_object_collapse(object);
 			vm_object_page_remove(object, offidxstart, offidxend, FALSE);
 			if (object->type == OBJT_SWAP)
@@ -2285,11 +2393,7 @@
 			vm_map_entry_unwire(map, entry);
 		}
 
-		if (!map->system_map)
-			VM_LOCK_GIANT();
 		pmap_remove(map->pmap, entry->start, entry->end);
-		if (!map->system_map)
-			VM_UNLOCK_GIANT();
 
 		/*
 		 * Delete the entry (which may delete the object) only after
@@ -2489,16 +2593,14 @@
 	vm_map_lock(old_map);
 
 	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
+	if (vm2 == NULL)
+		goto unlock_and_return;
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
 	new_map = &vm2->vm_map;	/* XXX */
 	new_map->timestamp = 1;
 
-	/* Do not inherit the MAP_WIREFUTURE property. */
-	if ((new_map->flags & MAP_WIREFUTURE) == MAP_WIREFUTURE)
-		new_map->flags &= ~MAP_WIREFUTURE;
-
 	old_entry = old_map->header.next;
 
 	while (old_entry != &old_map->header) {
@@ -2584,7 +2686,7 @@
 		}
 		old_entry = old_entry->next;
 	}
-
+unlock_and_return:
 	vm_map_unlock(old_map);
 
 	return (vm2);
@@ -2610,7 +2712,9 @@
 	cow &= ~orient;
 	KASSERT(orient != 0, ("No stack grow direction"));
 
-	if (addrbos < vm_map_min(map) || addrbos > map->max_offset)
+	if (addrbos < vm_map_min(map) ||
+	    addrbos > vm_map_max(map) ||
+	    addrbos + max_ssize < addrbos)
 		return (KERN_NO_SPACE);
 
 	init_ssize = (max_ssize < sgrowsiz) ? max_ssize : sgrowsiz;
@@ -2912,13 +3016,15 @@
  * Unshare the specified VM space for exec.  If other processes are
  * mapped to it, then create a new one.  The new vmspace is null.
  */
-void
+int
 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	newvmspace = vmspace_alloc(minuser, maxuser);
+	if (newvmspace == NULL)
+		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
 	/*
 	 * This code is written like this for prototype purposes.  The
@@ -2927,29 +3033,37 @@
 	 * run it down.  Even though there is little or no chance of blocking
 	 * here, it is a good idea to keep this form for future mods.
 	 */
+	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
+	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)		/* XXXKSE ? */
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
+	return (0);
 }
 
 /*
  * Unshare the specified VM space for forcing COW.  This
  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  */
-void
+int
 vmspace_unshare(struct proc *p)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	if (oldvmspace->vm_refcnt == 1)
-		return;
+		return (0);
 	newvmspace = vmspace_fork(oldvmspace);
+	if (newvmspace == NULL)
+		return (ENOMEM);
+	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
+	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)		/* XXXKSE ? */
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
+	return (0);
 }
 
 /*
Index: softdep.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/softdep.h,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/softdep.h -L sys/ufs/ffs/softdep.h -u -r1.2 -r1.3
--- sys/ufs/ffs/softdep.h
+++ sys/ufs/ffs/softdep.h
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)softdep.h	9.7 (McKusick) 6/21/00
- * $FreeBSD: src/sys/ufs/ffs/softdep.h,v 1.17.2.2 2006/03/13 03:08:00 jeff Exp $
+ * $FreeBSD: src/sys/ufs/ffs/softdep.h,v 1.19 2006/03/02 05:50:23 jeff Exp $
  */
 
 #include <sys/queue.h>
Index: ffs_rawread.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_rawread.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_rawread.c -L sys/ufs/ffs/ffs_rawread.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_rawread.c
+++ sys/ufs/ffs/ffs_rawread.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_rawread.c,v 1.25.2.2 2006/03/09 00:18:45 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_rawread.c,v 1.29 2007/02/04 23:42:02 tegge Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -129,8 +129,16 @@
 			upgraded = 0;
 			
 		
-		/* Attempt to msync mmap() regions to clean dirty mmap */ 
 		VI_LOCK(vp);
+		/* Check if vnode was reclaimed while unlocked. */
+		if ((vp->v_iflag & VI_DOOMED) != 0) {
+			VI_UNLOCK(vp);
+			if (upgraded != 0)
+				VOP_LOCK(vp, LK_DOWNGRADE, td);
+			vn_finished_write(mp);
+			return (EIO);
+		}
+		/* Attempt to msync mmap() regions to clean dirty mmap */ 
 		if ((vp->v_iflag & VI_OBJDIRTY) != 0) {
 			VI_UNLOCK(vp);
 			if (vp->v_object != NULL) {
@@ -150,6 +158,7 @@
 			VI_UNLOCK(vp);
 			if (upgraded != 0)
 				VOP_LOCK(vp, LK_DOWNGRADE, td);
+			vn_finished_write(mp);
 			return (error);
 		}
 		/* Flush dirty buffers */
@@ -159,6 +168,7 @@
 			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) {
 				if (upgraded != 0)
 					VOP_LOCK(vp, LK_DOWNGRADE, td);
+				vn_finished_write(mp);
 				return (error);
 			}
 			VI_LOCK(vp);
@@ -300,7 +310,7 @@
 			/* XXX: Leave some bufs for swap */
 			bp = getpbuf(&ffsrawbufcnt);
 			sa = bp->b_data;
-			bp->b_vp = vp; 
+			pbgetvp(vp, bp);
 			error = ffs_rawread_readahead(vp, udata, offset,
 						     resid, td, bp, sa);
 			if (error != 0)
@@ -314,7 +324,7 @@
 					nbp = NULL;
 				if (nbp != NULL) {
 					nsa = nbp->b_data;
-					nbp->b_vp = vp;
+					pbgetvp(vp, nbp);
 					
 					nerror = ffs_rawread_readahead(vp, 
 								       udata +
@@ -327,6 +337,7 @@
 								       nbp,
 								       nsa);
 					if (nerror) {
+						pbrelvp(nbp);
 						relpbuf(nbp, &ffsrawbufcnt);
 						nbp = NULL;
 					}
@@ -375,6 +386,7 @@
 			nsa = tsa;
 			
 			if (resid <= bp->b_bufsize) { /* No more readaheads */
+				pbrelvp(nbp);
 				relpbuf(nbp, &ffsrawbufcnt);
 				nbp = NULL;
 			} else { /* Setup next readahead */
@@ -389,6 +401,7 @@
 							       nbp,
 							       nsa);
 				if (nerror != 0) {
+					pbrelvp(nbp);
 					relpbuf(nbp, &ffsrawbufcnt);
 					nbp = NULL;
 				}
@@ -403,13 +416,16 @@
 		}
 	}
 	
-	if (bp != NULL)
+	if (bp != NULL) {
+		pbrelvp(bp);
 		relpbuf(bp, &ffsrawbufcnt);
+	}
 	if (nbp != NULL) {			/* Run down readahead buffer */
 		spl = splbio();
 		bwait(nbp, PRIBIO, "rawrd");
 		splx(spl);
 		vunmapbuf(nbp);
+		pbrelvp(nbp);
 		relpbuf(nbp, &ffsrawbufcnt);
 	}
 	
Index: ffs_inode.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_inode.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ffs/ffs_inode.c -L sys/ufs/ffs/ffs_inode.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ffs/ffs_inode.c
+++ sys/ufs/ffs/ffs_inode.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_inode.c,v 1.106 2005/04/05 08:49:41 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_inode.c,v 1.108 2007/06/01 01:12:45 jeff Exp $");
 
 #include "opt_quota.h"
 
@@ -66,9 +66,11 @@
  * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.  Write the inode
  * to disk if the IN_MODIFIED flag is set (it may be set initially, or by
  * the timestamp update).  The IN_LAZYMOD flag is set to force a write
- * later if not now.  If we write now, then clear both IN_MODIFIED and
- * IN_LAZYMOD to reflect the presumably successful write, and if waitfor is
- * set, then wait for the write to complete.
+ * later if not now.  The IN_LAZYACCESS is set instead of IN_MODIFIED if the fs
+ * is currently being suspended (or is suspended) and vnode has been accessed.
+ * If we write now, then clear IN_MODIFIED, IN_LAZYACCESS and IN_LAZYMOD to
+ * reflect the presumably successful write, and if waitfor is set, then wait
+ * for the write to complete.
  */
 int
 ffs_update(vp, waitfor)
@@ -80,12 +82,12 @@
 	struct inode *ip;
 	int error;
 
-	ASSERT_VOP_LOCKED(vp, "ffs_update");
+	ASSERT_VOP_ELOCKED(vp, "ffs_update");
 	ufs_itimes(vp);
 	ip = VTOI(vp);
 	if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0)
 		return (0);
-	ip->i_flag &= ~(IN_LAZYMOD | IN_MODIFIED);
+	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
 	fs = ip->i_fs;
 	if (fs->fs_ronly)
 		return (0);
@@ -557,7 +559,7 @@
 	vp = ITOV(ip);
 	bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0);
 	if ((bp->b_flags & B_CACHE) == 0) {
-		curproc->p_stats->p_ru.ru_inblock++;	/* pay for read */
+		curthread->td_ru.ru_inblock++;	/* pay for read */
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
Index: ffs_balloc.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_balloc.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ffs/ffs_balloc.c -L sys/ufs/ffs/ffs_balloc.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ffs/ffs_balloc.c
+++ sys/ufs/ffs/ffs_balloc.c
@@ -60,7 +60,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_balloc.c,v 1.50 2005/02/08 17:23:39 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_balloc.c,v 1.50.14.1 2008/01/19 18:12:25 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -102,6 +102,7 @@
 	ufs2_daddr_t newb;
 	ufs1_daddr_t *bap, pref;
 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
+	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
 	int unwindidx = -1;
 
 	ip = VTOI(vp);
@@ -231,6 +232,7 @@
 	nb = dp->di_ib[indirs[0].in_off];
 	allocib = NULL;
 	allocblk = allociblk;
+	lbns_remfree = lbns;
 	if (nb == 0) {
 		UFS_LOCK(ump);
 		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
@@ -239,6 +241,7 @@
 			return (error);
 		nb = newb;
 		*allocblk++ = nb;
+		*lbns_remfree++ = indirs[1].in_lbn;
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
 		bp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(bp);
@@ -289,6 +292,7 @@
 		}
 		nb = newb;
 		*allocblk++ = nb;
+		*lbns_remfree++ = indirs[i].in_lbn;
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(nbp);
@@ -342,6 +346,7 @@
 		}
 		nb = newb;
 		*allocblk++ = nb;
+		*lbns_remfree++ = lbn;
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		if (flags & BA_CLRBUF)
@@ -403,9 +408,18 @@
 	 * have an error to return to the user.
 	 */
 	(void) ffs_syncvnode(vp, MNT_WAIT);
-	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
-		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number);
+	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
+	     blkp < allocblk; blkp++, lbns_remfree++) {
+		/*
+		 * We shall not leave the freed blocks on the vnode
+		 * buffer object lists.
+		 */
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+		if (bp != NULL) {
+			bp->b_flags |= (B_INVAL | B_RELBUF);
+			bp->b_flags &= ~B_ASYNC;
+			brelse(bp);
+		}
 		deallocated += fs->fs_bsize;
 	}
 	if (allocib != NULL) {
@@ -441,6 +455,14 @@
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	(void) ffs_syncvnode(vp, MNT_WAIT);
+	/*
+	 * After the buffers are invalidated and on-disk pointers are
+	 * cleared, free the blocks.
+	 */
+	for (blkp = allociblk; blkp < allocblk; blkp++) {
+		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
+		    ip->i_number);
+	}
 	return (error);
 }
 
@@ -464,6 +486,7 @@
 	struct indir indirs[NIADDR + 2];
 	ufs2_daddr_t nb, newb, *bap, pref;
 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
+	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
 	int deallocated, osize, nsize, num, i, error;
 	int unwindidx = -1;
 
@@ -703,6 +726,7 @@
 	nb = dp->di_ib[indirs[0].in_off];
 	allocib = NULL;
 	allocblk = allociblk;
+	lbns_remfree = lbns;
 	if (nb == 0) {
 		UFS_LOCK(ump);
 		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
@@ -711,6 +735,7 @@
 			return (error);
 		nb = newb;
 		*allocblk++ = nb;
+		*lbns_remfree++ = indirs[1].in_lbn;
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
 		bp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(bp);
@@ -761,6 +786,7 @@
 		}
 		nb = newb;
 		*allocblk++ = nb;
+		*lbns_remfree++ = indirs[i].in_lbn;
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(nbp);
@@ -814,6 +840,7 @@
 		}
 		nb = newb;
 		*allocblk++ = nb;
+		*lbns_remfree++ = lbn;
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		if (flags & BA_CLRBUF)
@@ -881,9 +908,18 @@
 	 * have an error to return to the user.
 	 */
 	(void) ffs_syncvnode(vp, MNT_WAIT);
-	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
-		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number);
+	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
+	     blkp < allocblk; blkp++, lbns_remfree++) {
+		/*
+		 * We shall not leave the freed blocks on the vnode
+		 * buffer object lists.
+		 */
+		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
+		if (bp != NULL) {
+			bp->b_flags |= (B_INVAL | B_RELBUF);
+			bp->b_flags &= ~B_ASYNC;
+			brelse(bp);
+		}
 		deallocated += fs->fs_bsize;
 	}
 	if (allocib != NULL) {
@@ -919,5 +955,13 @@
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	(void) ffs_syncvnode(vp, MNT_WAIT);
+	/*
+	 * After the buffers are invalidated and on-disk pointers are
+	 * cleared, free the blocks.
+	 */
+	for (blkp = allociblk; blkp < allocblk; blkp++) {
+		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
+		    ip->i_number);
+	}
 	return (error);
 }
Index: ffs_alloc.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_alloc.c -L sys/ufs/ffs/ffs_alloc.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_alloc.c
+++ sys/ufs/ffs/ffs_alloc.c
@@ -60,7 +60,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_alloc.c,v 1.132.2.4 2006/03/13 03:07:32 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_alloc.c,v 1.147 2007/09/10 14:12:29 bz Exp $");
 
 #include "opt_quota.h"
 
@@ -71,6 +71,7 @@
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
@@ -121,7 +122,7 @@
  *   3) allocate a block in the same cylinder group.
  *   4) quadradically rehash into other cylinder groups, until an
  *      available block is located.
- * If no block preference is given the following heirarchy is used
+ * If no block preference is given the following hierarchy is used
  * to allocate a block:
  *   1) allocate a block in the cylinder group that contains the
  *      inode for the file.
@@ -142,6 +143,7 @@
 	int cg, reclaimed;
 	static struct timeval lastfail;
 	static int curfail;
+	int64_t delta;
 #ifdef QUOTA
 	int error;
 #endif
@@ -171,7 +173,7 @@
 #endif
 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 		goto nospace;
-	if (suser_cred(cred, SUSER_ALLOWJAIL) &&
+	if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
 	    freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
 		goto nospace;
 	if (bpref >= fs->fs_size)
@@ -182,11 +184,18 @@
 		cg = dtog(fs, bpref);
 	bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
 	if (bno > 0) {
-		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
+		delta = btodb(size);
+		if (ip->i_flag & IN_SPACECOUNTED) {
+			UFS_LOCK(ump);
+			fs->fs_pendingblocks += delta;
+			UFS_UNLOCK(ump);
+		}
+		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		*bnp = bno;
 		return (0);
 	}
+nospace:
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	/*
@@ -195,7 +204,6 @@
 	(void) chkdq(ip, -btodb(size), cred, FORCE);
 	UFS_LOCK(ump);
 #endif
-nospace:
 	if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
 		reclaimed = 1;
 		softdep_request_cleanup(fs, ITOV(ip));
@@ -236,6 +244,7 @@
 	ufs2_daddr_t bno;
 	static struct timeval lastfail;
 	static int curfail;
+	int64_t delta;
 
 	*bpp = 0;
 	vp = ITOV(ip);
@@ -259,7 +268,7 @@
 #endif /* DIAGNOSTIC */
 	reclaimed = 0;
 retry:
-	if (suser_cred(cred, SUSER_ALLOWJAIL) &&
+	if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
 	    freespace(fs, fs->fs_minfree) -  numfrags(fs, nsize - osize) < 0) {
 		goto nospace;
 	}
@@ -301,7 +310,13 @@
 	if (bno) {
 		if (bp->b_blkno != fsbtodb(fs, bno))
 			panic("ffs_realloccg: bad blockno");
-		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(nsize - osize));
+		delta = btodb(nsize - osize);
+		if (ip->i_flag & IN_SPACECOUNTED) {
+			UFS_LOCK(ump);
+			fs->fs_pendingblocks += delta;
+			UFS_UNLOCK(ump);
+		}
+		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
@@ -370,7 +385,13 @@
 			ffs_blkfree(ump, fs, ip->i_devvp,
 			    bno + numfrags(fs, nsize),
 			    (long)(request - nsize), ip->i_number);
-		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(nsize - osize));
+		delta = btodb(nsize - osize);
+		if (ip->i_flag & IN_SPACECOUNTED) {
+			UFS_LOCK(ump);
+			fs->fs_pendingblocks += delta;
+			UFS_UNLOCK(ump);
+		}
+		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
@@ -879,7 +900,7 @@
  *   2) allocate an inode in the same cylinder group.
  *   3) quadradically rehash into other cylinder groups, until an
  *      available inode is located.
- * If no inode preference is given the following heirarchy is used
+ * If no inode preference is given the following hierarchy is used
  * to allocate an inode:
  *   1) allocate an inode in cylinder group 0.
  *   2) quadradically rehash into other cylinder groups, until an
@@ -1052,7 +1073,10 @@
 	curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
 	if (dirsize < curdirsize)
 		dirsize = curdirsize;
-	maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
+	if (dirsize <= 0)
+		maxcontigdirs = 0;		/* dirsize overflowed */
+	else
+		maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
 	if (fs->fs_avgfpdir > 0)
 		maxcontigdirs = min(maxcontigdirs,
 				    fs->fs_ipg / fs->fs_avgfpdir);
@@ -2131,13 +2155,13 @@
 	blksfree = cg_blksfree(cgp);
 	len = howmany(fs->fs_fpg, NBBY) - start;
 	loc = scanc((u_int)len, (u_char *)&blksfree[start],
-		(u_char *)fragtbl[fs->fs_frag],
+		fragtbl[fs->fs_frag],
 		(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 	if (loc == 0) {
 		len = start + 1;
 		start = 0;
 		loc = scanc((u_int)len, (u_char *)&blksfree[0],
-			(u_char *)fragtbl[fs->fs_frag],
+			fragtbl[fs->fs_frag],
 			(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 		if (loc == 0) {
 			printf("start = %d, len = %d, fs = %s\n",
@@ -2430,6 +2454,11 @@
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
 			break;
 		ip = VTOI(vp);
+		if (ip->i_flag & IN_SPACECOUNTED) {
+			UFS_LOCK(ump);
+			fs->fs_pendingblocks += cmd.size;
+			UFS_UNLOCK(ump);
+		}
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
 		ip->i_flag |= IN_CHANGE;
 		vput(vp);
--- sys/ufs/ffs/README.softupdates
+++ /dev/null
@@ -1,58 +0,0 @@
-$FreeBSD: src/sys/ufs/ffs/README.softupdates,v 1.9 2000/07/08 02:31:21 mckusick Exp $
-
-Using Soft Updates
-
-To enable the soft updates feature in your kernel, add option
-SOFTUPDATES to your kernel configuration.
-
-Once you are running a kernel with soft update support, you need to enable
-it for whichever filesystems you wish to run with the soft update policy.
-This is done with the -n option to tunefs(8) on the UNMOUNTED filesystems,
-e.g. from single-user mode you'd do something like:
-
-	tunefs -n enable /usr
-
-To permanently enable soft updates on the /usr filesystem (or at least
-until a corresponding ``tunefs -n disable'' is done).
-
-
-Soft Updates Copyright Restrictions
-
-As of June 2000 the restrictive copyright has been removed and 
-replaced with a `Berkeley-style' copyright. The files implementing
-soft updates now reside in the sys/ufs/ffs directory and are
-compiled into the generic kernel by default.
-
-
-Soft Updates Status
-
-The soft updates code has been running in production on many
-systems for the past two years generally quite successfully.
-The two current sets of shortcomings are:
-
-1) On filesystems that are chronically full, the two minute lag
-   from the time a file is deleted until its free space shows up
-   will result in premature filesystem full failures. This
-   failure mode is most evident in small filesystems such as
-   the root. For this reason, use of soft updates is not
-   recommended on the root filesystem.
-
-2) If your system routines runs parallel processes each of which
-   remove many files, the kernel memory rate limiting code may
-   not be able to slow removal operations to a level sustainable
-   by the disk subsystem. The result is that the kernel runs out
-   of memory and hangs.
-
-Both of these problems are being addressed, but have not yet
-been resolved. There are no other known problems at this time.
-
-
-How Soft Updates Work
-
-For more general information on soft updates, please see:
-	http://www.mckusick.com/softdep/
-	http://www.ece.cmu.edu/~ganger/papers/CSE-TR-254-95/
-
---
-Marshall Kirk McKusick <mckusick at mckusick.com>
-July 2000
Index: ffs_softdep.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_softdep.c -L sys/ufs/ffs/ffs_softdep.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_softdep.c
+++ sys/ufs/ffs/ffs_softdep.c
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.181.2.8 2006/04/04 18:14:30 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.211 2007/06/22 13:22:36 kib Exp $");
 
 /*
  * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
@@ -81,6 +81,7 @@
 #include <vm/vm.h>
 
 #include "opt_ffs.h"
+#include "opt_quota.h"
 
 #ifndef SOFTUPDATES
 
@@ -164,7 +165,7 @@
 	struct buf *bp;
 {
 	
-	panic("softdep_setup_allocdirect called");
+	panic("softdep_setup_allocext called");
 }
 
 void
@@ -479,7 +480,7 @@
 #define TYPENAME(type)  \
 	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
 /*
- * End system adaptaion definitions.
+ * End system adaptation definitions.
  */
 
 /*
@@ -728,6 +729,7 @@
 
 	for (;;) {	
 		kthread_suspend_check(softdepproc);
+		vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
 		ACQUIRE_LOCK(&lk);
 		/*
 		 * If requested, try removing inode or removal dependencies.
@@ -743,6 +745,7 @@
 			wakeup_one(&proc_waiting);
 		}
 		FREE_LOCK(&lk);
+		VFS_UNLOCK_GIANT(vfslocked);
 		remaining = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
@@ -803,7 +806,7 @@
 	if (wk->wk_state & ONWORKLIST)
 		panic("add_to_worklist: already on list");
 	wk->wk_state |= ONWORKLIST;
-	if (LIST_FIRST(&ump->softdep_workitem_pending) == NULL)
+	if (LIST_EMPTY(&ump->softdep_workitem_pending))
 		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 	else
 		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
@@ -990,7 +993,7 @@
 {
 	struct worklist *wk, *wktail;
 
-	if (LIST_FIRST(&newbp->b_dep) != NULL)
+	if (!LIST_EMPTY(&newbp->b_dep))
 		panic("softdep_move_dependencies: need merge code");
 	wktail = 0;
 	ACQUIRE_LOCK(&lk);
@@ -1058,7 +1061,7 @@
 	error = 0;
 	if (i == 10) {
 		error = EBUSY;
-		printf("softdep_waitidle: Failed to flush worklist for %p",
+		printf("softdep_waitidle: Failed to flush worklist for %p\n",
 		    mp);
 	}
 
@@ -1423,8 +1426,14 @@
 	struct buf *bp;
 	int error, cyl;
 
-	mp->mnt_flag &= ~MNT_ASYNC;
-	mp->mnt_flag |= MNT_SOFTDEP;
+	MNT_ILOCK(mp);
+	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
+	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
+		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 
+			MNTK_SOFTDEP;
+		mp->mnt_noasync++;
+	}
+	MNT_IUNLOCK(mp);
 	ump = VFSTOUFS(mp);
 	LIST_INIT(&ump->softdep_workitem_pending);
 	ump->softdep_worklist_tail = NULL;
@@ -1516,7 +1525,8 @@
 	ACQUIRE_LOCK(&lk);
 	if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
 	    &inodedep)))
-		panic("softdep_setup_inomapdep: found inode");
+		panic("softdep_setup_inomapdep: dependency for new inode "
+		    "already exists");
 	inodedep->id_buf = bp;
 	inodedep->id_state &= ~DEPCOMPLETE;
 	bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
@@ -1769,7 +1779,7 @@
 	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
 		newdirblk = WK_NEWDIRBLK(wk);
 		WORKLIST_REMOVE(&newdirblk->db_list);
-		if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
+		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
 			panic("allocdirect_merge: extra newdirblk");
 		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
 	}
@@ -2215,7 +2225,7 @@
 		}
 		/*
 		 * If the file was removed, then the space being freed was
-		 * accounted for then (see softdep_filereleased()). If the
+		 * accounted for then (see softdep_releasefile()). If the
 		 * file is merely being truncated, then we account for it now.
 		 */
 		if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
@@ -2497,7 +2507,7 @@
 	if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
 		newdirblk = WK_NEWDIRBLK(wk);
 		WORKLIST_REMOVE(&newdirblk->db_list);
-		if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
+		if (!LIST_EMPTY(&adp->ad_newdirblk))
 			panic("free_allocdirect: extra newdirblk");
 		if (delay)
 			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
@@ -2540,7 +2550,7 @@
 	 * If no dependencies remain, the pagedep will be freed.
 	 */
 	for (i = 0; i < DAHASHSZ; i++)
-		if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
+		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
 			break;
 	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
 		LIST_REMOVE(pagedep, pd_hash);
@@ -2593,6 +2603,7 @@
 	}
 	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
 	FREE_LOCK(&lk);
+	ip->i_flag |= IN_MODIFIED;
 }
 
 /*
@@ -2617,13 +2628,13 @@
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
-	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
-	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
-	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
+	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
+	    !LIST_EMPTY(&inodedep->id_bufwait) ||
+	    !LIST_EMPTY(&inodedep->id_inowait) ||
+	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
+	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
+	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
+	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 	    inodedep->id_nlinkdelta != 0)
 		return (0);
 
@@ -2660,13 +2671,13 @@
 	mtx_assert(&lk, MA_OWNED);
 	if ((inodedep->id_state & ONWORKLIST) != 0 ||
 	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
-	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
-	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
-	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
+	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
+	    !LIST_EMPTY(&inodedep->id_bufwait) ||
+	    !LIST_EMPTY(&inodedep->id_inowait) ||
+	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
+	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
+	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
+	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
 		return (0);
 	LIST_REMOVE(inodedep, id_hash);
@@ -2733,7 +2744,7 @@
 			if ((bn = freeblks->fb_iblks[level]) == 0)
 				continue;
 			if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
-			    level, baselbns[level], &blocksreleased)) == 0)
+			    level, baselbns[level], &blocksreleased)) != 0)
 				allerror = error;
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
 			    fs->fs_bsize, freeblks->fb_previousinum);
@@ -2842,7 +2853,7 @@
 			panic("indir_trunc: lost indirdep");
 		WORKLIST_REMOVE(wk);
 		WORKITEM_FREE(indirdep, D_INDIRDEP);
-		if (LIST_FIRST(&bp->b_dep) != NULL)
+		if (!LIST_EMPTY(&bp->b_dep))
 			panic("indir_trunc: dangling dep");
 		ump->um_numindirdeps -= 1;
 		FREE_LOCK(&lk);
@@ -3500,9 +3511,9 @@
 	int extblocks;
 
 	if (ip->i_effnlink > 0)
-		panic("softdep_filerelease: file still referenced");
+		panic("softdep_releasefile: file still referenced");
 	/*
-	 * We may be called several times as the real reference count
+	 * We may be called several times as the on-disk link count
 	 * drops to zero. We only want to account for the space once.
 	 */
 	if (ip->i_flag & IN_SPACECOUNTED)
@@ -3616,9 +3627,12 @@
 	dirrem->dm_oldinum = dirrem->dm_dirinum;
 	if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
 	    0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
+		if (xp != NULL)
+			add_to_worklist(&dirrem->dm_list);
 		FREE_LOCK(&lk);
 		vput(vp);
-		handle_workitem_remove(dirrem, NULL);
+		if (xp == NULL)
+			handle_workitem_remove(dirrem, NULL);
 		return;
 	}
 	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
@@ -3757,7 +3771,7 @@
 			 * will be writing the real pointers, so the
 			 * dependency can be freed.
 			 */
-			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
+			if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
 				struct buf *bp;
 
 				bp = indirdep->ir_savebp;
@@ -3894,7 +3908,7 @@
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = 0;
-	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
+	if (TAILQ_EMPTY(&inodedep->id_inoupdt))
 		return;
 	/*
 	 * Set the dependencies to busy.
@@ -4037,8 +4051,8 @@
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = dp->di_extsize;
-	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
-	    TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
+	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
+	    TAILQ_EMPTY(&inodedep->id_extupdt))
 		return;
 	/*
 	 * Set the ext data dependencies to busy.
@@ -4895,10 +4909,10 @@
 	 * allocdirects that are completed by the merger.
 	 */
 	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
-	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
+	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
 	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
-	if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL)
+	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
 	/*
 	 * Now that the inode has been pushed into the buffer, the
@@ -4995,7 +5009,7 @@
 	struct buf *bp;
 	struct fs *fs;
 	struct thread *td = curthread;
-	int error, flushparent;
+	int error, flushparent, pagedep_new_block;
 	ino_t parentino;
 	ufs_lbn_t lbn;
 
@@ -5007,12 +5021,12 @@
 		FREE_LOCK(&lk);
 		return (0);
 	}
-	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
-	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
-	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
+	if (!LIST_EMPTY(&inodedep->id_inowait) ||
+	    !LIST_EMPTY(&inodedep->id_bufwait) ||
+	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
+	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
+	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
+	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
 		panic("softdep_fsync: pending ops");
 	for (error = 0, flushparent = 0; ; ) {
 		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
@@ -5073,15 +5087,36 @@
 		 * then we do the slower ffs_syncvnode of the directory.
 		 */
 		if (flushparent) {
+			int locked;
+
 			if ((error = ffs_update(pvp, 1)) != 0) {
 				vput(pvp);
 				return (error);
 			}
-			if ((pagedep->pd_state & NEWBLOCK) &&
-			    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
-				vput(pvp);
-				return (error);
+			ACQUIRE_LOCK(&lk);
+			locked = 1;
+			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
+				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
+					if (wk->wk_type != D_DIRADD)
+						panic("softdep_fsync: Unexpected type %s",
+						      TYPENAME(wk->wk_type));
+					dap = WK_DIRADD(wk);
+					if (dap->da_state & DIRCHG)
+						pagedep = dap->da_previous->dm_pagedep;
+					else
+						pagedep = dap->da_pagedep;
+					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
+					FREE_LOCK(&lk);
+					locked = 0;
+					if (pagedep_new_block &&
+					    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
+						vput(pvp);
+						return (error);
+					}
+				}
 			}
+			if (locked)
+				FREE_LOCK(&lk);
 		}
 		/*
 		 * Flush directory page containing the inode's name.
@@ -5268,7 +5303,7 @@
 					goto restart;
 				FREE_LOCK(&lk);
 				if ((error = bwrite(nbp)) != 0) {
-					break;
+					goto loop_end;
 				}
 				ACQUIRE_LOCK(&lk);
 				goto restart;
@@ -5299,7 +5334,7 @@
 				    flush_pagedep_deps(vp, wk->wk_mp,
 						&pagedep->pd_diraddhd[i]))) {
 					FREE_LOCK(&lk);
-					break;
+					goto loop_end;
 				}
 			}
 			continue;
@@ -5351,6 +5386,7 @@
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
+	loop_end:
 		/* We reach here only in error and unlocked */
 		if (error == 0)
 			panic("softdep_sync_metadata: zero error");
@@ -5501,6 +5537,7 @@
 	int error = 0;
 	struct buf *bp;
 	ino_t inum;
+	struct worklist *wk;
 
 	ump = VFSTOUFS(mp);
 	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
@@ -5545,8 +5582,53 @@
 			}
 			VI_LOCK(vp);
 			drain_output(vp);
+			/*
+			 * If first block is still dirty with a D_MKDIR
+			 * dependency then it needs to be written now.
+			 */
+			for (;;) {
+				error = 0;
+				bp = gbincore(&vp->v_bufobj, 0);
+				if (bp == NULL)
+					break;	/* First block not present */
+				error = BUF_LOCK(bp,
+						 LK_EXCLUSIVE |
+						 LK_SLEEPFAIL |
+						 LK_INTERLOCK,
+						 VI_MTX(vp));
+				VI_LOCK(vp);
+				if (error == ENOLCK)
+					continue;	/* Slept, retry */
+				if (error != 0)
+					break;		/* Failed */
+				if ((bp->b_flags & B_DELWRI) == 0) {
+					BUF_UNLOCK(bp);
+					break;	/* Buffer not dirty */
+				}
+				for (wk = LIST_FIRST(&bp->b_dep);
+				     wk != NULL;
+				     wk = LIST_NEXT(wk, wk_list))
+					if (wk->wk_type == D_MKDIR)
+						break;
+				if (wk == NULL)
+					BUF_UNLOCK(bp);	/* Dependency gone */
+				else {
+					/*
+					 * D_MKDIR dependency remains,
+					 * must write buffer to stable
+					 * storage.
+					 */
+					VI_UNLOCK(vp);
+					bremfree(bp);
+					error = bwrite(bp);
+					VI_LOCK(vp);
+				}
+				break;
+			}
 			VI_UNLOCK(vp);
 			vput(vp);
+			if (error != 0)
+				break;	/* Flushing of first block failed */
 			ACQUIRE_LOCK(&lk);
 			/*
 			 * If that cleared dependencies, go on to next.
@@ -5819,7 +5901,7 @@
 		if (next >= pagedep_hash)
 			next = 0;
 		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
-			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
+			if (LIST_EMPTY(&pagedep->pd_dirremhd))
 				continue;
 			mp = pagedep->pd_list.wk_mp;
 			ino = pagedep->pd_ino;
Index: ffs_vfsops.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_vfsops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_vfsops.c -L sys/ufs/ffs/ffs_vfsops.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_vfsops.c
+++ sys/ufs/ffs/ffs_vfsops.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_vfsops.c,v 1.290.2.9 2006/03/22 17:54:50 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_vfsops.c,v 1.329 2007/04/04 07:29:53 delphij Exp $");
 
 #include "opt_mac.h"
 #include "opt_quota.h"
@@ -40,9 +40,9 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
-#include <sys/mac.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
@@ -52,7 +52,10 @@
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <ufs/ufs/extattr.h>
+#include <ufs/ufs/gjournal.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
@@ -70,7 +73,6 @@
 
 static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
 
-static int	ffs_sbupdate(struct ufsmount *, int, int);
 static int	ffs_reload(struct mount *, struct thread *);
 static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
 static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
@@ -85,7 +87,6 @@
 static vfs_mount_t ffs_mount;
 static vfs_statfs_t ffs_statfs;
 static vfs_fhtovp_t ffs_fhtovp;
-static vfs_vptofh_t ffs_vptofh;
 static vfs_sync_t ffs_sync;
 
 static struct vfsops ufs_vfsops = {
@@ -101,10 +102,10 @@
 	.vfs_uninit =		ffs_uninit,
 	.vfs_unmount =		ffs_unmount,
 	.vfs_vget =		ffs_vget,
-	.vfs_vptofh =		ffs_vptofh,
 };
 
 VFS_SET(ufs_vfsops, ufs, 0);
+MODULE_VERSION(ufs, 1);
 
 static b_strategy_t ffs_geom_strategy;
 static b_write_t ffs_bufwrite;
@@ -114,12 +115,17 @@
 	.bop_write =	ffs_bufwrite,
 	.bop_strategy =	ffs_geom_strategy,
 	.bop_sync =	bufsync,
+#ifdef NO_FFS_SNAPSHOT
+	.bop_bdflush =	bufbdflush,
+#else
+	.bop_bdflush =	ffs_bdflush,
+#endif
 };
 
 static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr",
     "clusterw", "exec", "export", "force", "from", "multilabel", 
     "snapshot", "suid", "suiddir", "symfollow", "sync",
-    "update", "union", NULL };
+    "union", NULL };
 
 static int
 ffs_mount(struct mount *mp, struct thread *td)
@@ -128,9 +134,9 @@
 	struct ufsmount *ump = 0;
 	struct fs *fs;
 	int error, flags;
+	u_int mntorflags, mntandnotflags;
 	mode_t accessmode;
 	struct nameidata ndp;
-	struct export_args export;
 	char *fspec;
 
 	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
@@ -151,6 +157,38 @@
 	if (error)
 		return (error);
 
+	mntorflags = 0;
+	mntandnotflags = 0;
+	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
+		mntorflags |= MNT_ACLS;
+
+	if (vfs_getopt(mp->mnt_optnew, "async", NULL, NULL) == 0)
+		mntorflags |= MNT_ASYNC;
+
+	if (vfs_getopt(mp->mnt_optnew, "force", NULL, NULL) == 0)
+		mntorflags |= MNT_FORCE;
+
+	if (vfs_getopt(mp->mnt_optnew, "multilabel", NULL, NULL) == 0)
+		mntorflags |= MNT_MULTILABEL;
+
+	if (vfs_getopt(mp->mnt_optnew, "noasync", NULL, NULL) == 0)
+		mntandnotflags |= MNT_ASYNC;
+
+	if (vfs_getopt(mp->mnt_optnew, "noatime", NULL, NULL) == 0)
+		mntorflags |= MNT_NOATIME;
+
+	if (vfs_getopt(mp->mnt_optnew, "noclusterr", NULL, NULL) == 0)
+		mntorflags |= MNT_NOCLUSTERR;
+
+	if (vfs_getopt(mp->mnt_optnew, "noclusterw", NULL, NULL) == 0)
+		mntorflags |= MNT_NOCLUSTERW;
+
+	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0)
+		mntorflags |= MNT_SNAPSHOT;
+
+	MNT_ILOCK(mp);
+	mp->mnt_flag = (mp->mnt_flag | mntorflags) & ~mntandnotflags;
+	MNT_IUNLOCK(mp);
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
@@ -210,7 +248,9 @@
 			g_topology_unlock();
 			PICKUP_GIANT();
 			fs->fs_ronly = 1;
+			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_RDONLY;
+			MNT_IUNLOCK(mp);
 		}
 		if ((mp->mnt_flag & MNT_RELOAD) &&
 		    (error = ffs_reload(mp, td)) != 0)
@@ -221,15 +261,16 @@
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
-			if (suser(td)) {
-				vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
-				if ((error = VOP_ACCESS(devvp, VREAD | VWRITE,
-				    td->td_ucred, td)) != 0) {
-					VOP_UNLOCK(devvp, 0, td);
-					return (error);
-				}
+			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
+			error = VOP_ACCESS(devvp, VREAD | VWRITE,
+			    td->td_ucred, td);
+			if (error)
+				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
+			if (error) {
 				VOP_UNLOCK(devvp, 0, td);
+				return (error);
 			}
+			VOP_UNLOCK(devvp, 0, td);
 			fs->fs_flags &= ~FS_UNCLEAN;
 			if (fs->fs_clean == 0) {
 				fs->fs_flags |= FS_UNCLEAN;
@@ -262,7 +303,9 @@
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
 			fs->fs_ronly = 0;
+			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
+			MNT_IUNLOCK(mp);
 			fs->fs_clean = 0;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
 				vn_finished_write(mp);
@@ -285,19 +328,22 @@
 		 * Softdep_mount() clears it in an initial mount 
 		 * or ro->rw remount.
 		 */
-		if (mp->mnt_flag & MNT_SOFTDEP)
+		if (mp->mnt_flag & MNT_SOFTDEP) {
+			/* XXX: Reset too late ? */
+			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_ASYNC;
+			MNT_IUNLOCK(mp);
+		}
 		/*
 		 * Keep MNT_ACLS flag if it is stored in superblock.
 		 */
-		if ((fs->fs_flags & FS_ACLS) != 0)
+		if ((fs->fs_flags & FS_ACLS) != 0) {
+			/* XXX: Set too late ? */
+			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_ACLS;
-		/*
-		 * If not updating name, process export requests.
-		 */
-		error = vfs_copyopt(mp->mnt_optnew, "export", &export, sizeof export);
-		if (error == 0 && export.ex_flags != 0)
-			return (vfs_export(mp, &export));
+			MNT_IUNLOCK(mp);
+		}
+
 		/*
 		 * If this is a snapshot request, take the snapshot.
 		 */
@@ -323,14 +369,15 @@
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
-	if (suser(td)) {
-		accessmode = VREAD;
-		if ((mp->mnt_flag & MNT_RDONLY) == 0)
-			accessmode |= VWRITE;
-		if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td))!= 0){
-			vput(devvp);
-			return (error);
-		}
+	accessmode = VREAD;
+	if ((mp->mnt_flag & MNT_RDONLY) == 0)
+		accessmode |= VWRITE;
+	error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td);
+	if (error)
+		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
+	if (error) {
+		vput(devvp);
+		return (error);
 	}
 
 	if (mp->mnt_flag & MNT_UPDATE) {
@@ -558,6 +605,7 @@
 	int32_t *lp;
 	struct ucred *cred;
 	struct g_consumer *cp;
+	struct mount *nmp;
 
 	dev = devvp->v_rdev;
 	cred = td ? td->td_ucred : NOCRED;
@@ -594,7 +642,14 @@
 	 * Try reading the superblock in each of its possible locations.
 	 */
 	for (i = 0; sblock_try[i] != -1; i++) {
-		if ((error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE,
+		if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) {
+			error = EINVAL;
+			vfs_mount_error(mp,
+			    "Invalid sectorsize %d for superblock size %d",
+			    cp->provider->sectorsize, SBLOCKSIZE);
+			goto out;
+		}
+		if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE,
 		    cred, &bp)) != 0)
 			goto out;
 		fs = (struct fs *)bp->b_data;
@@ -647,6 +702,35 @@
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
 	}
+	if ((fs->fs_flags & FS_GJOURNAL) != 0) {
+#ifdef UFS_GJOURNAL
+		/*
+		 * Get journal provider name.
+		 */
+		size = 1024;
+		mp->mnt_gjprovider = malloc(size, M_UFSMNT, M_WAITOK);
+		if (g_io_getattr("GJOURNAL::provider", cp, &size,
+		    mp->mnt_gjprovider) == 0) {
+			mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, size,
+			    M_UFSMNT, M_WAITOK);
+			MNT_ILOCK(mp);
+			mp->mnt_flag |= MNT_GJOURNAL;
+			MNT_IUNLOCK(mp);
+		} else {
+			printf(
+"WARNING: %s: GJOURNAL flag on fs but no gjournal provider below\n",
+			    mp->mnt_stat.f_mntonname);
+			free(mp->mnt_gjprovider, M_UFSMNT);
+			mp->mnt_gjprovider = NULL;
+		}
+#else
+		printf(
+"WARNING: %s: GJOURNAL flag on fs but no UFS_GJOURNAL support\n",
+		    mp->mnt_stat.f_mntonname);
+#endif
+	} else {
+		mp->mnt_gjprovider = NULL;
+	}
 	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
 	ump->um_cp = cp;
 	ump->um_bo = &devvp->v_bufobj;
@@ -707,27 +791,39 @@
 	mp->mnt_data = (qaddr_t)ump;
 	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
 	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
+	nmp = NULL;
 	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || 
-	    vfs_getvfs(&mp->mnt_stat.f_fsid)) 
+	    (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
+		if (nmp)
+			vfs_rel(nmp);
 		vfs_getnewfsid(mp);
+	}
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
+	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
-	if ((fs->fs_flags & FS_MULTILABEL) != 0)
+	MNT_IUNLOCK(mp);
+	if ((fs->fs_flags & FS_MULTILABEL) != 0) {
 #ifdef MAC
+		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_MULTILABEL;
+		MNT_IUNLOCK(mp);
 #else
 		printf(
 "WARNING: %s: multilabel flag on fs but no MAC support\n",
-		    fs->fs_fsmnt);
+		    mp->mnt_stat.f_mntonname);
 #endif
-	if ((fs->fs_flags & FS_ACLS) != 0)
+	}
+	if ((fs->fs_flags & FS_ACLS) != 0) {
 #ifdef UFS_ACL
+		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_ACLS;
+		MNT_IUNLOCK(mp);
 #else
 		printf(
 "WARNING: %s: ACLs flag on fs but no ACLs support\n",
-		    fs->fs_fsmnt);
+		    mp->mnt_stat.f_mntonname);
 #endif
+	}
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
@@ -784,9 +880,9 @@
 	(void) ufs_extattr_autostart(mp, td);
 #endif /* !UFS_EXTATTR_AUTOSTART */
 #endif /* !UFS_EXTATTR */
-#ifndef QUOTA
+	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_MPSAFE;
-#endif
+	MNT_IUNLOCK(mp);
 	return (0);
 out:
 	if (bp)
@@ -800,6 +896,10 @@
 	}
 	if (ump) {
 		mtx_destroy(UFS_MTX(ump));
+		if (mp->mnt_gjprovider != NULL) {
+			free(mp->mnt_gjprovider, M_UFSMNT);
+			mp->mnt_gjprovider = NULL;
+		}
 		free(ump->um_fs, M_UFSMNT);
 		free(ump, M_UFSMNT);
 		mp->mnt_data = (qaddr_t)0;
@@ -850,13 +950,13 @@
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC &&
 	    fs->fs_old_inodefmt < FS_44INODEFMT) {
-		fs->fs_maxfilesize = (u_quad_t) 1LL << 39;
+		fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
 		fs->fs_qbmask = ~fs->fs_bmask;
 		fs->fs_qfmask = ~fs->fs_fmask;
 	}
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
-		maxfilesize = (u_int64_t)0x40000000 * fs->fs_bsize - 1;
+		maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
 		if (fs->fs_maxfilesize > maxfilesize)
 			fs->fs_maxfilesize = maxfilesize;
 	}
@@ -959,11 +1059,17 @@
 	PICKUP_GIANT();
 	vrele(ump->um_devvp);
 	mtx_destroy(UFS_MTX(ump));
+	if (mp->mnt_gjprovider != NULL) {
+		free(mp->mnt_gjprovider, M_UFSMNT);
+		mp->mnt_gjprovider = NULL;
+	}
 	free(fs->fs_csp, M_UFSMNT);
 	free(fs, M_UFSMNT);
 	free(ump, M_UFSMNT);
 	mp->mnt_data = (qaddr_t)0;
+	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
+	MNT_IUNLOCK(mp);
 	return (error);
 }
 
@@ -987,8 +1093,6 @@
 		if (error)
 			return (error);
 		for (i = 0; i < MAXQUOTAS; i++) {
-			if (ump->um_quotas[i] == NULLVP)
-				continue;
 			quotaoff(td, mp, i);
 		}
 		/*
@@ -1211,6 +1315,7 @@
 	struct vnode *vp;
 	struct cdev *dev;
 	int error;
+	struct thread *td;
 
 	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
@@ -1275,7 +1380,15 @@
 	}
 #endif
 
-	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
+	td = curthread;
+	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td);
+	error = insmntque(vp, mp);
+	if (error != 0) {
+		uma_zfree(uma_inode, ip);
+		*vpp = NULL;
+		return (error);
+	}
+	error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
 	if (error || *vpp != NULL)
 		return (error);
 
@@ -1393,26 +1506,6 @@
 }
 
 /*
- * Vnode pointer to File handle
- */
-/* ARGSUSED */
-static int
-ffs_vptofh(vp, fhp)
-	struct vnode *vp;
-	struct fid *fhp;
-{
-	struct inode *ip;
-	struct ufid *ufhp;
-
-	ip = VTOI(vp);
-	ufhp = (struct ufid *)fhp;
-	ufhp->ufid_len = sizeof(struct ufid);
-	ufhp->ufid_ino = ip->i_number;
-	ufhp->ufid_gen = ip->i_gen;
-	return (0);
-}
-
-/*
  * Initialize the filesystem.
  */
 static int
@@ -1441,7 +1534,7 @@
 /*
  * Write a superblock and associated information back to disk.
  */
-static int
+int
 ffs_sbupdate(mp, waitfor, suspended)
 	struct ufsmount *mp;
 	int waitfor;
@@ -1569,10 +1662,10 @@
 	/*
 	 * Process dependencies then return any unfinished ones.
 	 */
-	if (LIST_FIRST(&bp->b_dep) != NULL)
+	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 #ifdef SOFTUPDATES
-	if (LIST_FIRST(&bp->b_dep) != NULL)
+	if (!LIST_EMPTY(&bp->b_dep))
 		softdep_move_dependencies(bp, origbp);
 #endif
 	/*
@@ -1690,7 +1783,7 @@
 
 #ifdef SOFTUPDATES
 		/* move over the dependencies */
-		if (LIST_FIRST(&bp->b_dep) != NULL)
+		if (!LIST_EMPTY(&bp->b_dep))
 			softdep_move_dependencies(bp, newbp);
 #endif 
 
@@ -1728,6 +1821,7 @@
 		if ((vp->v_vflag & VV_COPYONWRITE) &&
 		    vp->v_rdev->si_snapdata != NULL) {
 			if ((bp->b_flags & B_CLUSTER) != 0) {
+				runningbufwakeup(bp);
 				TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
 					      b_cluster.cluster_entry) {
 					error = ffs_copyonwrite(vp, tbp);
@@ -1739,6 +1833,9 @@
 						return;
 					}
 				}
+				bp->b_runningbufspace = bp->b_bufsize;
+				atomic_add_int(&runningbufspace,
+					       bp->b_runningbufspace);
 			} else {
 				error = ffs_copyonwrite(vp, bp);
 				if (error != 0 && error != EOPNOTSUPP) {
@@ -1753,11 +1850,11 @@
 		if ((bp->b_flags & B_CLUSTER) != 0) {
 			TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
 				      b_cluster.cluster_entry) {
-				if (LIST_FIRST(&tbp->b_dep) != NULL)
+				if (!LIST_EMPTY(&tbp->b_dep))
 					buf_start(tbp);
 			}
 		} else {
-			if (LIST_FIRST(&bp->b_dep) != NULL)
+			if (!LIST_EMPTY(&bp->b_dep))
 				buf_start(bp);
 		}
 
Index: fs.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/fs.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ffs/fs.h -L sys/ufs/ffs/fs.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ffs/fs.h
+++ sys/ufs/ffs/fs.h
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)fs.h	8.13 (Berkeley) 3/21/95
- * $FreeBSD: src/sys/ufs/ffs/fs.h,v 1.48 2005/02/20 08:02:15 delphij Exp $
+ * $FreeBSD: src/sys/ufs/ffs/fs.h,v 1.49 2006/10/31 21:48:53 pjd Exp $
  */
 
 #ifndef _UFS_FFS_FS_H_
@@ -323,7 +323,8 @@
 	u_int	*fs_active;		/* (u) used by snapshots to track fs */
 	int32_t	 fs_old_cpc;		/* cyl per cycle in postbl */
 	int32_t	 fs_maxbsize;		/* maximum blocking factor permitted */
-	int64_t	 fs_sparecon64[17];	/* old rotation block list head */
+	int64_t	 fs_unrefs;		/* number of unreferenced inodes */
+	int64_t	 fs_sparecon64[16];	/* old rotation block list head */
 	int64_t	 fs_sblockloc;		/* byte offset of standard superblock */
 	struct	csum_total fs_cstotal;	/* (u) cylinder summary information */
 	ufs_time_t fs_time;		/* last time written */
@@ -406,6 +407,7 @@
 #define FS_INDEXDIRS  0x08	/* kernel supports indexed directories */
 #define FS_ACLS       0x10	/* file system has ACLs enabled */
 #define FS_MULTILABEL 0x20	/* file system is MAC multi-label */
+#define FS_GJOURNAL   0x40	/* gjournaled file system */
 #define FS_FLAGS_UPDATED 0x80	/* flags have been moved to new location */
 
 /*
@@ -475,7 +477,8 @@
 	int32_t	 cg_nclusterblks;	/* number of clusters this cg */
 	int32_t  cg_niblk;		/* number of inode blocks this cg */
 	int32_t	 cg_initediblk;		/* last initialized inode */
-	int32_t	 cg_sparecon32[3];	/* reserved for future use */
+	int32_t	 cg_unrefs;		/* number of unreferenced inodes */
+	int32_t	 cg_sparecon32[2];	/* reserved for future use */
 	ufs_time_t cg_time;		/* time last written */
 	int64_t	 cg_sparecon64[3];	/* reserved for future use */
 	u_int8_t cg_space[1];		/* space for cylinder group maps */
Index: ffs_vnops.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_vnops.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ffs/ffs_vnops.c -L sys/ufs/ffs/ffs_vnops.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ffs/ffs_vnops.c
+++ sys/ufs/ffs/ffs_vnops.c
@@ -62,7 +62,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_vnops.c,v 1.157.2.1 2005/10/29 06:43:55 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_vnops.c,v 1.173 2007/07/13 18:51:08 rodrigc Exp $");
 
 #include <sys/param.h>
 #include <sys/bio.h>
@@ -74,6 +74,7 @@
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
@@ -103,7 +104,7 @@
 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 #endif
 static vop_fsync_t	ffs_fsync;
-static vop_lock_t	ffs_lock;
+static vop_lock1_t	ffs_lock;
 static vop_getpages_t	ffs_getpages;
 static vop_read_t	ffs_read;
 static vop_write_t	ffs_write;
@@ -117,6 +118,7 @@
 static vop_listextattr_t	ffs_listextattr;
 static vop_openextattr_t	ffs_openextattr;
 static vop_setextattr_t	ffs_setextattr;
+static vop_vptofh_t	ffs_vptofh;
 
 
 /* Global vfs data structures for ufs. */
@@ -124,16 +126,18 @@
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
 	.vop_getpages =		ffs_getpages,
-	.vop_lock =		ffs_lock,
+	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_write =		ffs_write,
+	.vop_vptofh =		ffs_vptofh,
 };
 
 struct vop_vector ffs_fifoops1 = {
 	.vop_default =		&ufs_fifoops,
 	.vop_fsync =		ffs_fsync,
 	.vop_reallocblks =	ffs_reallocblks, /* XXX: really ??? */
+	.vop_vptofh =		ffs_vptofh,
 };
 
 /* Global vfs data structures for ufs. */
@@ -141,7 +145,7 @@
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
 	.vop_getpages =		ffs_getpages,
-	.vop_lock =		ffs_lock,
+	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_write =		ffs_write,
@@ -151,12 +155,13 @@
 	.vop_listextattr =	ffs_listextattr,
 	.vop_openextattr =	ffs_openextattr,
 	.vop_setextattr =	ffs_setextattr,
+	.vop_vptofh =		ffs_vptofh,
 };
 
 struct vop_vector ffs_fifoops2 = {
 	.vop_default =		&ufs_fifoops,
 	.vop_fsync =		ffs_fsync,
-	.vop_lock =		ffs_lock,
+	.vop_lock1 =		ffs_lock,
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_strategy =		ffsext_strategy,
 	.vop_closeextattr =	ffs_closeextattr,
@@ -165,6 +170,7 @@
 	.vop_listextattr =	ffs_listextattr,
 	.vop_openextattr =	ffs_openextattr,
 	.vop_setextattr =	ffs_setextattr,
+	.vop_vptofh =		ffs_vptofh,
 };
 
 /*
@@ -226,7 +232,7 @@
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 			continue;
 		VI_UNLOCK(vp);
-		if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
+		if (!wait && !LIST_EMPTY(&bp->b_dep) &&
 		    (bp->b_flags & B_DEFERRED) == 0 &&
 		    buf_countdeps(bp, 0)) {
 			bp->b_flags |= B_DEFERRED;
@@ -332,13 +338,62 @@
 
 static int
 ffs_lock(ap)
-	struct vop_lock_args /* {
+	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		struct thread *a_td;
+		char *file;
+		int line;
 	} */ *ap;
 {
-	return (VOP_LOCK_APV(&ufs_vnodeops, ap));
+#ifndef NO_FFS_SNAPSHOT
+	struct vnode *vp;
+	int flags;
+	struct lock *lkp;
+	int result;
+	
+	switch (ap->a_flags & LK_TYPE_MASK) {
+	case LK_SHARED:
+	case LK_UPGRADE:
+	case LK_EXCLUSIVE:
+		vp = ap->a_vp;
+		flags = ap->a_flags;
+		for (;;) {
+			/*
+			 * vnode interlock must be held to ensure that
+			 * the possibly external lock isn't freed,
+			 * e.g. when mutating from snapshot file vnode
+			 * to regular file vnode.
+			 */
+			if ((flags & LK_INTERLOCK) == 0) {
+				VI_LOCK(vp);
+				flags |= LK_INTERLOCK;
+			}
+			lkp = vp->v_vnlock;
+			result = _lockmgr(lkp, flags, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line);
+			if (lkp == vp->v_vnlock || result != 0)
+				break;
+			/*
+			 * Apparent success, except that the vnode
+			 * mutated between snapshot file vnode and
+			 * regular file vnode while this process
+			 * slept.  The lock currently held is not the
+			 * right lock.  Release it, and try to get the
+			 * new lock.
+			 */
+			(void) _lockmgr(lkp, LK_RELEASE, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line);
+			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
+				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
+			flags &= ~LK_INTERLOCK;
+		}
+		break;
+	default:
+		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
+	}
+	return (result);
+#else
+	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
+#endif
 }
 
 /*
@@ -510,7 +565,7 @@
 			break;
 
 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		   (LIST_EMPTY(&bp->b_dep))) {
 			/*
 			 * If there are no dependencies, and it's VMIO,
 			 * then we don't need the buf, mark it available
@@ -537,7 +592,7 @@
 	 */
 	if (bp != NULL) {
 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		   (LIST_EMPTY(&bp->b_dep))) {
 			bp->b_flags |= B_RELBUF;
 			brelse(bp);
 		} else {
@@ -546,8 +601,11 @@
 	}
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
-	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
+	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
+		VI_LOCK(vp);
 		ip->i_flag |= IN_ACCESS;
+		VI_UNLOCK(vp);
+	}
 	return (error);
 }
 
@@ -689,7 +747,7 @@
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		   (LIST_EMPTY(&bp->b_dep))) {
 			bp->b_flags |= B_RELBUF;
 		}
 
@@ -730,10 +788,12 @@
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
-	if (resid > uio->uio_resid && ap->a_cred && 
-	    suser_cred(ap->a_cred, SUSER_ALLOWJAIL)) {
-		ip->i_mode &= ~(ISUID | ISGID);
-		DIP_SET(ip, i_mode, ip->i_mode);
+	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
+	    ap->a_cred) {
+		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
+			ip->i_mode &= ~(ISUID | ISGID);
+			DIP_SET(ip, i_mode, ip->i_mode);
+		}
 	}
 	if (error) {
 		if (ioflag & IO_UNIT) {
@@ -906,7 +966,7 @@
 			break;
 
 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		   (LIST_EMPTY(&bp->b_dep))) {
 			/*
 			 * If there are no dependencies, and it's VMIO,
 			 * then we don't need the buf, mark it available
@@ -933,7 +993,7 @@
 	 */
 	if (bp != NULL) {
 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		   (LIST_EMPTY(&bp->b_dep))) {
 			bp->b_flags |= B_RELBUF;
 			brelse(bp);
 		} else {
@@ -942,8 +1002,11 @@
 	}
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
-	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
+	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
+		VI_LOCK(vp);
 		ip->i_flag |= IN_ACCESS;
+		VI_UNLOCK(vp);
+	}
 	return (error);
 }
 
@@ -965,6 +1028,9 @@
 	fs = ip->i_fs;
 	dp = ip->i_din2;
 
+	KASSERT(!(ip->i_flag & IN_SPACECOUNTED), ("inode %u: inode is dead",
+	    ip->i_number));
+
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
 		panic("ffs_extwrite: mode");
@@ -1024,7 +1090,7 @@
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_FIRST(&bp->b_dep) == NULL)) {
+		   (LIST_EMPTY(&bp->b_dep))) {
 			bp->b_flags |= B_RELBUF;
 		}
 
@@ -1053,10 +1119,11 @@
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
-	if (resid > uio->uio_resid && ucred && 
-	    suser_cred(ucred, SUSER_ALLOWJAIL)) {
-		ip->i_mode &= ~(ISUID | ISGID);
-		dp->di_mode = ip->i_mode;
+	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
+		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) {
+			ip->i_mode &= ~(ISUID | ISGID);
+			dp->di_mode = ip->i_mode;
+		}
 	}
 	if (error) {
 		if (ioflag & IO_UNIT) {
@@ -1125,14 +1192,18 @@
 {
 	struct inode *ip;
 	struct ufs2_dinode *dp;
+	struct fs *fs;
 	struct uio luio;
 	struct iovec liovec;
 	int easize, error;
 	u_char *eae;
 
 	ip = VTOI(vp);
+	fs = ip->i_fs;
 	dp = ip->i_din2;
 	easize = dp->di_extsize;
+	if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize)
+		return (EFBIG);
 
 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
 
@@ -1296,6 +1367,9 @@
 	if (ap->a_vp->v_type == VCHR)
 		return (EOPNOTSUPP);
 
+	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
+		return (EROFS);
+
 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
 }
 
@@ -1330,6 +1404,9 @@
 	if (strlen(ap->a_name) == 0)
 		return (EINVAL);
 
+	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
+		return (EROFS);
+
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, IWRITE);
 	if (error) {
@@ -1551,6 +1628,9 @@
 	if (ap->a_uio == NULL)
 		return (EOPNOTSUPP);
 
+	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
+		return (EROFS);
+
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, IWRITE);
 	if (error) {
@@ -1633,3 +1713,26 @@
 		error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
 	return(error);
 }
+
+/*
+ * Vnode pointer to File handle
+ */
+static int
+ffs_vptofh(struct vop_vptofh_args *ap)
+/*
+vop_vptofh {
+	IN struct vnode *a_vp;
+	IN struct fid *a_fhp;
+};
+*/
+{
+	struct inode *ip;
+	struct ufid *ufhp;
+
+	ip = VTOI(ap->a_vp);
+	ufhp = (struct ufid *)ap->a_fhp;
+	ufhp->ufid_len = sizeof(struct ufid);
+	ufhp->ufid_ino = ip->i_number;
+	ufhp->ufid_gen = ip->i_gen;
+	return (0);
+}
Index: ffs_snapshot.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_snapshot.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_snapshot.c -L sys/ufs/ffs/ffs_snapshot.c -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_snapshot.c
+++ sys/ufs/ffs/ffs_snapshot.c
@@ -34,7 +34,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_snapshot.c,v 1.103.2.5 2006/03/22 17:42:31 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_snapshot.c,v 1.136 2007/06/05 00:00:56 jeff Exp $");
+
+#include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -66,6 +68,62 @@
 #define KERNCRED thread0.td_ucred
 #define DEBUG 1
 
+#include "opt_ffs.h"
+
+#ifdef NO_FFS_SNAPSHOT
+int
+ffs_snapshot(mp, snapfile)
+	struct mount *mp;
+	char *snapfile;
+{
+	return (EINVAL);
+}
+
+int
+ffs_snapblkfree(fs, devvp, bno, size, inum)
+	struct fs *fs;
+	struct vnode *devvp;
+	ufs2_daddr_t bno;
+	long size;
+	ino_t inum;
+{
+	return (EINVAL);
+}
+
+void
+ffs_snapremove(vp)
+	struct vnode *vp;
+{
+}
+
+void
+ffs_snapshot_mount(mp)
+	struct mount *mp;
+{
+}
+
+void
+ffs_snapshot_unmount(mp)
+	struct mount *mp;
+{
+}
+
+void
+ffs_snapgone(ip)
+	struct inode *ip;
+{
+}
+
+int
+ffs_copyonwrite(devvp, bp)
+	struct vnode *devvp;
+	struct buf *bp;
+{
+	return (EINVAL);
+}
+
+#else
+
 TAILQ_HEAD(snaphead, inode);
 
 struct snapdata {
@@ -104,6 +162,8 @@
     struct fs *, ufs_lbn_t, int);
 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
 static void process_deferred_inactive(struct mount *);
+static void try_free_snapdata(struct vnode *devvp, struct thread *td);
+static int ffs_bp_snapblk(struct vnode *, struct buf *);
 
 /*
  * To ensure the consistency of snapshots across crashes, we must
@@ -135,7 +195,7 @@
 	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
 	int error, cg, snaploc;
 	int i, size, len, loc;
-	int flag = mp->mnt_flag;
+	int flag;
 	struct timespec starttime = {0, 0}, endtime;
 	char saved_nice = 0;
 	long redo = 0, snaplistsize = 0;
@@ -156,10 +216,10 @@
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
-	/*
-	 * XXX: make sure we don't go to out1 before we setup sn
-	 */
-	sn = (void *)0xdeadbeef;
+	sn = NULL;
+	MNT_ILOCK(mp);
+	flag = mp->mnt_flag;
+	MNT_IUNLOCK(mp);
 
 	/*
 	 * Need to serialize access to snapshot code per filesystem.
@@ -203,6 +263,7 @@
 		wrtmp = NULL;
 	if (wrtmp != mp)
 		panic("ffs_snapshot: mount mismatch");
+	vfs_rel(wrtmp);
 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
@@ -221,6 +282,7 @@
 		return (error);
 	}
 	vp = nd.ni_vp;
+	vp->v_vflag |= VV_SYSTEM;
 	ip = VTOI(vp);
 	devvp = ip->i_devvp;
 	/*
@@ -235,9 +297,10 @@
 	ip->i_size = lblktosize(fs, (off_t)numblks);
 	DIP_SET(ip, i_size, ip->i_size);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
-	if ((error = readblock(vp, bp, numblks - 1)) != 0)
-		goto out;
+	error = readblock(vp, bp, numblks - 1);
 	bawrite(bp);
+	if (error != 0)
+		goto out;
 	/*
 	 * Preallocate critical data structures so that we can copy
 	 * them in without further allocation after we suspend all
@@ -326,12 +389,15 @@
 	 * Recind nice scheduling while running with the filesystem suspended.
 	 */
 	if (td->td_proc->p_nice > 0) {
-		PROC_LOCK(td->td_proc);
-		mtx_lock_spin(&sched_lock);
-		saved_nice = td->td_proc->p_nice;
-		sched_nice(td->td_proc, 0);
-		mtx_unlock_spin(&sched_lock);
-		PROC_UNLOCK(td->td_proc);
+		struct proc *p;
+
+		p = td->td_proc;
+		PROC_LOCK(p);
+		PROC_SLOCK(p);
+		saved_nice = p->p_nice;
+		sched_nice(p, 0);
+		PROC_SUNLOCK(p);
+		PROC_UNLOCK(p);
 	}
 	/*
 	 * Suspend operation on filesystem.
@@ -348,8 +414,23 @@
 		vn_start_write(NULL, &wrtmp, V_WAIT);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (ip->i_effnlink == 0) {
+		error = ENOENT;		/* Snapshot file unlinked */
+		goto out1;
+	}
 	if (collectsnapstats)
 		nanotime(&starttime);
+
+	/* The last block might have changed.  Copy it again to be sure. */
+	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
+	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
+	if (error != 0)
+		goto out1;
+	error = readblock(vp, bp, numblks - 1);
+	bp->b_flags |= B_VALIDSUSPWRT;
+	bawrite(bp);
+	if (error != 0)
+		goto out1;
 	/*
 	 * First, copy all the cylinder group maps that have changed.
 	 */
@@ -551,7 +632,6 @@
 	}
 	lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 	    VI_MTX(vp), td);
-	transferlockers(&vp->v_lock, vp->v_vnlock);
 	lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 	/*
 	 * If this is the first snapshot on this filesystem, then we need
@@ -595,9 +675,10 @@
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
-	vp->v_vflag |= VV_SYSTEM;
 out1:
-	KASSERT(sn != (void *)0xdeadbeef, ("email phk@ and mckusick@"));
+	KASSERT((sn != NULL && sbp != NULL && error == 0) ||
+		(sn == NULL && sbp == NULL && error != 0),
+		("email phk@ and mckusick@"));
 	/*
 	 * Resume operation on filesystem.
 	 */
@@ -625,6 +706,13 @@
 		else
 			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
 			    BLK_SNAP);
+		if (error == 0 && xp->i_effnlink == 0) {
+			error = ffs_freefile(ump,
+					     copy_fs,
+					     vp,
+					     xp->i_number,
+					     xp->i_mode);
+		}
 		if (error) {
 			fs->fs_snapinum[snaploc] = 0;
 			goto done;
@@ -707,21 +795,30 @@
 	 * the inode for this snapshot then a deadlock can occur. Drop
 	 * the snapshot lock until the buffer has been written.
 	 */
+	VREF(vp);	/* Protect against ffs_snapgone() */
 	VOP_UNLOCK(vp, 0, td);
 	(void) bread(ip->i_devvp,
 		     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		     (int) fs->fs_bsize, NOCRED, &nbp);
 	brelse(nbp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (ip->i_effnlink == 0)
+		error = ENOENT;		/* Snapshot file unlinked */
+	else
+		vrele(vp);		/* Drop extra reference */
 done:
 	FREE(copy_fs->fs_csp, M_UFSMNT);
 	bawrite(sbp);
 out:
+	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (saved_nice > 0) {
-		PROC_LOCK(td->td_proc);
-		mtx_lock_spin(&sched_lock);
+		struct proc *p;
+
+		p = td->td_proc;
+		PROC_LOCK(p);
+		PROC_SLOCK(p);
 		sched_nice(td->td_proc, saved_nice);
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(td->td_proc);
 	}
 	UFS_LOCK(ump);
@@ -730,7 +827,9 @@
 		fs->fs_active = 0;
 	}
 	UFS_UNLOCK(ump);
-	mp->mnt_flag = flag;
+	MNT_ILOCK(mp);
+	mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
+	MNT_IUNLOCK(mp);
 	if (error)
 		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 	(void) ffs_syncvnode(vp, MNT_WAIT);
@@ -908,11 +1007,11 @@
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
-	 * to be completely unallocated.
+	 * or unlinked snapshots to be completely unallocated.
 	 */
 	dip = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
-	if (expungetype == BLK_NOCOPY)
+	if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
@@ -1469,18 +1568,16 @@
 {
 	struct inode *ip;
 	struct vnode *devvp;
-	struct lock *lkp;
 	struct buf *ibp;
 	struct fs *fs;
 	struct thread *td = curthread;
-	ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
+	ufs2_daddr_t numblks, blkno, dblk;
 	int error, loc, last;
 	struct snapdata *sn;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	devvp = ip->i_devvp;
-	sn = devvp->v_rdev->si_snapdata;
 	/*
 	 * If active, delete from incore list (this snapshot may
 	 * already have been in the process of being deleted, so
@@ -1488,29 +1585,23 @@
 	 *
 	 * Clear copy-on-write flag if last snapshot.
 	 */
+	VI_LOCK(devvp);
 	if (ip->i_nextsnap.tqe_prev != 0) {
-		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
-		VI_LOCK(devvp);
+		sn = devvp->v_rdev->si_snapdata;
 		TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
 		ip->i_nextsnap.tqe_prev = 0;
-		lkp = vp->v_vnlock;
+		VI_UNLOCK(devvp);
+		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
+		VI_LOCK(vp);
+		KASSERT(vp->v_vnlock == &sn->sn_lock,
+			("ffs_snapremove: lost lock mutation")); 
 		vp->v_vnlock = &vp->v_lock;
-		lockmgr(lkp, LK_RELEASE, NULL, td);
-		if (TAILQ_FIRST(&sn->sn_head) != 0) {
-			VI_UNLOCK(devvp);
-		} else {
-			snapblklist = sn->sn_blklist;
-			sn->sn_blklist = 0;
-			sn->sn_listsize = 0;
-			devvp->v_rdev->si_snapdata = NULL;
-			devvp->v_vflag &= ~VV_COPYONWRITE;
-			lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
-			lockmgr(lkp, LK_RELEASE, NULL, td);
-			lockdestroy(lkp);
-			free(sn, M_UFSMNT);
-			FREE(snapblklist, M_UFSMNT);
-		}
-	}
+		VI_UNLOCK(vp);
+		VI_LOCK(devvp);
+		lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
+		try_free_snapdata(devvp, td);
+	} else
+		VI_UNLOCK(devvp);
 	/*
 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
 	 * snapshots that want them (see ffs_snapblkfree below).
@@ -1575,6 +1666,13 @@
 	ip->i_flags &= ~SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+#ifdef QUOTA
+	/*
+	 * Reenable disk quotas for ex-snapshot file.
+	 */
+	if (!getinoquota(ip))
+		(void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
+#endif
 }
 
 /*
@@ -1792,6 +1890,7 @@
 	struct thread *td = curthread;
 	struct snapdata *sn;
 	struct vnode *vp;
+	struct vnode *lastvp;
 	struct inode *ip;
 	struct uio auio;
 	struct iovec aiov;
@@ -1809,6 +1908,7 @@
 	 * Process each snapshot listed in the superblock.
 	 */
 	vp = NULL;
+	lastvp = NULL;
 	sn = devvp->v_rdev->si_snapdata;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 		if (fs->fs_snapinum[snaploc] == 0)
@@ -1866,7 +1966,6 @@
 		}
 		lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 		    VI_MTX(vp), td);
-		transferlockers(&vp->v_lock, vp->v_vnlock);
 		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 		/*
 		 * Link it onto the active snapshot list.
@@ -1880,7 +1979,9 @@
 		vp->v_vflag |= VV_SYSTEM;
 		VI_UNLOCK(devvp);
 		VOP_UNLOCK(vp, 0, td);
+		lastvp = vp;
 	}
+	vp = lastvp;
 	/*
 	 * No usable snapshots found.
 	 */
@@ -1939,31 +2040,149 @@
 	struct snapdata *sn;
 	struct inode *xp;
 	struct vnode *vp;
+	struct thread *td = curthread;
 
-	sn = devvp->v_rdev->si_snapdata;
 	VI_LOCK(devvp);
-	while ((xp = TAILQ_FIRST(&sn->sn_head)) != 0) {
+	sn = devvp->v_rdev->si_snapdata;
+	while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
 		vp = ITOV(xp);
-		vp->v_vnlock = &vp->v_lock;
 		TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
 		xp->i_nextsnap.tqe_prev = 0;
-		if (xp->i_effnlink > 0) {
-			VI_UNLOCK(devvp);
+		lockmgr(&sn->sn_lock, 
+			LK_INTERLOCK | LK_EXCLUSIVE,
+			VI_MTX(devvp),
+			td);
+		VI_LOCK(vp);
+		lockmgr(&vp->v_lock,
+			LK_INTERLOCK | LK_EXCLUSIVE,
+			VI_MTX(vp), td);
+		VI_LOCK(vp);
+		KASSERT(vp->v_vnlock == &sn->sn_lock,
+		("ffs_snapshot_unmount: lost lock mutation")); 
+		vp->v_vnlock = &vp->v_lock;
+		VI_UNLOCK(vp);
+		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
+		lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
+		if (xp->i_effnlink > 0)
 			vrele(vp);
-			VI_LOCK(devvp);
-		}
+		VI_LOCK(devvp);
+		sn = devvp->v_rdev->si_snapdata;
 	}
-	devvp->v_rdev->si_snapdata = NULL;
-	devvp->v_vflag &= ~VV_COPYONWRITE;
+	try_free_snapdata(devvp, td);
+	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
+}
+
+/*
+ * Check the buffer block to be belong to device buffer that shall be
+ * locked after snaplk. devvp shall be locked on entry, and will be
+ * leaved locked upon exit.
+ */
+static int
+ffs_bp_snapblk(devvp, bp)
+	struct vnode *devvp;
+	struct buf *bp;
+{
+	struct snapdata *sn;
+	struct fs *fs;
+	ufs2_daddr_t lbn, *snapblklist;
+	int lower, upper, mid;
+
+	ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
+	KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
+	sn = devvp->v_rdev->si_snapdata;
+	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
+		return (0);
+	fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
+	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
+	snapblklist = sn->sn_blklist;
+	upper = sn->sn_listsize - 1;
+	lower = 1;
+	while (lower <= upper) {
+		mid = (lower + upper) / 2;
+		if (snapblklist[mid] == lbn)
+			break;
+		if (snapblklist[mid] < lbn)
+			lower = mid + 1;
+		else
+			upper = mid - 1;
+	}
+	if (lower <= upper)
+		return (1);
+	return (0);
+}
+
+void
+ffs_bdflush(bo, bp)
+	struct bufobj *bo;
+	struct buf *bp;
+{
+	struct thread *td;
+	struct vnode *vp, *devvp;
+	struct buf *nbp;
+	int bp_bdskip;
+
+	if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
+		return;
+
+	td = curthread;
+	vp = bp->b_vp;
+	devvp = bo->__bo_vnode;
+	KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
+
+	VI_LOCK(devvp);
+	bp_bdskip = ffs_bp_snapblk(devvp, bp);
+	if (bp_bdskip)
+		bdwriteskip++;
 	VI_UNLOCK(devvp);
-	if (sn->sn_blklist != NULL) {
-		FREE(sn->sn_blklist, M_UFSMNT);
-		sn->sn_blklist = NULL;
-		sn->sn_listsize = 0;
+	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
+		(void) VOP_FSYNC(vp, MNT_NOWAIT, td);
+		altbufferflushes++;
+	} else {
+		BO_LOCK(bo);
+		/*
+		 * Try to find a buffer to flush.
+		 */
+		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+			    BUF_LOCK(nbp,
+				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
+				continue;
+			if (bp == nbp)
+				panic("bdwrite: found ourselves");
+			BO_UNLOCK(bo);
+			/*
+			 * Don't countdeps with the bo lock
+			 * held.
+			 */
+			if (buf_countdeps(nbp, 0)) {
+				BO_LOCK(bo);
+				BUF_UNLOCK(nbp);
+				continue;
+			}
+			if (bp_bdskip) {
+				VI_LOCK(devvp);
+				if (!ffs_bp_snapblk(vp, nbp)) {
+					if (BO_MTX(bo) != VI_MTX(vp)) {
+						VI_UNLOCK(devvp);
+						BO_LOCK(bo);
+					}
+					BUF_UNLOCK(nbp);
+					continue;
+				}
+				VI_UNLOCK(devvp);
+			}
+			if (nbp->b_flags & B_CLUSTEROK) {
+				vfs_bio_awrite(nbp);
+			} else {
+				bremfree(nbp);
+				bawrite(nbp);
+			}
+			dirtybufferflushes++;
+			break;
+		}
+		if (nbp == NULL)
+			BO_UNLOCK(bo);
 	}
-	lockdestroy(&sn->sn_lock);
-	free(sn, M_UFSMNT);
-	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 }
 
 /*
@@ -1984,8 +2203,9 @@
 	ufs2_daddr_t lbn, blkno, *snapblklist;
 	int lower, upper, mid, indiroff, error = 0;
 	int launched_async_io, prev_norunningbuf;
+	long saved_runningbufspace;
 
-	if ((VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
+	if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
 		return (0);		/* Update on a snapshot file */
 	if (td->td_pflags & TDP_COWINPROGRESS)
 		panic("ffs_copyonwrite: recursive call");
@@ -1996,7 +2216,7 @@
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL ||
-	    TAILQ_FIRST(&sn->sn_head) == NULL) {
+	    TAILQ_EMPTY(&sn->sn_head)) {
 		VI_UNLOCK(devvp);
 		return (0);		/* No snapshot */
 	}
@@ -2026,7 +2246,9 @@
 	 * for a long time waiting on snaplk, back it out of
 	 * runningbufspace, possibly waking other threads waiting for space.
 	 */
-	runningbufwakeup(bp);
+	saved_runningbufspace = bp->b_runningbufspace;
+	if (saved_runningbufspace != 0)
+		runningbufwakeup(bp);
 	/*
 	 * Not in the precomputed list, so check the snapshots.
 	 */
@@ -2036,11 +2258,13 @@
 		VI_LOCK(devvp);
 		sn = devvp->v_rdev->si_snapdata;
 		if (sn == NULL ||
-		    TAILQ_FIRST(&sn->sn_head) == NULL) {
+		    TAILQ_EMPTY(&sn->sn_head)) {
 			VI_UNLOCK(devvp);
-			if (bp->b_runningbufspace)
+			if (saved_runningbufspace != 0) {
+				bp->b_runningbufspace = saved_runningbufspace;
 				atomic_add_int(&runningbufspace,
 					       bp->b_runningbufspace);
+			}
 			return (0);		/* Snapshot gone */
 		}
 	}
@@ -2161,8 +2385,10 @@
 	/*
 	 * I/O on bp will now be started, so count it in runningbufspace.
 	 */
-	if (bp->b_runningbufspace)
+	if (saved_runningbufspace != 0) {
+		bp->b_runningbufspace = saved_runningbufspace;
 		atomic_add_int(&runningbufspace, bp->b_runningbufspace);
+	}
 	return (error);
 }
 
@@ -2184,25 +2410,24 @@
 	bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 	bip->bio_data = bp->b_data;
 	bip->bio_length = bp->b_bcount;
+	bip->bio_done = NULL;
 
 	g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
-
-	do 
-		msleep(bip, NULL, PRIBIO, "snaprdb", hz/10);
-	while (!(bip->bio_flags & BIO_DONE));
-	bp->b_error = bip->bio_error;
+	bp->b_error = biowait(bip, "snaprdb");
 	g_destroy_bio(bip);
 	return (bp->b_error);
 }
 
 /*
  * Process file deletes that were deferred by ufs_inactive() due to
- * the file system being suspended.
+ * the file system being suspended. Transfer IN_LAZYACCESS into
+ * IN_MODIFIED for vnodes that were accessed during suspension.
  */
 static void
 process_deferred_inactive(struct mount *mp)
 {
 	struct vnode *vp, *mvp;
+	struct inode *ip;
 	struct thread *td;
 	int error;
 
@@ -2212,9 +2437,15 @@
  loop:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
-		if ((vp->v_iflag & (VI_DOOMED | VI_OWEINACT)) != VI_OWEINACT ||
-		    vp->v_usecount > 0 ||
-		    vp->v_type == VNON) {
+		/*
+		 * IN_LAZYACCESS is checked here without holding any
+		 * vnode lock, but this flag is set only while holding
+		 * vnode interlock.
+		 */
+		if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 ||
+		    ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
+			((vp->v_iflag & VI_OWEINACT) == 0 ||
+			vp->v_usecount > 0))) {
 			VI_UNLOCK(vp);
 			continue;
 		}
@@ -2229,8 +2460,13 @@
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
+		ip = VTOI(vp);
+		if ((ip->i_flag & IN_LAZYACCESS) != 0) {
+			ip->i_flag &= ~IN_LAZYACCESS;
+			ip->i_flag |= IN_MODIFIED;
+		}
 		VI_LOCK(vp);
-		if ((vp->v_iflag & VI_OWEINACT) == 0) {
+		if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
 			VI_UNLOCK(vp);
 			VOP_UNLOCK(vp, 0, td);
 			vdrop(vp);
@@ -2259,3 +2495,33 @@
 	MNT_IUNLOCK(mp);
 	vn_finished_secondary_write(mp);
 }
+
+/* Try to free snapdata associated with devvp */
+static void
+try_free_snapdata(struct vnode *devvp,
+		  struct thread *td)
+{
+	struct snapdata *sn;
+	ufs2_daddr_t *snapblklist;
+
+	sn = devvp->v_rdev->si_snapdata;
+
+	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
+	    (devvp->v_vflag & VV_COPYONWRITE) == 0) {
+		VI_UNLOCK(devvp);
+		return;
+	}
+
+	devvp->v_rdev->si_snapdata = NULL;
+	devvp->v_vflag &= ~VV_COPYONWRITE;
+	snapblklist = sn->sn_blklist;
+	sn->sn_blklist = NULL;
+	sn->sn_listsize = 0;
+	lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
+	lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
+	lockdestroy(&sn->sn_lock);
+	free(sn, M_UFSMNT);
+	if (snapblklist != NULL)
+		FREE(snapblklist, M_UFSMNT);
+}
+#endif
Index: ffs_extern.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ffs/ffs_extern.h,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ffs/ffs_extern.h -L sys/ufs/ffs/ffs_extern.h -u -r1.2 -r1.3
--- sys/ufs/ffs/ffs_extern.h
+++ sys/ufs/ffs/ffs_extern.h
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_extern.h	8.6 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/ufs/ffs/ffs_extern.h,v 1.69.2.1 2006/03/13 03:07:37 jeff Exp $
+ * $FreeBSD: src/sys/ufs/ffs/ffs_extern.h,v 1.74 2007/02/17 08:25:43 mckusick Exp $
  */
 
 #ifndef _UFS_FFS_EXTERN_H
@@ -61,6 +61,7 @@
 ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
 int	ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
 void	ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void	ffs_bdflush(struct bufobj *, struct buf *);
 int	ffs_copyonwrite(struct vnode *, struct buf *);
 int	ffs_flushfiles(struct mount *, int, struct thread *);
 void	ffs_fragacct(struct fs *, int, int32_t [], int);
@@ -72,6 +73,7 @@
 int	ffs_reallocblks(struct vop_reallocblks_args *);
 int	ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
 	    ufs2_daddr_t, int, int, struct ucred *, struct buf **);
+int	ffs_sbupdate(struct ufsmount *, int, int);
 void	ffs_setblock(struct fs *, u_char *, ufs1_daddr_t);
 int	ffs_snapblkfree(struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t);
 void	ffs_snapremove(struct vnode *vp);
Index: ufs_vfsops.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_vfsops.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/ufs/ufs/ufs_vfsops.c -L sys/ufs/ufs/ufs_vfsops.c -u -r1.1.1.2 -r1.2
--- sys/ufs/ufs/ufs_vfsops.c
+++ sys/ufs/ufs/ufs_vfsops.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vfsops.c,v 1.45.2.1 2006/02/20 00:53:15 yar Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vfsops.c,v 1.48 2007/02/01 02:13:53 mpp Exp $");
 
 #include "opt_quota.h"
 #include "opt_ufs.h"
@@ -60,7 +60,7 @@
 #include <ufs/ufs/dirhash.h>
 #endif
 
-MALLOC_DEFINE(M_UFSMNT, "UFS mount", "UFS mount structure");
+MALLOC_DEFINE(M_UFSMNT, "ufs_mount", "UFS mount structure");
 
 /*
  * Return the root of a filesystem.
@@ -86,11 +86,11 @@
  * Do operations associated with quotas
  */
 int
-ufs_quotactl(mp, cmds, uid, arg, td)
+ufs_quotactl(mp, cmds, id, arg, td)
 	struct mount *mp;
 	int cmds;
-	uid_t uid;
-	caddr_t arg;
+	uid_t id;
+	void *arg;
 	struct thread *td;
 {
 #ifndef QUOTA
@@ -98,10 +98,23 @@
 #else
 	int cmd, type, error;
 
-	if (uid == -1)
-		uid = td->td_ucred->cr_ruid;
 	cmd = cmds >> SUBCMDSHIFT;
 	type = cmds & SUBCMDMASK;
+	if (id == -1) {
+		switch (type) {
+
+		case USRQUOTA:
+			id = td->td_ucred->cr_ruid;
+			break;
+
+		case GRPQUOTA:
+			id = td->td_ucred->cr_rgid;
+			break;
+
+		default:
+			return (EINVAL);
+		}
+	}
 	if ((u_int)type >= MAXQUOTAS)
 		return (EINVAL);
 
@@ -118,15 +131,15 @@
 		break;
 
 	case Q_SETQUOTA:
-		error = setquota(td, mp, uid, type, arg);
+		error = setquota(td, mp, id, type, arg);
 		break;
 
 	case Q_SETUSE:
-		error = setuse(td, mp, uid, type, arg);
+		error = setuse(td, mp, id, type, arg);
 		break;
 
 	case Q_GETQUOTA:
-		error = getquota(td, mp, uid, type, arg);
+		error = getquota(td, mp, id, type, arg);
 		break;
 
 	case Q_SYNC:
@@ -205,6 +218,6 @@
 		return (ESTALE);
 	}
 	*vpp = nvp;
-	vnode_create_vobject_off(*vpp, DIP(ip, i_size), curthread);
+	vnode_create_vobject(*vpp, DIP(ip, i_size), curthread);
 	return (0);
 }
Index: ufs_dirhash.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_dirhash.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/ufs_dirhash.c -L sys/ufs/ufs/ufs_dirhash.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/ufs_dirhash.c
+++ sys/ufs/ufs/ufs_dirhash.c
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.21.2.1 2005/08/20 04:27:15 iedowse Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.23 2005/10/31 15:41:28 rwatson Exp $");
 
 #include "opt_ufs.h"
 
@@ -62,7 +62,7 @@
 #define OFSFMT(vp)		((vp)->v_mount->mnt_maxsymlinklen <= 0)
 #define BLKFREE2IDX(n)		((n) > DH_NFSTATS ? DH_NFSTATS : (n))
 
-static MALLOC_DEFINE(M_DIRHASH, "UFS dirhash", "UFS directory hash tables");
+static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");
 
 static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
 
--- /dev/null
+++ sys/ufs/ufs/ufs_gjournal.c
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_gjournal.c,v 1.2 2007/05/28 00:28:15 pjd Exp $");
+
+#include "opt_ufs.h"
+
+#ifdef UFS_GJOURNAL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+
+#include <ufs/ufs/extattr.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/gjournal.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+/*
+ * Change the number of unreferenced inodes.
+ */
+static int
+ufs_gjournal_modref(struct vnode *vp, int count)
+{
+	struct cg *cgp;
+	struct buf *bp;
+	ufs2_daddr_t cgbno;
+	int error, cg;
+	struct cdev *dev;
+	struct inode *ip;
+	struct ufsmount *ump;
+	struct fs *fs;
+	struct vnode *devvp;
+	ino_t ino;
+
+	ip = VTOI(vp);
+	ump = ip->i_ump;
+	fs = ip->i_fs;
+	devvp = ip->i_devvp;
+	ino = ip->i_number;
+
+	cg = ino_to_cg(fs, ino);
+	if (devvp->v_type != VCHR) {
+		/* devvp is a snapshot */
+		dev = VTOI(devvp)->i_devvp->v_rdev;
+		cgbno = fragstoblks(fs, cgtod(fs, cg));
+	} else {
+		/* devvp is a normal disk device */
+		dev = devvp->v_rdev;
+		cgbno = fsbtodb(fs, cgtod(fs, cg));
+	}
+	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+		panic("ffs_freefile: range: dev = %s, ino = %lu, fs = %s",
+		    devtoname(dev), (u_long)ino, fs->fs_fsmnt);
+	if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) {
+		brelse(bp);
+		return (error);
+	}
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp)) {
+		brelse(bp);
+		return (0);
+	}
+	bp->b_xflags |= BX_BKGRDWRITE;
+	cgp->cg_unrefs += count;
+	UFS_LOCK(ump);
+	fs->fs_unrefs += count;
+	fs->fs_fmod = 1;
+	ACTIVECLEAR(fs, cg);
+	UFS_UNLOCK(ump);
+	bdwrite(bp);
+	return (0);
+}
+
+void
+ufs_gjournal_orphan(struct vnode *vp)
+{
+	struct inode *ip;
+
+	if (vp->v_mount->mnt_gjprovider == NULL)
+		return;
+	if (vp->v_usecount < 2 || (vp->v_vflag & VV_DELETED))
+		return;
+	ip = VTOI(vp);
+	if ((vp->v_type == VDIR && ip->i_nlink > 2) ||
+	    (vp->v_type != VDIR && ip->i_nlink > 1)) {
+		return;
+	}
+	vp->v_vflag |= VV_DELETED;
+
+	ufs_gjournal_modref(vp, 1);
+}
+
+void
+ufs_gjournal_close(struct vnode *vp)
+{
+	struct inode *ip;
+
+	if (vp->v_mount->mnt_gjprovider == NULL)
+		return;
+	if (!(vp->v_vflag & VV_DELETED))
+		return;
+	ip = VTOI(vp);
+	if (ip->i_nlink > 0)
+		return;
+	ufs_gjournal_modref(vp, -1);
+}
+
+#endif /* UFS_GJOURNAL */
Index: dir.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/dir.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/dir.h -L sys/ufs/ufs/dir.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/dir.h
+++ sys/ufs/ufs/dir.h
@@ -32,7 +32,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)dir.h	8.2 (Berkeley) 1/21/94
- * $FreeBSD: src/sys/ufs/ufs/dir.h,v 1.11 2005/01/07 02:29:26 imp Exp $
+ * $FreeBSD: src/sys/ufs/ufs/dir.h,v 1.12 2007/07/02 01:31:43 peter Exp $
  */
 
 #ifndef _UFS_UFS_DIR_H_
@@ -110,7 +110,7 @@
  * 
  */
 #define	DIRECTSIZ(namlen)						\
-	(((int)&((struct direct *)0)->d_name +				\
+	(((uintptr_t)&((struct direct *)0)->d_name +			\
 	  ((namlen)+1)*sizeof(((struct direct *)0)->d_name[0]) + 3) & ~3)
 #if (BYTE_ORDER == LITTLE_ENDIAN)
 #define DIRSIZ(oldfmt, dp) \
Index: quota.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/quota.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/quota.h -L sys/ufs/ufs/quota.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/quota.h
+++ sys/ufs/ufs/quota.h
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)quota.h	8.3 (Berkeley) 8/19/94
- * $FreeBSD: src/sys/ufs/ufs/quota.h,v 1.27 2005/01/07 02:29:26 imp Exp $
+ * $FreeBSD: src/sys/ufs/ufs/quota.h,v 1.30 2007/03/14 08:54:07 kib Exp $
  */
 
 #ifndef _UFS_UFS_QUOTA_H_
@@ -113,15 +113,18 @@
  * filesystem. There is one allocated for each quota that exists on any
  * filesystem for the current user or group. A cache is kept of recently
  * used entries.
+ * (h) protected by dqhlock
  */
 struct dquot {
-	LIST_ENTRY(dquot) dq_hash;	/* hash list */
-	TAILQ_ENTRY(dquot) dq_freelist;	/* free list */
+	LIST_ENTRY(dquot) dq_hash;	/* (h) hash list */
+	TAILQ_ENTRY(dquot) dq_freelist;	/* (h) free list */
+	struct mtx dq_lock;		/* lock for concurrency */
 	u_int16_t dq_flags;		/* flags, see below */
 	u_int16_t dq_type;		/* quota type of this dquot */
-	u_int32_t dq_cnt;		/* count of active references */
+	u_int32_t dq_cnt;		/* (h) count of active references */
 	u_int32_t dq_id;		/* identifier this applies to */
-	struct	ufsmount *dq_ump;	/* filesystem that this is taken from */
+	struct	ufsmount *dq_ump;	/* (h) filesystem that this is
+					   taken from */
 	struct	dqblk dq_dqb;		/* actual usage & quotas */
 };
 /*
@@ -167,6 +170,23 @@
 #define	DQREF(dq)	(dq)->dq_cnt++
 #endif
 
+#define	DQI_LOCK(dq)	mtx_lock(&(dq)->dq_lock)
+#define	DQI_UNLOCK(dq)	mtx_unlock(&(dq)->dq_lock)
+
+#define	DQI_WAIT(dq, prio, msg) do {		\
+	while ((dq)->dq_flags & DQ_LOCK) {	\
+		(dq)->dq_flags |= DQ_WANT;	\
+		(void) msleep((dq),		\
+		    &(dq)->dq_lock, (prio), (msg), 0); \
+	}					\
+} while (0)
+
+#define	DQI_WAKEUP(dq) do {			\
+	if ((dq)->dq_flags & DQ_WANT)		\
+		wakeup((dq));			\
+	(dq)->dq_flags &= ~(DQ_WANT|DQ_LOCK);	\
+} while (0)
+
 struct inode;
 struct mount;
 struct thread;
@@ -174,17 +194,17 @@
 struct vnode;
 
 int	chkdq(struct inode *, int64_t, struct ucred *, int);
-int	chkiq(struct inode *, ino_t, struct ucred *, int);
+int	chkiq(struct inode *, int, struct ucred *, int);
 void	dqinit(void);
 void	dqrele(struct vnode *, struct dquot *);
 void	dquninit(void);
 int	getinoquota(struct inode *);
-int	getquota(struct thread *, struct mount *, u_long, int, caddr_t);
+int	getquota(struct thread *, struct mount *, u_long, int, void *);
 int	qsync(struct mount *mp);
 int	quotaoff(struct thread *td, struct mount *, int);
-int	quotaon(struct thread *td, struct mount *, int, caddr_t);
-int	setquota(struct thread *, struct mount *, u_long, int, caddr_t);
-int	setuse(struct thread *, struct mount *, u_long, int, caddr_t);
+int	quotaon(struct thread *td, struct mount *, int, void *);
+int	setquota(struct thread *, struct mount *, u_long, int, void *);
+int	setuse(struct thread *, struct mount *, u_long, int, void *);
 vfs_quotactl_t ufs_quotactl;
 
 #else /* !_KERNEL */
Index: ufs_lookup.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_lookup.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_lookup.c -L sys/ufs/ufs/ufs_lookup.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_lookup.c
+++ sys/ufs/ufs/ufs_lookup.c
@@ -35,10 +35,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_lookup.c,v 1.77.2.2 2006/03/09 00:21:23 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_lookup.c,v 1.83 2007/03/14 08:50:27 kib Exp $");
 
 #include "opt_ffs_broken_fixme.h"
 #include "opt_ufs.h"
+#include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -593,10 +594,12 @@
 	struct mount *mp;
 
 	mp = ITOV(ip)->v_mount;
-	(void)printf("%s: bad dir ino %lu at offset %ld: %s\n",
-	    mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
-		panic("ufs_dirbad: bad dir");
+		panic("ufs_dirbad: %s: bad dir ino %lu at offset %ld: %s",
+		    mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
+	else
+		(void)printf("%s: bad dir ino %lu at offset %ld: %s\n",
+		    mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how);
 }
 
 /*
@@ -700,7 +703,7 @@
 	struct buf *bp;
 	u_int dsize;
 	struct direct *ep, *nep;
-	int error, ret, blkoff, loc, spacefree, flags;
+	int error, ret, blkoff, loc, spacefree, flags, namlen;
 	char *dirbuf;
 
 	td = curthread;	/* XXX */
@@ -721,6 +724,13 @@
 		flags = BA_CLRBUF;
 		if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp))
 			flags |= IO_SYNC;
+#ifdef QUOTA
+		if ((error = getinoquota(dp)) != 0) {
+			if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
+				bdwrite(newdirbp);
+			return (error);
+		}
+#endif
 		if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ,
 		    cr, flags, &bp)) != 0) {
 			if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
@@ -875,8 +885,16 @@
 	 * Update the pointer fields in the previous entry (if any),
 	 * copy in the new entry, and write out the block.
 	 */
+#	if (BYTE_ORDER == LITTLE_ENDIAN)
+		if (OFSFMT(dvp))
+			namlen = ep->d_type;
+		else
+			namlen = ep->d_namlen;
+#	else
+		namlen = ep->d_namlen;
+#	endif
 	if (ep->d_ino == 0 ||
-	    (ep->d_ino == WINO &&
+	    (ep->d_ino == WINO && namlen == dirp->d_namlen &&
 	     bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
 		if (spacefree + dsize < newentrysize)
 			panic("ufs_direnter: compact1");
Index: ufs_vnops.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_vnops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_vnops.c -L sys/ufs/ufs/ufs_vnops.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_vnops.c
+++ sys/ufs/ufs/ufs_vnops.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vnops.c,v 1.271.2.4 2006/03/22 17:46:50 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vnops.c,v 1.291 2007/06/12 00:12:01 rwatson Exp $");
 
 #include "opt_mac.h"
 #include "opt_quota.h"
@@ -53,17 +53,20 @@
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/refcount.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/lockf.h>
 #include <sys/conf.h>
 #include <sys/acl.h>
-#include <sys/mac.h>
 #include <sys/jail.h>
 
 #include <machine/mutex.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <sys/file.h>		/* XXX */
 
 #include <vm/vm.h>
@@ -81,6 +84,9 @@
 #ifdef UFS_DIRHASH
 #include <ufs/ufs/dirhash.h>
 #endif
+#ifdef UFS_GJOURNAL
+#include <ufs/ufs/gjournal.h>
+#endif
 
 #include <ufs/ffs/ffs_extern.h>
 
@@ -121,39 +127,56 @@
 	0, DIRBLKSIZ - 12, 2, ".."
 };
 
-void
-ufs_itimes(vp)
-	struct vnode *vp;
+static void
+ufs_itimes_locked(struct vnode *vp)
 {
 	struct inode *ip;
 	struct timespec ts;
 
+	ASSERT_VI_LOCKED(vp, __func__);
+
 	ip = VTOI(vp);
+	if ((vp->v_mount->mnt_flag & MNT_RDONLY) != 0)
+		goto out;
 	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
 		return;
+
 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp))
 		ip->i_flag |= IN_LAZYMOD;
-	else
+	else if (((vp->v_mount->mnt_kern_flag &
+		    (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
+		    (ip->i_flag & (IN_CHANGE | IN_UPDATE)))
 		ip->i_flag |= IN_MODIFIED;
-	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
-		vfs_timestamp(&ts);
-		if (ip->i_flag & IN_ACCESS) {
-			DIP_SET(ip, i_atime, ts.tv_sec);
-			DIP_SET(ip, i_atimensec, ts.tv_nsec);
-		}
-		if (ip->i_flag & IN_UPDATE) {
-			DIP_SET(ip, i_mtime, ts.tv_sec);
-			DIP_SET(ip, i_mtimensec, ts.tv_nsec);
-			ip->i_modrev++;
-		}
-		if (ip->i_flag & IN_CHANGE) {
-			DIP_SET(ip, i_ctime, ts.tv_sec);
-			DIP_SET(ip, i_ctimensec, ts.tv_nsec);
-		}
+	else if (ip->i_flag & IN_ACCESS)
+		ip->i_flag |= IN_LAZYACCESS;
+	vfs_timestamp(&ts);
+	if (ip->i_flag & IN_ACCESS) {
+		DIP_SET(ip, i_atime, ts.tv_sec);
+		DIP_SET(ip, i_atimensec, ts.tv_nsec);
+	}
+	if (ip->i_flag & IN_UPDATE) {
+		DIP_SET(ip, i_mtime, ts.tv_sec);
+		DIP_SET(ip, i_mtimensec, ts.tv_nsec);
+		ip->i_modrev++;
+	}
+	if (ip->i_flag & IN_CHANGE) {
+		DIP_SET(ip, i_ctime, ts.tv_sec);
+		DIP_SET(ip, i_ctimensec, ts.tv_nsec);
 	}
+
+ out:
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
 }
 
+void
+ufs_itimes(struct vnode *vp)
+{
+
+	VI_LOCK(vp);
+	ufs_itimes_locked(vp);
+	VI_UNLOCK(vp);
+}
+
 /*
  * Create a regular file
  */
@@ -245,7 +268,7 @@
 	if ((ip->i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
-	vnode_create_vobject_off(vp, DIP(ip, i_size), ap->a_td);
+	vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td);
 	return (0);
 }
 
@@ -265,10 +288,12 @@
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
+	int usecount;
 
 	VI_LOCK(vp);
-	if (vp->v_usecount > 1)
-		ufs_itimes(vp);
+	usecount = vp->v_usecount;
+	if (usecount > 1)
+		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
@@ -302,10 +327,6 @@
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
-#ifdef QUOTA
-			if ((error = getinoquota(ip)) != 0)
-				return (error);
-#endif
 			break;
 		default:
 			break;
@@ -364,7 +385,16 @@
 	struct inode *ip = VTOI(vp);
 	struct vattr *vap = ap->a_vap;
 
-	ufs_itimes(vp);
+	VI_LOCK(vp);
+	ufs_itimes_locked(vp);
+	if (ip->i_ump->um_fstype == UFS1) {
+		vap->va_atime.tv_sec = ip->i_din1->di_atime;
+		vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
+	} else {
+		vap->va_atime.tv_sec = ip->i_din2->di_atime;
+		vap->va_atime.tv_nsec = ip->i_din2->di_atimensec;
+	}
+	VI_UNLOCK(vp);
 	/*
 	 * Copy from inode table
 	 */
@@ -377,8 +407,6 @@
 	if (ip->i_ump->um_fstype == UFS1) {
 		vap->va_rdev = ip->i_din1->di_rdev;
 		vap->va_size = ip->i_din1->di_size;
-		vap->va_atime.tv_sec = ip->i_din1->di_atime;
-		vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
 		vap->va_mtime.tv_sec = ip->i_din1->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din1->di_ctime;
@@ -389,8 +417,6 @@
 	} else {
 		vap->va_rdev = ip->i_din2->di_rdev;
 		vap->va_size = ip->i_din2->di_size;
-		vap->va_atime.tv_sec = ip->i_din2->di_atime;
-		vap->va_atime.tv_nsec = ip->i_din2->di_atimensec;
 		vap->va_mtime.tv_sec = ip->i_din2->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din2->di_ctime;
@@ -465,8 +491,7 @@
 		 * is non-zero; otherwise, they behave like unprivileged
 		 * processes.
 		 */
-		if (!suser_cred(cred,
-		    jail_chflags_allowed ? SUSER_ALLOWJAIL : 0)) {
+		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
 			if (ip->i_flags
 			    & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 				error = securelevel_gt(cred, 0);
@@ -508,22 +533,35 @@
 	}
 	if (vap->va_size != VNOVAL) {
 		/*
-		 * Disallow write attempts on read-only filesystems;
-		 * unless the file is a socket, fifo, or a block or
-		 * character device resident on the filesystem.
+		 * XXX most of the following special cases should be in
+		 * callers instead of in N filesystems.  The VDIR check
+		 * mostly already is.
 		 */
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
+			/*
+			 * Truncation should have an effect in these cases.
+			 * Disallow it if the filesystem is read-only or
+			 * the file is being snapshotted.
+			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			if ((ip->i_flags & SF_SNAPSHOT) != 0)
 				return (EPERM);
 			break;
 		default:
-			break;
+			/*
+			 * According to POSIX, the result is unspecified
+			 * for file types other than regular files,
+			 * directories and shared memory objects.  We
+			 * don't support shared memory objects in the file
+			 * system, and have dubious support for truncating
+			 * symlinks.  Just ignore the request in other cases.
+			 */
+			return (0);
 		}
 		if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL,
 		    cred, td)) != 0)
@@ -543,10 +581,19 @@
 		 * super-user.
 		 * If times is non-NULL, ... The caller must be the owner of
 		 * the file or be the super-user.
+		 *
+		 * Possibly for historical reasons, try to use VADMIN in
+		 * preference to VWRITE for a NULL timestamp.  This means we
+		 * will return EACCES in preference to EPERM if neither
+		 * check succeeds.
 		 */
-		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) &&
-		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
-		    (error = VOP_ACCESS(vp, VWRITE, cred, td))))
+		if (vap->va_vaflags & VA_UTIMES_NULL) {
+			error = VOP_ACCESS(vp, VADMIN, cred, td);
+			if (error)
+				error = VOP_ACCESS(vp, VWRITE, cred, td);
+		} else
+			error = VOP_ACCESS(vp, VADMIN, cred, td);
+		if (error)
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL)
 			ip->i_flag |= IN_ACCESS;
@@ -612,11 +659,11 @@
 	 * jail(8).
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
-		if (suser_cred(cred, SUSER_ALLOWJAIL))
+		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0))
 			return (EFTYPE);
 	}
 	if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
-		error = suser_cred(cred, SUSER_ALLOWJAIL);
+		error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
 		if (error)
 			return (error);
 	}
@@ -653,19 +700,19 @@
 	if (gid == (gid_t)VNOVAL)
 		gid = ip->i_gid;
 	/*
-	 * To modify the ownership of a file, must possess VADMIN
-	 * for that file.
+	 * To modify the ownership of a file, must possess VADMIN for that
+	 * file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 	/*
-	 * To change the owner of a file, or change the group of a file
-	 * to a group of which we are not a member, the caller must
-	 * have privilege.
+	 * To change the owner of a file, or change the group of a file to a
+	 * group of which we are not a member, the caller must have
+	 * privilege.
 	 */
 	if ((uid != ip->i_uid || 
 	    (gid != ip->i_gid && !groupmember(gid, cred))) &&
-	    (error = suser_cred(cred, SUSER_ALLOWJAIL)))
+	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0)))
 		return (error);
 	ogid = ip->i_gid;
 	ouid = ip->i_uid;
@@ -736,9 +783,11 @@
 		panic("ufs_chown: lost quota");
 #endif /* QUOTA */
 	ip->i_flag |= IN_CHANGE;
-	if (suser_cred(cred, SUSER_ALLOWJAIL) && (ouid != uid || ogid != gid)) {
-		ip->i_mode &= ~(ISUID | ISGID);
-		DIP_SET(ip, i_mode, ip->i_mode);
+	if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
+		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
+			ip->i_mode &= ~(ISUID | ISGID);
+			DIP_SET(ip, i_mode, ip->i_mode);
+		}
 	}
 	return (0);
 }
@@ -764,6 +813,9 @@
 		error = EPERM;
 		goto out;
 	}
+#ifdef UFS_GJOURNAL
+	ufs_gjournal_orphan(vp);
+#endif
 	error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
 	if (ip->i_nlink <= 0)
 		vp->v_vflag |= VV_NOSYNC;
@@ -1047,7 +1099,7 @@
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
-	 * directory heirarchy above the target, as this would
+	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
@@ -1200,7 +1252,7 @@
 			DIP_SET(xp, i_nlink, xp->i_nlink);
 			xp->i_flag |= IN_CHANGE;
 			ioflag = IO_NORMAL;
-			if (DOINGASYNC(tvp))
+			if (!DOINGASYNC(tvp))
 				ioflag |= IO_SYNC;
 			if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag,
 			    tcnp->cn_cred, tcnp->cn_thread)) != 0)
@@ -1380,7 +1432,7 @@
 				 * XXX This seems to never be accessed out of
 				 * our context so a stack variable is ok.
 				 */
-				ucred.cr_ref = 1;
+				refcount_init(&ucred.cr_ref, 1);
 				ucred.cr_uid = ip->i_uid;
 				ucred.cr_ngroups = 1;
 				ucred.cr_groups[0] = dp->i_gid;
@@ -1670,6 +1722,9 @@
 		error = EINVAL;
 		goto out;
 	}
+#ifdef UFS_GJOURNAL
+	ufs_gjournal_orphan(vp);
+#endif
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
@@ -1707,7 +1762,7 @@
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		ioflag = IO_NORMAL;
-		if (DOINGASYNC(vp))
+		if (!DOINGASYNC(vp))
 			ioflag |= IO_SYNC;
 		error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred,
 		    cnp->cn_thread);
@@ -1776,7 +1831,7 @@
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
-		int *ncookies;
+		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
@@ -1978,10 +2033,12 @@
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
+	int usecount;
 
 	VI_LOCK(vp);
-	if (vp->v_usecount > 1)
-		ufs_itimes(vp);
+	usecount = vp->v_usecount;
+	if (usecount > 1)
+		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (fifo_specops.vop_close(ap));
 }
@@ -2211,7 +2268,7 @@
 			 * XXX This seems to never be accessed out of our
 			 * context so a stack variable is ok.
 			 */
-			ucred.cr_ref = 1;
+			refcount_init(&ucred.cr_ref, 1);
 			ucred.cr_uid = ip->i_uid;
 			ucred.cr_ngroups = 1;
 			ucred.cr_groups[0] = pdir->i_gid;
@@ -2307,7 +2364,7 @@
 	if (DOINGSOFTDEP(tvp))
 		softdep_change_linkcnt(ip);
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
-	    suser_cred(cnp->cn_cred, SUSER_ALLOWJAIL)) {
+	    priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) {
 		ip->i_mode &= ~ISGID;
 		DIP_SET(ip, i_mode, ip->i_mode);
 	}
Index: ufsmount.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufsmount.h,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufsmount.h -L sys/ufs/ufs/ufsmount.h -u -r1.2 -r1.3
--- sys/ufs/ufs/ufsmount.h
+++ sys/ufs/ufs/ufsmount.h
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufsmount.h	8.6 (Berkeley) 3/30/95
- * $FreeBSD: src/sys/ufs/ufs/ufsmount.h,v 1.34.2.2 2006/04/04 18:14:31 tegge Exp $
+ * $FreeBSD: src/sys/ufs/ufs/ufsmount.h,v 1.37 2006/04/03 22:23:23 tegge Exp $
  */
 
 #ifndef _UFS_UFS_UFSMOUNT_H_
Index: ufs_acl.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_acl.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/ufs_acl.c -L sys/ufs/ufs/ufs_acl.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/ufs_acl.c
+++ sys/ufs/ufs/ufs_acl.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1999-2001, 2003 Robert N. M. Watson
+ * Copyright (c) 1999-2003 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_acl.c,v 1.20 2004/08/15 06:24:42 jmg Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_acl.c,v 1.21 2007/01/08 17:55:32 rwatson Exp $");
 
 #include "opt_ufs.h"
 #include "opt_quota.h"
Index: ufs_bmap.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_bmap.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/ufs_bmap.c -L sys/ufs/ufs/ufs_bmap.c -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/ufs_bmap.c
+++ sys/ufs/ufs/ufs_bmap.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_bmap.c,v 1.64.2.1 2005/11/26 21:19:20 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_bmap.c,v 1.66 2007/06/01 01:12:45 jeff Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -226,7 +226,7 @@
 			vfs_busy_pages(bp, 0);
 			bp->b_iooffset = dbtob(bp->b_blkno);
 			bstrategy(bp);
-			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
+			curthread->td_ru.ru_inblock++;
 			error = bufwait(bp);
 			if (error) {
 				brelse(bp);
--- /dev/null
+++ sys/ufs/ufs/gjournal.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/ufs/ufs/gjournal.h,v 1.1 2006/10/31 21:48:54 pjd Exp $
+ */
+
+#ifndef _UFS_UFS_GJOURNAL_H_
+#define _UFS_UFS_GJOURNAL_H_
+
+/*
+ * GEOM journal function prototypes.
+ */
+void	ufs_gjournal_orphan(struct vnode *fvp);
+void	ufs_gjournal_close(struct vnode *vp);
+#endif /* !_UFS_UFS_GJOURNAL_H_ */
Index: extattr.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/extattr.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/extattr.h -L sys/ufs/ufs/extattr.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/extattr.h
+++ sys/ufs/ufs/extattr.h
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/ufs/ufs/extattr.h,v 1.20 2005/01/31 08:16:45 imp Exp $
+ * $FreeBSD: src/sys/ufs/ufs/extattr.h,v 1.21 2007/03/06 08:13:20 mckusick Exp $
  */
 /*
  * Developed by the TrustedBSD Project.
@@ -69,6 +69,48 @@
 	/* data follows the header */
 };
 
+/*
+ * This structure defines the required fields of an extended-attribute header.
+ */
+struct extattr {
+	int32_t	ea_length;	    /* length of this attribute */
+	int8_t	ea_namespace;	    /* name space of this attribute */
+	int8_t	ea_contentpadlen;   /* bytes of padding at end of attribute */
+	int8_t	ea_namelength;	    /* length of attribute name */
+	char	ea_name[1];	    /* null-terminated attribute name */
+	/* extended attribute content follows */
+};
+
+/*
+ * These macros are used to access and manipulate an extended attribute:
+ *
+ * EXTATTR_NEXT(eap) returns a pointer to the next extended attribute
+ *	following eap.
+ * EXTATTR_CONTENT(eap) returns a pointer to the extended attribute
+ *	content referenced by eap.
+ * EXTATTR_CONTENT_SIZE(eap) returns the size of the extended attribute
+ *	content referenced by eap.
+ * EXTATTR_SET_LENGTHS(eap, contentsize) called after initializing the
+ *	attribute name to calculate and set the ea_length, ea_namelength,
+ *	and ea_contentpadlen fields of the extended attribute structure.
+ */
+#define EXTATTR_NEXT(eap) \
+	((struct extattr *)(((void *)(eap)) + (eap)->ea_length))
+#define EXTATTR_CONTENT(eap) (((void *)(eap)) + EXTATTR_BASE_LENGTH(eap))
+#define EXTATTR_CONTENT_SIZE(eap) \
+	((eap)->ea_length - EXTATTR_BASE_LENGTH(eap) - (eap)->ea_contentpadlen)
+#define EXTATTR_BASE_LENGTH(eap) \
+	((sizeof(struct extattr) + (eap)->ea_namelength + 7) & ~7)
+#define EXTATTR_SET_LENGTHS(eap, contentsize) do { \
+	KASSERT(((eap)->ea_name[0] != 0), \
+		("Must initialize name before setting lengths")); \
+	(eap)->ea_namelength = strlen((eap)->ea_name); \
+	(eap)->ea_contentpadlen = ((contentsize) % 8) ? \
+		8 - ((contentsize) % 8) : 0; \
+	(eap)->ea_length = EXTATTR_BASE_LENGTH(eap) + \
+		(contentsize) + (eap)->ea_contentpadlen; \
+} while (0)
+
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
@@ -106,6 +148,13 @@
 int	ufs_setextattr(struct vop_setextattr_args *ap);
 void	ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td);
 
+#else
+
+/* User-level definition of KASSERT for macros above */
+#define KASSERT(cond, str) do { \
+        if (!(cond)) { printf("panic: "); printf(str); printf("\n"); exit(1); }\
+} while (0)
+
 #endif /* !_KERNEL */
 
 #endif /* !_UFS_UFS_EXTATTR_H_ */
Index: ufs_inode.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_inode.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_inode.c -L sys/ufs/ufs/ufs_inode.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_inode.c
+++ sys/ufs/ufs/ufs_inode.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_inode.c,v 1.63.2.2 2006/03/13 03:08:12 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_inode.c,v 1.69 2007/06/22 13:22:37 kib Exp $");
 
 #include "opt_quota.h"
 #include "opt_ufs.h"
@@ -57,6 +57,9 @@
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/dirhash.h>
 #endif
+#ifdef UFS_GJOURNAL
+#include <ufs/ufs/gjournal.h>
+#endif
 
 /*
  * Last reference to an inode.  If necessary, write or delete it.
@@ -83,9 +86,12 @@
 	 */
 	if (ip->i_mode == 0)
 		goto out;
-	if (ip->i_effnlink == 0 && DOINGSOFTDEP(vp))
-		softdep_releasefile(ip);
-	if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+#ifdef UFS_GJOURNAL
+	ufs_gjournal_close(vp);
+#endif
+	if ((ip->i_effnlink == 0 && DOINGSOFTDEP(vp)) ||
+	    (ip->i_nlink <= 0 &&
+	     (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)) {
 	loop:
 		if (vn_start_secondary_write(vp, &mp, V_NOWAIT) != 0) {
 			/* Cannot delete file while file system is suspended */
@@ -112,6 +118,10 @@
 				return (0);
 			}
 		}
+	}
+	if (ip->i_effnlink == 0 && DOINGSOFTDEP(vp))
+		softdep_releasefile(ip);
+	if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 #ifdef QUOTA
 		if (!getinoquota(ip))
 			(void)chkiq(ip, -1, NOCRED, FORCE);
@@ -184,10 +194,9 @@
 	 * Destroy the vm object and flush associated pages.
 	 */
 	vnode_destroy_vobject(vp);
-	if (ip->i_flag & IN_LAZYMOD) {
+	if (ip->i_flag & IN_LAZYMOD)
 		ip->i_flag |= IN_MODIFIED;
-		UFS_UPDATE(vp, 0);
-	}
+	UFS_UPDATE(vp, 0);
 	/*
 	 * Remove the inode from its hash chain.
 	 */
Index: ufs_quota.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_quota.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_quota.c -L sys/ufs/ufs/ufs_quota.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_quota.c
+++ sys/ufs/ufs/ufs_quota.c
@@ -33,7 +33,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_quota.c,v 1.74.2.2.2.1 2006/04/26 01:23:59 kris Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_quota.c,v 1.95 2007/06/12 00:12:01 rwatson Exp $");
+
+#include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -44,8 +46,10 @@
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
@@ -55,26 +59,26 @@
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
-SYSCTL_DECL(_security_bsd);
-
 static int unprivileged_get_quota = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW,
     &unprivileged_get_quota, 0,
     "Unprivileged processes may retrieve quotas for other uids and gids");
 
-static MALLOC_DEFINE(M_DQUOT, "UFS quota", "UFS quota entries");
+static MALLOC_DEFINE(M_DQUOT, "ufs_quota", "UFS quota entries");
 
 /*
  * Quota name to error message mapping.
  */
 static char *quotatypes[] = INITQFNAMES;
 
-static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int);
-static int chkiqchg(struct inode *, ino_t, struct ucred *, int);
+static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int, int *);
+static int chkiqchg(struct inode *, int, struct ucred *, int, int *);
 static int dqget(struct vnode *,
-		u_long, struct ufsmount *, int, struct dquot **);
+	u_long, struct ufsmount *, int, struct dquot **);
 static int dqsync(struct vnode *, struct dquot *);
 static void dqflush(struct vnode *);
+static int quotaoff1(struct thread *td, struct mount *mp, int type);
+static int quotaoff_inchange(struct thread *td, struct mount *mp, int type);
 
 #ifdef DIAGNOSTIC
 static void dqref(struct dquot *);
@@ -94,16 +98,29 @@
 	struct inode *ip;
 {
 	struct ufsmount *ump;
-	struct vnode *vp = ITOV(ip);
+	struct vnode *vp;
 	int error;
 
+	vp = ITOV(ip);
+
+	/*
+	 * Disk quotas must be turned off for system files.  Currently
+	 * snapshot and quota files.
+	 */
+	if ((vp->v_vflag & VV_SYSTEM) != 0)
+		return (0);
+	/*
+	 * XXX: Turn off quotas for files with a negative UID or GID.
+	 * This prevents the creation of 100GB+ quota files.
+	 */
+	if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
+		return (0);
 	ump = VFSTOUFS(vp->v_mount);
 	/*
 	 * Set up the user quota based on file uid.
 	 * EINVAL means that quotas are not enabled.
 	 */
-	if (ip->i_dquot[USRQUOTA] == NODQUOT &&
-	    (error =
+	if ((error =
 		dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) &&
 	    error != EINVAL)
 		return (error);
@@ -111,8 +128,7 @@
 	 * Set up the group quota based on file gid.
 	 * EINVAL means that quotas are not enabled.
 	 */
-	if (ip->i_dquot[GRPQUOTA] == NODQUOT &&
-	    (error =
+	if ((error =
 		dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) &&
 	    error != EINVAL)
 		return (error);
@@ -131,8 +147,21 @@
 {
 	struct dquot *dq;
 	ufs2_daddr_t ncurblocks;
-	int i, error;
+	struct vnode *vp = ITOV(ip);
+	int i, error, warn, do_check;
 
+	/*
+	 * Disk quotas must be turned off for system files.  Currently
+	 * snapshot and quota files.
+	 */
+	if ((vp->v_vflag & VV_SYSTEM) != 0)
+		return (0);
+	/*
+	 * XXX: Turn off quotas for files with a negative UID or GID.
+	 * This prevents the creation of 100GB+ quota files.
+	 */
+	if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
+		return (0);
 #ifdef DIAGNOSTIC
 	if ((flags & CHOWN) == 0)
 		chkdquot(ip);
@@ -143,10 +172,8 @@
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
-			while (dq->dq_flags & DQ_LOCK) {
-				dq->dq_flags |= DQ_WANT;
-				(void) tsleep(dq, PINOD+1, "chkdq1", 0);
-			}
+			DQI_LOCK(dq);
+			DQI_WAIT(dq, PINOD+1, "chkdq1");
 			ncurblocks = dq->dq_curblocks + change;
 			if (ncurblocks >= 0)
 				dq->dq_curblocks = ncurblocks;
@@ -154,24 +181,46 @@
 				dq->dq_curblocks = 0;
 			dq->dq_flags &= ~DQ_BLKS;
 			dq->dq_flags |= DQ_MOD;
+			DQI_UNLOCK(dq);
 		}
 		return (0);
 	}
-	if ((flags & FORCE) == 0 && suser_cred(cred, 0)) {
-		for (i = 0; i < MAXQUOTAS; i++) {
-			if ((dq = ip->i_dquot[i]) == NODQUOT)
-				continue;
-			error = chkdqchg(ip, change, cred, i);
-			if (error)
-				return (error);
-		}
-	}
+	if ((flags & FORCE) == 0 &&
+	    priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
+		do_check = 1;
+	else
+		do_check = 0;
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = ip->i_dquot[i]) == NODQUOT)
 			continue;
-		while (dq->dq_flags & DQ_LOCK) {
-			dq->dq_flags |= DQ_WANT;
-			(void) tsleep(dq, PINOD+1, "chkdq2", 0);
+		warn = 0;
+		DQI_LOCK(dq);
+		DQI_WAIT(dq, PINOD+1, "chkdq2");
+		if (do_check) {
+			error = chkdqchg(ip, change, cred, i, &warn);
+			if (error) {
+				/*
+				 * Roll back user quota changes when
+				 * group quota failed.
+				 */
+				while (i > 0) {
+					--i;
+					dq = ip->i_dquot[i];
+					if (dq == NODQUOT)
+						continue;
+					DQI_LOCK(dq);
+					DQI_WAIT(dq, PINOD+1, "chkdq3");
+					ncurblocks = dq->dq_curblocks - change;
+					if (ncurblocks >= 0)
+						dq->dq_curblocks = ncurblocks;
+					else
+						dq->dq_curblocks = 0;
+					dq->dq_flags &= ~DQ_BLKS;
+					dq->dq_flags |= DQ_MOD;
+					DQI_UNLOCK(dq);
+				}
+				return (error);
+			}
 		}
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curblocks + change >= dq->dq_bsoftlimit &&
@@ -180,6 +229,11 @@
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_btime[i];
 		dq->dq_curblocks += change;
 		dq->dq_flags |= DQ_MOD;
+		DQI_UNLOCK(dq);
+		if (warn)
+			uprintf("\n%s: warning, %s %s\n",
+				ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+				quotatypes[i], "disk quota exceeded");
 	}
 	return (0);
 }
@@ -189,11 +243,12 @@
  * Issue an error message if appropriate.
  */
 static int
-chkdqchg(ip, change, cred, type)
+chkdqchg(ip, change, cred, type, warn)
 	struct inode *ip;
 	ufs2_daddr_t change;
 	struct ucred *cred;
 	int type;
+	int *warn;
 {
 	struct dquot *dq = ip->i_dquot[type];
 	ufs2_daddr_t ncurblocks = dq->dq_curblocks + change;
@@ -204,11 +259,14 @@
 	if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) {
 		if ((dq->dq_flags & DQ_BLKS) == 0 &&
 		    ip->i_uid == cred->cr_uid) {
+			dq->dq_flags |= DQ_BLKS;
+			DQI_UNLOCK(dq);
 			uprintf("\n%s: write failed, %s disk limit reached\n",
 			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 			    quotatypes[type]);
-			dq->dq_flags |= DQ_BLKS;
+			return (EDQUOT);
 		}
+		DQI_UNLOCK(dq);
 		return (EDQUOT);
 	}
 	/*
@@ -220,20 +278,21 @@
 			dq->dq_btime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type];
 			if (ip->i_uid == cred->cr_uid)
-				uprintf("\n%s: warning, %s %s\n",
-				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
-				    quotatypes[type], "disk quota exceeded");
+				*warn = 1;
 			return (0);
 		}
 		if (time_second > dq->dq_btime) {
 			if ((dq->dq_flags & DQ_BLKS) == 0 &&
 			    ip->i_uid == cred->cr_uid) {
+				dq->dq_flags |= DQ_BLKS;
+				DQI_UNLOCK(dq);
 				uprintf("\n%s: write failed, %s %s\n",
 				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 				    quotatypes[type],
 				    "disk quota exceeded for too long");
-				dq->dq_flags |= DQ_BLKS;
+				return (EDQUOT);
 			}
+			DQI_UNLOCK(dq);
 			return (EDQUOT);
 		}
 	}
@@ -246,13 +305,13 @@
 int
 chkiq(ip, change, cred, flags)
 	struct inode *ip;
-	ino_t change;
+	int change;
 	struct ucred *cred;
 	int flags;
 {
 	struct dquot *dq;
 	ino_t ncurinodes;
-	int i, error;
+	int i, error, warn, do_check;
 
 #ifdef DIAGNOSTIC
 	if ((flags & CHOWN) == 0)
@@ -260,41 +319,62 @@
 #endif
 	if (change == 0)
 		return (0);
-	/* XXX: change is unsigned */
 	if (change < 0) {
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if ((dq = ip->i_dquot[i]) == NODQUOT)
 				continue;
-			while (dq->dq_flags & DQ_LOCK) {
-				dq->dq_flags |= DQ_WANT;
-				(void) tsleep(dq, PINOD+1, "chkiq1", 0);
-			}
+			DQI_LOCK(dq);
+			DQI_WAIT(dq, PINOD+1, "chkiq1");
 			ncurinodes = dq->dq_curinodes + change;
 			/* XXX: ncurinodes is unsigned */
-			if (ncurinodes >= 0)
+			if (dq->dq_curinodes != 0 && ncurinodes >= 0)
 				dq->dq_curinodes = ncurinodes;
 			else
 				dq->dq_curinodes = 0;
 			dq->dq_flags &= ~DQ_INODS;
 			dq->dq_flags |= DQ_MOD;
+			DQI_UNLOCK(dq);
 		}
 		return (0);
 	}
-	if ((flags & FORCE) == 0 && suser_cred(cred, 0)) {
-		for (i = 0; i < MAXQUOTAS; i++) {
-			if ((dq = ip->i_dquot[i]) == NODQUOT)
-				continue;
-			error = chkiqchg(ip, change, cred, i);
-			if (error)
-				return (error);
-		}
-	}
+	if ((flags & FORCE) == 0 &&
+	    priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0))
+		do_check = 1;
+	else
+		do_check = 0;
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if ((dq = ip->i_dquot[i]) == NODQUOT)
 			continue;
-		while (dq->dq_flags & DQ_LOCK) {
-			dq->dq_flags |= DQ_WANT;
-			(void) tsleep(dq, PINOD+1, "chkiq2", 0);
+		warn = 0;
+		DQI_LOCK(dq);
+		DQI_WAIT(dq, PINOD+1, "chkiq2");
+		if (do_check) {
+			error = chkiqchg(ip, change, cred, i, &warn);
+			if (error) {
+				/*
+				 * Roll back user quota changes when
+				 * group quota failed.
+				 */
+				while (i > 0) {
+					--i;
+					dq = ip->i_dquot[i];
+					if (dq == NODQUOT)
+						continue;
+					DQI_LOCK(dq);
+					DQI_WAIT(dq, PINOD+1, "chkiq3");
+					ncurinodes = dq->dq_curinodes - change;
+					/* XXX: ncurinodes is unsigned */
+					if (dq->dq_curinodes != 0 &&
+					    ncurinodes >= 0)
+						dq->dq_curinodes = ncurinodes;
+					else
+						dq->dq_curinodes = 0;
+					dq->dq_flags &= ~DQ_INODS;
+					dq->dq_flags |= DQ_MOD;
+					DQI_UNLOCK(dq);
+				}
+				return (error);
+			}
 		}
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curinodes + change >= dq->dq_isoftlimit &&
@@ -303,6 +383,11 @@
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_itime[i];
 		dq->dq_curinodes += change;
 		dq->dq_flags |= DQ_MOD;
+		DQI_UNLOCK(dq);
+		if (warn)
+			uprintf("\n%s: warning, %s %s\n",
+				ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+				quotatypes[i], "inode quota exceeded");
 	}
 	return (0);
 }
@@ -312,11 +397,12 @@
  * Issue an error message if appropriate.
  */
 static int
-chkiqchg(ip, change, cred, type)
+chkiqchg(ip, change, cred, type, warn)
 	struct inode *ip;
-	ino_t change;
+	int change;
 	struct ucred *cred;
 	int type;
+	int *warn;
 {
 	struct dquot *dq = ip->i_dquot[type];
 	ino_t ncurinodes = dq->dq_curinodes + change;
@@ -327,11 +413,14 @@
 	if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) {
 		if ((dq->dq_flags & DQ_INODS) == 0 &&
 		    ip->i_uid == cred->cr_uid) {
+			dq->dq_flags |= DQ_INODS;
+			DQI_UNLOCK(dq);
 			uprintf("\n%s: write failed, %s inode limit reached\n",
 			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
 			    quotatypes[type]);
-			dq->dq_flags |= DQ_INODS;
+			return (EDQUOT);
 		}
+		DQI_UNLOCK(dq);
 		return (EDQUOT);
 	}
 	/*
@@ -343,20 +432,21 @@
 			dq->dq_itime = time_second +
 			    VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type];
 			if (ip->i_uid == cred->cr_uid)
-				uprintf("\n%s: warning, %s %s\n",
-				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
-				    quotatypes[type], "inode quota exceeded");
+				*warn = 1;
 			return (0);
 		}
 		if (time_second > dq->dq_itime) {
 			if ((dq->dq_flags & DQ_INODS) == 0 &&
 			    ip->i_uid == cred->cr_uid) {
-				uprintf("\n%s: write failed, %s %s\n",
-				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
-				    quotatypes[type],
-				    "inode quota exceeded for too long");
 				dq->dq_flags |= DQ_INODS;
+				DQI_UNLOCK(dq);
+				uprintf("\n%s: write failed, %s %s\n",
+					ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+					quotatypes[type],
+					"inode quota exceeded for too long");
+				return (EDQUOT);
 			}
+			DQI_UNLOCK(dq);
 			return (EDQUOT);
 		}
 	}
@@ -373,17 +463,34 @@
 	struct inode *ip;
 {
 	struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount);
+	struct vnode *vp = ITOV(ip);
 	int i;
 
+	/*
+	 * Disk quotas must be turned off for system files.  Currently
+	 * these are snapshots and quota files.
+	 */
+	if ((vp->v_vflag & VV_SYSTEM) != 0)
+		return;
+	/*
+	 * XXX: Turn off quotas for files with a negative UID or GID.
+	 * This prevents the creation of 100GB+ quota files.
+	 */
+	if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0)
+		return;
+
+	UFS_LOCK(ump);
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (ump->um_quotas[i] == NULLVP ||
 		    (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING)))
 			continue;
 		if (ip->i_dquot[i] == NODQUOT) {
+			UFS_UNLOCK(ump);
 			vprint("chkdquot: missing dquot", ITOV(ip));
 			panic("chkdquot: missing dquot");
 		}
 	}
+	UFS_UNLOCK(ump);
 }
 #endif
 
@@ -399,40 +506,59 @@
 	struct thread *td;
 	struct mount *mp;
 	int type;
-	caddr_t fname;
+	void *fname;
 {
-	struct ufsmount *ump = VFSTOUFS(mp);
+	struct ufsmount *ump;
 	struct vnode *vp, **vpp;
 	struct vnode *mvp;
 	struct dquot *dq;
-	int error, flags;
+	int error, flags, vfslocked;
 	struct nameidata nd;
 
-	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+	error = priv_check(td, PRIV_UFS_QUOTAON);
 	if (error)
 		return (error);
 
-	vpp = &ump->um_quotas[type];
-	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, td);
+	ump = VFSTOUFS(mp);
+	dq = NODQUOT;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_USERSPACE, fname, td);
 	flags = FREAD | FWRITE;
-	error = vn_open(&nd, &flags, 0, -1);
+	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		return (error);
+	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	VOP_UNLOCK(vp, 0, td);
 	if (vp->v_type != VREG) {
 		(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
+		VFS_UNLOCK_GIANT(vfslocked);
 		return (EACCES);
 	}
-	if (*vpp != vp)
-		quotaoff(td, mp, type);
-	ump->um_qflags[type] |= QTF_OPENING;
+
+	UFS_LOCK(ump);
+	if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) {
+		UFS_UNLOCK(ump);
+		(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
+		VFS_UNLOCK_GIANT(vfslocked);
+		return (EALREADY);
+	}
+	ump->um_qflags[type] |= QTF_OPENING|QTF_CLOSING;
+	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_QUOTA;
+	MNT_IUNLOCK(mp);
+	UFS_UNLOCK(ump);
+
+	vpp = &ump->um_quotas[type];
+	if (*vpp != vp)
+		quotaoff1(td, mp, type);
+
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	vp->v_vflag |= VV_SYSTEM;
 	VOP_UNLOCK(vp, 0, td);
 	*vpp = vp;
+	VFS_UNLOCK_GIANT(vfslocked);
 	/*
 	 * Save the credential of the process that turned on quotas.
 	 * Set up the time limits for this quota.
@@ -448,6 +574,13 @@
 		dqrele(NULLVP, dq);
 	}
 	/*
+	 * Allow the getdq from getinoquota below to read the quota
+	 * from file.
+	 */
+	UFS_LOCK(ump);
+	ump->um_qflags[type] &= ~QTF_CLOSING;
+	UFS_UNLOCK(ump);
+	/*
 	 * Search vnodes associated with this mount point,
 	 * adding references to quota file being opened.
 	 * NB: only need to add dquot's for inodes being modified.
@@ -478,35 +611,49 @@
 		}
 	}
 	MNT_IUNLOCK(mp);
+
+        if (error)
+		quotaoff_inchange(td, mp, type);
+	UFS_LOCK(ump);
 	ump->um_qflags[type] &= ~QTF_OPENING;
-	if (error)
-		quotaoff(td, mp, type);
+	KASSERT((ump->um_qflags[type] & QTF_CLOSING) == 0,
+		("quotaon: leaking flags"));
+	UFS_UNLOCK(ump);
+
 	return (error);
 }
 
 /*
- * Q_QUOTAOFF - turn off disk quotas for a filesystem.
+ * Main code to turn off disk quotas for a filesystem. Does not change
+ * flags.
  */
-int
-quotaoff(td, mp, type)
+static int
+quotaoff1(td, mp, type)
 	struct thread *td;
 	struct mount *mp;
 	int type;
 {
 	struct vnode *vp;
 	struct vnode *qvp, *mvp;
-	struct ufsmount *ump = VFSTOUFS(mp);
+	struct ufsmount *ump;
 	struct dquot *dq;
 	struct inode *ip;
+	struct ucred *cr;
+	int vfslocked;
 	int error;
 
-	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
-	if (error)
-		return (error);
+	ump = VFSTOUFS(mp);
 
-	if ((qvp = ump->um_quotas[type]) == NULLVP)
+	UFS_LOCK(ump);
+	KASSERT((ump->um_qflags[type] & QTF_CLOSING) != 0,
+		("quotaoff1: flags are invalid"));
+	if ((qvp = ump->um_quotas[type]) == NULLVP) {
+		UFS_UNLOCK(ump);
 		return (0);
-	ump->um_qflags[type] |= QTF_CLOSING;
+	}
+	cr = ump->um_cred[type];
+	UFS_UNLOCK(ump);
+	
 	/*
 	 * Search vnodes associated with this mount point,
 	 * deleting any references to quota file being closed.
@@ -535,24 +682,88 @@
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
+
 	dqflush(qvp);
+	/* Clear um_quotas before closing the quota vnode to prevent
+	 * access to the closed vnode from dqget/dqsync
+	 */
+	UFS_LOCK(ump);
+	ump->um_quotas[type] = NULLVP;
+	ump->um_cred[type] = NOCRED;
+	UFS_UNLOCK(ump);
+
+	vfslocked = VFS_LOCK_GIANT(qvp->v_mount);
 	vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY, td);
 	qvp->v_vflag &= ~VV_SYSTEM;
 	VOP_UNLOCK(qvp, 0, td);
 	error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td);
-	ump->um_quotas[type] = NULLVP;
-	crfree(ump->um_cred[type]);
-	ump->um_cred[type] = NOCRED;
+	VFS_UNLOCK_GIANT(vfslocked);
+	crfree(cr);
+
+	return (error);
+}
+
+/*
+ * Turns off quotas, assumes that ump->um_qflags are already checked
+ * and QTF_CLOSING is set to indicate operation in progress. Fixes
+ * ump->um_qflags and mp->mnt_flag after.
+ */
+int
+quotaoff_inchange(td, mp, type)
+	struct thread *td;
+	struct mount *mp;
+	int type;
+{
+	struct ufsmount *ump;
+	int i;
+	int error;
+
+	error = quotaoff1(td, mp, type);
+
+	ump = VFSTOUFS(mp);
+	UFS_LOCK(ump);
 	ump->um_qflags[type] &= ~QTF_CLOSING;
-	for (type = 0; type < MAXQUOTAS; type++)
-		if (ump->um_quotas[type] != NULLVP)
+	for (i = 0; i < MAXQUOTAS; i++)
+		if (ump->um_quotas[i] != NULLVP)
 			break;
-	if (type == MAXQUOTAS)
+	if (i == MAXQUOTAS) {
+		MNT_ILOCK(mp);
 		mp->mnt_flag &= ~MNT_QUOTA;
+		MNT_IUNLOCK(mp);
+	}
+	UFS_UNLOCK(ump);
 	return (error);
 }
 
 /*
+ * Q_QUOTAOFF - turn off disk quotas for a filesystem.
+ */
+int
+quotaoff(td, mp, type)
+	struct thread *td;
+	struct mount *mp;
+	int type;
+{
+	struct ufsmount *ump;
+	int error;
+
+	error = priv_check(td, PRIV_UFS_QUOTAOFF);
+	if (error)
+		return (error);
+
+	ump = VFSTOUFS(mp);
+	UFS_LOCK(ump);
+	if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) {
+		UFS_UNLOCK(ump);
+		return (EALREADY);
+	}
+	ump->um_qflags[type] |= QTF_CLOSING;
+	UFS_UNLOCK(ump);
+
+	return (quotaoff_inchange(td, mp, type));
+}
+
+/*
  * Q_GETQUOTA - return current values in a dqblk structure.
  */
 int
@@ -561,7 +772,7 @@
 	struct mount *mp;
 	u_long id;
 	int type;
-	caddr_t addr;
+	void *addr;
 {
 	struct dquot *dq;
 	int error;
@@ -569,15 +780,16 @@
 	switch (type) {
 	case USRQUOTA:
 		if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) {
-			error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+			error = priv_check(td, PRIV_VFS_GETQUOTA);
 			if (error)
 				return (error);
 		}
 		break;  
 
 	case GRPQUOTA:
-		if (!groupmember(id, td->td_ucred) && !unprivileged_get_quota) {
-			error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+		if (!groupmember(id, td->td_ucred) &&
+		    !unprivileged_get_quota) {
+			error = priv_check(td, PRIV_VFS_GETQUOTA);
 			if (error)
 				return (error);
 		}
@@ -587,10 +799,11 @@
 		return (EINVAL);
 	}
 
+	dq = NODQUOT;
 	error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq);
 	if (error)
 		return (error);
-	error = copyout((caddr_t)&dq->dq_dqb, addr, sizeof (struct dqblk));
+	error = copyout(&dq->dq_dqb, addr, sizeof (struct dqblk));
 	dqrele(NULLVP, dq);
 	return (error);
 }
@@ -604,29 +817,32 @@
 	struct mount *mp;
 	u_long id;
 	int type;
-	caddr_t addr;
+	void *addr;
 {
 	struct dquot *dq;
 	struct dquot *ndq;
-	struct ufsmount *ump = VFSTOUFS(mp);
+	struct ufsmount *ump;
 	struct dqblk newlim;
 	int error;
 
-	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+	error = priv_check(td, PRIV_VFS_SETQUOTA);
 	if (error)
 		return (error);
 
-	error = copyin(addr, (caddr_t)&newlim, sizeof (struct dqblk));
+	ump = VFSTOUFS(mp);
+	error = copyin(addr, &newlim, sizeof (struct dqblk));
 	if (error)
 		return (error);
+
+	ndq = NODQUOT;
+	ump = VFSTOUFS(mp);
+
 	error = dqget(NULLVP, id, ump, type, &ndq);
 	if (error)
 		return (error);
 	dq = ndq;
-	while (dq->dq_flags & DQ_LOCK) {
-		dq->dq_flags |= DQ_WANT;
-		(void) tsleep(dq, PINOD+1, "setqta", 0);
-	}
+	DQI_LOCK(dq);
+	DQI_WAIT(dq, PINOD+1, "setqta");
 	/*
 	 * Copy all but the current values.
 	 * Reset time limit if previously had no soft limit or were
@@ -657,6 +873,7 @@
 	else
 		dq->dq_flags &= ~DQ_FAKE;
 	dq->dq_flags |= DQ_MOD;
+	DQI_UNLOCK(dq);
 	dqrele(NULLVP, dq);
 	return (0);
 }
@@ -670,29 +887,32 @@
 	struct mount *mp;
 	u_long id;
 	int type;
-	caddr_t addr;
+	void *addr;
 {
 	struct dquot *dq;
-	struct ufsmount *ump = VFSTOUFS(mp);
+	struct ufsmount *ump;
 	struct dquot *ndq;
 	struct dqblk usage;
 	int error;
 
-	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+	error = priv_check(td, PRIV_UFS_SETUSE);
 	if (error)
 		return (error);
 
-	error = copyin(addr, (caddr_t)&usage, sizeof (struct dqblk));
+	ump = VFSTOUFS(mp);
+	error = copyin(addr, &usage, sizeof (struct dqblk));
 	if (error)
 		return (error);
+
+	ump = VFSTOUFS(mp);
+	ndq = NODQUOT;
+
 	error = dqget(NULLVP, id, ump, type, &ndq);
 	if (error)
 		return (error);
 	dq = ndq;
-	while (dq->dq_flags & DQ_LOCK) {
-		dq->dq_flags |= DQ_WANT;
-		(void) tsleep(dq, PINOD+1, "setuse", 0);
-	}
+	DQI_LOCK(dq);
+	DQI_WAIT(dq, PINOD+1, "setuse");
 	/*
 	 * Reset time limit if have a soft limit and were
 	 * previously under it, but are now over it.
@@ -710,6 +930,7 @@
 	if (dq->dq_curinodes < dq->dq_isoftlimit)
 		dq->dq_flags &= ~DQ_INODS;
 	dq->dq_flags |= DQ_MOD;
+	DQI_UNLOCK(dq);
 	dqrele(NULLVP, dq);
 	return (0);
 }
@@ -731,9 +952,11 @@
 	 * Check if the mount point has any quotas.
 	 * If not, simply return.
 	 */
+	UFS_LOCK(ump);
 	for (i = 0; i < MAXQUOTAS; i++)
 		if (ump->um_quotas[i] != NULLVP)
 			break;
+	UFS_UNLOCK(ump);
 	if (i == MAXQUOTAS)
 		return (0);
 	/*
@@ -761,7 +984,7 @@
 		}
 		for (i = 0; i < MAXQUOTAS; i++) {
 			dq = VTOI(vp)->i_dquot[i];
-			if (dq != NODQUOT && (dq->dq_flags & DQ_MOD))
+			if (dq != NODQUOT)
 				dqsync(vp, dq);
 		}
 		vput(vp);
@@ -786,6 +1009,18 @@
 static TAILQ_HEAD(dqfreelist, dquot) dqfreelist;
 static long numdquot, desireddquot = DQUOTINC;
 
+/* 
+ * Lock to protect quota hash, dq free list and dq_cnt ref counters of
+ * _all_ dqs.
+ */
+struct mtx dqhlock;
+
+#define	DQH_LOCK()	mtx_lock(&dqhlock)
+#define	DQH_UNLOCK()	mtx_unlock(&dqhlock)
+
+static struct dquot *dqhashfind(struct dqhash *dqh, u_long id,
+	struct vnode *dqvp);
+
 /*
  * Initialize the quota system.
  */
@@ -793,6 +1028,7 @@
 dqinit()
 {
 
+	mtx_init(&dqhlock, "dqhlock", NULL, MTX_DEF);
 	dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash);
 	TAILQ_INIT(&dqfreelist);
 }
@@ -808,8 +1044,35 @@
 	hashdestroy(dqhashtbl, M_DQUOT, dqhash);
 	while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) {
 		TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
+		mtx_destroy(&dq->dq_lock);
 		free(dq, M_DQUOT);
 	}
+	mtx_destroy(&dqhlock);
+}
+
+static struct dquot *
+dqhashfind(dqh, id, dqvp)
+	struct dqhash *dqh;
+	u_long id;
+	struct vnode *dqvp;
+{
+	struct dquot *dq;
+
+	mtx_assert(&dqhlock, MA_OWNED);
+	LIST_FOREACH(dq, dqh, dq_hash) {
+		if (dq->dq_id != id ||
+		    dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
+			continue;
+		/*
+		 * Cache hit with no references.  Take
+		 * the structure off the free list.
+		 */
+		if (dq->dq_cnt == 0)
+			TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
+		DQREF(dq);
+		return (dq);
+	}
+	return (NODQUOT);
 }
 
 /*
@@ -825,50 +1088,122 @@
 	struct dquot **dqp;
 {
 	struct thread *td = curthread;		/* XXX */
-	struct dquot *dq;
+	struct dquot *dq, *dq1;
 	struct dqhash *dqh;
 	struct vnode *dqvp;
 	struct iovec aiov;
 	struct uio auio;
-	int error;
+	int vfslocked, dqvplocked, error;
 
+#ifdef DEBUG_VFS_LOCKS
+	if (vp != NULLVP)
+		ASSERT_VOP_ELOCKED(vp, "dqget");
+#endif
+
+	if (vp != NULLVP && *dqp != NODQUOT) {
+		return (0);
+	}
+
+	/* XXX: Disallow negative id values to prevent the
+	* creation of 100GB+ quota data files.
+	*/
+	if ((int)id < 0)
+		return (EINVAL);
+
+	UFS_LOCK(ump);
 	dqvp = ump->um_quotas[type];
 	if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) {
 		*dqp = NODQUOT;
+		UFS_UNLOCK(ump);
 		return (EINVAL);
 	}
+	vref(dqvp);
+	UFS_UNLOCK(ump);
+	error = 0;
+	dqvplocked = 0;
+
 	/*
 	 * Check the cache first.
 	 */
 	dqh = DQHASH(dqvp, id);
-	LIST_FOREACH(dq, dqh, dq_hash) {
-		if (dq->dq_id != id ||
-		    dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
-			continue;
+	DQH_LOCK();
+	dq = dqhashfind(dqh, id, dqvp);
+	if (dq != NULL) {
+		DQH_UNLOCK();
+hfound:		DQI_LOCK(dq);
+		DQI_WAIT(dq, PINOD+1, "dqget");
+		DQI_UNLOCK(dq);
+		if (dq->dq_ump == NULL) {
+			dqrele(vp, dq);
+			dq = NODQUOT;
+			error = EIO;
+		}
+		*dqp = dq;
+		vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
+		if (dqvplocked)
+			vput(dqvp);
+		else
+			vrele(dqvp);
+		VFS_UNLOCK_GIANT(vfslocked);
+		return (error);
+	}
+
+	/*
+	 * Quota vnode lock is before DQ_LOCK. Acquire dqvp lock there
+	 * since new dq will appear on the hash chain DQ_LOCKed.
+	 */
+	if (vp != dqvp) {
+		DQH_UNLOCK();
+		vn_lock(dqvp, LK_SHARED | LK_RETRY, td);
+		dqvplocked = 1;
+		DQH_LOCK();
 		/*
-		 * Cache hit with no references.  Take
-		 * the structure off the free list.
+		 * Recheck the cache after sleep for quota vnode lock.
 		 */
-		if (dq->dq_cnt == 0)
-			TAILQ_REMOVE(&dqfreelist, dq, dq_freelist);
-		DQREF(dq);
-		*dqp = dq;
-		return (0);
+		dq = dqhashfind(dqh, id, dqvp);
+		if (dq != NULL) {
+			DQH_UNLOCK();
+			goto hfound;
+		}
 	}
+
 	/*
-	 * Not in cache, allocate a new one.
+	 * Not in cache, allocate a new one or take it from the
+	 * free list.
 	 */
 	if (TAILQ_FIRST(&dqfreelist) == NODQUOT &&
 	    numdquot < MAXQUOTAS * desiredvnodes)
 		desireddquot += DQUOTINC;
 	if (numdquot < desireddquot) {
-		dq = (struct dquot *)malloc(sizeof *dq, M_DQUOT,
-		    M_WAITOK | M_ZERO);
 		numdquot++;
+		DQH_UNLOCK();
+		dq1 = (struct dquot *)malloc(sizeof *dq, M_DQUOT,
+		    M_WAITOK | M_ZERO);
+		mtx_init(&dq1->dq_lock, "dqlock", NULL, MTX_DEF);
+		DQH_LOCK();
+		/*
+		 * Recheck the cache after sleep for memory.
+		 */
+		dq = dqhashfind(dqh, id, dqvp);
+		if (dq != NULL) {
+			numdquot--;
+			DQH_UNLOCK();
+			mtx_destroy(&dq1->dq_lock);
+			free(dq1, M_DQUOT);
+			goto hfound;
+		}
+		dq = dq1;
 	} else {
 		if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) {
+			DQH_UNLOCK();
 			tablefull("dquot");
 			*dqp = NODQUOT;
+			vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
+			if (dqvplocked)
+				vput(dqvp);
+			else
+				vrele(dqvp);
+			VFS_UNLOCK_GIANT(vfslocked);
 			return (EUSERS);
 		}
 		if (dq->dq_cnt || (dq->dq_flags & DQ_MOD))
@@ -877,44 +1212,57 @@
 		if (dq->dq_ump != NULL)
 			LIST_REMOVE(dq, dq_hash);
 	}
+
 	/*
-	 * Initialize the contents of the dquot structure.
+	 * Dq is put into hash already locked to prevent parallel
+	 * usage while it is being read from file.
 	 */
-	if (vp != dqvp)
-		vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td);
-	LIST_INSERT_HEAD(dqh, dq, dq_hash);
-	DQREF(dq);
 	dq->dq_flags = DQ_LOCK;
 	dq->dq_id = id;
-	dq->dq_ump = ump;
 	dq->dq_type = type;
+	dq->dq_ump = ump;
+	LIST_INSERT_HEAD(dqh, dq, dq_hash);
+	DQREF(dq);
+	DQH_UNLOCK();
+
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
-	aiov.iov_base = (caddr_t)&dq->dq_dqb;
+	aiov.iov_base = &dq->dq_dqb;
 	aiov.iov_len = sizeof (struct dqblk);
 	auio.uio_resid = sizeof (struct dqblk);
-	auio.uio_offset = (off_t)(id * sizeof (struct dqblk));
+	auio.uio_offset = (off_t)id * sizeof (struct dqblk);
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = (struct thread *)0;
+
+	vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
 	error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]);
 	if (auio.uio_resid == sizeof(struct dqblk) && error == 0)
-		bzero((caddr_t)&dq->dq_dqb, sizeof(struct dqblk));
-	if (vp != dqvp)
-		VOP_UNLOCK(dqvp, 0, td);
-	if (dq->dq_flags & DQ_WANT)
-		wakeup(dq);
-	dq->dq_flags = 0;
+		bzero(&dq->dq_dqb, sizeof(struct dqblk));
+	if (dqvplocked)
+		vput(dqvp);
+	else
+		vrele(dqvp);
+	VFS_UNLOCK_GIANT(vfslocked);
 	/*
 	 * I/O error in reading quota file, release
 	 * quota structure and reflect problem to caller.
 	 */
 	if (error) {
+		DQH_LOCK();
+		dq->dq_ump = NULL;
 		LIST_REMOVE(dq, dq_hash);
+		DQH_UNLOCK();
+		DQI_LOCK(dq);
+		if (dq->dq_flags & DQ_WANT)
+			wakeup(dq);
+		dq->dq_flags = 0;
+		DQI_UNLOCK(dq);
 		dqrele(vp, dq);
 		*dqp = NODQUOT;
 		return (error);
 	}
+	DQI_LOCK(dq);
 	/*
 	 * Check for no limit to enforce.
 	 * Initialize time values if necessary.
@@ -923,11 +1271,21 @@
 	    dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
 		dq->dq_flags |= DQ_FAKE;
 	if (dq->dq_id != 0) {
-		if (dq->dq_btime == 0)
+		if (dq->dq_btime == 0) {
 			dq->dq_btime = time_second + ump->um_btime[type];
-		if (dq->dq_itime == 0)
+			if (dq->dq_bsoftlimit &&
+			    dq->dq_curblocks >= dq->dq_bsoftlimit)
+				dq->dq_flags |= DQ_MOD;
+		}
+		if (dq->dq_itime == 0) {
 			dq->dq_itime = time_second + ump->um_itime[type];
+			if (dq->dq_isoftlimit &&
+			    dq->dq_curinodes >= dq->dq_isoftlimit)
+				dq->dq_flags |= DQ_MOD;
+		}
 	}
+	DQI_WAKEUP(dq);
+	DQI_UNLOCK(dq);
 	*dqp = dq;
 	return (0);
 }
@@ -956,15 +1314,24 @@
 
 	if (dq == NODQUOT)
 		return;
+	DQH_LOCK();
 	if (dq->dq_cnt > 1) {
 		dq->dq_cnt--;
+		DQH_UNLOCK();
 		return;
 	}
-	if (dq->dq_flags & DQ_MOD)
-		(void) dqsync(vp, dq);
+	DQH_UNLOCK();
+
+	(void) dqsync(vp, dq);
+
+	DQH_LOCK();
 	if (--dq->dq_cnt > 0)
+	{
+		DQH_UNLOCK();
 		return;
+	}
 	TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist);
+	DQH_UNLOCK();
 }
 
 /*
@@ -979,48 +1346,75 @@
 	struct vnode *dqvp;
 	struct iovec aiov;
 	struct uio auio;
-	int error;
+	int vfslocked, error;
 	struct mount *mp;
+	struct ufsmount *ump;
+
+#ifdef DEBUG_VFS_LOCKS
+	if (vp != NULL)
+		ASSERT_VOP_ELOCKED(vp, "dqsync");
+#endif
 
 	mp = NULL;
+	error = 0;
 	if (dq == NODQUOT)
 		panic("dqsync: dquot");
-	if ((dq->dq_flags & DQ_MOD) == 0)
+	if ((ump = dq->dq_ump) == NULL)
 		return (0);
-	if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP)
+	UFS_LOCK(ump);
+	if ((dqvp = ump->um_quotas[dq->dq_type]) == NULLVP)
 		panic("dqsync: file");
+	vref(dqvp);
+	UFS_UNLOCK(ump);
+
+	vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
+	DQI_LOCK(dq);
+	if ((dq->dq_flags & DQ_MOD) == 0) {
+		DQI_UNLOCK(dq);
+		vrele(dqvp);
+		VFS_UNLOCK_GIANT(vfslocked);
+		return (0);
+	}
+	DQI_UNLOCK(dq);
+
 	(void) vn_start_secondary_write(dqvp, &mp, V_WAIT);
 	if (vp != dqvp)
 		vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td);
-	while (dq->dq_flags & DQ_LOCK) {
-		dq->dq_flags |= DQ_WANT;
-		(void) tsleep(dq, PINOD+2, "dqsync", 0);
-		if ((dq->dq_flags & DQ_MOD) == 0) {
-			if (vp != dqvp)
-				VOP_UNLOCK(dqvp, 0, td);
-			vn_finished_secondary_write(mp);
-			return (0);
-		}
-	}
+
+	VFS_UNLOCK_GIANT(vfslocked);
+	DQI_LOCK(dq);
+	DQI_WAIT(dq, PINOD+2, "dqsync");
+	if ((dq->dq_flags & DQ_MOD) == 0)
+		goto out;
 	dq->dq_flags |= DQ_LOCK;
+	DQI_UNLOCK(dq);
+
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
-	aiov.iov_base = (caddr_t)&dq->dq_dqb;
+	aiov.iov_base = &dq->dq_dqb;
 	aiov.iov_len = sizeof (struct dqblk);
 	auio.uio_resid = sizeof (struct dqblk);
-	auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk));
+	auio.uio_offset = (off_t)dq->dq_id * sizeof (struct dqblk);
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = (struct thread *)0;
+	vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
 	error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]);
+	VFS_UNLOCK_GIANT(vfslocked);
 	if (auio.uio_resid && error == 0)
 		error = EIO;
-	if (dq->dq_flags & DQ_WANT)
-		wakeup(dq);
-	dq->dq_flags &= ~(DQ_MOD|DQ_LOCK|DQ_WANT);
+
+	DQI_LOCK(dq);
+	DQI_WAKEUP(dq);
+	dq->dq_flags &= ~DQ_MOD;
+out:	DQI_UNLOCK(dq);
+	vfslocked = VFS_LOCK_GIANT(dqvp->v_mount);
 	if (vp != dqvp)
-		VOP_UNLOCK(dqvp, 0, td);
+		vput(dqvp);
+	else
+		vrele(dqvp);
 	vn_finished_secondary_write(mp);
+	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -1039,6 +1433,7 @@
 	 * file off their hash chains (they will eventually
 	 * fall off the head of the free list and be re-used).
 	 */
+	DQH_LOCK();
 	for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) {
 		for (dq = LIST_FIRST(dqh); dq; dq = nextdq) {
 			nextdq = LIST_NEXT(dq, dq_hash);
@@ -1050,4 +1445,5 @@
 			dq->dq_ump = (struct ufsmount *)0;
 		}
 	}
+	DQH_UNLOCK();
 }
Index: inode.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/inode.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/inode.h -L sys/ufs/ufs/inode.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/inode.h
+++ sys/ufs/ufs/inode.h
@@ -32,7 +32,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)inode.h	8.9 (Berkeley) 5/14/95
- * $FreeBSD: src/sys/ufs/ufs/inode.h,v 1.49 2005/03/14 10:21:16 phk Exp $
+ * $FreeBSD: src/sys/ufs/ufs/inode.h,v 1.51 2006/10/10 09:20:54 kib Exp $
  */
 
 #ifndef _UFS_UFS_INODE_H_
@@ -55,6 +55,13 @@
  * is the permanent meta-data associated with the file which is read in
  * from the permanent dinode from long term storage when the file becomes
  * active, and is put back when the file is no longer being used.
+ *
+ * An inode may only be changed while holding either the exclusive
+ * vnode lock or the shared vnode lock and the vnode interlock. We use
+ * the latter only for "read" and "get" operations that require
+ * changing i_flag, or a timestamp. This locking protocol allows executing
+ * those operations without having to upgrade the vnode lock from shared to
+ * exclusive.
  */
 struct inode {
 	TAILQ_ENTRY(inode) i_nextsnap; /* snapshot file list. */
@@ -119,6 +126,8 @@
 #define	IN_RENAME	0x0010		/* Inode is being renamed. */
 #define	IN_LAZYMOD	0x0040		/* Modified, but don't write yet. */
 #define	IN_SPACECOUNTED	0x0080		/* Blocks to be freed in free count. */
+#define	IN_LAZYACCESS	0x0100		/* Process IN_ACCESS after the
+					   suspension finished */
 
 #define i_devvp i_ump->um_devvp
 #define i_umbufobj i_ump->um_bo
@@ -166,7 +175,7 @@
 
 /* Determine if soft dependencies are being done */
 #define DOINGSOFTDEP(vp)	((vp)->v_mount->mnt_flag & MNT_SOFTDEP)
-#define DOINGASYNC(vp)		((vp)->v_mount->mnt_flag & MNT_ASYNC)
+#define DOINGASYNC(vp)		((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC)
 
 /* This overlays the fid structure (see mount.h). */
 struct ufid {
Index: ufs_extattr.c
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/ufs_extattr.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/ufs/ufs/ufs_extattr.c -L sys/ufs/ufs/ufs_extattr.c -u -r1.2 -r1.3
--- sys/ufs/ufs/ufs_extattr.c
+++ sys/ufs/ufs/ufs_extattr.c
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_extattr.c,v 1.81.2.3 2006/03/13 03:08:08 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_extattr.c,v 1.86 2007/06/01 14:33:11 kib Exp $");
 
 #include "opt_ufs.h"
 
@@ -48,6 +48,7 @@
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
@@ -320,7 +321,7 @@
 {
 	int error;
 
-	error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, -1);
+	error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, NULL);
 	if (error) {
 		printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed "
 		    "with %d\n", error);
@@ -699,7 +700,8 @@
 	 * Processes with privilege, but in jail, are not allowed to
 	 * configure extended attributes.
 	 */
-	if ((error = suser(td))) {
+	error = priv_check(td, PRIV_UFS_EXTATTRCTL);
+	if (error) {
 		if (filename_vp != NULL)
 			VOP_UNLOCK(filename_vp, 0, td);
 		return (error);
Index: dinode.h
===================================================================
RCS file: /home/cvs/src/sys/ufs/ufs/dinode.h,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/ufs/ufs/dinode.h -L sys/ufs/ufs/dinode.h -u -r1.1.1.1 -r1.2
--- sys/ufs/ufs/dinode.h
+++ sys/ufs/ufs/dinode.h
@@ -62,7 +62,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)dinode.h	8.3 (Berkeley) 1/21/94
- * $FreeBSD: src/sys/ufs/ufs/dinode.h,v 1.15 2005/01/07 02:29:26 imp Exp $
+ * $FreeBSD: src/sys/ufs/ufs/dinode.h,v 1.17 2006/05/21 21:55:29 maxim Exp $
  */
 
 #ifndef _UFS_UFS_DINODE_H_
@@ -79,7 +79,7 @@
 /*
  * The Whiteout inode# is a dummy non-zero inode number which will
  * never be allocated to a real file.  It is used as a place holder
- * in the directory entry which has been tagged as a DT_W entry.
+ * in the directory entry which has been tagged as a DT_WHT entry.
  * See the comments about ROOTINO above.
  */
 #define	WINO	((ino_t)1)
@@ -129,7 +129,7 @@
 	u_int32_t	di_gid;		/*   8: File group. */
 	u_int32_t	di_blksize;	/*  12: Inode blocksize. */
 	u_int64_t	di_size;	/*  16: File byte count. */
-	u_int64_t	di_blocks;	/*  24: Bytes actually held. */
+	u_int64_t	di_blocks;	/*  24: Blocks actually held. */
 	ufs_time_t	di_atime;	/*  32: Last access time. */
 	ufs_time_t	di_mtime;	/*  40: Last modified time. */
 	ufs_time_t	di_ctime;	/*  48: Last inode change time. */


More information about the Midnightbsd-cvs mailing list