[Midnightbsd-cvs] src: i386/i386:

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Sat Sep 27 22:12:57 EDT 2008


Log Message:
-----------


Modified Files:
--------------
    src/sys/i386/i386:
        apic_vector.s (r1.1.1.1 -> r1.2)
        bios.c (r1.1.1.1 -> r1.2)
        busdma_machdep.c (r1.2 -> r1.3)
        db_trace.c (r1.2 -> r1.3)
        dump_machdep.c (r1.1.1.1 -> r1.2)
        elan-mmcr.c (r1.1.1.1 -> r1.2)
        elf_machdep.c (r1.1.1.1 -> r1.2)
        exception.s (r1.1.1.1 -> r1.2)
        genassym.c (r1.1.1.1 -> r1.2)
        geode.c (r1.1.1.1 -> r1.2)
        identcpu.c (r1.4 -> r1.5)
        in_cksum.c (r1.1.1.1 -> r1.2)
        initcpu.c (r1.3 -> r1.4)
        intr_machdep.c (r1.2 -> r1.3)
        io.c (r1.1.1.1 -> r1.2)
        io_apic.c (r1.2 -> r1.3)
        legacy.c (r1.1.1.1 -> r1.2)
        local_apic.c (r1.2 -> r1.3)
        locore.s (r1.1.1.1 -> r1.2)
        machdep.c (r1.4 -> r1.5)
        mem.c (r1.2 -> r1.3)
        mp_clock.c (r1.1.1.1 -> r1.2)
        mp_machdep.c (r1.2 -> r1.3)
        mp_watchdog.c (r1.1.1.1 -> r1.2)
        mptable.c (r1.1.1.1 -> r1.2)
        mptable_pci.c (r1.1.1.1 -> r1.2)
        nexus.c (r1.1.1.1 -> r1.2)
        pmap.c (r1.2 -> r1.3)
        ptrace_machdep.c (r1.3 -> r1.4)
        support.s (r1.1.1.1 -> r1.2)
        swtch.s (r1.1.1.1 -> r1.2)
        sys_machdep.c (r1.1.1.1 -> r1.2)
        trap.c (r1.1.1.1 -> r1.2)
        tsc.c (r1.1.1.1 -> r1.2)
        vm86.c (r1.1.1.1 -> r1.2)
        vm86bios.s (r1.1.1.1 -> r1.2)
        vm_machdep.c (r1.2 -> r1.3)

Added Files:
-----------
    src/sys/i386/i386:
        bpf_jit_machdep.c (r1.1)
        bpf_jit_machdep.h (r1.1)
        minidump_machdep.c (r1.1)
        msi.c (r1.1)

-------------- next part --------------
Index: intr_machdep.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/intr_machdep.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/i386/i386/intr_machdep.c -L sys/i386/i386/intr_machdep.c -u -r1.2 -r1.3
--- sys/i386/i386/intr_machdep.c
+++ sys/i386/i386/intr_machdep.c
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/i386/i386/intr_machdep.c,v 1.14.2.2 2006/03/10 19:37:33 jhb Exp $
+ * $FreeBSD: src/sys/i386/i386/intr_machdep.c,v 1.29.2.1 2007/11/26 15:06:49 scottl Exp $
  */
 
 /*
@@ -42,15 +42,17 @@
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
-#include <sys/lock.h>
 #include <sys/ktr.h>
 #include <sys/kernel.h>
+#include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
+#include <sys/sx.h>
 #include <machine/clock.h>
 #include <machine/intr_machdep.h>
+#include <machine/smp.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
@@ -61,7 +63,15 @@
 
 static int intrcnt_index;
 static struct intsrc *interrupt_sources[NUM_IO_INTS];
-static struct mtx intr_table_lock;
+static struct sx intr_table_lock;
+static struct mtx intrcnt_lock;
+static STAILQ_HEAD(, pic) pics;
+
+#ifdef INTR_FILTER
+static void intr_eoi_src(void *arg);
+static void intr_disab_eoi_src(void *arg);
+static void intr_event_stray(void *cookie);
+#endif
 
 #ifdef SMP
 static int assign_cpu;
@@ -70,10 +80,45 @@
 #endif
 
 static void	intr_init(void *__dummy);
+static int	intr_pic_registered(struct pic *pic);
 static void	intrcnt_setname(const char *name, int index);
 static void	intrcnt_updatename(struct intsrc *is);
 static void	intrcnt_register(struct intsrc *is);
 
+static int
+intr_pic_registered(struct pic *pic)
+{
+	struct pic *p;
+
+	STAILQ_FOREACH(p, &pics, pics) {
+		if (p == pic)
+			return (1);
+	}
+	return (0);
+}
+
+/*
+ * Register a new interrupt controller (PIC).  This is to support suspend
+ * and resume where we suspend/resume controllers rather than individual
+ * sources.  This also allows controllers with no active sources (such as
+ * 8259As in a system using the APICs) to participate in suspend and resume.
+ */
+int
+intr_register_pic(struct pic *pic)
+{
+	int error;
+
+	sx_xlock(&intr_table_lock);
+	if (intr_pic_registered(pic))
+		error = EBUSY;
+	else {
+		STAILQ_INSERT_TAIL(&pics, pic, pics);
+		error = 0;
+	}
+	sx_xunlock(&intr_table_lock);
+	return (error);
+}
+
 /*
  * Register a new interrupt source with the global interrupt system.
  * The global interrupts need to be disabled when this function is
@@ -84,23 +129,30 @@
 {
 	int error, vector;
 
+	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
 	vector = isrc->is_pic->pic_vector(isrc);
 	if (interrupt_sources[vector] != NULL)
 		return (EEXIST);
+#ifdef INTR_FILTER
+	error = intr_event_create(&isrc->is_event, isrc, 0,
+	    (mask_fn)isrc->is_pic->pic_enable_source,
+	    intr_eoi_src, intr_disab_eoi_src, "irq%d:", vector);
+#else
 	error = intr_event_create(&isrc->is_event, isrc, 0,
 	    (mask_fn)isrc->is_pic->pic_enable_source, "irq%d:", vector);
+#endif
 	if (error)
 		return (error);
-	mtx_lock_spin(&intr_table_lock);
+	sx_xlock(&intr_table_lock);
 	if (interrupt_sources[vector] != NULL) {
-		mtx_unlock_spin(&intr_table_lock);
+		sx_xunlock(&intr_table_lock);
 		intr_event_destroy(isrc->is_event);
 		return (EEXIST);
 	}
 	intrcnt_register(isrc);
 	interrupt_sources[vector] = isrc;
-	isrc->is_enabled = 0;
-	mtx_unlock_spin(&intr_table_lock);
+	isrc->is_handlers = 0;
+	sx_xunlock(&intr_table_lock);
 	return (0);
 }
 
@@ -112,8 +164,8 @@
 }
 
 int
-intr_add_handler(const char *name, int vector, driver_intr_t handler,
-    void *arg, enum intr_type flags, void **cookiep)
+intr_add_handler(const char *name, int vector, driver_filter_t filter,
+    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep)
 {
 	struct intsrc *isrc;
 	int error;
@@ -121,22 +173,21 @@
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
-	error = intr_event_add_handler(isrc->is_event, name, handler, arg,
-	    intr_priority(flags), flags, cookiep);
+	error = intr_event_add_handler(isrc->is_event, name, filter, handler,
+	    arg, intr_priority(flags), flags, cookiep);
 	if (error == 0) {
+		sx_xlock(&intr_table_lock);
 		intrcnt_updatename(isrc);
-		mtx_lock_spin(&intr_table_lock);
-		if (!isrc->is_enabled) {
-			isrc->is_enabled = 1;
+		isrc->is_handlers++;
+		if (isrc->is_handlers == 1) {
 #ifdef SMP
 			if (assign_cpu)
 				intr_assign_next_cpu(isrc);
 #endif
-			mtx_unlock_spin(&intr_table_lock);
 			isrc->is_pic->pic_enable_intr(isrc);
-		} else
-			mtx_unlock_spin(&intr_table_lock);
-		isrc->is_pic->pic_enable_source(isrc);
+			isrc->is_pic->pic_enable_source(isrc);
+		}
+		sx_xunlock(&intr_table_lock);
 	}
 	return (error);
 }
@@ -144,13 +195,21 @@
 int
 intr_remove_handler(void *cookie)
 {
+	struct intsrc *isrc;
 	int error;
 
+	isrc = intr_handler_source(cookie);
 	error = intr_event_remove_handler(cookie);
-#ifdef XXX
-	if (error == 0)
-		intrcnt_updatename(/* XXX */);
-#endif
+	if (error == 0) {
+		sx_xlock(&intr_table_lock);
+		isrc->is_handlers--;
+		if (isrc->is_handlers == 0) {
+			isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
+			isrc->is_pic->pic_disable_intr(isrc);
+		}
+		intrcnt_updatename(isrc);
+		sx_xunlock(&intr_table_lock);
+	}
 	return (error);
 }
 
@@ -165,13 +224,84 @@
 	return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
 }
 
+#ifdef INTR_FILTER
+void
+intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
+{
+	struct thread *td;
+	struct intr_event *ie;
+	int vector;
+
+	td = curthread;
+
+	/*
+	 * We count software interrupts when we process them.  The
+	 * code here follows previous practice, but there's an
+	 * argument for counting hardware interrupts when they're
+	 * processed too.
+	 */
+	(*isrc->is_count)++;
+	PCPU_INC(cnt.v_intr);
+
+	ie = isrc->is_event;
+
+	/*
+	 * XXX: We assume that IRQ 0 is only used for the ISA timer
+	 * device (clk).
+	 */
+	vector = isrc->is_pic->pic_vector(isrc);
+	if (vector == 0)
+		clkintr_pending = 1;
+
+	if (intr_event_handle(ie, frame) != 0)
+		intr_event_stray(isrc);		
+}
+
+static void
+intr_event_stray(void *cookie)
+{
+	struct intsrc *isrc;
+
+	isrc = cookie;
+	/*
+	 * For stray interrupts, mask and EOI the source, bump the
+	 * stray count, and log the condition.
+	 */
+	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
+	(*isrc->is_straycount)++;
+	if (*isrc->is_straycount < MAX_STRAY_LOG)
+		log(LOG_ERR, "stray irq%d\n", isrc->is_pic->pic_vector(isrc));
+	else if (*isrc->is_straycount == MAX_STRAY_LOG)
+		log(LOG_CRIT,
+		    "too many stray irq %d's: not logging anymore\n",
+		    isrc->is_pic->pic_vector(isrc));
+}
+
+static void
+intr_eoi_src(void *arg)
+{
+	struct intsrc *isrc;
+
+	isrc = arg;
+	isrc->is_pic->pic_eoi_source(isrc);
+}
+
+static void
+intr_disab_eoi_src(void *arg)
+{
+	struct intsrc *isrc;
+
+	isrc = arg;
+	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
+}
+#else
 void
-intr_execute_handlers(struct intsrc *isrc, struct intrframe *iframe)
+intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
 {
 	struct thread *td;
 	struct intr_event *ie;
 	struct intr_handler *ih;
-	int error, vector, thread;
+	int error, vector, thread, ret;
 
 	td = curthread;
 
@@ -182,7 +312,7 @@
 	 * processed too.
 	 */
 	(*isrc->is_count)++;
-	PCPU_LAZY_INC(cnt.v_intr);
+	PCPU_INC(cnt.v_intr);
 
 	ie = isrc->is_event;
 
@@ -214,23 +344,42 @@
 	 * Execute fast interrupt handlers directly.
 	 * To support clock handlers, if a handler registers
 	 * with a NULL argument, then we pass it a pointer to
-	 * an intrframe as its argument.
+	 * a trapframe as its argument.
 	 */
 	td->td_intr_nesting_level++;
+	ret = 0;
 	thread = 0;
 	critical_enter();
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
-		if (!(ih->ih_flags & IH_FAST)) {
+		if (ih->ih_filter == NULL) {
 			thread = 1;
 			continue;
 		}
 		CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__,
-		    ih->ih_handler, ih->ih_argument == NULL ? iframe :
+		    ih->ih_filter, ih->ih_argument == NULL ? frame :
 		    ih->ih_argument, ih->ih_name);
 		if (ih->ih_argument == NULL)
-			ih->ih_handler(iframe);
+			ret = ih->ih_filter(frame);
 		else
-			ih->ih_handler(ih->ih_argument);
+			ret = ih->ih_filter(ih->ih_argument);
+		/* 
+		 * Wrapper handler special handling:
+		 *
+		 * in some particular cases (like pccard and pccbb), 
+		 * the _real_ device handler is wrapped in a couple of
+		 * functions - a filter wrapper and an ithread wrapper.
+		 * In this case (and just in this case), the filter wrapper 
+		 * could ask the system to schedule the ithread and mask
+		 * the interrupt source if the wrapped handler is composed
+		 * of just an ithread handler.
+		 *
+		 * TODO: write a generic wrapper to avoid people rolling 
+		 * their own
+		 */
+		if (!thread) {
+			if (ret == FILTER_SCHEDULE_THREAD)
+				thread = 1;
+		}
 	}
 
 	/*
@@ -242,40 +391,41 @@
 		isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
 	else
 		isrc->is_pic->pic_eoi_source(isrc);
-	critical_exit();
 
 	/* Schedule the ithread if needed. */
 	if (thread) {
 		error = intr_event_schedule_thread(ie);
 		KASSERT(error == 0, ("bad stray interrupt"));
 	}
+	critical_exit();
 	td->td_intr_nesting_level--;
 }
+#endif
 
 void
 intr_resume(void)
 {
-	struct intsrc **isrc;
-	int i;
+	struct pic *pic;
 
-	mtx_lock_spin(&intr_table_lock);
-	for (i = 0, isrc = interrupt_sources; i < NUM_IO_INTS; i++, isrc++)
-		if (*isrc != NULL && (*isrc)->is_pic->pic_resume != NULL)
-			(*isrc)->is_pic->pic_resume(*isrc);
-	mtx_unlock_spin(&intr_table_lock);
+	sx_xlock(&intr_table_lock);
+	STAILQ_FOREACH(pic, &pics, pics) {
+		if (pic->pic_resume != NULL)
+			pic->pic_resume(pic);
+	}
+	sx_xunlock(&intr_table_lock);
 }
 
 void
 intr_suspend(void)
 {
-	struct intsrc **isrc;
-	int i;
+	struct pic *pic;
 
-	mtx_lock_spin(&intr_table_lock);
-	for (i = 0, isrc = interrupt_sources; i < NUM_IO_INTS; i++, isrc++)
-		if (*isrc != NULL && (*isrc)->is_pic->pic_suspend != NULL)
-			(*isrc)->is_pic->pic_suspend(*isrc);
-	mtx_unlock_spin(&intr_table_lock);
+	sx_xlock(&intr_table_lock);
+	STAILQ_FOREACH(pic, &pics, pics) {
+		if (pic->pic_suspend != NULL)
+			pic->pic_suspend(pic);
+	}
+	sx_xunlock(&intr_table_lock);
 }
 
 static void
@@ -298,8 +448,8 @@
 {
 	char straystr[MAXCOMLEN + 1];
 
-	/* mtx_assert(&intr_table_lock, MA_OWNED); */
 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
+	mtx_lock_spin(&intrcnt_lock);
 	is->is_index = intrcnt_index;
 	intrcnt_index += 2;
 	snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
@@ -308,17 +458,18 @@
 	is->is_count = &intrcnt[is->is_index];
 	intrcnt_setname(straystr, is->is_index + 1);
 	is->is_straycount = &intrcnt[is->is_index + 1];
+	mtx_unlock_spin(&intrcnt_lock);
 }
 
 void
 intrcnt_add(const char *name, u_long **countp)
 {
 
-	mtx_lock_spin(&intr_table_lock);
+	mtx_lock_spin(&intrcnt_lock);
 	*countp = &intrcnt[intrcnt_index];
 	intrcnt_setname(name, intrcnt_index);
 	intrcnt_index++;
-	mtx_unlock_spin(&intr_table_lock);
+	mtx_unlock_spin(&intrcnt_lock);
 }
 
 static void
@@ -327,7 +478,9 @@
 
 	intrcnt_setname("???", 0);
 	intrcnt_index = 1;
-	mtx_init(&intr_table_lock, "intr table", NULL, MTX_SPIN);
+	STAILQ_INIT(&pics);
+	sx_init(&intr_table_lock, "intr sources");
+	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
 }
 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL)
 
@@ -338,16 +491,14 @@
 DB_SHOW_COMMAND(irqs, db_show_irqs)
 {
 	struct intsrc **isrc;
-	int i, quit, verbose;
+	int i, verbose;
 
-	quit = 0;
 	if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	isrc = interrupt_sources;
-	db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
-	for (i = 0; i < NUM_IO_INTS && !quit; i++, isrc++)
+	for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++)
 		if (*isrc != NULL)
 			db_dump_intr_event((*isrc)->is_event, verbose);
 }
@@ -359,8 +510,9 @@
  * allocate CPUs round-robin.
  */
 
-static u_int cpu_apic_ids[MAXCPU];
-static int current_cpu, num_cpus;
+/* The BSP is always a valid target. */
+static cpumask_t intr_cpus = (1 << 0);
+static int current_cpu, num_cpus = 1;
 
 static void
 intr_assign_next_cpu(struct intsrc *isrc)
@@ -373,29 +525,29 @@
 	 */
 	pic = isrc->is_pic;
 	apic_id = cpu_apic_ids[current_cpu];
-	current_cpu++;
-	if (current_cpu >= num_cpus)
-		current_cpu = 0;
-	if (bootverbose) {
-		printf("INTR: Assigning IRQ %d", pic->pic_vector(isrc));
-		printf(" to local APIC %u\n", apic_id);
-	}
 	pic->pic_assign_cpu(isrc, apic_id);
+	do {
+		current_cpu++;
+		if (current_cpu >= num_cpus)
+			current_cpu = 0;
+	} while (!(intr_cpus & (1 << current_cpu)));
 }
 
 /*
- * Add a local APIC ID to our list of valid local APIC IDs that can
- * be destinations of interrupts.
+ * Add a CPU to our mask of valid CPUs that can be destinations of
+ * interrupts.
  */
 void
-intr_add_cpu(u_int apic_id)
+intr_add_cpu(u_int cpu)
 {
 
+	if (cpu >= MAXCPU)
+		panic("%s: Invalid CPU ID", __func__);
 	if (bootverbose)
-		printf("INTR: Adding local APIC %d as a target\n", apic_id);
-	if (num_cpus >= MAXCPU)
-		panic("WARNING: Local APIC IDs exhausted!");
-	cpu_apic_ids[num_cpus] = apic_id;
+		printf("INTR: Adding local APIC %d as a target\n",
+		    cpu_apic_ids[cpu]);
+
+	intr_cpus |= (1 << cpu);
 	num_cpus++;
 }
 
@@ -413,15 +565,15 @@
 	if (num_cpus <= 1)
 		return;
 
-	/* Round-robin assign each enabled source a CPU. */
-	mtx_lock_spin(&intr_table_lock);
+	/* Round-robin assign a CPU to each enabled source. */
+	sx_xlock(&intr_table_lock);
 	assign_cpu = 1;
 	for (i = 0; i < NUM_IO_INTS; i++) {
 		isrc = interrupt_sources[i];
-		if (isrc != NULL && isrc->is_enabled)
+		if (isrc != NULL && isrc->is_handlers > 0)
 			intr_assign_next_cpu(isrc);
 	}
-	mtx_unlock_spin(&intr_table_lock);
+	sx_xunlock(&intr_table_lock);
 }
 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs, NULL)
 #endif
Index: in_cksum.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/in_cksum.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/in_cksum.c -L sys/i386/i386/in_cksum.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/in_cksum.c
+++ sys/i386/i386/in_cksum.c
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/in_cksum.c,v 1.28 2005/03/02 21:33:25 joerg Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/in_cksum.c,v 1.28.10.1 2007/10/26 07:15:04 bz Exp $");
 
 /*
  * MPsafe: alfred
@@ -260,17 +260,6 @@
  * reorder operations, this will generally take place in parallel with
  * other calculations.
  */
-#define ADD(n)	__asm __volatile \
-		("addl %1, %0" : "+r" (sum) : \
-		"g" (((const u_int32_t *)w)[n / 4]))
-#define ADDC(n)	__asm __volatile \
-		("adcl %1, %0" : "+r" (sum) : \
-		"g" (((const u_int32_t *)w)[n / 4]))
-#define LOAD(n)	__asm __volatile \
-		("" : : "r" (((const u_int32_t *)w)[n / 4]))
-#define MOP	__asm __volatile \
-		("adcl         $0, %0" : "+r" (sum))
-
 u_short
 in_cksum_skip(m, len, skip)
 	struct mbuf *m;
@@ -341,15 +330,24 @@
 		 * Advance to a 486 cache line boundary.
 		 */
 		if (4 & (int) w && mlen >= 4) {
-			ADD(0);
-			MOP;
+			__asm __volatile (
+				"addl %1, %0\n"
+				"adcl $0, %0"
+				: "+r" (sum)
+				: "g" (((const u_int32_t *)w)[0])
+			);
 			w += 2;
 			mlen -= 4;
 		}
 		if (8 & (int) w && mlen >= 8) {
-			ADD(0);
-			ADDC(4);
-			MOP;
+			__asm __volatile (
+				"addl %1, %0\n"
+				"adcl %2, %0\n"
+				"adcl $0, %0"
+				: "+r" (sum)
+				: "g" (((const u_int32_t *)w)[0]),
+				  "g" (((const u_int32_t *)w)[1])
+			);
 			w += 4;
 			mlen -= 8;
 		}
@@ -379,45 +377,81 @@
 			 * is initially 33 (not 32) to guaranteed that
 			 * the LOAD(32) is within bounds.
 			 */
-			ADD(16);
-			ADDC(0);
-			ADDC(4);
-			ADDC(8);
-			ADDC(12);
-			LOAD(32);
-			ADDC(20);
-			ADDC(24);
-			ADDC(28);
-			MOP;
+			__asm __volatile (
+				"addl %1, %0\n"
+				"adcl %2, %0\n"
+				"adcl %3, %0\n"
+				"adcl %4, %0\n"
+				"adcl %5, %0\n"
+				"mov  %6, %%eax\n"
+				"adcl %7, %0\n"
+				"adcl %8, %0\n"
+				"adcl %9, %0\n"
+				"adcl $0, %0"
+				: "+r" (sum)
+				: "g" (((const u_int32_t *)w)[4]),
+				  "g" (((const u_int32_t *)w)[0]),
+				  "g" (((const u_int32_t *)w)[1]),
+				  "g" (((const u_int32_t *)w)[2]),
+				  "g" (((const u_int32_t *)w)[3]),
+				  "g" (((const u_int32_t *)w)[8]),
+				  "g" (((const u_int32_t *)w)[5]),
+				  "g" (((const u_int32_t *)w)[6]),
+				  "g" (((const u_int32_t *)w)[7])
+				: "eax"
+			);
 			w += 16;
 		}
 		mlen += 32 + 1;
 		if (mlen >= 32) {
-			ADD(16);
-			ADDC(0);
-			ADDC(4);
-			ADDC(8);
-			ADDC(12);
-			ADDC(20);
-			ADDC(24);
-			ADDC(28);
-			MOP;
+			__asm __volatile (
+				"addl %1, %0\n"
+				"adcl %2, %0\n"
+				"adcl %3, %0\n"
+				"adcl %4, %0\n"
+				"adcl %5, %0\n"
+				"adcl %6, %0\n"
+				"adcl %7, %0\n"
+				"adcl %8, %0\n"
+				"adcl $0, %0"
+				: "+r" (sum)
+				: "g" (((const u_int32_t *)w)[4]),
+				  "g" (((const u_int32_t *)w)[0]),
+				  "g" (((const u_int32_t *)w)[1]),
+				  "g" (((const u_int32_t *)w)[2]),
+				  "g" (((const u_int32_t *)w)[3]),
+				  "g" (((const u_int32_t *)w)[5]),
+				  "g" (((const u_int32_t *)w)[6]),
+				  "g" (((const u_int32_t *)w)[7])
+			);
 			w += 16;
 			mlen -= 32;
 		}
 		if (mlen >= 16) {
-			ADD(0);
-			ADDC(4);
-			ADDC(8);
-			ADDC(12);
-			MOP;
+			__asm __volatile (
+				"addl %1, %0\n"
+				"adcl %2, %0\n"
+				"adcl %3, %0\n"
+				"adcl %4, %0\n"
+				"adcl $0, %0"
+				: "+r" (sum)
+				: "g" (((const u_int32_t *)w)[0]),
+				  "g" (((const u_int32_t *)w)[1]),
+				  "g" (((const u_int32_t *)w)[2]),
+				  "g" (((const u_int32_t *)w)[3])
+			);
 			w += 8;
 			mlen -= 16;
 		}
 		if (mlen >= 8) {
-			ADD(0);
-			ADDC(4);
-			MOP;
+			__asm __volatile (
+				"addl %1, %0\n"
+				"adcl %2, %0\n"
+				"adcl $0, %0"
+				: "+r" (sum)
+				: "g" (((const u_int32_t *)w)[0]),
+				  "g" (((const u_int32_t *)w)[1])
+			);
 			w += 4;
 			mlen -= 8;
 		}
Index: db_trace.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/db_trace.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/i386/i386/db_trace.c -L sys/i386/i386/db_trace.c -u -r1.2 -r1.3
--- sys/i386/i386/db_trace.c
+++ sys/i386/i386/db_trace.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/db_trace.c,v 1.66.2.1 2006/03/13 03:05:33 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/db_trace.c,v 1.79 2007/02/19 10:57:47 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -80,6 +80,7 @@
 	{ "edi",	DB_OFFSET(tf_edi),	db_frame },
 	{ "eip",	DB_OFFSET(tf_eip),	db_frame },
 	{ "efl",	DB_OFFSET(tf_eflags),	db_frame },
+#define	DB_N_SHOW_REGS	15	/* Don't show registers after here. */
 	{ "dr0",	NULL,			db_dr0 },
 	{ "dr1",	NULL,			db_dr1 },
 	{ "dr2",	NULL,			db_dr2 },
@@ -89,7 +90,7 @@
 	{ "dr6",	NULL,			db_dr6 },
 	{ "dr7",	NULL,			db_dr7 },
 };
-struct db_variable *db_eregs = db_regs + sizeof(db_regs)/sizeof(db_regs[0]);
+struct db_variable *db_eregs = db_regs + DB_N_SHOW_REGS;
 
 #define DB_DRX_FUNC(reg)		\
 static int				\
@@ -182,19 +183,17 @@
 #define	INTERRUPT	2
 #define	SYSCALL		3
 #define	DOUBLE_FAULT	4
+#define	TRAP_INTERRUPT	5
 
 static void db_nextframe(struct i386_frame **, db_addr_t *, struct thread *);
 static int db_numargs(struct i386_frame *);
 static void db_print_stack_entry(const char *, int, char **, int *, db_addr_t);
 static void decode_syscall(int, struct thread *);
 
-static char * watchtype_str(int type);
+static const char * watchtype_str(int type);
 int  i386_set_watch(int watchnum, unsigned int watchaddr, int size, int access,
-		    struct dbreg * d);
-int  i386_clr_watch(int watchnum, struct dbreg * d);
-int  db_md_set_watchpoint(db_expr_t addr, db_expr_t size);
-int  db_md_clr_watchpoint(db_expr_t addr, db_expr_t size);
-void db_md_list_watchpoints(void);
+		    struct dbreg *d);
+int  i386_clr_watch(int watchnum, struct dbreg *d);
 
 /*
  * Figure out how many arguments were passed into the frame at "fp".
@@ -203,26 +202,30 @@
 db_numargs(fp)
 	struct i386_frame *fp;
 {
-	int	*argp;
+	char   *argp;
 	int	inst;
 	int	args;
 
-	argp = (int *)db_get_value((int)&fp->f_retaddr, 4, FALSE);
+	argp = (char *)db_get_value((int)&fp->f_retaddr, 4, FALSE);
 	/*
 	 * XXX etext is wrong for LKMs.  We should attempt to interpret
 	 * the instruction at the return address in all cases.  This
 	 * may require better fault handling.
 	 */
-	if (argp < (int *)btext || argp >= (int *)etext) {
-		args = 5;
+	if (argp < btext || argp >= etext) {
+		args = -1;
 	} else {
+retry:
 		inst = db_get_value((int)argp, 4, FALSE);
 		if ((inst & 0xff) == 0x59)	/* popl %ecx */
 			args = 1;
 		else if ((inst & 0xffff) == 0xc483)	/* addl $Ibs, %esp */
 			args = ((inst >> 16) & 0xff) / 4;
-		else
-			args = 5;
+		else if ((inst & 0xf8ff) == 0xc089) {	/* movl %eax, %Reg */
+			argp += 2;
+			goto retry;
+		} else
+			args = -1;
 	}
 	return (args);
 }
@@ -235,15 +238,19 @@
 	int *argp;
 	db_addr_t callpc;
 {
+	int n = narg >= 0 ? narg : 5;
+
 	db_printf("%s(", name);
-	while (narg) {
+	while (n) {
 		if (argnp)
 			db_printf("%s=", *argnp++);
 		db_printf("%r", db_get_value((int)argp, 4, FALSE));
 		argp++;
-		if (--narg != 0)
+		if (--n != 0)
 			db_printf(",");
 	}
+	if (narg < 0)
+		db_printf(",...");
 	db_printf(") at ");
 	db_printsym(callpc, DB_STGY_PROC);
 	db_printf("\n");
@@ -311,6 +318,13 @@
 			frame_type = SYSCALL;
 		else if (strcmp(name, "dblfault_handler") == 0)
 			frame_type = DOUBLE_FAULT;
+		/* XXX: These are interrupts with trap frames. */
+		else if (strcmp(name, "Xtimerint") == 0 ||
+		    strcmp(name, "Xcpustop") == 0 ||
+		    strcmp(name, "Xrendezvous") == 0 ||
+		    strcmp(name, "Xipi_intr_bitmap_handler") == 0 ||
+		    strcmp(name, "Xlazypmap") == 0)
+			frame_type = TRAP_INTERRUPT;
 	}
 
 	/*
@@ -346,9 +360,9 @@
 	 * current frame.
 	 */
 	if (frame_type == INTERRUPT)
-		tf = (struct trapframe *)((int)*fp + 12);
+		tf = (struct trapframe *)((int)*fp + 16);
 	else
-		tf = (struct trapframe *)((int)*fp + 8);
+		tf = (struct trapframe *)((int)*fp + 12);
 
 	if (INKERNEL((int) tf)) {
 		esp = get_esp(tf);
@@ -362,6 +376,7 @@
 			db_printf("--- syscall");
 			decode_syscall(tf->tf_eax, td);
 			break;
+		case TRAP_INTERRUPT:
 		case INTERRUPT:
 			db_printf("--- interrupt");
 			break;
@@ -387,16 +402,38 @@
 	int *argp;
 	db_expr_t offset;
 	c_db_sym_t sym;
-	int narg, quit;
+	int instr, narg;
 	boolean_t first;
 
+	/*
+	 * If an indirect call via an invalid pointer caused a trap,
+	 * %pc contains the invalid address while the return address
+	 * of the unlucky caller has been saved by CPU on the stack
+	 * just before the trap frame.  In this case, try to recover
+	 * the caller's address so that the first frame is assigned
+	 * to the right spot in the right function, for that is where
+	 * the failure actually happened.
+	 *
+	 * This trick depends on the fault address stashed in tf_err
+	 * by trap_fatal() before entering KDB.
+	 */
+	if (kdb_frame && pc == kdb_frame->tf_err) {
+		/*
+		 * Find where the trap frame actually ends.
+		 * It won't contain tf_esp or tf_ss unless crossing rings.
+		 */
+		if (ISPL(kdb_frame->tf_cs))
+			instr = (int)(kdb_frame + 1);
+		else
+			instr = (int)&kdb_frame->tf_esp;
+		pc = db_get_value(instr, 4, FALSE);
+	}
+
 	if (count == -1)
 		count = 1024;
 
 	first = TRUE;
-	quit = 0;
-	db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
-	while (count-- && !quit) {
+	while (count-- && !db_pager_quit) {
 		sym = db_search_symbol(pc, DB_STGY_ANY, &offset);
 		db_symbol_values(sym, &name, NULL);
 
@@ -414,8 +451,6 @@
 		actframe = frame;
 		if (first) {
 			if (tf != NULL) {
-				int instr;
-
 				instr = db_get_value(pc, 4, FALSE);
 				if ((instr & 0xffffff) == 0x00e58955) {
 					/* pushl %ebp; movl %esp, %ebp */
@@ -534,21 +569,20 @@
 	unsigned int watchaddr;
 	int size;
 	int access;
-	struct dbreg * d;
+	struct dbreg *d;
 {
-	int i;
-	unsigned int mask;
-	
+	int i, len;
+
 	if (watchnum == -1) {
-		for (i = 0, mask = 0x3; i < 4; i++, mask <<= 2)
-			if ((d->dr[7] & mask) == 0)
+		for (i = 0; i < 4; i++)
+			if (!DBREG_DR7_ENABLED(d->dr[7], i))
 				break;
 		if (i < 4)
 			watchnum = i;
 		else
 			return (-1);
 	}
-	
+
 	switch (access) {
 	case DBREG_DR7_EXEC:
 		size = 1; /* size must be 1 for an execution breakpoint */
@@ -556,29 +590,36 @@
 	case DBREG_DR7_WRONLY:
 	case DBREG_DR7_RDWR:
 		break;
-	default : return (-1);
+	default:
+		return (-1);
 	}
-	
+
 	/*
 	 * we can watch a 1, 2, or 4 byte sized location
 	 */
 	switch (size) {
-	case 1	: mask = 0x00; break;
-	case 2	: mask = 0x01 << 2; break;
-	case 4	: mask = 0x03 << 2; break;
-	default : return (-1);
+	case 1:
+		len = DBREG_DR7_LEN_1;
+		break;
+	case 2:
+		len = DBREG_DR7_LEN_2;
+		break;
+	case 4:
+		len = DBREG_DR7_LEN_4;
+		break;
+	default:
+		return (-1);
 	}
 
-	mask |= access;
-
 	/* clear the bits we are about to affect */
-	d->dr[7] &= ~((0x3 << (watchnum*2)) | (0x0f << (watchnum*4+16)));
+	d->dr[7] &= ~DBREG_DR7_MASK(watchnum);
 
 	/* set drN register to the address, N=watchnum */
-	DBREG_DRX(d,watchnum) = watchaddr;
+	DBREG_DRX(d, watchnum) = watchaddr;
 
 	/* enable the watchpoint */
-	d->dr[7] |= (0x2 << (watchnum*2)) | (mask << (watchnum*4+16));
+	d->dr[7] |= DBREG_DR7_SET(watchnum, len, access,
+	    DBREG_DR7_GLOBAL_ENABLE);
 
 	return (watchnum);
 }
@@ -587,15 +628,15 @@
 int
 i386_clr_watch(watchnum, d)
 	int watchnum;
-	struct dbreg * d;
+	struct dbreg *d;
 {
 
 	if (watchnum < 0 || watchnum >= 4)
 		return (-1);
-	
-	d->dr[7] = d->dr[7] & ~((0x3 << (watchnum*2)) | (0x0f << (watchnum*4+16)));
-	DBREG_DRX(d,watchnum) = 0;
-	
+
+	d->dr[7] &= ~DBREG_DR7_MASK(watchnum);
+	DBREG_DRX(d, watchnum) = 0;
+
 	return (0);
 }
 
@@ -605,38 +646,35 @@
 	db_expr_t addr;
 	db_expr_t size;
 {
-	int avail, wsize;
-	int i;
 	struct dbreg d;
-	
+	int avail, i, wsize;
+
 	fill_dbregs(NULL, &d);
-	
+
 	avail = 0;
-	for(i=0; i<4; i++) {
-		if ((d.dr[7] & (3 << (i*2))) == 0)
+	for(i = 0; i < 4; i++) {
+		if (!DBREG_DR7_ENABLED(d.dr[7], i))
 			avail++;
 	}
-	
-	if (avail*4 < size)
+
+	if (avail * 4 < size)
 		return (-1);
-	
-	for (i=0; i<4 && (size != 0); i++) {
-		if ((d.dr[7] & (3<<(i*2))) == 0) {
-			if (size > 4)
+
+	for (i = 0; i < 4 && (size > 0); i++) {
+		if (!DBREG_DR7_ENABLED(d.dr[7], i)) {
+			if (size > 2)
 				wsize = 4;
 			else
 				wsize = size;
-			if (wsize == 3)
-				wsize++;
-			i386_set_watch(i, addr, wsize, 
+			i386_set_watch(i, addr, wsize,
 				       DBREG_DR7_WRONLY, &d);
 			addr += wsize;
 			size -= wsize;
 		}
 	}
-	
+
 	set_dbregs(NULL, &d);
-	
+
 	return(0);
 }
 
@@ -646,28 +684,27 @@
 	db_expr_t addr;
 	db_expr_t size;
 {
-	int i;
 	struct dbreg d;
+	int i;
 
 	fill_dbregs(NULL, &d);
 
-	for(i=0; i<4; i++) {
-		if (d.dr[7] & (3 << (i*2))) {
-			if ((DBREG_DRX((&d), i) >= addr) && 
+	for(i = 0; i < 4; i++) {
+		if (DBREG_DR7_ENABLED(d.dr[7], i)) {
+			if ((DBREG_DRX((&d), i) >= addr) &&
 			    (DBREG_DRX((&d), i) < addr+size))
 				i386_clr_watch(i, &d);
-			
+
 		}
 	}
-	
+
 	set_dbregs(NULL, &d);
-	
+
 	return(0);
 }
 
 
-static 
-char *
+static const char *
 watchtype_str(type)
 	int type;
 {
@@ -683,31 +720,30 @@
 void
 db_md_list_watchpoints()
 {
-	int i;
 	struct dbreg d;
+	int i, len, type;
 
 	fill_dbregs(NULL, &d);
 
 	db_printf("\nhardware watchpoints:\n");
 	db_printf("  watch    status        type  len     address\n");
 	db_printf("  -----  --------  ----------  ---  ----------\n");
-	for (i=0; i<4; i++) {
-		if (d.dr[7] & (0x03 << (i*2))) {
-			unsigned type, len;
-			type = (d.dr[7] >> (16+(i*4))) & 3;
-			len =  (d.dr[7] >> (16+(i*4)+2)) & 3;
-			db_printf("  %-5d  %-8s  %10s  %3d  0x%08x\n",
-				  i, "enabled", watchtype_str(type), 
-				  len+1, DBREG_DRX((&d),i));
-		}
-		else {
+	for (i = 0; i < 4; i++) {
+		if (DBREG_DR7_ENABLED(d.dr[7], i)) {
+			type = DBREG_DR7_ACCESS(d.dr[7], i);
+			len = DBREG_DR7_LEN(d.dr[7], i);
+			db_printf("  %-5d  %-8s  %10s  %3d  ",
+			    i, "enabled", watchtype_str(type), len + 1);
+			db_printsym((db_addr_t)DBREG_DRX((&d), i), DB_STGY_ANY);
+			db_printf("\n");
+		} else {
 			db_printf("  %-5d  disabled\n", i);
 		}
 	}
-	
+
 	db_printf("\ndebug register values:\n");
-	for (i=0; i<8; i++) {
-		db_printf("  dr%d 0x%08x\n", i, DBREG_DRX((&d),i));
+	for (i = 0; i < 8; i++) {
+		db_printf("  dr%d 0x%08x\n", i, DBREG_DRX((&d), i));
 	}
 	db_printf("\n");
 }
Index: support.s
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/support.s,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/support.s -L sys/i386/i386/support.s -u -r1.1.1.1 -r1.2
--- sys/i386/i386/support.s
+++ sys/i386/i386/support.s
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/i386/i386/support.s,v 1.107 2005/04/21 23:07:20 alc Exp $
+ * $FreeBSD: src/sys/i386/i386/support.s,v 1.119 2007/08/22 05:06:14 jkoshy Exp $
  */
 
 #include "opt_npx.h"
@@ -80,6 +80,7 @@
 ENTRY(bzero)
 	MEXITCOUNT
 	jmp	*bzero_vector
+END(bzero)
 
 ENTRY(generic_bzero)
 	pushl	%edi
@@ -96,7 +97,8 @@
 	stosb
 	popl	%edi
 	ret
-
+END(generic_bzero)	
+	
 #ifdef I486_CPU
 ENTRY(i486_bzero)
 	movl	4(%esp),%edx
@@ -197,6 +199,7 @@
 	SUPERALIGN_TEXT
 do0:
 	ret
+END(i486_bzero)
 #endif
 
 #if defined(I586_CPU) && defined(DEV_NPX)
@@ -355,6 +358,7 @@
 	stosb
 	popl	%edi
 	ret
+END(i586_bzero)
 #endif /* I586_CPU && defined(DEV_NPX) */
 
 ENTRY(sse2_pagezero)
@@ -371,18 +375,19 @@
 	sfence
 	popl	%ebx
 	ret
+END(sse2_pagezero)
 
 ENTRY(i686_pagezero)
 	pushl	%edi
 	pushl	%ebx
 
-	movl	12(%esp), %edi
-	movl	$1024, %ecx
+	movl	12(%esp),%edi
+	movl	$1024,%ecx
 	cld
 
 	ALIGN_TEXT
 1:
-	xorl	%eax, %eax
+	xorl	%eax,%eax
 	repe
 	scasl
 	jnz	2f
@@ -395,32 +400,33 @@
 
 2:
 	incl	%ecx
-	subl	$4, %edi
+	subl	$4,%edi
 
-	movl	%ecx, %edx
-	cmpl	$16, %ecx
+	movl	%ecx,%edx
+	cmpl	$16,%ecx
 
 	jge	3f
 
-	movl	%edi, %ebx
-	andl	$0x3f, %ebx
+	movl	%edi,%ebx
+	andl	$0x3f,%ebx
 	shrl	%ebx
 	shrl	%ebx
-	movl	$16, %ecx
-	subl	%ebx, %ecx
+	movl	$16,%ecx
+	subl	%ebx,%ecx
 
 3:
-	subl	%ecx, %edx
+	subl	%ecx,%edx
 	rep
 	stosl
 
-	movl	%edx, %ecx
-	testl	%edx, %edx
+	movl	%edx,%ecx
+	testl	%edx,%edx
 	jnz	1b
 
 	popl	%ebx
 	popl	%edi
 	ret
+END(i686_pagezero)
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
@@ -433,6 +439,7 @@
 	stosw
 	popl	%edi
 	ret
+END(fillw)
 
 ENTRY(bcopyb)
 	pushl	%esi
@@ -464,10 +471,12 @@
 	popl	%esi
 	cld
 	ret
+END(bcopyb)
 
 ENTRY(bcopy)
 	MEXITCOUNT
 	jmp	*bcopy_vector
+END(bcopy)
 
 /*
  * generic_bcopy(src, dst, cnt)
@@ -517,6 +526,7 @@
 	popl	%esi
 	cld
 	ret
+END(generic_bcopy)
 
 #if defined(I586_CPU) && defined(DEV_NPX)
 ENTRY(i586_bcopy)
@@ -665,6 +675,7 @@
 	popl	%esi
 	cld
 	ret
+END(i586_bcopy)
 #endif /* I586_CPU && defined(DEV_NPX) */
 
 /*
@@ -688,7 +699,7 @@
 	popl	%esi
 	popl	%edi
 	ret
-
+END(memcpy)
 
 /*****************************************************************************/
 /* copyout and fubyte family                                                 */
@@ -714,6 +725,7 @@
 ENTRY(copyout)
 	MEXITCOUNT
 	jmp	*copyout_vector
+END(copyout)
 
 ENTRY(generic_copyout)
 	movl	PCPU(CURPCB),%eax
@@ -773,6 +785,7 @@
 	movl	PCPU(CURPCB),%edx
 	movl	%eax,PCB_ONFAULT(%edx)
 	ret
+END(generic_copyout)
 
 	ALIGN_TEXT
 copyout_fault:
@@ -836,6 +849,7 @@
 	call	fastmove
 	addl	$4,%esp
 	jmp	done_copyout
+END(i586_copyout)
 #endif /* I586_CPU && defined(DEV_NPX) */
 
 /*
@@ -844,6 +858,7 @@
 ENTRY(copyin)
 	MEXITCOUNT
 	jmp	*copyin_vector
+END(copyin)
 
 ENTRY(generic_copyin)
 	movl	PCPU(CURPCB),%eax
@@ -887,6 +902,7 @@
 	movl	PCPU(CURPCB),%edx
 	movl	%eax,PCB_ONFAULT(%edx)
 	ret
+END(generic_copyin)
 
 	ALIGN_TEXT
 copyin_fault:
@@ -930,6 +946,7 @@
 	call	fastmove
 	addl	$8,%esp
 	jmp	done_copyin
+END(i586_copyin)
 #endif /* I586_CPU && defined(DEV_NPX) */
 
 #if defined(I586_CPU) && defined(DEV_NPX)
@@ -1137,12 +1154,15 @@
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
+END(fastmove)
 #endif /* I586_CPU && defined(DEV_NPX) */
 
 /*
- * casuptr.  Compare and set user pointer.  Returns -1 or the current value.
+ * casuword.  Compare and set user word.  Returns -1 or the current value.
  */
-ENTRY(casuptr)
+
+ALTENTRY(casuword32)
+ENTRY(casuword)
 	movl	PCPU(CURPCB),%ecx
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	4(%esp),%edx			/* dst */
@@ -1155,7 +1175,7 @@
 #ifdef SMP
 	lock
 #endif
-	cmpxchgl %ecx, (%edx)			/* Compare and set. */
+	cmpxchgl %ecx,(%edx)			/* Compare and set. */
 
 	/*
 	 * The old value is in %eax.  If the store succeeded it will be the
@@ -1167,6 +1187,8 @@
 	movl	$fusufault,PCB_ONFAULT(%ecx)
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
+END(casuword32)
+END(casuword)
 
 /*
  * Fetch (load) a 32-bit word, a 16-bit word, or an 8-bit byte from user
@@ -1185,6 +1207,8 @@
 	movl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
+END(fuword32)
+END(fuword)
 
 /*
  * fuswintr() and suswintr() are specialized variants of fuword16() and
@@ -1197,6 +1221,8 @@
 ENTRY(fuswintr)
 	movl	$-1,%eax
 	ret
+END(suswintr)
+END(fuswintr)
 
 ENTRY(fuword16)
 	movl	PCPU(CURPCB),%ecx
@@ -1209,6 +1235,7 @@
 	movzwl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
+END(fuword16)
 
 ENTRY(fubyte)
 	movl	PCPU(CURPCB),%ecx
@@ -1221,6 +1248,7 @@
 	movzbl	(%edx),%eax
 	movl	$0,PCB_ONFAULT(%ecx)
 	ret
+END(fubyte)
 
 	ALIGN_TEXT
 fusufault:
@@ -1250,6 +1278,8 @@
 	movl	PCPU(CURPCB),%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
+END(suword32)
+END(suword)
 
 ENTRY(suword16)
 	movl	PCPU(CURPCB),%ecx
@@ -1265,6 +1295,7 @@
 	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
+END(suword16)
 
 ENTRY(subyte)
 	movl	PCPU(CURPCB),%ecx
@@ -1280,6 +1311,7 @@
 	movl	PCPU(CURPCB),%ecx		/* restore trashed register */
 	movl	%eax,PCB_ONFAULT(%ecx)
 	ret
+END(subyte)
 
 /*
  * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
@@ -1352,7 +1384,7 @@
 	popl	%edi
 	popl	%esi
 	ret
-
+END(copyinstr)
 
 /*
  * copystr(from, to, maxlen, int *lencopied) - MP SAFE
@@ -1394,6 +1426,7 @@
 	popl	%edi
 	popl	%esi
 	ret
+END(copystr)
 
 ENTRY(bcmp)
 	pushl	%edi
@@ -1419,7 +1452,7 @@
 	popl	%esi
 	popl	%edi
 	ret
-
+END(bcmp)
 
 /*
  * Handling of special 386 registers and descriptor tables etc
@@ -1449,6 +1482,7 @@
 	movl	$KCSEL,4(%esp)
 	MEXITCOUNT
 	lret
+END(lgdt)
 
 /* ssdtosd(*ssdp,*sdp) */
 ENTRY(ssdtosd)
@@ -1470,6 +1504,7 @@
 	movl	%ebx,4(%ecx)
 	popl	%ebx
 	ret
+END(ssdtosd)
 
 /* void reset_dbregs() */
 ENTRY(reset_dbregs)
@@ -1481,6 +1516,7 @@
 	movl    %eax,%dr3
 	movl    %eax,%dr6
 	ret
+END(reset_dbregs)
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
@@ -1497,6 +1533,7 @@
 	movl	%edx,20(%eax)			/* save eip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
+END(setjmp)
 
 ENTRY(longjmp)
 	movl	4(%esp),%eax
@@ -1510,6 +1547,7 @@
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
+END(longjmp)
 
 /*
  * Support for BB-profiling (gcc -a).  The kernbb program will extract
Index: mp_watchdog.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/mp_watchdog.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/mp_watchdog.c -L sys/i386/i386/mp_watchdog.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/mp_watchdog.c
+++ sys/i386/i386/mp_watchdog.c
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/i386/i386/mp_watchdog.c,v 1.4 2005/02/27 22:34:07 pjd Exp $
+ * $FreeBSD: src/sys/i386/i386/mp_watchdog.c,v 1.5 2007/06/04 23:56:33 jeff Exp $
  */
 
 #include "opt_mp_watchdog.h"
@@ -105,9 +105,7 @@
 	 * locks to make sure.  Then reset the timer.
 	 */
 	mtx_lock(&Giant);
-	mtx_lock_spin(&sched_lock);
 	watchdog_timer = WATCHDOG_THRESHOLD;
-	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&Giant);
 	callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL);
 }
@@ -156,34 +154,6 @@
     sysctl_watchdog, "I", "");
 
 /*
- * A badly behaved sysctl that leaks the sched lock when written to.  Then
- * spin holding it just to make matters worse.  This can be used to test the
- * effectiveness of the watchdog by generating a fairly hard and nast hang.
- * Note that Giant is also held in the current world order when we get here.
- */
-static int
-sysctl_leak_schedlock(SYSCTL_HANDLER_ARGS)
-{
-	int error, temp;
-
-	temp = 0;
-	error = sysctl_handle_int(oidp, &temp, 0, req);
-	if (error)
-		return (error);
-
-	if (req->newptr != NULL) {
-		if (temp) {
-			printf("Leaking the sched lock...\n");
-			mtx_lock_spin(&sched_lock);
-			while (1);
-		}
-	}
-	return (0);
-}
-SYSCTL_PROC(_debug, OID_AUTO, leak_schedlock, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
-    sysctl_leak_schedlock, "IU", "");
-
-/*
  * Drop into the debugger by sending an IPI NMI to the boot processor.
  */
 static void
Index: mp_clock.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/mp_clock.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/mp_clock.c -L sys/i386/i386/mp_clock.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/mp_clock.c
+++ sys/i386/i386/mp_clock.c
@@ -8,7 +8,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/mp_clock.c,v 1.19 2004/05/30 20:34:57 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/mp_clock.c,v 1.20 2007/06/04 18:25:06 dwmalone Exp $");
 
 /*-
  * Just when we thought life were beautiful, reality pops its grim face over
@@ -71,7 +71,7 @@
 	if (piix_timecounter.tc_frequency == 0)
 		return (EOPNOTSUPP);
 	freq = piix_freq;
-	error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
+	error = sysctl_handle_int(oidp, &freq, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		piix_freq = freq;
 		piix_timecounter.tc_frequency = piix_freq;
Index: mp_machdep.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/mp_machdep.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/i386/i386/mp_machdep.c -L sys/i386/i386/mp_machdep.c -u -r1.2 -r1.3
--- sys/i386/i386/mp_machdep.c
+++ sys/i386/i386/mp_machdep.c
@@ -24,14 +24,14 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/mp_machdep.c,v 1.252.2.5.2.1 2006/04/28 06:54:34 cperciva Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/mp_machdep.c,v 1.281.2.2 2007/11/28 23:24:07 cperciva Exp $");
 
 #include "opt_apic.h"
 #include "opt_cpu.h"
-#include "opt_kdb.h"
 #include "opt_kstack_pages.h"
 #include "opt_mp_watchdog.h"
 #include "opt_sched.h"
+#include "opt_smp.h"
 
 #if !defined(lint)
 #if !defined(SMP)
@@ -61,6 +61,7 @@
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
+#include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
@@ -71,12 +72,11 @@
 #include <vm/vm_extern.h>
 
 #include <machine/apicreg.h>
-#include <machine/clock.h>
 #include <machine/md_var.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pcb.h>
+#include <machine/psl.h>
 #include <machine/smp.h>
-#include <machine/smptests.h>	/** COUNT_XINVLTLB_HITS */
 #include <machine/specialreg.h>
 #include <machine/privatespace.h>
 
@@ -127,29 +127,9 @@
 
 #endif				/* CHECK_POINTS */
 
-/*
- * Values to send to the POST hardware.
- */
-#define MP_BOOTADDRESS_POST	0x10
-#define MP_PROBE_POST		0x11
-#define MPTABLE_PASS1_POST	0x12
-
-#define MP_START_POST		0x13
-#define MP_ENABLE_POST		0x14
-#define MPTABLE_PASS2_POST	0x15
-
-#define START_ALL_APS_POST	0x16
-#define INSTALL_AP_TRAMP_POST	0x17
-#define START_AP_POST		0x18
-
-#define MP_ANNOUNCE_POST	0x19
-
 /* lock region used by kernel profiling */
 int	mcount_lock;
 
-/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
-int	current_postcode;
-
 int	mp_naps;		/* # of Applications processors */
 int	boot_cpu_id = -1;	/* designated BSP */
 extern	int nkpt;
@@ -177,19 +157,20 @@
 vm_offset_t smp_tlb_addr2;
 volatile int smp_tlb_wait;
 
-#ifdef KDB_STOP_NMI
+#ifdef STOP_NMI
 volatile cpumask_t ipi_nmi_pending;
+
+static void	ipi_nmi_selected(u_int32_t cpus);
 #endif 
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
-#ifdef IPI_PREEMPTION
 static u_long *ipi_preempt_counts[MAXCPU];
-#endif
 static u_long *ipi_ast_counts[MAXCPU];
 u_long *ipi_invltlb_counts[MAXCPU];
 u_long *ipi_invlrng_counts[MAXCPU];
 u_long *ipi_invlpg_counts[MAXCPU];
+u_long *ipi_invlcache_counts[MAXCPU];
 u_long *ipi_rendezvous_counts[MAXCPU];
 u_long *ipi_lazypmap_counts[MAXCPU];
 #endif
@@ -198,6 +179,20 @@
  * Local data and functions.
  */
 
+#ifdef STOP_NMI
+/* 
+ * Provide an alternate method of stopping other CPUs. If another CPU has
+ * disabled interrupts the conventional STOP IPI will be blocked. This 
+ * NMI-based stop should get through in that case.
+ */
+static int stop_cpus_with_nmi = 1;
+SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
+    &stop_cpus_with_nmi, 0, "");
+TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi);
+#else
+#define	stop_cpus_with_nmi	0
+#endif
+
 static u_int logical_cpus;
 
 /* used to hold the AP's until we are ready to release them */
@@ -214,24 +209,25 @@
 	int	cpu_present:1;
 	int	cpu_bsp:1;
 	int	cpu_disabled:1;
-} static cpu_info[MAXCPU];
-static int cpu_apic_ids[MAXCPU];
+} static cpu_info[MAX_APIC_ID + 1];
+int cpu_apic_ids[MAXCPU];
 
 /* Holds pending bitmap based IPIs per CPU */
 static volatile u_int cpu_ipi_pending[MAXCPU];
 
 static u_int boot_address;
 
+static void	assign_cpu_ids(void);
+static void	install_ap_tramp(void);
 static void	set_interrupt_apic_ids(void);
 static int	start_all_aps(void);
-static void	install_ap_tramp(void);
 static int	start_ap(int apic_id);
 static void	release_aps(void *dummy);
 
 static int	hlt_logical_cpus;
 static u_int	hyperthreading_cpus;
 static cpumask_t	hyperthreading_cpus_mask;
-static int	hyperthreading_allowed;
+static int	hyperthreading_allowed = 1;
 static struct	sysctl_ctx_list logical_cpu_clist;
 
 static void
@@ -245,28 +241,25 @@
 mp_topology(void)
 {
 	struct cpu_group *group;
-	int logical_cpus;
 	int apic_id;
 	int groups;
 	int cpu;
 
 	/* Build the smp_topology map. */
 	/* Nothing to do if there is no HTT support. */
-	if ((cpu_feature & CPUID_HTT) == 0)
-		return;
-	logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
-	if (logical_cpus <= 1)
+	if (hyperthreading_cpus <= 1)
 		return;
 	group = &mp_groups[0];
 	groups = 1;
-	for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
+	for (cpu = 0, apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) {
 		if (!cpu_info[apic_id].cpu_present)
 			continue;
 		/*
 		 * If the current group has members and we're not a logical
 		 * cpu, create a new group.
 		 */
-		if (group->cg_count != 0 && (apic_id % logical_cpus) == 0) {
+		if (group->cg_count != 0 &&
+		    (apic_id % hyperthreading_cpus) == 0) {
 			group++;
 			groups++;
 		}
@@ -287,7 +280,6 @@
 u_int
 mp_bootaddress(u_int basemem)
 {
-	POSTCODE(MP_BOOTADDRESS_POST);
 
 	boot_address = trunc_page(basemem);	/* round down to 4k boundary */
 	if ((basemem - boot_address) < bootMP_size)
@@ -300,9 +292,8 @@
 cpu_add(u_int apic_id, char boot_cpu)
 {
 
-	if (apic_id >= MAXCPU) {
-		printf("SMP: CPU %d exceeds maximum CPU %d, ignoring\n",
-		    apic_id, MAXCPU - 1);
+	if (apic_id > MAX_APIC_ID) {
+		panic("SMP: APIC ID %d too high", apic_id);
 		return;
 	}
 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
@@ -315,11 +306,11 @@
 		boot_cpu_id = apic_id;
 		cpu_info[apic_id].cpu_bsp = 1;
 	}
-	mp_ncpus++;
+	if (mp_ncpus < MAXCPU)
+		mp_ncpus++;
 	if (bootverbose)
 		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
 		    "AP");
-	
 }
 
 void
@@ -370,8 +361,6 @@
 	int i;
 	u_int threads_per_cache, p[4];
 
-	POSTCODE(MP_START_POST);
-
 	/* Initialize the logical ID to APIC ID table. */
 	for (i = 0; i < MAXCPU; i++) {
 		cpu_apic_ids[i] = -1;
@@ -385,7 +374,11 @@
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLRNG, IDTVEC(invlrng),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
-	
+
+	/* Install an inter-CPU IPI for cache invalidation. */
+	setidt(IPI_INVLCACHE, IDTVEC(invlcache),
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
 	/* Install an inter-CPU IPI for lazy pmap release */
 	setidt(IPI_LAZYPMAP, IDTVEC(lazypmap),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
@@ -412,6 +405,8 @@
 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
 	cpu_apic_ids[0] = boot_cpu_id;
 
+	assign_cpu_ids();
+
 	/* Start each Application Processor */
 	start_all_aps();
 
@@ -463,6 +458,9 @@
 	}
 
 	set_interrupt_apic_ids();
+
+	/* Last, setup the cpu topology now that we have probed CPUs */
+	mp_topology();
 }
 
 
@@ -474,11 +472,9 @@
 {
 	int i, x;
 
-	POSTCODE(MP_ANNOUNCE_POST);
-
 	/* List CPUs */
 	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
-	for (i = 1, x = 0; x < MAXCPU; x++) {
+	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
 		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
 			continue;
 		if (cpu_info[x].cpu_disabled)
@@ -564,6 +560,9 @@
 	lidt(&r_idt);
 #endif
 
+	/* Initialize the PAT MSR if present. */
+	pmap_init_pat();
+
 	/* set up CPU registers and state */
 	cpu_setregs();
 
@@ -573,6 +572,16 @@
 	/* set up SSE registers */
 	enable_sse();
 
+#ifdef PAE
+	/* Enable the PTE no-execute bit. */
+	if ((amd_feature & AMDID_NX) != 0) {
+		uint64_t msr;
+
+		msr = rdmsr(MSR_EFER) | EFER_NXE;
+		wrmsr(MSR_EFER, msr);
+	}
+#endif
+
 	/* A quick check from sanity claus */
 	if (PCPU_GET(apic_id) != lapic_id()) {
 		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
@@ -589,7 +598,7 @@
 	mtx_lock_spin(&ap_boot_mtx);
 
 	/* Init local apic for irq's */
-	lapic_setup();
+	lapic_setup(1);
 
 	/* Set memory range attributes for this CPU to match the BSP */
 	mem_range_AP_init();
@@ -626,25 +635,8 @@
 	while (smp_started == 0)
 		ia32_pause();
 
-	/* ok, now grab sched_lock and enter the scheduler */
-	mtx_lock_spin(&sched_lock);
-
-	/*
-	 * Correct spinlock nesting.  The idle thread context that we are
-	 * borrowing was created so that it would start out with a single
-	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
-	 * explicitly acquired locks in this function, the nesting count
-	 * is now 2 rather than 1.  Since we are nested, calling
-	 * spinlock_exit() will simply adjust the counts without allowing
-	 * spin lock using code to interrupt us.
-	 */
-	spinlock_exit();
-	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
-
-	binuptime(PCPU_PTR(switchtime));
-	PCPU_SET(switchticks, ticks);
-
-	cpu_throw(NULL, choosethread());	/* doesn't return */
+	/* enter the scheduler */
+	sched_throw(NULL);
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
@@ -664,24 +656,69 @@
 static void
 set_interrupt_apic_ids(void)
 {
-	u_int apic_id;
+	u_int i, apic_id;
 
-	for (apic_id = 0; apic_id < MAXCPU; apic_id++) {
-		if (!cpu_info[apic_id].cpu_present)
+	for (i = 0; i < MAXCPU; i++) {
+		apic_id = cpu_apic_ids[i];
+		if (apic_id == -1)
 			continue;
 		if (cpu_info[apic_id].cpu_bsp)
 			continue;
+		if (cpu_info[apic_id].cpu_disabled)
+			continue;
 
 		/* Don't let hyperthreads service interrupts. */
 		if (hyperthreading_cpus > 1 &&
 		    apic_id % hyperthreading_cpus != 0)
 			continue;
 
-		intr_add_cpu(apic_id);
+		intr_add_cpu(i);
 	}
 }
 
 /*
+ * Assign logical CPU IDs to local APICs.
+ */
+static void
+assign_cpu_ids(void)
+{
+	u_int i;
+
+	/* Check for explicitly disabled CPUs. */
+	for (i = 0; i <= MAX_APIC_ID; i++) {
+		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
+			continue;
+
+		/* Don't use this CPU if it has been disabled by a tunable. */
+		if (resource_disabled("lapic", i)) {
+			cpu_info[i].cpu_disabled = 1;
+			continue;
+		}
+	}
+
+	/*
+	 * Assign CPU IDs to local APIC IDs and disable any CPUs
+	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
+	 * so we only have to assign IDs for APs.
+	 */
+	mp_ncpus = 1;
+	for (i = 0; i <= MAX_APIC_ID; i++) {
+		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
+		    cpu_info[i].cpu_disabled)
+			continue;
+
+		if (mp_ncpus < MAXCPU) {
+			cpu_apic_ids[mp_ncpus] = i;
+			mp_ncpus++;
+		} else
+			cpu_info[i].cpu_disabled = 1;
+	}
+	KASSERT(mp_maxid >= mp_ncpus - 1,
+	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
+	    mp_ncpus));		
+}
+
+/*
  * start each AP in our list
  */
 static int
@@ -696,8 +733,6 @@
 	u_int32_t mpbioswarmvec;
 	int apic_id, cpu, i, pg;
 
-	POSTCODE(START_ALL_APS_POST);
-
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	/* install the AP 1st level boot code */
@@ -719,24 +754,8 @@
 	invltlb();
 
 	/* start each AP */
-	for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
-
-		/* Ignore non-existent CPUs and the BSP. */
-		if (!cpu_info[apic_id].cpu_present ||
-		    cpu_info[apic_id].cpu_bsp)
-			continue;
-
-		/* Don't use this CPU if it has been disabled by a tunable. */
-		if (resource_disabled("lapic", apic_id)) {
-			cpu_info[apic_id].cpu_disabled = 1;
-			mp_ncpus--;
-			continue;
-		}
-
-		cpu++;
-
-		/* save APIC ID for this logical ID */
-		cpu_apic_ids[cpu] = apic_id;
+	for (cpu = 1; cpu < mp_ncpus; cpu++) {
+		apic_id = cpu_apic_ids[cpu];
 
 		/* first page of AP's private space */
 		pg = cpu * i386_btop(sizeof(struct privatespace));
@@ -841,8 +860,6 @@
 	u_int16_t *dst16;
 	u_int32_t *dst32;
 
-	POSTCODE(INSTALL_AP_TRAMP_POST);
-
 	KASSERT (size <= PAGE_SIZE,
 	    ("'size' do not fit into PAGE_SIZE, as expected."));
 	pmap_kenter(va, boot_address);
@@ -893,8 +910,6 @@
 	int vector, ms;
 	int cpus;
 
-	POSTCODE(START_AP_POST);
-
 	/* calculate the vector */
 	vector = (boot_address >> 12) & 0xff;
 
@@ -1008,13 +1023,16 @@
 	ncpu = mp_ncpus - 1;	/* does not shootdown self */
 	if (ncpu < 1)
 		return;		/* no other cpus */
-	mtx_assert(&smp_ipi_mtx, MA_OWNED);
+	if (!(read_eflags() & PSL_I))
+		panic("%s: interrupts disabled", __func__);
+	mtx_lock_spin(&smp_ipi_mtx);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
 	ipi_all_but_self(vector);
 	while (smp_tlb_wait < ncpu)
 		ia32_pause();
+	mtx_unlock_spin(&smp_ipi_mtx);
 }
 
 static void
@@ -1042,7 +1060,9 @@
 		if (ncpu < 1)
 			return;
 	}
-	mtx_assert(&smp_ipi_mtx, MA_OWNED);
+	if (!(read_eflags() & PSL_I))
+		panic("%s: interrupts disabled", __func__);
+	mtx_lock_spin(&smp_ipi_mtx);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
@@ -1052,6 +1072,15 @@
 		ipi_selected(mask, vector);
 	while (smp_tlb_wait < ncpu)
 		ia32_pause();
+	mtx_unlock_spin(&smp_ipi_mtx);
+}
+
+void
+smp_cache_flush(void)
+{
+
+	if (smp_started)
+		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
 }
 
 void
@@ -1128,36 +1157,30 @@
 	}
 }
 
-
 void
-ipi_bitmap_handler(struct clockframe frame)
+ipi_bitmap_handler(struct trapframe frame)
 {
 	int cpu = PCPU_GET(cpuid);
 	u_int ipi_bitmap;
 
 	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
 
-#ifdef IPI_PREEMPTION
-	if (ipi_bitmap & IPI_PREEMPT) {
+	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
+		struct thread *running_thread = curthread;
 #ifdef COUNT_IPIS
-		*ipi_preempt_counts[cpu]++;
+		(*ipi_preempt_counts[cpu])++;
 #endif
-		mtx_lock_spin(&sched_lock);
-		/* Don't preempt the idle thread */
-		if (curthread->td_priority <  PRI_MIN_IDLE) {
-			struct thread *running_thread = curthread;
-			if (running_thread->td_critnest > 1) 
-				running_thread->td_owepreempt = 1;
-			else 		
-				mi_switch(SW_INVOL | SW_PREEMPT, NULL);
-		}
-		mtx_unlock_spin(&sched_lock);
+		thread_lock(running_thread);
+		if (running_thread->td_critnest > 1) 
+			running_thread->td_owepreempt = 1;
+		else 		
+			mi_switch(SW_INVOL | SW_PREEMPT, NULL);
+		thread_unlock(running_thread);
 	}
-#endif
 
-	if (ipi_bitmap & IPI_AST) {
+	if (ipi_bitmap & (1 << IPI_AST)) {
 #ifdef COUNT_IPIS
-		*ipi_ast_counts[cpu]++;
+		(*ipi_ast_counts[cpu])++;
 #endif
 		/* Nothing to do for AST */
 	}
@@ -1179,6 +1202,12 @@
 		ipi = IPI_BITMAP_VECTOR;
 	}
 
+#ifdef STOP_NMI
+	if (ipi == IPI_STOP && stop_cpus_with_nmi) {
+		ipi_nmi_selected(cpus);
+		return;
+	}
+#endif
 	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
 	while ((cpu = ffs(cpus)) != 0) {
 		cpu--;
@@ -1209,6 +1238,10 @@
 ipi_all(u_int ipi)
 {
 
+	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
+		ipi_selected(all_cpus, ipi);
+		return;
+	}
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL);
 }
@@ -1220,6 +1253,10 @@
 ipi_all_but_self(u_int ipi)
 {
 
+	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
+		ipi_selected(PCPU_GET(other_cpus), ipi);
+		return;
+	}
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
@@ -1231,11 +1268,15 @@
 ipi_self(u_int ipi)
 {
 
+	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
+		ipi_selected(PCPU_GET(cpumask), ipi);
+		return;
+	}
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF);
 }
 
-#ifdef KDB_STOP_NMI
+#ifdef STOP_NMI
 /*
  * send NMI IPI to selected CPUs
  */
@@ -1245,7 +1286,6 @@
 void
 ipi_nmi_selected(u_int32_t cpus)
 {
-
 	int cpu;
 	register_t icrlo;
 
@@ -1254,10 +1294,8 @@
 	
 	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
 
-
 	atomic_set_int(&ipi_nmi_pending, cpus);
 
-
 	while ((cpu = ffs(cpus)) != 0) {
 		cpu--;
 		cpus &= ~(1 << cpu);
@@ -1269,41 +1307,52 @@
 		if (!lapic_ipi_wait(BEFORE_SPIN))
 			panic("ipi_nmi_selected: previous IPI has not cleared");
 
-		lapic_ipi_raw(icrlo,cpu_apic_ids[cpu]);
+		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
 	}
 }
 
-
 int
-ipi_nmi_handler()
+ipi_nmi_handler(void)
 {
-	int cpu  = PCPU_GET(cpuid);
+	int cpumask = PCPU_GET(cpumask);
 
-	if(!(atomic_load_acq_int(&ipi_nmi_pending) & (1 << cpu)))
+	if (!(ipi_nmi_pending & cpumask))
 		return 1;
 
-	atomic_clear_int(&ipi_nmi_pending,1 << cpu);
+	atomic_clear_int(&ipi_nmi_pending, cpumask);
+	cpustop_handler();
+	return 0;
+}
+
+#endif /* STOP_NMI */
+
+/*
+ * Handle an IPI_STOP by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpustop_handler(void)
+{
+	int cpu = PCPU_GET(cpuid);
+	int cpumask = PCPU_GET(cpumask);
 
 	savectx(&stoppcbs[cpu]);
 
 	/* Indicate that we are stopped */
-	atomic_set_int(&stopped_cpus,1 << cpu);
-
+	atomic_set_int(&stopped_cpus, cpumask);
 
 	/* Wait for restart */
-	while(!(atomic_load_acq_int(&started_cpus) & (1 << cpu)))
+	while (!(started_cpus & cpumask))
 	    ia32_pause();
 
-	atomic_clear_int(&started_cpus,1 << cpu);
-	atomic_clear_int(&stopped_cpus,1 << cpu);
+	atomic_clear_int(&started_cpus, cpumask);
+	atomic_clear_int(&stopped_cpus, cpumask);
 
-	if(cpu == 0 && cpustop_restartfunc != NULL)
+	if (cpu == 0 && cpustop_restartfunc != NULL) {
 		cpustop_restartfunc();
-
-	return 0;
+		cpustop_restartfunc = NULL;
+	}
 }
-     
-#endif /* KDB_STOP_NMI */
 
 /*
  * This is called once the rest of the system is up and running and we're
@@ -1315,11 +1364,9 @@
 
 	if (mp_ncpus == 1) 
 		return;
-	mtx_lock_spin(&sched_lock);
 	atomic_store_rel_int(&aps_ready, 1);
 	while (smp_started == 0)
 		ia32_pause();
-	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
@@ -1482,10 +1529,8 @@
 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d: invlpg", i);
 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
-#ifdef IPI_PREEMPTION
 		snprintf(buf, sizeof(buf), "cpu%d: preempt", i);
 		intrcnt_add(buf, &ipi_preempt_counts[i]);
-#endif
 		snprintf(buf, sizeof(buf), "cpu%d: ast", i);
 		intrcnt_add(buf, &ipi_ast_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d: rendezvous", i);
Index: io_apic.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/io_apic.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/i386/i386/io_apic.c -L sys/i386/i386/io_apic.c -u -r1.2 -r1.3
--- sys/i386/i386/io_apic.c
+++ sys/i386/i386/io_apic.c
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/io_apic.c,v 1.20.2.4 2006/03/07 18:33:21 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/io_apic.c,v 1.35 2007/06/05 18:57:48 jhb Exp $");
 
 #include "opt_isa.h"
 
@@ -36,11 +36,15 @@
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
-#include <sys/malloc.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
@@ -48,6 +52,7 @@
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/apicvar.h>
+#include <machine/resource.h>
 #include <machine/segments.h>
 
 #define IOAPIC_ISA_INTS		16
@@ -60,9 +65,7 @@
 #define	IRQ_SMI			(NUM_IO_INTS + 3)
 #define	IRQ_DISABLED		(NUM_IO_INTS + 4)
 
-#define	TODO		printf("%s: not implemented!\n", __func__)
-
-static MALLOC_DEFINE(M_IOAPIC, "I/O APIC", "I/O APIC structures");
+static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures");
 
 /*
  * I/O APIC interrupt source driver.  Each pin is assigned an IRQ cookie
@@ -72,6 +75,10 @@
  * IRQs behave as PCI IRQs by default.  We also assume that the pin for
  * IRQ 0 is actually an ExtINT pin.  The apic enumerators override the
  * configuration of individual pins as indicated by their tables.
+ *
+ * Documentation for the I/O APIC: "82093AA I/O Advanced Programmable
+ * Interrupt Controller (IOAPIC)", May 1996, Intel Corp.
+ * ftp://download.intel.com/design/chipsets/datashts/29056601.pdf
  */
 
 struct ioapic_intsrc {
@@ -84,6 +91,7 @@
 	u_int io_edgetrigger:1;
 	u_int io_masked:1;
 	int io_bus:4;
+	uint32_t io_lowreg;
 };
 
 struct ioapic {
@@ -93,6 +101,7 @@
 	u_int io_intbase:8;		/* System Interrupt base */
 	u_int io_numintr:8;
 	volatile ioapic_t *io_addr;	/* XXX: should use bus_space */
+	vm_paddr_t io_paddr;
 	STAILQ_ENTRY(ioapic) io_next;
 	struct ioapic_intsrc io_pins[0];
 };
@@ -105,20 +114,20 @@
 static void	ioapic_disable_source(struct intsrc *isrc, int eoi);
 static void	ioapic_eoi_source(struct intsrc *isrc);
 static void	ioapic_enable_intr(struct intsrc *isrc);
+static void	ioapic_disable_intr(struct intsrc *isrc);
 static int	ioapic_vector(struct intsrc *isrc);
 static int	ioapic_source_pending(struct intsrc *isrc);
 static int	ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig,
 		    enum intr_polarity pol);
-static void	ioapic_suspend(struct intsrc *isrc);
-static void	ioapic_resume(struct intsrc *isrc);
+static void	ioapic_resume(struct pic *pic);
 static void	ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id);
 static void	ioapic_program_intpin(struct ioapic_intsrc *intpin);
 
 static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list);
 struct pic ioapic_template = { ioapic_enable_source, ioapic_disable_source,
 			       ioapic_eoi_source, ioapic_enable_intr,
-			       ioapic_vector, ioapic_source_pending,
-			       ioapic_suspend, ioapic_resume,
+			       ioapic_disable_intr, ioapic_vector,
+			       ioapic_source_pending, NULL, ioapic_resume,
 			       ioapic_config_intr, ioapic_assign_cpu };
 
 static int next_ioapic_base;
@@ -202,9 +211,7 @@
 
 	mtx_lock_spin(&icu_lock);
 	if (intpin->io_masked) {
-		flags = ioapic_read(io->io_addr,
-		    IOAPIC_REDTBL_LO(intpin->io_intpin));
-		flags &= ~(IOART_INTMASK);
+		flags = intpin->io_lowreg & ~IOART_INTMASK;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    flags);
 		intpin->io_masked = 0;
@@ -221,9 +228,7 @@
 
 	mtx_lock_spin(&icu_lock);
 	if (!intpin->io_masked && !intpin->io_edgetrigger) {
-		flags = ioapic_read(io->io_addr,
-		    IOAPIC_REDTBL_LO(intpin->io_intpin));
-		flags |= IOART_INTMSET;
+		flags = intpin->io_lowreg | IOART_INTMSET;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    flags);
 		intpin->io_masked = 1;
@@ -308,6 +313,7 @@
 
 	/* Write the values to the APIC. */
 	mtx_lock_spin(&icu_lock);
+	intpin->io_lowreg = low;
 	ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low);
 	value = ioapic_read(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin));
 	value &= ~IOART_DEST;
@@ -354,6 +360,23 @@
 	}
 }
 
+static void
+ioapic_disable_intr(struct intsrc *isrc)
+{
+	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
+	u_int vector;
+
+	if (intpin->io_vector != 0) {
+		/* Mask this interrupt pin and free its APIC vector. */
+		vector = intpin->io_vector;
+		apic_disable_vector(vector);
+		intpin->io_masked = 1;
+		intpin->io_vector = 0;
+		ioapic_program_intpin(intpin);
+		apic_free_vector(vector, intpin->io_irq);
+	}
+}
+
 static int
 ioapic_vector(struct intsrc *isrc)
 {
@@ -416,24 +439,20 @@
 }
 
 static void
-ioapic_suspend(struct intsrc *isrc)
-{
-
-	TODO;
-}
-
-static void
-ioapic_resume(struct intsrc *isrc)
+ioapic_resume(struct pic *pic)
 {
+	struct ioapic *io = (struct ioapic *)pic;
+	int i;
 
-	ioapic_program_intpin((struct ioapic_intsrc *)isrc);
+	for (i = 0; i < io->io_numintr; i++)
+		ioapic_program_intpin(&io->io_pins[i]);
 }
 
 /*
  * Create a plain I/O APIC object.
  */
 void *
-ioapic_create(uintptr_t addr, int32_t apic_id, int intbase)
+ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase)
 {
 	struct ioapic *io;
 	struct ioapic_intsrc *intpin;
@@ -442,7 +461,7 @@
 	uint32_t value;
 
 	/* Map the register window so we can access the device. */
-	apic = (ioapic_t *)pmap_mapdev(addr, IOAPIC_MEM_REGION);
+	apic = pmap_mapdev(addr, IOAPIC_MEM_REGION);
 	mtx_lock_spin(&icu_lock);
 	value = ioapic_read(apic, IOAPIC_VER);
 	mtx_unlock_spin(&icu_lock);
@@ -473,13 +492,14 @@
 		intbase = next_ioapic_base;
 		printf("ioapic%u: Assuming intbase of %d\n", io->io_id,
 		    intbase);
-	} else if (intbase != next_ioapic_base)
+	} else if (intbase != next_ioapic_base && bootverbose)
 		printf("ioapic%u: WARNING: intbase %d != expected base %d\n",
 		    io->io_id, intbase, next_ioapic_base);
 	io->io_intbase = intbase;
 	next_ioapic_base = intbase + numintr;
 	io->io_numintr = numintr;
 	io->io_addr = apic;
+	io->io_paddr = addr;
 
 	/*
 	 * Initialize pins.  Start off with interrupts disabled.  Default
@@ -517,13 +537,6 @@
 		 * be routed to other CPUs later after they are enabled.
 		 */
 		intpin->io_cpu = PCPU_GET(apic_id);
-		if (bootverbose && intpin->io_irq != IRQ_DISABLED) {
-			printf("ioapic%u: intpin %d -> ",  io->io_id, i);
-			ioapic_print_irq(intpin);
-			printf(" (%s, %s)\n", intpin->io_edgetrigger ?
-			    "edge" : "level", intpin->io_activehi ? "high" :
-			    "low");
-		}
 		value = ioapic_read(apic, IOAPIC_REDTBL_LO(i));
 		ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET);
 	}
@@ -588,6 +601,8 @@
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
+	if (io->io_pins[pin].io_bus == bus_type)
+		return (0);
 	io->io_pins[pin].io_bus = bus_type;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d bus %s\n", io->io_id, pin,
@@ -671,13 +686,17 @@
 ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol)
 {
 	struct ioapic *io;
+	int activehi;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
-	io->io_pins[pin].io_activehi = (pol == INTR_POLARITY_HIGH);
+	activehi = (pol == INTR_POLARITY_HIGH);
+	if (io->io_pins[pin].io_activehi == activehi)
+		return (0);
+	io->io_pins[pin].io_activehi = activehi;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d polarity: %s\n", io->io_id, pin,
 		    pol == INTR_POLARITY_HIGH ? "high" : "low");
@@ -688,13 +707,17 @@
 ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger)
 {
 	struct ioapic *io;
+	int edgetrigger;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
-	io->io_pins[pin].io_edgetrigger = (trigger == INTR_TRIGGER_EDGE);
+	edgetrigger = (trigger == INTR_TRIGGER_EDGE);
+	if (io->io_pins[pin].io_edgetrigger == edgetrigger)
+		return (0);
+	io->io_pins[pin].io_edgetrigger = edgetrigger;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d trigger: %s\n", io->io_id, pin,
 		    trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
@@ -724,7 +747,129 @@
 	    io->io_intbase + io->io_numintr - 1);
 
 	/* Register valid pins as interrupt sources. */
+	intr_register_pic(&io->io_pic);
 	for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++)
 		if (pin->io_irq < NUM_IO_INTS)
 			intr_register_source(&pin->io_intsrc);
 }
+
+/* A simple new-bus driver to consume PCI I/O APIC devices. */
+static int
+ioapic_pci_probe(device_t dev)
+{
+
+	if (pci_get_class(dev) == PCIC_BASEPERIPH &&
+	    pci_get_subclass(dev) == PCIS_BASEPERIPH_PIC) {
+		switch (pci_get_progif(dev)) {
+		case PCIP_BASEPERIPH_PIC_IO_APIC:
+			device_set_desc(dev, "IO APIC");
+			break;
+		case PCIP_BASEPERIPH_PIC_IOX_APIC:
+			device_set_desc(dev, "IO(x) APIC");
+			break;
+		default:
+			return (ENXIO);
+		}
+		device_quiet(dev);
+		return (-10000);
+	}
+	return (ENXIO);
+}
+
+static int
+ioapic_pci_attach(device_t dev)
+{
+
+	return (0);
+}
+
+static device_method_t ioapic_pci_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		ioapic_pci_probe),
+	DEVMETHOD(device_attach,	ioapic_pci_attach),
+
+	{ 0, 0 }
+};
+
+DEFINE_CLASS_0(ioapic, ioapic_pci_driver, ioapic_pci_methods, 0);
+
+static devclass_t ioapic_devclass;
+DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0);
+
+/*
+ * A new-bus driver to consume the memory resources associated with
+ * the APICs in the system.  On some systems ACPI or PnPBIOS system
+ * resource devices may already claim these resources.  To keep from
+ * breaking those devices, we attach ourself to the nexus device after
+ * legacy0 and acpi0 and ignore any allocation failures.
+ */
+static void
+apic_identify(driver_t *driver, device_t parent)
+{
+
+	/*
+	 * Add at order 12.  acpi0 is probed at order 10 and legacy0
+	 * is probed at order 11.
+	 */
+	if (lapic_paddr != 0)
+		BUS_ADD_CHILD(parent, 12, "apic", 0);
+}
+
+static int
+apic_probe(device_t dev)
+{
+
+	device_set_desc(dev, "APIC resources");
+	device_quiet(dev);
+	return (0);
+}
+
+static void
+apic_add_resource(device_t dev, int rid, vm_paddr_t base, size_t length)
+{
+	int error;
+
+#ifdef PAE
+	/*
+	 * Resources use long's to track resources, so we can't
+	 * include memory regions above 4GB.
+	 */
+	if (base >= ~0ul)
+		return;
+#endif
+	error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length);
+	if (error)
+		panic("apic_add_resource: resource %d failed set with %d", rid,
+		    error);
+	bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0);
+}
+
+static int
+apic_attach(device_t dev)
+{
+	struct ioapic *io;
+	int i;
+
+	/* Reserve the local APIC. */
+	apic_add_resource(dev, 0, lapic_paddr, sizeof(lapic_t));
+	i = 1;
+	STAILQ_FOREACH(io, &ioapic_list, io_next) {
+		apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION);
+		i++;
+	}
+	return (0);
+}
+
+static device_method_t apic_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_identify,	apic_identify),
+	DEVMETHOD(device_probe,		apic_probe),
+	DEVMETHOD(device_attach,	apic_attach),
+
+	{ 0, 0 }
+};
+
+DEFINE_CLASS_0(apic, apic_driver, apic_methods, 0);
+
+static devclass_t apic_devclass;
+DRIVER_MODULE(apic, nexus, apic_driver, apic_devclass, 0, 0);
Index: trap.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/trap.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/trap.c -L sys/i386/i386/trap.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/trap.c
+++ sys/i386/i386/trap.c
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/trap.c,v 1.277.2.3 2005/11/28 20:03:04 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/trap.c,v 1.307.2.1 2007/12/06 14:20:24 kib Exp $");
 
 /*
  * 386 Trap and System call handling
@@ -77,6 +77,7 @@
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
+#include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -101,8 +102,8 @@
 #include <machine/clock.h>
 #endif
 
-extern void trap(struct trapframe frame);
-extern void syscall(struct trapframe frame);
+extern void trap(struct trapframe *frame);
+extern void syscall(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int, vm_offset_t);
 static void trap_fatal(struct trapframe *, vm_offset_t);
@@ -157,10 +158,11 @@
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
+static int prot_fault_translation = 0;
+SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
+	&prot_fault_translation, 0, "Select signal to deliver on protection fault");
 
-#ifdef WITNESS
 extern char *syscallnames[];
-#endif
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
@@ -170,28 +172,31 @@
  */
 
 void
-trap(frame)
-	struct trapframe frame;
+trap(struct trapframe *frame)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
-	u_int sticks = 0;
-	int i = 0, ucode = 0, type, code;
+	int i = 0, ucode = 0, code;
+	u_int type;
+	register_t addr = 0;
 	vm_offset_t eva;
+	ksiginfo_t ksi;
 #ifdef POWERFAIL_NMI
 	static int lastalert = 0;
 #endif
 
-	PCPU_LAZY_INC(cnt.v_trap);
-	type = frame.tf_trapno;
+	PCPU_INC(cnt.v_trap);
+	type = frame->tf_trapno;
 
-#ifdef KDB_STOP_NMI
-	/* Handler for NMI IPIs used for debugging */
+#ifdef SMP
+#ifdef STOP_NMI
+	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI) {
 	         if (ipi_nmi_handler() == 0)
 	                   goto out;
 	}
-#endif /* KDB_STOP_NMI */
+#endif /* STOP_NMI */
+#endif /* SMP */
 
 #ifdef KDB
 	if (kdb_active) {
@@ -209,12 +214,12 @@
 	 * return immediately.
 	 */
 	if (type == T_NMI && pmc_intr &&
-	    (*pmc_intr)(PCPU_GET(cpuid), (uintptr_t) frame.tf_eip,
-		TRAPF_USERMODE(&frame)))
+	    (*pmc_intr)(PCPU_GET(cpuid), (uintptr_t) frame->tf_eip,
+		TRAPF_USERMODE(frame)))
 	    goto out;
 #endif
 
-	if ((frame.tf_eflags & PSL_I) == 0) {
+	if ((frame->tf_eflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
@@ -222,12 +227,12 @@
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
-		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
+		if (ISPL(frame->tf_cs) == SEL_UPL || (frame->tf_eflags & PSL_VM))
 			printf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curproc->p_comm, type);
 		else if (type != T_BPTFLT && type != T_TRCTRAP &&
-			 frame.tf_eip != (int)cpu_switch_load_gs) {
+			 frame->tf_eip != (int)cpu_switch_load_gs) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
@@ -236,17 +241,17 @@
 			    type);
 			/*
 			 * Page faults need interrupts disabled until later,
-			 * and we shouldn't enable interrupts while in a
-			 * critical section or if servicing an NMI.
+			 * and we shouldn't enable interrupts while holding
+			 * a spin lock or if servicing an NMI.
 			 */
 			if (type != T_NMI && type != T_PAGEFLT &&
-			    td->td_critnest == 0)
+			    td->td_md.md_spinlock_count == 0)
 				enable_intr();
 		}
 	}
 
 	eva = 0;
-	code = frame.tf_err;
+	code = frame->tf_err;
 	if (type == T_PAGEFLT) {
 		/*
 		 * For some Cyrix CPUs, %cr2 is clobbered by
@@ -261,35 +266,45 @@
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
+		 *
+		 * If we get a page fault while holding a non-sleepable
+		 * lock, then it is most likely a fatal kernel page fault.
+		 * If WITNESS is enabled, then it's going to whine about
+		 * bogus LORs with various VM locks, so just skip to the
+		 * fatal trap handling directly.
 		 */
 		eva = rcr2();
-		if (td->td_critnest == 0)
-			enable_intr();
+		if (td->td_critnest != 0 ||
+		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
+		    "Kernel page fault") != 0)
+			trap_fatal(frame, eva);
 		else
-			trap_fatal(&frame, eva);
+			enable_intr();
 	}
 
-        if ((ISPL(frame.tf_cs) == SEL_UPL) ||
-	    ((frame.tf_eflags & PSL_VM) && 
+        if ((ISPL(frame->tf_cs) == SEL_UPL) ||
+	    ((frame->tf_eflags & PSL_VM) && 
 		!(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL))) {
 		/* user trap */
 
-		sticks = td->td_sticks;
-		td->td_frame = &frame;
+		td->td_pticks = 0;
+		td->td_frame = frame;
+		addr = frame->tf_eip;
 		if (td->td_ucred != p->p_ucred) 
 			cred_update_thread(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
-			ucode = type;
 			i = SIGILL;
+			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			enable_intr();
-			frame.tf_eflags &= ~PSL_T;
+			frame->tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
+			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
@@ -298,7 +313,7 @@
 			if (ucode == -1)
 				goto userout;
 #else
-			ucode = code;
+			ucode = 0;
 #endif
 			i = SIGFPE;
 			break;
@@ -310,27 +325,36 @@
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
-			if (frame.tf_eflags & PSL_VM) {
-				i = vm86_emulate((struct vm86frame *)&frame);
+			if (frame->tf_eflags & PSL_VM) {
+				i = vm86_emulate((struct vm86frame *)frame);
 				if (i == 0)
 					goto user;
 				break;
 			}
-			/* FALLTHROUGH */
-
+			i = SIGBUS;
+			ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
+			break;
 		case T_SEGNPFLT:	/* segment not present fault */
+			i = SIGBUS;
+			ucode = BUS_ADRERR;
+			break;
 		case T_TSSFLT:		/* invalid TSS fault */
+			i = SIGBUS;
+			ucode = BUS_OBJERR;
+			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
-			ucode = code + BUS_SEGM_FAULT ;
 			i = SIGBUS;
+			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
+#ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				thread_user_enter(td);
+#endif
 
-			i = trap_pfault(&frame, TRUE, eva);
+			i = trap_pfault(frame, TRUE, eva);
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 			if (i == -2) {
 				/*
@@ -338,10 +362,10 @@
 				 * treat the fault as an illegal instruction 
 				 * (T_PRIVINFLT) instead of a page fault.
 				 */
-				type = frame.tf_trapno = T_PRIVINFLT;
+				type = frame->tf_trapno = T_PRIVINFLT;
 
 				/* Proceed as in that case. */
-				ucode = type;
+				ucode = ILL_PRVOPC;
 				i = SIGILL;
 				break;
 			}
@@ -351,7 +375,37 @@
 			if (i == 0)
 				goto user;
 
-			ucode = T_PAGEFLT;
+			if (i == SIGSEGV)
+				ucode = SEGV_MAPERR;
+			else {
+				if (prot_fault_translation == 0) {
+					/*
+					 * Autodetect.
+					 * This check also covers the images
+					 * without the ABI-tag ELF note.
+					 */
+					if (p->p_osrel >= 700004) {
+						i = SIGSEGV;
+						ucode = SEGV_ACCERR;
+					} else {
+						i = SIGBUS;
+						ucode = BUS_PAGE_FAULT;
+					}
+				} else if (prot_fault_translation == 1) {
+					/*
+					 * Always compat mode.
+					 */
+					i = SIGBUS;
+					ucode = BUS_PAGE_FAULT;
+				} else {
+					/*
+					 * Always SIGSEGV mode.
+					 */
+					i = SIGSEGV;
+					ucode = SEGV_ACCERR;
+				}
+			}
+			addr = eva;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
@@ -384,7 +438,7 @@
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
-					kdb_trap(type, 0, &frame);
+					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto userout;
@@ -410,12 +464,14 @@
 			if (npxdna())
 				goto userout;
 #endif
-			i = SIGFPE;
-			ucode = FPE_FPU_NP_TRAP;
+			printf("pid %d killed due to lack of floating point\n",
+				p->p_pid);
+			i = SIGKILL;
+			ucode = 0;
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
-			ucode = T_FPOPFLT;
+			ucode = ILL_COPROC;
 			i = SIGILL;
 			break;
 
@@ -431,7 +487,7 @@
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
-			(void) trap_pfault(&frame, FALSE, eva);
+			(void) trap_pfault(frame, FALSE, eva);
 			goto out;
 
 		case T_DNA:
@@ -453,13 +509,13 @@
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
-			if (frame.tf_eflags & PSL_VM) {
-				i = vm86_emulate((struct vm86frame *)&frame);
+			if (frame->tf_eflags & PSL_VM) {
+				i = vm86_emulate((struct vm86frame *)frame);
 				if (i != 0)
 					/*
 					 * returns to original process
 					 */
-					vm86_trap((struct vm86frame *)&frame);
+					vm86_trap((struct vm86frame *)frame);
 				goto out;
 			}
 			if (type == T_STKFLT)
@@ -480,7 +536,7 @@
 			 * (XXX) so that we can continue, and generate
 			 * a signal.
 			 */
-			if (frame.tf_eip == (int)cpu_switch_load_gs) {
+			if (frame->tf_eip == (int)cpu_switch_load_gs) {
 				PCPU_GET(curpcb)->pcb_gs = 0;
 #if 0				
 				PROC_LOCK(p);
@@ -503,24 +559,24 @@
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
-			if (frame.tf_eip == (int)doreti_iret) {
-				frame.tf_eip = (int)doreti_iret_fault;
+			if (frame->tf_eip == (int)doreti_iret) {
+				frame->tf_eip = (int)doreti_iret_fault;
 				goto out;
 			}
-			if (frame.tf_eip == (int)doreti_popl_ds) {
-				frame.tf_eip = (int)doreti_popl_ds_fault;
+			if (frame->tf_eip == (int)doreti_popl_ds) {
+				frame->tf_eip = (int)doreti_popl_ds_fault;
 				goto out;
 			}
-			if (frame.tf_eip == (int)doreti_popl_es) {
-				frame.tf_eip = (int)doreti_popl_es_fault;
+			if (frame->tf_eip == (int)doreti_popl_es) {
+				frame->tf_eip = (int)doreti_popl_es_fault;
 				goto out;
 			}
-			if (frame.tf_eip == (int)doreti_popl_fs) {
-				frame.tf_eip = (int)doreti_popl_fs_fault;
+			if (frame->tf_eip == (int)doreti_popl_fs) {
+				frame->tf_eip = (int)doreti_popl_fs_fault;
 				goto out;
 			}
 			if (PCPU_GET(curpcb)->pcb_onfault != NULL) {
-				frame.tf_eip =
+				frame->tf_eip =
 				    (int)PCPU_GET(curpcb)->pcb_onfault;
 				goto out;
 			}
@@ -536,14 +592,14 @@
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
-			if (frame.tf_eflags & PSL_NT) {
-				frame.tf_eflags &= ~PSL_NT;
+			if (frame->tf_eflags & PSL_NT) {
+				frame->tf_eflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
-			if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) {
+			if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
@@ -552,12 +608,12 @@
 				 */
 				goto out;
 			}
-			if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
+			if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
-				frame.tf_eflags &= ~PSL_T;
+				frame->tf_eflags &= ~PSL_T;
 				goto out;
 			}
 			/*
@@ -589,8 +645,7 @@
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
-			/* XXX Giant */
-			if (kdb_trap(type, 0, &frame))
+			if (kdb_trap(type, 0, frame))
 				goto out;
 #endif
 			break;
@@ -617,7 +672,7 @@
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
-					kdb_trap(type, 0, &frame);
+					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto out;
@@ -628,7 +683,7 @@
 #endif /* DEV_ISA */
 		}
 
-		trap_fatal(&frame, eva);
+		trap_fatal(frame, eva);
 		goto out;
 	}
 
@@ -636,7 +691,12 @@
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
-	trapsignal(td, i, ucode);
+	ksiginfo_init_trap(&ksi);
+	ksi.ksi_signo = i;
+	ksi.ksi_code = ucode;
+	ksi.ksi_addr = (void *)addr;
+	ksi.ksi_trapno = type;
+	trapsignal(td, &ksi);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
@@ -649,7 +709,7 @@
 #endif
 
 user:
-	userret(td, &frame, sticks);
+	userret(td, frame);
 	mtx_assert(&Giant, MA_NOTOWNED);
 userout:
 out:
@@ -664,7 +724,7 @@
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
-	vm_map_t map = 0;
+	vm_map_t map;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct thread *td = curthread;
@@ -703,8 +763,16 @@
 		map = &vm->vm_map;
 	}
 
+	/*
+	 * PGEX_I is defined only if the execute disable bit capability is
+	 * supported and enabled.
+	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
+#ifdef PAE
+	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
+		ftype = VM_PROT_EXECUTE;
+#endif
 	else
 		ftype = VM_PROT_READ;
 
@@ -745,9 +813,6 @@
 		return (-1);
 	}
 
-	/* kludge to pass faulting virtual address to sendsig */
-	frame->tf_err = eva;
-
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
@@ -756,7 +821,8 @@
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
-	int code, type, ss, esp;
+	int code, ss, esp;
+	u_int type;
 	struct soft_segment_descriptor softseg;
 	char *msg;
 
@@ -813,22 +879,19 @@
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
-		printf("%lu (%s)\n",
-		    (u_long)curproc->p_pid, curproc->p_comm ?
-		    curproc->p_comm : "");
+		printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm);
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active) {
-		register_t eflags;
-		eflags = intr_disable();
+		frame->tf_err = eva;	/* smuggle fault address to ddb */
 		if (kdb_trap(type, 0, frame)) {
-			intr_restore(eflags);
+			frame->tf_err = code;	/* restore error code */
 			return;
 		}
-		intr_restore(eflags);
+		frame->tf_err = code;		/* restore error code */
 	}
 #endif
 	printf("trap number		= %d\n", type);
@@ -871,50 +934,45 @@
  *	A system call is essentially treated as a trap.
  */
 void
-syscall(frame)
-	struct trapframe frame;
+syscall(struct trapframe *frame)
 {
 	caddr_t params;
 	struct sysent *callp;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	register_t orig_tf_eflags;
-	u_int sticks;
 	int error;
 	int narg;
 	int args[8];
 	u_int code;
+	ksiginfo_t ksi;
 
-	/*
-	 * note: PCPU_LAZY_INC() can only be used if we can afford
-	 * occassional inaccuracy in the count.
-	 */
-	PCPU_LAZY_INC(cnt.v_syscall);
+	PCPU_INC(cnt.v_syscall);
 
 #ifdef DIAGNOSTIC
-	if (ISPL(frame.tf_cs) != SEL_UPL) {
-		mtx_lock(&Giant);	/* try to stabilize the system XXX */
+	if (ISPL(frame->tf_cs) != SEL_UPL) {
 		panic("syscall");
 		/* NOT REACHED */
-		mtx_unlock(&Giant);
 	}
 #endif
 
-	sticks = td->td_sticks;
-	td->td_frame = &frame;
+	td->td_pticks = 0;
+	td->td_frame = frame;
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
+#ifdef KSE
 	if (p->p_flag & P_SA)
 		thread_user_enter(td);
-	params = (caddr_t)frame.tf_esp + sizeof(int);
-	code = frame.tf_eax;
-	orig_tf_eflags = frame.tf_eflags;
+#endif
+	params = (caddr_t)frame->tf_esp + sizeof(int);
+	code = frame->tf_eax;
+	orig_tf_eflags = frame->tf_eflags;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is MP aware.
 		 */
-		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
+		(*p->p_sysent->sv_prepsyscall)(frame, args, &code, &params);
 	} else {
 		/*
 		 * Need to check if this is a 32 bit or 64 bit syscall.
@@ -944,7 +1002,7 @@
   	else
  		callp = &p->p_sysent->sv_table[code];
 
-	narg = callp->sy_narg & SYF_ARGMASK;
+	narg = callp->sy_narg;
 
 	/*
 	 * copyin and the ktrsyscall()/ktrsysret() code is MP-aware
@@ -963,29 +1021,26 @@
 	CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm, code);
 
-	/*
-	 * Try to run the syscall without Giant if the syscall
-	 * is MP safe.
-	 */
-	if ((callp->sy_narg & SYF_MPSAFE) == 0)
-		mtx_lock(&Giant);
+	td->td_syscalls++;
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
-		td->td_retval[1] = frame.tf_edx;
+		td->td_retval[1] = frame->tf_edx;
 
 		STOPEVENT(p, S_SCE, narg);
 
 		PTRACESTOP_SC(p, td, S_PT_SCE);
 
+		AUDIT_SYSCALL_ENTER(code, td);
 		error = (*callp->sy_call)(td, args);
+		AUDIT_SYSCALL_EXIT(error, td);
 	}
 
 	switch (error) {
 	case 0:
-		frame.tf_eax = td->td_retval[0];
-		frame.tf_edx = td->td_retval[1];
-		frame.tf_eflags &= ~PSL_C;
+		frame->tf_eax = td->td_retval[0];
+		frame->tf_edx = td->td_retval[1];
+		frame->tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
@@ -993,7 +1048,7 @@
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
 		 * int 0x80 is 2 bytes. We saved this in tf_err.
 		 */
-		frame.tf_eip -= frame.tf_err;
+		frame->tf_eip -= frame->tf_err;
 		break;
 
 	case EJUSTRETURN:
@@ -1006,29 +1061,40 @@
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
-		frame.tf_eax = error;
-		frame.tf_eflags |= PSL_C;
+		frame->tf_eax = error;
+		frame->tf_eflags |= PSL_C;
 		break;
 	}
 
 	/*
-	 * Release Giant if we previously set it.
-	 */
-	if ((callp->sy_narg & SYF_MPSAFE) == 0)
-		mtx_unlock(&Giant);
-
-	/*
 	 * Traced syscall.
 	 */
 	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
-		frame.tf_eflags &= ~PSL_T;
-		trapsignal(td, SIGTRAP, 0);
+		frame->tf_eflags &= ~PSL_T;
+		ksiginfo_init_trap(&ksi);
+		ksi.ksi_signo = SIGTRAP;
+		ksi.ksi_code = TRAP_TRACE;
+		ksi.ksi_addr = (void *)frame->tf_eip;
+		trapsignal(td, &ksi);
 	}
 
 	/*
+	 * Check for misbehavior.
+	 */
+	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
+	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
+	KASSERT(td->td_critnest == 0,
+	    ("System call %s returning in a critical section",
+	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"));
+	KASSERT(td->td_locks == 0,
+	    ("System call %s returning with %d locks held",
+	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???",
+	    td->td_locks));
+
+	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
-	userret(td, &frame, sticks);
+	userret(td, frame);
 
 	CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm, code);
@@ -1046,10 +1112,5 @@
 	STOPEVENT(p, S_SCX, code);
 
 	PTRACESTOP_SC(p, td, S_PT_SCX);
-
-	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
-	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
-	mtx_assert(&sched_lock, MA_NOTOWNED);
-	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
Index: elf_machdep.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/elf_machdep.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/elf_machdep.c -L sys/i386/i386/elf_machdep.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/elf_machdep.c
+++ sys/i386/i386/elf_machdep.c
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/elf_machdep.c,v 1.20 2004/08/11 02:35:05 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/elf_machdep.c,v 1.22 2007/05/22 02:22:58 kan Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -82,6 +82,7 @@
 						"/libexec/ld-elf.so.1",
 						&elf32_freebsd_sysvec,
 						NULL,
+						BI_CAN_EXEC_DYN,
 					  };
 
 SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY,
@@ -96,6 +97,7 @@
 						"/usr/libexec/ld-elf.so.1",
 						&elf32_freebsd_sysvec,
 						NULL,
+						BI_CAN_EXEC_DYN,
 					  };
 
 SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY,
Index: mem.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/mem.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/i386/i386/mem.c -L sys/i386/i386/mem.c -u -r1.2 -r1.3
--- sys/i386/i386/mem.c
+++ sys/i386/i386/mem.c
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/mem.c,v 1.116.8.1 2006/04/04 19:46:44 ups Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/mem.c,v 1.117 2006/01/23 15:46:09 ups Exp $");
 
 /*
  * Memory special file
Index: busdma_machdep.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/busdma_machdep.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/i386/i386/busdma_machdep.c -L sys/i386/i386/busdma_machdep.c -u -r1.2 -r1.3
--- sys/i386/i386/busdma_machdep.c
+++ sys/i386/i386/busdma_machdep.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/busdma_machdep.c,v 1.74.2.2 2006/03/28 06:28:37 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/busdma_machdep.c,v 1.89 2007/06/11 17:57:24 mjacob Exp $");
 
 #include <sys/param.h>
 #include <sys/kdb.h>
@@ -51,8 +51,11 @@
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/md_var.h>
+#include <machine/specialreg.h>
 
 #define MAX_BPAGES 512
+#define BUS_DMA_COULD_BOUNCE	BUS_DMA_BUS3
+#define BUS_DMA_MIN_ALLOC_COMP	BUS_DMA_BUS4
 
 struct bounce_zone;
 
@@ -137,7 +140,9 @@
 static bus_addr_t add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map,
 				   vm_offset_t vaddr, bus_size_t size);
 static void free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage);
-static __inline int run_filter(bus_dma_tag_t dmat, bus_addr_t paddr);
+int run_filter(bus_dma_tag_t dmat, bus_addr_t paddr);
+int _bus_dmamap_count_pages(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
+			    bus_size_t buflen, int flags, int *nb);
 
 /*
  * Return true if a match is made.
@@ -147,7 +152,7 @@
  * If paddr is within the bounds of the dma tag then call the filter callback
  * to check for a match, if there is no filter callback then assume a match.
  */
-static __inline int
+int
 run_filter(bus_dma_tag_t dmat, bus_addr_t paddr)
 {
 	int retval;
@@ -202,8 +207,6 @@
 	panic("driver error: busdma dflt_lock called");
 }
 
-#define BUS_DMA_COULD_BOUNCE	BUS_DMA_BUS3
-#define BUS_DMA_MIN_ALLOC_COMP	BUS_DMA_BUS4
 /*
  * Allocate a device specific dma_tag.
  */
@@ -222,6 +225,10 @@
 	if (boundary != 0 && boundary < maxsegsz)
 		maxsegsz = boundary;
 
+	if (maxsegsz == 0) {
+		return (EINVAL);
+	}
+
 	/* Return a NULL tag on failure */
 	*dmat = NULL;
 
@@ -265,6 +272,9 @@
 		else if (parent->boundary != 0)
 			newtag->boundary = MIN(parent->boundary,
 					       newtag->boundary);
+		if ((newtag->filter != NULL) ||
+		    ((parent->flags & BUS_DMA_COULD_BOUNCE) != 0))
+			newtag->flags |= BUS_DMA_COULD_BOUNCE;
 		if (newtag->filter == NULL) {
 			/*
 			 * Short circuit looking at our parent directly
@@ -495,7 +505,16 @@
 		}
 	}
 
+	/* 
+	 * XXX:
+	 * (dmat->alignment < dmat->maxsize) is just a quick hack; the exact
+	 * alignment guarantees of malloc need to be nailed down, and the
+	 * code below should be rewritten to take that into account.
+	 *
+	 * In the meantime, we'll warn the user if malloc gets it wrong.
+	 */
 	if ((dmat->maxsize <= PAGE_SIZE) &&
+	   (dmat->alignment < dmat->maxsize) &&
 	    dmat->lowaddr >= ptoa((vm_paddr_t)Maxmem)) {
 		*vaddr = malloc(dmat->maxsize, M_DEVBUF, mflags);
 	} else {
@@ -513,7 +532,12 @@
 		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
 		    __func__, dmat, dmat->flags, ENOMEM);
 		return (ENOMEM);
+	} else if ((uintptr_t)*vaddr & (dmat->alignment - 1)) {
+		printf("bus_dmamem_alloc failed to align memory properly.\n");
 	}
+	if (flags & BUS_DMA_NOCACHE)
+		pmap_change_attr((vm_offset_t)*vaddr, dmat->maxsize,
+		    PAT_UNCACHEABLE);
 	CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
 	    __func__, dmat, dmat->flags, ENOMEM);
 	return (0);
@@ -532,8 +556,10 @@
 	 */
 	if (map != NULL)
 		panic("bus_dmamem_free: Invalid map freed\n");
-	if ((dmat->maxsize <= PAGE_SIZE)
-	 && dmat->lowaddr >= ptoa((vm_paddr_t)Maxmem))
+	pmap_change_attr((vm_offset_t)vaddr, dmat->maxsize, PAT_WRITE_BACK);
+	if ((dmat->maxsize <= PAGE_SIZE) &&
+	   (dmat->alignment < dmat->maxsize) &&
+	    dmat->lowaddr >= ptoa((vm_paddr_t)Maxmem))
 		free(vaddr, M_DEVBUF);
 	else {
 		contigfree(vaddr, dmat->maxsize, M_DEVBUF);
@@ -541,37 +567,16 @@
 	CTR3(KTR_BUSDMA, "%s: tag %p flags 0x%x", __func__, dmat, dmat->flags);
 }
 
-/*
- * Utility function to load a linear buffer.  lastaddrp holds state
- * between invocations (for multiple-buffer loads).  segp contains
- * the starting segment on entrace, and the ending segment on exit.
- * first indicates if this is the first invocation of this function.
- */
-static __inline int
-_bus_dmamap_load_buffer(bus_dma_tag_t dmat,
-    			bus_dmamap_t map,
-			void *buf, bus_size_t buflen,
-			pmap_t pmap,
-			int flags,
-			bus_addr_t *lastaddrp,
-			bus_dma_segment_t *segs,
-			int *segp,
-			int first)
+int
+_bus_dmamap_count_pages(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
+			bus_size_t buflen, int flags, int *nb)
 {
-	bus_size_t sgsize;
-	bus_addr_t curaddr, lastaddr, baddr, bmask;
 	vm_offset_t vaddr;
+	vm_offset_t vendaddr;
 	bus_addr_t paddr;
-	int needbounce = 0;
-	int seg;
-
-	if (map == NULL)
-		map = &nobounce_dmamap;
-
-	if ((map != &nobounce_dmamap && map->pagesneeded == 0) 
-	 && ((dmat->flags & BUS_DMA_COULD_BOUNCE) != 0)) {
-		vm_offset_t	vendaddr;
+	int needbounce = *nb;
 
+	if ((map != &nobounce_dmamap && map->pagesneeded == 0)) {
 		CTR4(KTR_BUSDMA, "lowaddr= %d Maxmem= %d, boundary= %d, "
 		    "alignment= %d", dmat->lowaddr, ptoa((vm_paddr_t)Maxmem),
 		    dmat->boundary, dmat->alignment);
@@ -586,7 +591,8 @@
 
 		while (vaddr < vendaddr) {
 			paddr = pmap_kextract(vaddr);
-			if (run_filter(dmat, paddr) != 0) {
+			if (((dmat->flags & BUS_DMA_COULD_BOUNCE) != 0) &&
+			    run_filter(dmat, paddr) != 0) {
 				needbounce = 1;
 				map->pagesneeded++;
 			}
@@ -618,6 +624,43 @@
 		mtx_unlock(&bounce_lock);
 	}
 
+	*nb = needbounce;
+	return (0);
+}
+
+/*
+ * Utility function to load a linear buffer.  lastaddrp holds state
+ * between invocations (for multiple-buffer loads).  segp contains
+ * the starting segment on entrace, and the ending segment on exit.
+ * first indicates if this is the first invocation of this function.
+ */
+static __inline int
+_bus_dmamap_load_buffer(bus_dma_tag_t dmat,
+    			bus_dmamap_t map,
+			void *buf, bus_size_t buflen,
+			pmap_t pmap,
+			int flags,
+			bus_addr_t *lastaddrp,
+			bus_dma_segment_t *segs,
+			int *segp,
+			int first)
+{
+	bus_size_t sgsize;
+	bus_addr_t curaddr, lastaddr, baddr, bmask;
+	vm_offset_t vaddr;
+	int needbounce = 0;
+	int seg, error;
+
+	if (map == NULL)
+		map = &nobounce_dmamap;
+
+	if ((dmat->flags & BUS_DMA_COULD_BOUNCE) != 0) {
+		error = _bus_dmamap_count_pages(dmat, map, buf, buflen, flags,
+		    &needbounce);
+		if (error)
+			return (error);
+	}
+
 	vaddr = (vm_offset_t)buf;
 	lastaddr = *lastaddrp;
 	bmask = ~(dmat->boundary - 1);
@@ -635,6 +678,8 @@
 		 * Compute the segment size, and adjust counts.
 		 */
 		sgsize = PAGE_SIZE - ((u_long)curaddr & PAGE_MASK);
+		if (sgsize > dmat->maxsegsz)
+			sgsize = dmat->maxsegsz;
 		if (buflen < sgsize)
 			sgsize = buflen;
 
@@ -647,7 +692,8 @@
 				sgsize = (baddr - curaddr);
 		}
 
-		if (map->pagesneeded != 0 && run_filter(dmat, curaddr))
+		if (((dmat->flags & BUS_DMA_COULD_BOUNCE) != 0) &&
+		    map->pagesneeded != 0 && run_filter(dmat, curaddr))
 			curaddr = add_bounce_page(dmat, map, vaddr, sgsize);
 
 		/*
@@ -706,9 +752,10 @@
 	error = _bus_dmamap_load_buffer(dmat, map, buf, buflen, NULL, flags,
 	     &lastaddr, dmat->segments, &nsegs, 1);
 
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, dmat->flags, error, nsegs + 1);
+
 	if (error == EINPROGRESS) {
-		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
-		    __func__, dmat, dmat->flags, error);
 		return (error);
 	}
 
@@ -717,8 +764,13 @@
 	else
 		(*callback)(callback_arg, dmat->segments, nsegs + 1, 0);
 
-	CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error 0 nsegs %d",
-	    __func__, dmat, dmat->flags, nsegs + 1);
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferal is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
 	return (0);
 }
 
@@ -726,18 +778,17 @@
 /*
  * Like _bus_dmamap_load(), but for mbufs.
  */
-int
-bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map,
-		     struct mbuf *m0,
-		     bus_dmamap_callback2_t *callback, void *callback_arg,
-		     int flags)
+static __inline int
+_bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+			struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs,
+			int flags)
 {
-	int nsegs, error;
+	int error;
 
 	M_ASSERTPKTHDR(m0);
 
 	flags |= BUS_DMA_NOWAIT;
-	nsegs = 0;
+	*nsegs = 0;
 	error = 0;
 	if (m0->m_pkthdr.len <= dmat->maxsize) {
 		int first = 1;
@@ -749,7 +800,7 @@
 				error = _bus_dmamap_load_buffer(dmat, map,
 						m->m_data, m->m_len,
 						NULL, flags, &lastaddr,
-						dmat->segments, &nsegs, first);
+						segs, nsegs, first);
 				first = 0;
 			}
 		}
@@ -757,15 +808,33 @@
 		error = EINVAL;
 	}
 
+	/* XXX FIXME: Having to increment nsegs is really annoying */
+	++*nsegs;
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, dmat->flags, error, *nsegs);
+	return (error);
+}
+
+int
+bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map,
+		     struct mbuf *m0,
+		     bus_dmamap_callback2_t *callback, void *callback_arg,
+		     int flags)
+{
+	int nsegs, error;
+
+	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, dmat->segments, &nsegs,
+	    flags);
+
 	if (error) {
 		/* force "no valid mappings" in callback */
 		(*callback)(callback_arg, dmat->segments, 0, 0, error);
 	} else {
 		(*callback)(callback_arg, dmat->segments,
-			    nsegs+1, m0->m_pkthdr.len, error);
+			    nsegs, m0->m_pkthdr.len, error);
 	}
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
-	    __func__, dmat, dmat->flags, error, nsegs + 1);
+	    __func__, dmat, dmat->flags, error, nsegs);
 	return (error);
 }
 
@@ -774,36 +843,7 @@
 			struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs,
 			int flags)
 {
-	int error;
-
-	M_ASSERTPKTHDR(m0);
-
-	flags |= BUS_DMA_NOWAIT;
-	*nsegs = 0;
-	error = 0;
-	if (m0->m_pkthdr.len <= dmat->maxsize) {
-		int first = 1;
-		bus_addr_t lastaddr = 0;
-		struct mbuf *m;
-
-		for (m = m0; m != NULL && error == 0; m = m->m_next) {
-			if (m->m_len > 0) {
-				error = _bus_dmamap_load_buffer(dmat, map,
-						m->m_data, m->m_len,
-						NULL, flags, &lastaddr,
-						segs, nsegs, first);
-				first = 0;
-			}
-		}
-	} else {
-		error = EINVAL;
-	}
-
-	/* XXX FIXME: Having to increment nsegs is really annoying */
-	++*nsegs;
-	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
-	    __func__, dmat, dmat->flags, error, *nsegs);
-	return (error);
+	return (_bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags));
 }
 
 /*
@@ -835,6 +875,7 @@
 	nsegs = 0;
 	error = 0;
 	first = 1;
+	lastaddr = (bus_addr_t) 0;
 	for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) {
 		/*
 		 * Now at the first iovec to load.  Load each iovec
@@ -891,7 +932,6 @@
 		 * want to add support for invalidating
 		 * the caches on broken hardware
 		 */
-		dmat->bounce_zone->total_bounced++;
 		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x "
 		    "performing bounce", __func__, op, dmat, dmat->flags);
 
@@ -902,6 +942,7 @@
 				      bpage->datacount);
 				bpage = STAILQ_NEXT(bpage, links);
 			}
+			dmat->bounce_zone->total_bounced++;
 		}
 
 		if (op & BUS_DMASYNC_POSTREAD) {
@@ -911,6 +952,7 @@
 				      bpage->datacount);
 				bpage = STAILQ_NEXT(bpage, links);
 			}
+			dmat->bounce_zone->total_bounced++;
 		}
 	}
 }
Index: vm_machdep.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/vm_machdep.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/i386/i386/vm_machdep.c -L sys/i386/i386/vm_machdep.c -u -r1.2 -r1.3
--- sys/i386/i386/vm_machdep.c
+++ sys/i386/i386/vm_machdep.c
@@ -41,12 +41,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.259.2.3 2006/03/13 02:46:55 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.283 2007/07/07 16:59:01 attilio Exp $");
 
 #include "opt_isa.h"
 #include "opt_npx.h"
 #include "opt_reset.h"
 #include "opt_cpu.h"
+#include "opt_xbox.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -94,6 +95,10 @@
 #include <i386/isa/isa.h>
 #endif
 
+#ifdef XBOX
+#include <machine/xbox.h>
+#endif
+
 #ifndef NSFBUFS
 #define	NSFBUFS		(512 + maxusers * 16)
 #endif
@@ -153,23 +158,25 @@
 			struct mdproc *mdp1 = &p1->p_md;
 			struct proc_ldt *pldt;
 
-			pldt = mdp1->md_ldt;
-			if (pldt && pldt->ldt_refcnt > 1) {
+			mtx_lock_spin(&dt_lock);
+			if ((pldt = mdp1->md_ldt) != NULL &&
+			    pldt->ldt_refcnt > 1) {
 				pldt = user_ldt_alloc(mdp1, pldt->ldt_len);
 				if (pldt == NULL)
 					panic("could not copy LDT");
 				mdp1->md_ldt = pldt;
 				set_user_ldt(mdp1);
 				user_ldt_free(td1);
-			}
+			} else
+				mtx_unlock_spin(&dt_lock);
 		}
 		return;
 	}
 
 	/* Ensure that p1's pcb is up to date. */
-#ifdef DEV_NPX
 	if (td1 == curthread)
 		td1->td_pcb->pcb_gs = rgs();
+#ifdef DEV_NPX
 	savecrit = intr_disable();
 	if (PCPU_GET(fpcurthread) == td1)
 		npxsave(&td1->td_pcb->pcb_save);
@@ -228,7 +235,6 @@
 	pcb2->pcb_ebx = (int)td2;		/* fork_trampoline argument */
 	pcb2->pcb_eip = (int)fork_trampoline;
 	pcb2->pcb_psl = PSL_KERNEL;		/* ints disabled */
-	pcb2->pcb_gs = rgs();
 	/*-
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
@@ -244,7 +250,7 @@
 	pcb2->pcb_ext = 0;
 
 	/* Copy the LDT, if necessary. */
-	mtx_lock_spin(&sched_lock);
+	mtx_lock_spin(&dt_lock);
 	if (mdp2->md_ldt != NULL) {
 		if (flags & RFMEM) {
 			mdp2->md_ldt->ldt_refcnt++;
@@ -255,9 +261,9 @@
 				panic("could not copy LDT");
 		}
 	}
-	mtx_unlock_spin(&sched_lock);
+	mtx_unlock_spin(&dt_lock);
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 
@@ -300,11 +306,13 @@
 	 * If this process has a custom LDT, release it.  Reset pc->pcb_gs
 	 * and %gs before we free it in case they refer to an LDT entry.
 	 */
+	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt) {
 		td->td_pcb->pcb_gs = _udatasel;
 		load_gs(_udatasel);
 		user_ldt_free(td);
-	}
+	} else
+		mtx_unlock_spin(&dt_lock);
 }
 
 void
@@ -429,7 +437,7 @@
 	 */
 	pcb2->pcb_ext = NULL;
 
-	/* Setup to release sched_lock in fork_exit(). */
+	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 }
@@ -536,6 +544,14 @@
 void
 cpu_reset()
 {
+#ifdef XBOX
+	if (arch_i386_is_xbox) {
+		/* Kick the PIC16L, it can reboot the box */
+		pic16l_reboot();
+		for (;;);
+	}
+#endif
+
 #ifdef SMP
 	u_int cnt, map;
 
@@ -551,7 +567,10 @@
 			cpustop_restartfunc = cpu_reset_proxy;
 			cpu_reset_proxy_active = 0;
 			printf("cpu_reset: Restarting BSP\n");
-			started_cpus = (1<<0);		/* Restart CPU #0 */
+
+			/* Restart CPU #0. */
+			/* XXX: restart_cpus(1 << 0); */
+			atomic_store_rel_int(&started_cpus, (1 << 0));
 
 			cnt = 0;
 			while (cpu_reset_proxy_active == 0 && cnt < 10000000)
@@ -575,7 +594,12 @@
 static void
 cpu_reset_real()
 {
+	struct region_descriptor null_idt;
+#ifndef PC98
+	int b;
+#endif
 
+	disable_intr();
 #ifdef CPU_ELAN
 	if (elan_mmcr != NULL)
 		elan_mmcr->RESCFG = 1;
@@ -591,7 +615,6 @@
 	/*
 	 * Attempt to do a CPU reset via CPU reset port.
 	 */
-	disable_intr();
 	if ((inb(0x35) & 0xa0) != 0xa0) {
 		outb(0x37, 0x0f);		/* SHUT0 = 0. */
 		outb(0x37, 0x0b);		/* SHUT1 = 0. */
@@ -606,16 +629,46 @@
 	 */
 	outb(IO_KBD + 4, 0xFE);
 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
-	printf("Keyboard reset did not work, attempting CPU shutdown\n");
-	DELAY(1000000);	/* wait 1 sec for printf to complete */
 #endif
+
+	/*
+	 * Attempt to force a reset via the Reset Control register at
+	 * I/O port 0xcf9.  Bit 2 forces a system reset when it is
+	 * written as 1.  Bit 1 selects the type of reset to attempt:
+	 * 0 selects a "soft" reset, and 1 selects a "hard" reset.  We
+	 * try to do a "soft" reset first, and then a "hard" reset.
+	 */
+	outb(0xcf9, 0x2);
+	outb(0xcf9, 0x6);
+	DELAY(500000);  /* wait 0.5 sec to see if that did it */
+
+	/*
+	 * Attempt to force a reset via the Fast A20 and Init register
+	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
+	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
+	 * preserve bit 1 while setting bit 0.  We also must clear bit
+	 * 0 before setting it if it isn't already clear.
+	 */
+	b = inb(0x92);
+	if (b != 0xff) {
+		if ((b & 0x1) != 0)
+			outb(0x92, b & 0xfe);
+		outb(0x92, b | 0x1);
+		DELAY(500000);  /* wait 0.5 sec to see if that did it */
+	}
 #endif /* PC98 */
 
-	/* Force a shutdown by unmapping entire address space. */
-	bzero((caddr_t)PTD, NBPTD);
+	printf("No known reset method worked, attempting CPU shutdown\n");
+	DELAY(1000000); /* wait 1 sec for printf to complete */
+
+	/* Wipe the IDT. */
+	null_idt.rd_limit = 0;
+	null_idt.rd_base = 0;
+	lidt(&null_idt);
 
 	/* "good night, sweet prince .... <THUNK!>" */
-	invltlb();
+	breakpoint();
+
 	/* NOTREACHED */
 	while(1);
 }
@@ -647,7 +700,7 @@
 }
 
 /*
- * Get an sf_buf from the freelist. Will block if none are available.
+ * Get an sf_buf from the freelist.  May block if none are available.
  */
 struct sf_buf *
 sf_buf_alloc(struct vm_page *m, int flags)
@@ -734,9 +787,7 @@
 		other_cpus = PCPU_GET(other_cpus) & ~sf->cpumask;
 		if (other_cpus != 0) {
 			sf->cpumask |= other_cpus;
-			mtx_lock_spin(&smp_ipi_mtx);
 			smp_masked_invlpg(other_cpus, sf->kva);
-			mtx_unlock_spin(&smp_ipi_mtx);
 		}
 	}
 	sched_unpin();	
Index: mptable.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/mptable.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/mptable.c -L sys/i386/i386/mptable.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/mptable.c
+++ sys/i386/i386/mptable.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/mptable.c,v 1.241 2005/04/14 17:59:58 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/mptable.c,v 1.245 2007/05/08 22:01:03 jhb Exp $");
 
 #include "opt_mptable_force_htt.h"
 #include <sys/param.h>
@@ -51,7 +51,7 @@
 /* string defined by the Intel MP Spec as identifying the MP table */
 #define	MP_SIG			0x5f504d5f	/* _MP_ */
 
-#define	NAPICID			32	/* Max number of APIC's */
+#define	MAX_LAPIC_ID		31	/* Max local APIC ID for HTT fixup */
 
 #ifdef PC98
 #define BIOS_BASE		(0xe8000)
@@ -142,12 +142,12 @@
 
 static mpfps_t mpfps;
 static mpcth_t mpct;
-static void *ioapics[NAPICID];
+static void *ioapics[MAX_APIC_ID + 1];
 static bus_datum *busses;
 static int mptable_nioapics, mptable_nbusses, mptable_maxbusid;
 static int pci0 = -1;
 
-static MALLOC_DEFINE(M_MPTABLE, "MP Table", "MP Table Items");
+static MALLOC_DEFINE(M_MPTABLE, "mptable", "MP Table Items");
 
 static enum intr_polarity conforming_polarity(u_char src_bus,
 	    u_char src_bus_irq);
@@ -321,18 +321,20 @@
 static int
 mptable_setup_local(void)
 {
+	vm_paddr_t addr;
 
 	/* Is this a pre-defined config? */
 	printf("MPTable: <");
 	if (mpfps->config_type != 0) {
-		lapic_init(DEFAULT_APIC_BASE);
+		addr = DEFAULT_APIC_BASE;
 		printf("Default Configuration %d", mpfps->config_type);
 	} else {
-		lapic_init((uintptr_t)mpct->apic_address);
+		addr = mpct->apic_address;
 		printf("%.*s %.*s", (int)sizeof(mpct->oem_id), mpct->oem_id,
 		    (int)sizeof(mpct->product_id), mpct->product_id);
 	}
 	printf(">\n");
+	lapic_init(addr);
 	return (0);
 }
 
@@ -359,7 +361,7 @@
 	mptable_parse_ints();
 
 	/* Fourth, we register all the I/O APIC's. */
-	for (i = 0; i < NAPICID; i++)
+	for (i = 0; i <= MAX_APIC_ID; i++)
 		if (ioapics[i] != NULL)
 			ioapic_register(ioapics[i]);
 
@@ -425,8 +427,10 @@
 		if (proc->cpu_flags & PROCENTRY_FLAG_EN) {
 			lapic_create(proc->apic_id, proc->cpu_flags &
 			    PROCENTRY_FLAG_BP);
-			cpu_mask = (u_int *)arg;
-			*cpu_mask |= (1 << proc->apic_id);
+			if (proc->apic_id < MAX_LAPIC_ID) {
+				cpu_mask = (u_int *)arg;
+				*cpu_mask |= (1ul << proc->apic_id);
+			}
 		}
 		break;
 	}
@@ -513,14 +517,14 @@
 		apic = (io_apic_entry_ptr)entry;
 		if (!(apic->apic_flags & IOAPICENTRY_FLAG_EN))
 			break;
-		if (apic->apic_id >= NAPICID)
+		if (apic->apic_id > MAX_APIC_ID)
 			panic("%s: I/O APIC ID %d too high", __func__,
 			    apic->apic_id);
 		if (ioapics[apic->apic_id] != NULL)
 			panic("%s: Double APIC ID %d", __func__,
 			    apic->apic_id);
-		ioapics[apic->apic_id] = ioapic_create(
-			(uintptr_t)apic->apic_address, apic->apic_id, -1);
+		ioapics[apic->apic_id] = ioapic_create(apic->apic_address,
+		    apic->apic_id, -1);
 		break;
 	default:
 		break;
@@ -662,7 +666,7 @@
 			return;
 		}
 	}
-	if (apic_id >= NAPICID) {
+	if (apic_id > MAX_APIC_ID) {
 		printf("MPTable: Ignoring interrupt entry for ioapic%d\n",
 		    intr->dst_apic_id);
 		return;
@@ -892,7 +896,7 @@
 	 * physical processor.  If any of those ID's are
 	 * already in the table, then kill the fixup.
 	 */
-	for (id = 0; id < NAPICID; id++) {
+	for (id = 0; id <= MAX_LAPIC_ID; id++) {
 		if ((id_mask & 1 << id) == 0)
 			continue;
 		/* First, make sure we are on a logical_cpus boundary. */
Index: local_apic.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/local_apic.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/i386/i386/local_apic.c -L sys/i386/i386/local_apic.c -u -r1.2 -r1.3
--- sys/i386/i386/local_apic.c
+++ sys/i386/i386/local_apic.c
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/local_apic.c,v 1.17.2.6 2006/03/10 19:37:33 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/local_apic.c,v 1.44 2007/09/11 22:54:09 attilio Exp $");
 
 #include "opt_hwpmc_hooks.h"
 
@@ -51,6 +51,7 @@
 #include <vm/pmap.h>
 
 #include <machine/apicreg.h>
+#include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
@@ -64,13 +65,6 @@
 #include <ddb/ddb.h>
 #endif
 
-/*
- * We can handle up to 60 APICs via our logical cluster IDs, but currently
- * the physical IDs on Intel processors up to the Pentium 4 are limited to
- * 16.
- */
-#define	MAX_APICID	16
-
 /* Sanity checks on IDT vectors. */
 CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT);
 CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS);
@@ -113,7 +107,7 @@
 	u_long la_hard_ticks;
 	u_long la_stat_ticks;
 	u_long la_prof_ticks;
-} static lapics[MAX_APICID];
+} static lapics[MAX_APIC_ID + 1];
 
 /* XXX: should thermal be an NMI? */
 
@@ -146,16 +140,22 @@
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
+extern inthand_t IDTVEC(rsvd);
+
 volatile lapic_t *lapic;
+vm_paddr_t lapic_paddr;
 static u_long lapic_timer_divisor, lapic_timer_period, lapic_timer_hz;
 
 static void	lapic_enable(void);
+static void	lapic_resume(struct pic *pic);
 static void	lapic_timer_enable_intr(void);
 static void	lapic_timer_oneshot(u_int count);
 static void	lapic_timer_periodic(u_int count);
 static void	lapic_timer_set_divisor(u_int divisor);
 static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 
+struct pic lapic_pic = { .pic_resume = lapic_resume };
+
 static uint32_t
 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
 {
@@ -201,13 +201,14 @@
  * Map the local APIC and setup necessary interrupt vectors.
  */
 void
-lapic_init(uintptr_t addr)
+lapic_init(vm_paddr_t addr)
 {
 
 	/* Map the local APIC and setup the spurious interrupt handler. */
 	KASSERT(trunc_page(addr) == addr,
 	    ("local APIC not aligned on a page boundary"));
-	lapic = (lapic_t *)pmap_mapdev(addr, sizeof(lapic_t));
+	lapic = pmap_mapdev(addr, sizeof(lapic_t));
+	lapic_paddr = addr;
 	setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 
@@ -217,7 +218,6 @@
 
 	/* Set BSP's per-CPU local APIC ID. */
 	PCPU_SET(apic_id, lapic_id());
-	intr_add_cpu(PCPU_GET(apic_id));
 
 	/* Local APIC timer interrupt. */
 	setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_SYS386IGT, SEL_KPL,
@@ -235,7 +235,7 @@
 {
 	int i;
 
-	if (apic_id >= MAX_APICID) {
+	if (apic_id > MAX_APIC_ID) {
 		printf("APIC: Ignoring local APIC with ID %d\n", apic_id);
 		if (boot_cpu)
 			panic("Can't ignore BSP");
@@ -278,7 +278,7 @@
 }
 
 void
-lapic_setup(void)
+lapic_setup(int boot)
 {
 	struct lapic *la;
 	u_int32_t maxlvt;
@@ -307,9 +307,13 @@
 
 	/* Program timer LVT and setup handler. */
 	lapic->lvt_timer = lvt_mode(la, LVT_TIMER, lapic->lvt_timer);
-	snprintf(buf, sizeof(buf), "cpu%d: timer", PCPU_GET(cpuid));
-	intrcnt_add(buf, &la->la_timer_count);
-	if (PCPU_GET(cpuid) != 0) {
+	if (boot) {
+		snprintf(buf, sizeof(buf), "cpu%d: timer", PCPU_GET(cpuid));
+		intrcnt_add(buf, &la->la_timer_count);
+	}
+
+	/* We don't setup the timer during boot on the BSP until later. */
+	if (!(boot && PCPU_GET(cpuid) == 0)) {
 		KASSERT(lapic_timer_period != 0, ("lapic%u: zero divisor",
 		    lapic_id()));
 		lapic_timer_set_divisor(lapic_timer_divisor);
@@ -319,6 +323,29 @@
 
 	/* XXX: Error and thermal LVTs */
 
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+		/*
+		 * Detect the presence of C1E capability mostly on latest
+		 * dual-cores (or future) k8 family.  This feature renders
+		 * the local APIC timer dead, so we disable it by reading
+		 * the Interrupt Pending Message register and clearing both
+		 * C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
+		 * 
+		 * Reference:
+		 *   "BIOS and Kernel Developer's Guide for AMD NPT
+		 *    Family 0Fh Processors"
+		 *   #32559 revision 3.00
+		 */
+		if ((cpu_id & 0x00000f00) == 0x00000f00 &&
+		    (cpu_id & 0x0fff0000) >=  0x00040000) {
+			uint64_t msr;
+
+			msr = rdmsr(0xc0010055);
+			if (msr & 0x18000000)
+				wrmsr(0xc0010055, msr & ~0x18000000ULL);
+		}
+	}
+
 	intr_restore(eflags);
 }
 
@@ -399,6 +426,14 @@
 	lapic->svr = value;
 }
 
+/* Reset the local APIC on the BSP during resume. */
+static void
+lapic_resume(struct pic *pic)
+{
+
+	lapic_setup(0);
+}
+
 int
 lapic_id(void)
 {
@@ -596,21 +631,41 @@
 }
 
 void
-lapic_handle_intr(struct intrframe frame)
+lapic_handle_intr(int vector, struct trapframe *frame)
 {
 	struct intsrc *isrc;
 
-	if (frame.if_vec == -1)
+	if (vector == -1)
 		panic("Couldn't get vector from ISR!");
-	isrc = intr_lookup_source(apic_idt_to_irq(frame.if_vec));
-	intr_execute_handlers(isrc, &frame);
+	isrc = intr_lookup_source(apic_idt_to_irq(vector));
+	intr_execute_handlers(isrc, frame);
 }
 
 void
-lapic_handle_timer(struct clockframe frame)
+lapic_handle_timer(struct trapframe *frame)
 {
 	struct lapic *la;
 
+	/* Send EOI first thing. */
+	lapic_eoi();
+
+#if defined(SMP) && !defined(SCHED_ULE)
+	/*
+	 * Don't do any accounting for the disabled HTT cores, since it
+	 * will provide misleading numbers for the userland.
+	 *
+	 * No locking is necessary here, since even if we loose the race
+	 * when hlt_cpus_mask changes it is not a big deal, really.
+	 *
+	 * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask
+	 * and unlike other schedulers it actually schedules threads to
+	 * those CPUs.
+	 */
+	if ((hlt_cpus_mask & (1 << PCPU_GET(cpuid))) != 0)
+		return;
+#endif
+
+	/* Look up our local APIC structure for the tick counters. */
 	la = &lapics[PCPU_GET(apic_id)];
 	(*la->la_timer_count)++;
 	critical_enter();
@@ -620,16 +675,16 @@
 	if (la->la_hard_ticks >= lapic_timer_hz) {
 		la->la_hard_ticks -= lapic_timer_hz;
 		if (PCPU_GET(cpuid) == 0)
-			hardclock(&frame);
+			hardclock(TRAPF_USERMODE(frame), TRAPF_PC(frame));
 		else
-			hardclock_process(&frame);
+			hardclock_cpu(TRAPF_USERMODE(frame));
 	}
 
 	/* Fire statclock at stathz. */
 	la->la_stat_ticks += stathz;
 	if (la->la_stat_ticks >= lapic_timer_hz) {
 		la->la_stat_ticks -= lapic_timer_hz;
-		statclock(&frame);
+		statclock(TRAPF_USERMODE(frame));
 	}
 
 	/* Fire profclock at profhz, but only when needed. */
@@ -637,7 +692,7 @@
 	if (la->la_prof_ticks >= lapic_timer_hz) {
 		la->la_prof_ticks -= lapic_timer_hz;
 		if (profprocs != 0)
-			profclock(&frame);
+			profclock(TRAPF_USERMODE(frame), TRAPF_PC(frame));
 	}
 	critical_exit();
 }
@@ -710,6 +765,65 @@
 	panic("Couldn't find an APIC vector for IRQ %u", irq);
 }
 
+/*
+ * Request 'count' free contiguous IDT vectors to be used by 'count'
+ * IRQs.  'count' must be a power of two and the vectors will be
+ * aligned on a boundary of 'align'.  If the request cannot be
+ * satisfied, 0 is returned.
+ */
+u_int
+apic_alloc_vectors(u_int *irqs, u_int count, u_int align)
+{
+	u_int first, run, vector;
+
+	KASSERT(powerof2(count), ("bad count"));
+	KASSERT(powerof2(align), ("bad align"));
+	KASSERT(align >= count, ("align < count"));
+#ifdef INVARIANTS
+	for (run = 0; run < count; run++)
+		KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u",
+		    irqs[run], run));
+#endif
+
+	/*
+	 * Search for 'count' free vectors.  As with apic_alloc_vector(),
+	 * this just uses a simple first fit algorithm.
+	 */
+	run = 0;
+	first = 0;
+	mtx_lock_spin(&icu_lock);
+	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
+
+		/* Vector is in use, end run. */
+		if (ioint_irqs[vector] != 0) {
+			run = 0;
+			first = 0;
+			continue;
+		}
+
+		/* Start a new run if run == 0 and vector is aligned. */
+		if (run == 0) {
+			if ((vector & (align - 1)) != 0)
+				continue;
+			first = vector;
+		}
+		run++;
+
+		/* Keep looping if the run isn't long enough yet. */
+		if (run < count)
+			continue;
+
+		/* Found a run, assign IRQs and return the first vector. */
+		for (vector = 0; vector < count; vector++)
+			ioint_irqs[first + vector] = irqs[vector];
+		mtx_unlock_spin(&icu_lock);
+		return (first + APIC_IO_INTS);
+	}
+	mtx_unlock_spin(&icu_lock);
+	printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count);
+	return (0);
+}
+
 void
 apic_enable_vector(u_int vector)
 {
@@ -721,6 +835,17 @@
 	    GSEL(GCODE_SEL, SEL_KPL));
 }
 
+void
+apic_disable_vector(u_int vector)
+{
+
+	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
+	KASSERT(ioint_handlers[vector / 32] != NULL,
+	    ("No ISR handler for vector %u", vector));
+	setidt(vector, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+}
+
 /* Release an APIC vector when it's no longer in use. */
 void
 apic_free_vector(u_int vector, u_int irq)
@@ -753,18 +878,16 @@
 DB_SHOW_COMMAND(apic, db_show_apic)
 {
 	struct intsrc *isrc;
-	int quit, i, verbose;
+	int i, verbose;
 	u_int irq;
 
-	quit = 0;
 	if (strcmp(modif, "vv") == 0)
 		verbose = 2;
 	else if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
-	db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
-	for (i = 0; i < APIC_NUM_IOINTS + 1 && !quit; i++) {
+	for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
 		irq = ioint_irqs[i];
 		if (irq != 0 && irq != IRQ_SYSCALL) {
 			db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
@@ -782,6 +905,76 @@
 		}
 	}
 }
+
+static void
+dump_mask(const char *prefix, uint32_t v, int base)
+{
+	int i, first;
+
+	first = 1;
+	for (i = 0; i < 32; i++)
+		if (v & (1 << i)) {
+			if (first) {
+				db_printf("%s:", prefix);
+				first = 0;
+			}
+			db_printf(" %02x", base + i);
+		}
+	if (!first)
+		db_printf("\n");
+}
+
+/* Show info from the lapic regs for this CPU. */
+DB_SHOW_COMMAND(lapic, db_show_lapic)
+{
+	uint32_t v;
+
+	db_printf("lapic ID = %d\n", lapic_id());
+	v = lapic->version;
+	db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
+	    v & 0xf);
+	db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
+	v = lapic->svr;
+	db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
+	    v & APIC_SVR_ENABLE ? "enabled" : "disabled");
+	db_printf("TPR      = %02x\n", lapic->tpr);
+
+#define dump_field(prefix, index)					\
+	dump_mask(__XSTRING(prefix ## index), lapic->prefix ## index,	\
+	    index * 32)
+
+	db_printf("In-service Interrupts:\n");
+	dump_field(isr, 0);
+	dump_field(isr, 1);
+	dump_field(isr, 2);
+	dump_field(isr, 3);
+	dump_field(isr, 4);
+	dump_field(isr, 5);
+	dump_field(isr, 6);
+	dump_field(isr, 7);
+
+	db_printf("TMR Interrupts:\n");
+	dump_field(tmr, 0);
+	dump_field(tmr, 1);
+	dump_field(tmr, 2);
+	dump_field(tmr, 3);
+	dump_field(tmr, 4);
+	dump_field(tmr, 5);
+	dump_field(tmr, 6);
+	dump_field(tmr, 7);
+
+	db_printf("IRR Interrupts:\n");
+	dump_field(irr, 0);
+	dump_field(irr, 1);
+	dump_field(irr, 2);
+	dump_field(irr, 3);
+	dump_field(irr, 4);
+	dump_field(irr, 5);
+	dump_field(irr, 6);
+	dump_field(irr, 7);
+
+#undef dump_field
+}
 #endif
 
 /*
@@ -871,12 +1064,8 @@
 	if (retval != 0)
 		printf("%s: Failed to setup the local APIC: returned %d\n",
 		    best_enum->apic_name, retval);
-#ifdef SMP
-	/* Last, setup the cpu topology now that we have probed CPUs */
-	mp_topology();
-#endif
 }
-SYSINIT(apic_init, SI_SUB_CPU, SI_ORDER_FIRST, apic_init, NULL)
+SYSINIT(apic_init, SI_SUB_CPU, SI_ORDER_SECOND, apic_init, NULL)
 
 /*
  * Setup the I/O APICs.
@@ -897,9 +1086,13 @@
 	 * Finish setting up the local APIC on the BSP once we know how to
 	 * properly program the LINT pins.
 	 */
-	lapic_setup();
+	lapic_setup(1);
+	intr_register_pic(&lapic_pic);
 	if (bootverbose)
 		lapic_dump("BSP");
+
+	/* Enable the MSI "pic". */
+	msi_init();
 }
 SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_SECOND, apic_setup_io, NULL)
 
Index: vm86.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/vm86.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/vm86.c -L sys/i386/i386/vm86.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/vm86.c
+++ sys/i386/i386/vm86.c
@@ -25,10 +25,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/vm86.c,v 1.57 2004/11/27 06:51:36 das Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/vm86.c,v 1.62 2006/12/17 05:07:01 kmacy Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -54,7 +55,7 @@
 extern int vm86_bioscall(struct vm86frame *);
 extern void vm86_biosret(struct vm86frame *);
 
-void vm86_prepcall(struct vm86frame);
+void vm86_prepcall(struct vm86frame *);
 
 struct system_map {
 	int		type;
@@ -505,46 +506,33 @@
 	panic("vm86_addpage: not enough room");
 }
 
-static void
-vm86_initflags(struct vm86frame *vmf)
-{
-	int eflags = vmf->vmf_eflags;
-	struct vm86_kernel *vm86 = &PCPU_GET(curpcb)->pcb_ext->ext_vm86;
-
-	if (vm86->vm86_has_vme) {
-		eflags = (vmf->vmf_eflags & ~VME_USERCHANGE) |
-		    (eflags & VME_USERCHANGE) | PSL_VM;
-	} else {
-		vm86->vm86_eflags = eflags;     /* save VIF, VIP */
-		eflags = (vmf->vmf_eflags & ~VM_USERCHANGE) |             
-		    (eflags & VM_USERCHANGE) | PSL_VM;
-	}
-	vmf->vmf_eflags = eflags | PSL_VM;
-}
-
 /*
  * called from vm86_bioscall, while in vm86 address space, to finalize setup.
  */
 void
-vm86_prepcall(struct vm86frame vmf)
+vm86_prepcall(struct vm86frame *vmf)
 {
 	uintptr_t addr[] = { 0xA00, 0x1000 };	/* code, stack */
 	u_char intcall[] = {
 		CLI, INTn, 0x00, STI, HLT
 	};
+	struct vm86_kernel *vm86;
 
-	if ((vmf.vmf_trapno & PAGE_MASK) <= 0xff) {
+	if ((vmf->vmf_trapno & PAGE_MASK) <= 0xff) {
 		/* interrupt call requested */
-        	intcall[2] = (u_char)(vmf.vmf_trapno & 0xff);
+		intcall[2] = (u_char)(vmf->vmf_trapno & 0xff);
 		memcpy((void *)addr[0], (void *)intcall, sizeof(intcall));
-		vmf.vmf_ip = addr[0];
-		vmf.vmf_cs = 0;
+		vmf->vmf_ip = addr[0];
+		vmf->vmf_cs = 0;
 	}
-	vmf.vmf_sp = addr[1] - 2;              /* keep aligned */
-	vmf.kernel_fs = vmf.kernel_es = vmf.kernel_ds = 0;
-	vmf.vmf_ss = 0;
-	vmf.vmf_eflags = PSL_VIF | PSL_VM | PSL_USER;
-	vm86_initflags(&vmf);
+	vmf->vmf_sp = addr[1] - 2;              /* keep aligned */
+	vmf->kernel_fs = vmf->kernel_es = vmf->kernel_ds = 0;
+	vmf->vmf_ss = 0;
+	vmf->vmf_eflags = PSL_VIF | PSL_VM | PSL_USER;
+
+	vm86 = &PCPU_GET(curpcb)->pcb_ext->ext_vm86;
+	if (!vm86->vm86_has_vme) 
+		vm86->vm86_eflags = vmf->vmf_eflags;  /* save VIF, VIP */
 }
 
 /*
@@ -724,7 +712,7 @@
 	case VM86_INTCALL: {
 		struct vm86_intcall_args sa;
 
-		if ((error = suser(td)))
+		if ((error = priv_check(td, PRIV_VM86_INTCALL)))
 			return (error);
 		if ((error = copyin(ua.sub_args, &sa, sizeof(sa))))
 			return (error);
Index: swtch.s
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/swtch.s,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/swtch.s -L sys/i386/i386/swtch.s -u -r1.1.1.1 -r1.2
--- sys/i386/i386/swtch.s
+++ sys/i386/i386/swtch.s
@@ -29,15 +29,32 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.148 2005/04/13 22:57:17 peter Exp $
+ * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.156 2007/08/22 05:06:14 jkoshy Exp $
  */
 
 #include "opt_npx.h"
+#include "opt_sched.h"
 
 #include <machine/asmacros.h>
 
 #include "assym.s"
 
+#if defined(SMP) && defined(SCHED_ULE)
+#define	SETOP		xchgl
+#define	BLOCK_SPIN(reg)							\
+		movl		$blocked_lock,%eax ;			\
+	100: ;								\
+		lock ;							\
+		cmpxchgl	%eax,TD_LOCK(reg) ;			\
+		jne		101f ;					\
+		pause ;							\
+		jmp		100b ;					\
+	101:
+#else
+#define	SETOP		movl
+#define	BLOCK_SPIN(reg)
+#endif
+
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
@@ -82,6 +99,7 @@
 #endif
 	btsl	%esi, PM_ACTIVE(%ebx)		/* set new */
 	jmp	sw1
+END(cpu_throw)
 
 /*
  * cpu_switch(old, new)
@@ -91,6 +109,7 @@
  * 0(%esp) = ret
  * 4(%esp) = oldtd
  * 8(%esp) = newtd
+ * 12(%esp) = newlock
  */
 ENTRY(cpu_switch)
 
@@ -114,12 +133,6 @@
 	movl	%gs,PCB_GS(%edx)
 	pushfl					/* PSL */
 	popl	PCB_PSL(%edx)
-	/* Check to see if we need to call a switchout function. */
-	movl	PCB_SWITCHOUT(%edx),%eax
-	cmpl	$0, %eax
-	je	1f
-	call	*%eax
-1:
 	/* Test if debug registers should be saved. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%edx)
 	jz      1f                              /* no, skip over */
@@ -151,14 +164,14 @@
 #endif
 
 	/* Save is done.  Now fire up new thread. Leave old vmspace. */
-	movl	%ecx,%edi
+	movl	4(%esp),%edi
 	movl	8(%esp),%ecx			/* New thread */
+	movl	12(%esp),%esi			/* New lock */
 #ifdef INVARIANTS
 	testl	%ecx,%ecx			/* no thread? */
 	jz	badsw3				/* no, panic */
 #endif
 	movl	TD_PCB(%ecx),%edx
-	movl	PCPU(CPUID), %esi
 
 	/* switch address space */
 	movl	PCB_CR3(%edx),%eax
@@ -167,11 +180,14 @@
 #else
 	cmpl	%eax,IdlePTD			/* Kernel address space? */
 #endif
-	je	sw1
+	je	sw0
 	movl	%cr3,%ebx			/* The same address space? */
 	cmpl	%ebx,%eax
-	je	sw1
+	je	sw0
 	movl	%eax,%cr3			/* new address space */
+	movl	%esi,%eax
+	movl	PCPU(CPUID),%esi
+	SETOP	%eax,TD_LOCK(%edi)		/* Switchout td_lock */
 
 	/* Release bit from old pmap->pm_active */
 	movl	PCPU(CURPMAP), %ebx
@@ -189,15 +205,19 @@
 	lock
 #endif
 	btsl	%esi, PM_ACTIVE(%ebx)		/* set new */
+	jmp	sw1
 
+sw0:
+	SETOP	%esi,TD_LOCK(%edi)		/* Switchout td_lock */
 sw1:
+	BLOCK_SPIN(%ecx)
 	/*
 	 * At this point, we've switched address spaces and are ready
 	 * to load up the rest of the next context.
 	 */
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f				/* If not, use the default */
-	btsl	%esi, private_tss		/* mark use of private tss */
+	movl	$1, PCPU(PRIVATE_TSS) 		/* mark use of private tss */
 	movl	PCB_EXT(%edx), %edi		/* new tss descriptor */
 	jmp	2f				/* Load it up */
 
@@ -213,8 +233,9 @@
 	 * Test this CPU's  bit in the bitmap to see if this
 	 * CPU was using a private TSS.
 	 */
-	btrl	%esi, private_tss		/* Already using the common? */
-	jae	3f				/* if so, skip reloading */
+	cmpl	$0, PCPU(PRIVATE_TSS)		/* Already using the common? */
+	je	3f				/* if so, skip reloading */
+	movl	$0, PCPU(PRIVATE_TSS)
 	PCPU_ADDR(COMMON_TSSD, %edi)
 2:
 	/* Move correct tss descriptor into GDT slot, then reload tr. */
@@ -223,7 +244,7 @@
 	movl	4(%edi), %esi
 	movl	%eax, 0(%ebx)
 	movl	%esi, 4(%ebx)
-	movl	$GPROC0_SEL*8, %esi		/* GSEL(entry, SEL_KPL) */
+	movl	$GPROC0_SEL*8, %esi		/* GSEL(GPROC0_SEL, SEL_KPL) */
 	ltr	%si
 3:
 
@@ -251,6 +272,7 @@
 	popfl
 
 	movl	%edx, PCPU(CURPCB)
+	movl	TD_TID(%ecx),%eax
 	movl	%ecx, PCPU(CURTHREAD)		/* into next thread */
 
 	/*
@@ -327,6 +349,7 @@
 	call	panic
 sw0_3:	.asciz	"cpu_switch: no newthread supplied"
 #endif
+END(cpu_switch)
 
 /*
  * savectx(pcb)
@@ -392,3 +415,4 @@
 #endif	/* DEV_NPX */
 
 	ret
+END(savectx)
--- /dev/null
+++ sys/i386/i386/bpf_jit_machdep.c
@@ -0,0 +1,514 @@
+/*-
+ * Copyright (c) 2002 - 2003 NetGroup, Politecnico di Torino (Italy)
+ * Copyright (c) 2005 Jung-uk Kim <jkim at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Politecnico di Torino nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS intERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/i386/i386/bpf_jit_machdep.c,v 1.4 2006/01/03 20:26:02 jkim Exp $");
+
+#include "opt_bpf.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/malloc.h>
+
+#include <net/if.h>
+#include <net/bpf.h>
+#include <net/bpf_jitter.h>
+
+#include <i386/i386/bpf_jit_machdep.h>
+
+bpf_filter_func	bpf_jit_compile(struct bpf_insn *, u_int, int *);
+
+/*
+ * emit routine to update the jump table
+ */
+static void
+emit_length(bpf_bin_stream *stream, u_int value, u_int len)
+{
+
+	(stream->refs)[stream->bpf_pc] += len;
+	stream->cur_ip += len;
+}
+
+/*
+ * emit routine to output the actual binary code
+ */
+static void
+emit_code(bpf_bin_stream *stream, u_int value, u_int len)
+{
+
+	switch (len) {
+	case 1:
+		stream->ibuf[stream->cur_ip] = (u_char)value;
+		stream->cur_ip++;
+		break;
+
+	case 2:
+		*((u_short *)(stream->ibuf + stream->cur_ip)) = (u_short)value;
+		stream->cur_ip += 2;
+		break;
+
+	case 4:
+		*((u_int *)(stream->ibuf + stream->cur_ip)) = value;
+		stream->cur_ip += 4;
+		break;
+	}
+
+	return;
+}
+
+/*
+ * Function that does the real stuff
+ */
+bpf_filter_func
+bpf_jit_compile(struct bpf_insn *prog, u_int nins, int *mem)
+{
+	struct bpf_insn *ins;
+	u_int i, pass;
+	bpf_bin_stream stream;
+
+	/*
+	 * NOTE: do not modify the name of this variable, as it's used by
+	 * the macros to emit code.
+	 */
+	emit_func emitm;
+
+	/* Do not compile an empty filter. */
+	if (nins == 0)
+		return NULL;
+
+	/* Allocate the reference table for the jumps */
+	stream.refs = (u_int *)malloc((nins + 1) * sizeof(u_int),
+	    M_BPFJIT, M_NOWAIT);
+	if (stream.refs == NULL)
+		return NULL;
+
+	/* Reset the reference table */
+	for (i = 0; i < nins + 1; i++)
+		stream.refs[i] = 0;
+
+	stream.cur_ip = 0;
+	stream.bpf_pc = 0;
+
+	/*
+	 * the first pass will emit the lengths of the instructions
+	 * to create the reference table
+	 */
+	emitm = emit_length;
+
+	pass = 0;
+	for (;;) {
+		ins = prog;
+
+		/* create the procedure header */
+		PUSH(EBP);
+		MOVrd(EBP, ESP);
+		PUSH(EDI);
+		PUSH(ESI);
+		PUSH(EBX);
+		MOVodd(EBX, EBP, 8);
+
+		for (i = 0; i < nins; i++) {
+			stream.bpf_pc++;
+
+			switch (ins->code) {
+			default:
+				return NULL;
+
+			case BPF_RET|BPF_K:
+				MOVid(EAX, ins->k);
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				break;
+
+			case BPF_RET|BPF_A:
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				break;
+
+			case BPF_LD|BPF_W|BPF_ABS:
+				MOVid(ECX, ins->k);
+				MOVrd(ESI, ECX);
+				ADDib(ECX, sizeof(int));
+				CMPodd(ECX, EBP, 0x10);
+				JLEb(7);
+				ZERO_EAX();
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				MOVobd(EAX, EBX, ESI);
+				BSWAP(EAX);
+				break;
+
+			case BPF_LD|BPF_H|BPF_ABS:
+				ZERO_EAX();
+				MOVid(ECX, ins->k);
+				MOVrd(ESI, ECX);
+				ADDib(ECX, sizeof(short));
+				CMPodd(ECX, EBP, 0x10);
+				JLEb(5);
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				MOVobw(AX, EBX, ESI);
+				SWAP_AX();
+				break;
+
+			case BPF_LD|BPF_B|BPF_ABS:
+				ZERO_EAX();
+				MOVid(ECX, ins->k);
+				CMPodd(ECX, EBP, 0x10);
+				JLEb(5);
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				MOVobb(AL, EBX, ECX);
+				break;
+
+			case BPF_LD|BPF_W|BPF_LEN:
+				MOVodd(EAX, EBP, 0xc);
+				break;
+
+			case BPF_LDX|BPF_W|BPF_LEN:
+				MOVodd(EDX, EBP, 0xc);
+				break;
+
+			case BPF_LD|BPF_W|BPF_IND:
+				MOVid(ECX, ins->k);
+				ADDrd(ECX, EDX);
+				MOVrd(ESI, ECX);
+				ADDib(ECX, sizeof(int));
+				CMPodd(ECX, EBP, 0x10);
+				JLEb(7);
+				ZERO_EAX();
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				MOVobd(EAX, EBX, ESI);
+				BSWAP(EAX);
+				break;
+
+			case BPF_LD|BPF_H|BPF_IND:
+				ZERO_EAX();
+				MOVid(ECX, ins->k);
+				ADDrd(ECX, EDX);
+				MOVrd(ESI, ECX);
+				ADDib(ECX, sizeof(short));
+				CMPodd(ECX, EBP, 0x10);
+				JLEb(5);
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				MOVobw(AX, EBX, ESI);
+				SWAP_AX();
+				break;
+
+			case BPF_LD|BPF_B|BPF_IND:
+				ZERO_EAX();
+				MOVid(ECX, ins->k);
+				ADDrd(ECX, EDX);
+				CMPodd(ECX, EBP, 0x10);
+				JLEb(5);
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				MOVobb(AL, EBX, ECX);
+				break;
+
+			case BPF_LDX|BPF_MSH|BPF_B:
+				MOVid(ECX, ins->k);
+				CMPodd(ECX, EBP, 0x10);
+				JLEb(7);
+				ZERO_EAX();
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				ZERO_EDX();
+				MOVobb(DL, EBX, ECX);
+				ANDib(DL, 0xf);
+				SHLib(EDX, 2);
+				break;
+
+			case BPF_LD|BPF_IMM:
+				MOVid(EAX, ins->k);
+				break;
+
+			case BPF_LDX|BPF_IMM:
+				MOVid(EDX, ins->k);
+				break;
+
+			case BPF_LD|BPF_MEM:
+				MOVid(ECX, (uintptr_t)mem);
+				MOVid(ESI, ins->k * 4);
+				MOVobd(EAX, ECX, ESI);
+				break;
+
+			case BPF_LDX|BPF_MEM:
+				MOVid(ECX, (uintptr_t)mem);
+				MOVid(ESI, ins->k * 4);
+				MOVobd(EDX, ECX, ESI);
+				break;
+
+			case BPF_ST:
+				/*
+				 * XXX this command and the following could
+				 * be optimized if the previous instruction
+				 * was already of this type
+				 */
+				MOVid(ECX, (uintptr_t)mem);
+				MOVid(ESI, ins->k * 4);
+				MOVomd(ECX, ESI, EAX);
+				break;
+
+			case BPF_STX:
+				MOVid(ECX, (uintptr_t)mem);
+				MOVid(ESI, ins->k * 4);
+				MOVomd(ECX, ESI, EDX);
+				break;
+
+			case BPF_JMP|BPF_JA:
+				JMP(stream.refs[stream.bpf_pc + ins->k] -
+				    stream.refs[stream.bpf_pc]);
+				break;
+
+			case BPF_JMP|BPF_JGT|BPF_K:
+				CMPid(EAX, ins->k);
+				/* 5 is the size of the following JMP */
+				JG(stream.refs[stream.bpf_pc + ins->jt] -
+				    stream.refs[stream.bpf_pc] + 5 );
+				JMP(stream.refs[stream.bpf_pc + ins->jf] -
+				    stream.refs[stream.bpf_pc]);
+				break;
+
+			case BPF_JMP|BPF_JGE|BPF_K:
+				CMPid(EAX, ins->k);
+				JGE(stream.refs[stream.bpf_pc + ins->jt] -
+				    stream.refs[stream.bpf_pc] + 5);
+				JMP(stream.refs[stream.bpf_pc + ins->jf] -
+				    stream.refs[stream.bpf_pc]);
+				break;
+
+			case BPF_JMP|BPF_JEQ|BPF_K:
+				CMPid(EAX, ins->k);
+				JE(stream.refs[stream.bpf_pc + ins->jt] -
+				    stream.refs[stream.bpf_pc] + 5);
+				JMP(stream.refs[stream.bpf_pc + ins->jf] -
+				    stream.refs[stream.bpf_pc]);
+				break;
+
+			case BPF_JMP|BPF_JSET|BPF_K:
+				MOVrd(ECX, EAX);
+				ANDid(ECX, ins->k);
+				JE(stream.refs[stream.bpf_pc + ins->jf] -
+				    stream.refs[stream.bpf_pc] + 5);
+				JMP(stream.refs[stream.bpf_pc + ins->jt] -
+				    stream.refs[stream.bpf_pc]);
+				break;
+
+			case BPF_JMP|BPF_JGT|BPF_X:
+				CMPrd(EAX, EDX);
+				JA(stream.refs[stream.bpf_pc + ins->jt] -
+				    stream.refs[stream.bpf_pc] + 5);
+				JMP(stream.refs[stream.bpf_pc + ins->jf] -
+				    stream.refs[stream.bpf_pc]);
+				break;
+
+			case BPF_JMP|BPF_JGE|BPF_X:
+				CMPrd(EAX, EDX);
+				JAE(stream.refs[stream.bpf_pc + ins->jt] -
+				    stream.refs[stream.bpf_pc] + 5);
+				JMP(stream.refs[stream.bpf_pc + ins->jf] -
+				    stream.refs[stream.bpf_pc]);
+				break;
+
+			case BPF_JMP|BPF_JEQ|BPF_X:
+				CMPrd(EAX, EDX);
+				JE(stream.refs[stream.bpf_pc + ins->jt] -
+				    stream.refs[stream.bpf_pc] + 5);
+				JMP(stream.refs[stream.bpf_pc + ins->jf] -
+				    stream.refs[stream.bpf_pc]);
+				break;
+
+			case BPF_JMP|BPF_JSET|BPF_X:
+				MOVrd(ECX, EAX);
+				ANDrd(ECX, EDX);
+				JE(stream.refs[stream.bpf_pc + ins->jf] -
+				    stream.refs[stream.bpf_pc] + 5);
+				JMP(stream.refs[stream.bpf_pc + ins->jt] -
+				    stream.refs[stream.bpf_pc]);
+				break;
+
+			case BPF_ALU|BPF_ADD|BPF_X:
+				ADDrd(EAX, EDX);
+				break;
+
+			case BPF_ALU|BPF_SUB|BPF_X:
+				SUBrd(EAX, EDX);
+				break;
+
+			case BPF_ALU|BPF_MUL|BPF_X:
+				MOVrd(ECX, EDX);
+				MULrd(EDX);
+				MOVrd(EDX, ECX);
+				break;
+
+			case BPF_ALU|BPF_DIV|BPF_X:
+				CMPid(EDX, 0);
+				JNEb(7);
+				ZERO_EAX();
+				POP(EBX);
+				POP(ESI);
+				POP(EDI);
+				LEAVE_RET();
+				MOVrd(ECX, EDX);
+				ZERO_EDX();
+				DIVrd(ECX);
+				MOVrd(EDX, ECX);
+				break;
+
+			case BPF_ALU|BPF_AND|BPF_X:
+				ANDrd(EAX, EDX);
+				break;
+
+			case BPF_ALU|BPF_OR|BPF_X:
+				ORrd(EAX, EDX);
+				break;
+
+			case BPF_ALU|BPF_LSH|BPF_X:
+				MOVrd(ECX, EDX);
+				SHL_CLrb(EAX);
+				break;
+
+			case BPF_ALU|BPF_RSH|BPF_X:
+				MOVrd(ECX, EDX);
+				SHR_CLrb(EAX);
+				break;
+
+			case BPF_ALU|BPF_ADD|BPF_K:
+				ADD_EAXi(ins->k);
+				break;
+
+			case BPF_ALU|BPF_SUB|BPF_K:
+				SUB_EAXi(ins->k);
+				break;
+
+			case BPF_ALU|BPF_MUL|BPF_K:
+				MOVrd(ECX, EDX);
+				MOVid(EDX, ins->k);
+				MULrd(EDX);
+				MOVrd(EDX, ECX);
+				break;
+
+			case BPF_ALU|BPF_DIV|BPF_K:
+				MOVrd(ECX, EDX);
+				ZERO_EDX();
+				MOVid(ESI, ins->k);
+				DIVrd(ESI);
+				MOVrd(EDX, ECX);
+				break;
+
+			case BPF_ALU|BPF_AND|BPF_K:
+				ANDid(EAX, ins->k);
+				break;
+
+			case BPF_ALU|BPF_OR|BPF_K:
+				ORid(EAX, ins->k);
+				break;
+
+			case BPF_ALU|BPF_LSH|BPF_K:
+				SHLib(EAX, (ins->k) & 255);
+				break;
+
+			case BPF_ALU|BPF_RSH|BPF_K:
+				SHRib(EAX, (ins->k) & 255);
+				break;
+
+			case BPF_ALU|BPF_NEG:
+				NEGd(EAX);
+				break;
+
+			case BPF_MISC|BPF_TAX:
+				MOVrd(EDX, EAX);
+				break;
+
+			case BPF_MISC|BPF_TXA:
+				MOVrd(EAX, EDX);
+				break;
+			}
+			ins++;
+		}
+
+		pass++;
+		if (pass == 2)
+			break;
+
+		stream.ibuf = (char *)malloc(stream.cur_ip, M_BPFJIT, M_NOWAIT);
+		if (stream.ibuf == NULL) {
+			free(stream.refs, M_BPFJIT);
+			return NULL;
+		}
+
+		/*
+		 * modify the reference table to contain the offsets and
+		 * not the lengths of the instructions
+		 */
+		for (i = 1; i < nins + 1; i++)
+			stream.refs[i] += stream.refs[i - 1];
+
+		/* Reset the counters */
+		stream.cur_ip = 0;
+		stream.bpf_pc = 0;
+
+		/* the second pass creates the actual code */
+		emitm = emit_code;
+	}
+
+	/*
+	 * the reference table is needed only during compilation,
+	 * now we can free it
+	 */
+	free(stream.refs, M_BPFJIT);
+
+	return (bpf_filter_func)stream.ibuf;
+}
Index: identcpu.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/identcpu.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -L sys/i386/i386/identcpu.c -L sys/i386/i386/identcpu.c -u -r1.4 -r1.5
--- sys/i386/i386/identcpu.c
+++ sys/i386/i386/identcpu.c
@@ -39,13 +39,14 @@
  */
 
 #include <sys/cdefs.h>
-/*$FreeBSD: src/sys/i386/i386/identcpu.c,v 1.145.2.5 2006/08/08 08:41:34 mr Exp $ */
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: src/sys/i386/i386/identcpu.c,v 1.180 2007/05/29 19:39:18 des Exp $");
 
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
+#include <sys/cpu.h>
+#include <sys/eventhandler.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
@@ -75,14 +76,21 @@
 void panicifcpuunsupported(void);
 
 static void identifycyrix(void);
+static void init_exthigh(void);
+void setPQL2(int *const size, int *const ways);
+static void setPQL2_AMD(int *const size, int *const ways);
+static void setPQL2_INTEL(int *const size, int *const ways);
+static void get_INTEL_TLB(u_int data, int *const size, int *const ways);
 static void print_AMD_info(void);
+static void print_INTEL_info(void);
+static void print_INTEL_TLB(u_int data);
 static void print_AMD_assoc(int i);
 static void print_transmeta_info(void);
 
 int	cpu_class;
 u_int	cpu_exthigh;		/* Highest arg to extended CPUID */
 u_int	cyrix_did;		/* Device ID of Cyrix CPU */
-char machine[] = "i386";
+char machine[] = MACHINE;
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, 
     machine, 0, "Machine class");
 
@@ -94,7 +102,6 @@
 SYSCTL_INT(_hw, OID_AUTO, clockrate, CTLFLAG_RD, 
     &hw_clockrate, 0, "CPU instruction clock rate");
 
-#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 static char cpu_brand[48];
 
 #define	MAX_BRAND_INDEX	8
@@ -110,7 +117,6 @@
 	NULL,
 	"Intel Pentium 4"
 };
-#endif
 
 static struct {
 	char	*cpu_name;
@@ -139,57 +145,53 @@
 int has_f00f_bug = 0;		/* Initialized so that it can be patched. */
 #endif
 
+static void
+init_exthigh(void)
+{
+	static int done = 0;
+	u_int regs[4];
+
+	if (done == 0) {
+		if (cpu_high > 0 &&
+		    (strcmp(cpu_vendor, "GenuineIntel") == 0 ||
+		    strcmp(cpu_vendor, "AuthenticAMD") == 0 ||
+		    strcmp(cpu_vendor, "GenuineTMx86") == 0 ||
+		    strcmp(cpu_vendor, "TransmetaCPU") == 0 ||
+		    strcmp(cpu_vendor, "Geode by NSC") == 0)) {
+			do_cpuid(0x80000000, regs);
+			if (regs[0] >= 0x80000000)
+				cpu_exthigh = regs[0];
+		}
+
+		done = 1;
+	}
+}
+
 void
 printcpuinfo(void)
 {
-#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	u_int regs[4], i;
 	char *brand;
-#endif
 
 	cpu_class = i386_cpus[cpu].cpu_class;
 	printf("CPU: ");
 	strncpy(cpu_model, i386_cpus[cpu].cpu_name, sizeof (cpu_model));
 
-#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	/* Check for extended CPUID information and a processor name. */
-	if (cpu_high > 0 &&
-	    (strcmp(cpu_vendor, "GenuineIntel") == 0 ||
-	    strcmp(cpu_vendor, "AuthenticAMD") == 0 ||
-	    strcmp(cpu_vendor, "GenuineTMx86") == 0 ||
-	    strcmp(cpu_vendor, "TransmetaCPU") == 0 ||
-	    strcmp(cpu_vendor, "Geode by NSC") == 0)) {
-		do_cpuid(0x80000000, regs);
-		if (regs[0] >= 0x80000000) {
-			cpu_exthigh = regs[0];
-			if (cpu_exthigh >= 0x80000004) {
-				brand = cpu_brand;
-				for (i = 0x80000002; i < 0x80000005; i++) {
-					do_cpuid(i, regs);
-					memcpy(brand, regs, sizeof(regs));
-					brand += sizeof(regs);
-				}
-			}
-		}
-	}
-
-	/* Detect AMD features (PTE no-execute bit, 3dnow, 64 bit mode etc) */
-	if (strcmp(cpu_vendor, "GenuineIntel") == 0 ||
-	    strcmp(cpu_vendor, "AuthenticAMD") == 0) {
-		if (cpu_exthigh >= 0x80000001) {
-			do_cpuid(0x80000001, regs);
-			amd_feature = regs[3] & ~(cpu_feature & 0x0183f3ff);
-			amd_feature2 = regs[2];
-		}
-		if (cpu_exthigh >= 0x80000008) {
-			do_cpuid(0x80000008, regs);
-			cpu_procinfo2 = regs[2];
+	init_exthigh();
+	if (cpu_exthigh >= 0x80000004) {
+		brand = cpu_brand;
+		for (i = 0x80000002; i < 0x80000005; i++) {
+			do_cpuid(i, regs);
+			memcpy(brand, regs, sizeof(regs));
+			brand += sizeof(regs);
 		}
 	}
 
 	if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
 		if ((cpu_id & 0xf00) > 0x300) {
 			u_int brand_index;
+			u_int model;
 
 			cpu_model[0] = '\0';
 
@@ -302,6 +304,16 @@
 			case 0xf00:
 				strcat(cpu_model, "Pentium 4");
 				cpu = CPU_P4;
+				model = (cpu_id & 0x0f0) >> 4;
+				if (model == 3 || model == 4 || model == 6) {
+					uint64_t tmp;
+
+					tmp = rdmsr(MSR_IA32_MISC_ENABLE);
+					wrmsr(MSR_IA32_MISC_ENABLE,
+					      tmp & ~(1LL << 22));
+					do_cpuid(0, regs);
+					cpu_high = regs[0];
+				}
 				break;
 			default:
 				strcat(cpu_model, "unknown");
@@ -374,6 +386,14 @@
 		case 0x590:
 			strcat(cpu_model, "K6-III");
 			break;
+		case 0x5a0:
+			strcat(cpu_model, "Geode LX");
+			/*
+			 * Make sure the TSC runs through suspension,
+			 * otherwise we can't use it as timecounter
+			 */
+			wrmsr(0x1900, rdmsr(0x1900) | 0x20ULL);
+			break;
 		default:
 			strcat(cpu_model, "Unknown");
 			break;
@@ -575,16 +595,12 @@
 				i = 0;
 			if (i & VIA_CPUID_HAS_RNG)
 				strcat(cpu_model, "+RNG");
-
 			if (i & VIA_CPUID_HAS_ACE)
 				strcat(cpu_model, "+AES");
-
 			if (i & VIA_CPUID_HAS_ACE2)
 				strcat(cpu_model, "+AES-CTR");
-
 			if (i & VIA_CPUID_HAS_PHE)
 				strcat(cpu_model, "+SHA1+SHA256");
-
 			if (i & VIA_CPUID_HAS_PMM)
 				strcat(cpu_model, "+RSA");
 			break;
@@ -616,8 +632,6 @@
 	if (*brand != '\0')
 		strcpy(cpu_model, brand);
 
-#endif
-
 	printf("%s (", cpu_model);
 	switch(cpu_class) {
 	case CPUCLASS_286:
@@ -654,7 +668,6 @@
 		printf("Unknown");	/* will panic below... */
 	}
 	printf("-class CPU)\n");
-#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	if(*cpu_vendor)
 		printf("  Origin = \"%s\"",cpu_vendor);
 	if(cpu_id)
@@ -858,7 +871,8 @@
 			else if (strcmp(cpu_vendor, "GenuineIntel") == 0 &&
 			    (cpu_high >= 4)) {
 				cpuid_count(4, 0, regs);
-				cmp = ((regs[0] & 0xfc000000) >> 26) + 1;
+				if ((regs[0] & 0x1f) != 0)
+					cmp = ((regs[0] >> 26) & 0x3f) + 1;
 			}
 			if (cmp > 1)
 				printf("\n  Cores per package: %d", cmp);
@@ -879,24 +893,16 @@
 	if (*cpu_vendor || cpu_id)
 		printf("\n");
 
-#endif
-
 	if (!bootverbose)
 		return;
 
 	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
 		print_AMD_info();
+	else if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+		print_INTEL_info();
 	else if (strcmp(cpu_vendor, "GenuineTMx86") == 0 ||
 		 strcmp(cpu_vendor, "TransmetaCPU") == 0)
 		print_transmeta_info();
-
-#ifdef I686_CPU
-	/*
-	 * XXX - Do PPro CPUID level=2 stuff here?
-	 *
-	 * No, but maybe in a print_Intel_info() function called from here.
-	 */
-#endif
 }
 
 void
@@ -1062,6 +1068,21 @@
 	write_eflags(eflags);
 }
 
+/* Update TSC freq with the value indicated by the caller. */
+static void
+tsc_freq_changed(void *arg, const struct cf_level *level, int status)
+{
+	/* If there was an error during the transition, don't do anything. */
+	if (status != 0)
+		return;
+
+	/* Total setting for this level gives the new frequency in MHz. */
+	hw_clockrate = level->total_set.freq;
+}
+
+EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL,
+    EVENTHANDLER_PRI_ANY);
+
 /*
  * Final stage of CPU identification. -- Should I check TI?
  */
@@ -1072,7 +1093,20 @@
 	u_char	ccr3;
 	u_int	regs[4];
 
-	if (strcmp(cpu_vendor, "CyrixInstead") == 0) {
+	/* Detect AMD features (PTE no-execute bit, 3dnow, 64 bit mode etc) */
+	if (strcmp(cpu_vendor, "GenuineIntel") == 0 ||
+	    strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+		init_exthigh();
+		if (cpu_exthigh >= 0x80000001) {
+			do_cpuid(0x80000001, regs);
+			amd_feature = regs[3] & ~(cpu_feature & 0x0183f3ff);
+			amd_feature2 = regs[2];
+		}
+		if (cpu_exthigh >= 0x80000008) {
+			do_cpuid(0x80000008, regs);
+			cpu_procinfo2 = regs[2];
+		}
+	} else if (strcmp(cpu_vendor, "CyrixInstead") == 0) {
 		if (cpu == CPU_486) {
 			/*
 			 * These conditions are equivalent to:
@@ -1231,6 +1265,506 @@
 }
 
 static void
+print_INTEL_info(void)
+{
+	u_int regs[4];
+	u_int rounds, regnum;
+	u_int nwaycode, nway;
+
+	if (cpu_high >= 2) {
+		rounds = 0;
+		do {
+			do_cpuid(0x2, regs);
+			if (rounds == 0 && (rounds = (regs[0] & 0xff)) == 0)
+				break;	/* we have a buggy CPU */
+
+			for (regnum = 0; regnum <= 3; ++regnum) {
+				if (regs[regnum] & (1<<31))
+					continue;
+				if (regnum != 0)
+					print_INTEL_TLB(regs[regnum] & 0xff);
+				print_INTEL_TLB((regs[regnum] >> 8) & 0xff);
+				print_INTEL_TLB((regs[regnum] >> 16) & 0xff);
+				print_INTEL_TLB((regs[regnum] >> 24) & 0xff);
+			}
+		} while (--rounds > 0);
+	}
+
+	if (cpu_exthigh >= 0x80000006) {
+		do_cpuid(0x80000006, regs);
+		nwaycode = (regs[2] >> 12) & 0x0f;
+		if (nwaycode >= 0x02 && nwaycode <= 0x08)
+			nway = 1 << (nwaycode / 2);
+		else
+			nway = 0;
+		printf("\nL2 cache: %u kbytes, %u-way associative, %u bytes/line",
+		    (regs[2] >> 16) & 0xffff, nway, regs[2] & 0xff);
+	}
+
+	printf("\n");
+}
+
+static void
+print_INTEL_TLB(u_int data)
+{
+	switch (data) {
+	case 0x0:
+	case 0x40:
+	default:
+		break;
+	case 0x1:
+		printf("\nInstruction TLB: 4 KB pages, 4-way set associative, 32 entries");
+		break;
+	case 0x2:
+		printf("\nInstruction TLB: 4 MB pages, fully associative, 2 entries");
+		break;
+	case 0x3:
+		printf("\nData TLB: 4 KB pages, 4-way set associative, 64 entries");
+		break;
+	case 0x4:
+		printf("\nData TLB: 4 MB Pages, 4-way set associative, 8 entries");
+		break;
+	case 0x6:
+		printf("\n1st-level instruction cache: 8 KB, 4-way set associative, 32 byte line size");
+		break;
+	case 0x8:
+		printf("\n1st-level instruction cache: 16 KB, 4-way set associative, 32 byte line size");
+		break;
+	case 0xa:
+		printf("\n1st-level data cache: 8 KB, 2-way set associative, 32 byte line size");
+		break;
+	case 0xc:
+		printf("\n1st-level data cache: 16 KB, 4-way set associative, 32 byte line size");
+		break;
+	case 0x22:
+		printf("\n3rd-level cache: 512 KB, 4-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x23:
+		printf("\n3rd-level cache: 1 MB, 8-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x25:
+		printf("\n3rd-level cache: 2 MB, 8-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x29:
+		printf("\n3rd-level cache: 4 MB, 8-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x2c:
+		printf("\n1st-level data cache: 32 KB, 8-way set associative, 64 byte line size");
+		break;
+	case 0x30:
+		printf("\n1st-level instruction cache: 32 KB, 8-way set associative, 64 byte line size");
+		break;
+	case 0x39:
+		printf("\n2nd-level cache: 128 KB, 4-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x3b:
+		printf("\n2nd-level cache: 128 KB, 2-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x3c:
+		printf("\n2nd-level cache: 256 KB, 4-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x41:
+		printf("\n2nd-level cache: 128 KB, 4-way set associative, 32 byte line size");
+		break;
+	case 0x42:
+		printf("\n2nd-level cache: 256 KB, 4-way set associative, 32 byte line size");
+		break;
+	case 0x43:
+		printf("\n2nd-level cache: 512 KB, 4-way set associative, 32 byte line size");
+		break;
+	case 0x44:
+		printf("\n2nd-level cache: 1 MB, 4-way set associative, 32 byte line size");
+		break;
+	case 0x45:
+		printf("\n2nd-level cache: 2 MB, 4-way set associative, 32 byte line size");
+		break;
+	case 0x46:
+		printf("\n3rd-level cache: 4 MB, 4-way set associative, 64 byte line size");
+		break;
+	case 0x47:
+		printf("\n3rd-level cache: 8 MB, 8-way set associative, 64 byte line size");
+		break;
+	case 0x50:
+		printf("\nInstruction TLB: 4 KB, 2 MB or 4 MB pages, fully associative, 64 entries");
+		break;
+	case 0x51:
+		printf("\nInstruction TLB: 4 KB, 2 MB or 4 MB pages, fully associative, 128 entries");
+		break;
+	case 0x52:
+		printf("\nInstruction TLB: 4 KB, 2 MB or 4 MB pages, fully associative, 256 entries");
+		break;
+	case 0x5b:
+		printf("\nData TLB: 4 KB or 4 MB pages, fully associative, 64 entries");
+		break;
+	case 0x5c:
+		printf("\nData TLB: 4 KB or 4 MB pages, fully associative, 128 entries");
+		break;
+	case 0x5d:
+		printf("\nData TLB: 4 KB or 4 MB pages, fully associative, 256 entries");
+		break;
+	case 0x60:
+		printf("\n1st-level data cache: 16 KB, 8-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x66:
+		printf("\n1st-level data cache: 8 KB, 4-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x67:
+		printf("\n1st-level data cache: 16 KB, 4-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x68:
+		printf("\n1st-level data cache: 32 KB, 4 way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x70:
+		printf("\nTrace cache: 12K-uops, 8-way set associative");
+		break;
+	case 0x71:
+		printf("\nTrace cache: 16K-uops, 8-way set associative");
+		break;
+	case 0x72:
+		printf("\nTrace cache: 32K-uops, 8-way set associative");
+		break;
+	case 0x78:
+		printf("\n2nd-level cache: 1 MB, 4-way set associative, 64-byte line size");
+		break;
+	case 0x79:
+		printf("\n2nd-level cache: 128 KB, 8-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x7a:
+		printf("\n2nd-level cache: 256 KB, 8-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x7b:
+		printf("\n2nd-level cache: 512 KB, 8-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x7c:
+		printf("\n2nd-level cache: 1 MB, 8-way set associative, sectored cache, 64 byte line size");
+		break;
+	case 0x7d:
+		printf("\n2nd-level cache: 2-MB, 8-way set associative, 64-byte line size");
+		break;
+	case 0x7f:
+		printf("\n2nd-level cache: 512-KB, 2-way set associative, 64-byte line size");
+		break;
+	case 0x82:
+		printf("\n2nd-level cache: 256 KB, 8-way set associative, 32 byte line size");
+		break;
+	case 0x83:
+		printf("\n2nd-level cache: 512 KB, 8-way set associative, 32 byte line size");
+		break;
+	case 0x84:
+		printf("\n2nd-level cache: 1 MB, 8-way set associative, 32 byte line size");
+		break;
+	case 0x85:
+		printf("\n2nd-level cache: 2 MB, 8-way set associative, 32 byte line size");
+		break;
+	case 0x86:
+		printf("\n2nd-level cache: 512 KB, 4-way set associative, 64 byte line size");
+		break;
+	case 0x87:
+		printf("\n2nd-level cache: 1 MB, 8-way set associative, 64 byte line size");
+		break;
+	case 0xb0:
+		printf("\nInstruction TLB: 4 KB Pages, 4-way set associative, 128 entries");
+		break;
+	case 0xb3:
+		printf("\nData TLB: 4 KB Pages, 4-way set associative, 128 entries");
+		break;
+	}
+}
+
+
+static void
+setPQL2_AMD(int *const size, int *const ways) {
+	if (cpu_exthigh >= 0x80000006) {
+		u_int regs[4];
+
+		do_cpuid(0x80000006, regs);
+		*size = regs[2] >> 16;
+		*ways = (regs[2] >> 12) & 0x0f;
+	}
+}
+
+
+static void
+setPQL2_INTEL(int *const size, int *const ways)
+{
+	u_int rounds, regnum;
+	u_int regs[4];
+	u_int nwaycode;
+
+	if (cpu_high >= 2) {
+		rounds = 0;
+		do {
+			do_cpuid(0x2, regs);
+			if (rounds == 0 && (rounds = (regs[0] & 0xff)) == 0)
+				break;	/* we have a buggy CPU */
+
+			for (regnum = 0; regnum <= 3; ++regnum) {
+				if (regs[regnum] & (1<<31))
+					continue;
+				if (regnum != 0)
+					get_INTEL_TLB(regs[regnum] & 0xff,
+					    size, ways);
+				get_INTEL_TLB((regs[regnum] >> 8) & 0xff,
+				    size, ways);
+				get_INTEL_TLB((regs[regnum] >> 16) & 0xff,
+				    size, ways);
+				get_INTEL_TLB((regs[regnum] >> 24) & 0xff,
+				    size, ways);
+			}
+		} while (--rounds > 0);
+	}
+
+	if (cpu_exthigh >= 0x80000006) {
+		do_cpuid(0x80000006, regs);
+		if (*size < ((regs[2] >> 16) & 0xffff)) {
+			*size = (regs[2] >> 16) & 0xffff;
+			nwaycode = (regs[2] >> 12) & 0x0f;
+			if (nwaycode >= 0x02 && nwaycode <= 0x08)
+				*ways = 1 << (nwaycode / 2);
+			else
+				*ways = 0;
+		}
+        }
+}
+
+static void
+get_INTEL_TLB(u_int data, int *const size, int *const ways)
+{
+	switch (data) {
+	default:
+		break;
+	case 0x22:
+		/* 3rd-level cache: 512 KB, 4-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 512) {
+			*size = 512;
+			*ways = 4;
+		}
+		break;
+	case 0x23:
+		/* 3rd-level cache: 1 MB, 8-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 1024) {
+			*size = 1024;
+			*ways = 8;
+		}
+		break;
+	case 0x25:
+		/* 3rd-level cache: 2 MB, 8-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 2048) {
+			*size = 2048;
+			*ways = 8;
+		}
+		break;
+	case 0x29:
+		/* 3rd-level cache: 4 MB, 8-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 4096) {
+			*size = 4096;
+			*ways = 8;
+		}
+		break;
+	case 0x39:
+		/* 2nd-level cache: 128 KB, 4-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 128) {
+			*size = 128;
+			*ways = 4;
+		}
+		break;
+	case 0x3b:
+		/* 2nd-level cache: 128 KB, 2-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 128) {
+			*size = 128;
+			*ways = 2;
+		}
+		break;
+	case 0x3c:
+		/* 2nd-level cache: 256 KB, 4-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 256) {
+			*size = 256;
+			*ways = 4;
+		}
+		break;
+	case 0x41:
+		/* 2nd-level cache: 128 KB, 4-way set associative,
+		 * 32 byte line size */
+		if (*size < 128) {
+			*size = 128;
+			*ways = 4;
+		}
+		break;
+	case 0x42:
+		/* 2nd-level cache: 256 KB, 4-way set associative,
+		 * 32 byte line size */
+		if (*size < 256) {
+			*size = 256;
+			*ways = 4;
+		}
+		break;
+	case 0x43:
+		/* 2nd-level cache: 512 KB, 4-way set associative,
+		 * 32 byte line size */
+		if (*size < 512) {
+			*size = 512;
+			*ways = 4;
+		}
+		break;
+	case 0x44:
+		/* 2nd-level cache: 1 MB, 4-way set associative,
+		 * 32 byte line size */
+		if (*size < 1024) {
+			*size = 1024;
+			*ways = 4;
+		}
+		break;
+	case 0x45:
+		/* 2nd-level cache: 2 MB, 4-way set associative,
+		 * 32 byte line size */
+		if (*size < 2048) {
+			*size = 2048;
+			*ways = 4;
+		}
+		break;
+	case 0x46:
+		/* 3rd-level cache: 4 MB, 4-way set associative,
+		 * 64 byte line size */
+		if (*size < 4096) {
+			*size = 4096;
+			*ways = 4;
+		}
+		break;
+	case 0x47:
+		/* 3rd-level cache: 8 MB, 8-way set associative,
+		 * 64 byte line size */
+		if (*size < 8192) {
+			*size = 8192;
+			*ways = 8;
+		}
+		break;
+	case 0x78:
+		/* 2nd-level cache: 1 MB, 4-way set associative,
+		 * 64-byte line size */
+		if (*size < 1024) {
+			*size = 1024;
+			*ways = 4;
+		}
+		break;
+	case 0x79:
+		/* 2nd-level cache: 128 KB, 8-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 128) {
+			*size = 128;
+			*ways = 8;
+		}
+		break;
+	case 0x7a:
+		/* 2nd-level cache: 256 KB, 8-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 256) {
+			*size = 256;
+			*ways = 8;
+		}
+		break;
+	case 0x7b:
+		/* 2nd-level cache: 512 KB, 8-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 512) {
+			*size = 512;
+			*ways = 8;
+		}
+		break;
+	case 0x7c:
+		/* 2nd-level cache: 1 MB, 8-way set associative,
+		 * sectored cache, 64 byte line size */
+		if (*size < 1024) {
+			*size = 1024;
+			*ways = 8;
+		}
+		break;
+	case 0x7d:
+		/* 2nd-level cache: 2 MB, 8-way set associative,
+		 * 64-byte line size */
+		if (*size < 2048) {
+			*size = 2048;
+			*ways = 8;
+		}
+		break;
+	case 0x7f:
+		/* 2nd-level cache: 512 KB, 2-way set associative,
+		 * 64-byte line size */
+		if (*size < 512) {
+			*size = 512;
+			*ways = 2;
+		}
+		break;
+	case 0x82:
+		/* 2nd-level cache: 256 KB, 8-way set associative,
+		 * 32 byte line size */
+		if (*size < 256) {
+			*size = 256;
+			*ways = 8;
+		}
+		break;
+	case 0x83:
+		/* 2nd-level cache: 512 KB, 8-way set associative,
+		 * 32 byte line size */
+		if (*size < 512) {
+			*size = 512;
+			*ways = 8;
+		}
+		break;
+	case 0x84:
+		/* 2nd-level cache: 1 MB, 8-way set associative,
+		 * 32 byte line size */
+		if (*size < 1024) {
+			*size = 1024;
+			*ways = 8;
+		}
+		break;
+	case 0x85:
+		/* 2nd-level cache: 2 MB, 8-way set associative,
+		 * 32 byte line size */
+		if (*size < 2048) {
+			*size = 2048;
+			*ways = 8;
+		}
+		break;
+	case 0x86:
+		/* 2nd-level cache: 512 KB, 4-way set associative,
+		 * 64 byte line size */
+		if (*size < 512) {
+			*size = 512;
+			*ways = 4;
+		}
+		break;
+	case 0x87:
+		/* 2nd-level cache: 1 MB, 8-way set associative,
+		 * 64 byte line size */
+		if (*size < 1024) {
+			*size = 512;
+			*ways = 8;
+		}
+		break;
+	}
+}
+
+void
+setPQL2(int *const size, int *const ways)
+{
+	/* make sure the cpu_exthigh variable is initialized */
+	init_exthigh();
+
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+		setPQL2_AMD(size, ways);
+	else if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+		setPQL2_INTEL(size, ways);
+}
+
+static void
 print_transmeta_info()
 {
 	u_int regs[4], nreg = 0;
Index: vm86bios.s
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/vm86bios.s,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/vm86bios.s -L sys/i386/i386/vm86bios.s -u -r1.1.1.1 -r1.2
--- sys/i386/i386/vm86bios.s
+++ sys/i386/i386/vm86bios.s
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/i386/i386/vm86bios.s,v 1.31 2005/04/13 18:13:40 peter Exp $
+ * $FreeBSD: src/sys/i386/i386/vm86bios.s,v 1.32 2006/12/17 05:07:01 kmacy Exp $
  */
 
 #include "opt_npx.h"
@@ -128,9 +128,11 @@
 #endif
 	movl	%ecx,%cr3		/* new page tables */
 	movl	SCR_VMFRAME(%edx),%esp	/* switch to new stack */
-	
-	call	vm86_prepcall		/* finish setup */
 
+	pushl	%esp
+	call	vm86_prepcall		/* finish setup */
+	add	$4, %esp
+	
 	/*
 	 * Return via doreti
 	 */
Index: tsc.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/tsc.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/tsc.c -L sys/i386/i386/tsc.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/tsc.c
+++ sys/i386/i386/tsc.c
@@ -25,11 +25,14 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/tsc.c,v 1.204 2003/10/21 18:28:34 silby Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/tsc.c,v 1.208 2007/06/04 18:25:06 dwmalone Exp $");
 
 #include "opt_clock.h"
 
 #include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/cpu.h>
+#include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
@@ -41,9 +44,12 @@
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
+#include "cpufreq_if.h"
+
 uint64_t	tsc_freq;
 int		tsc_is_broken;
 u_int		tsc_present;
+static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag;
 
 #ifdef SMP
 static int	smp_tsc;
@@ -52,14 +58,19 @@
 TUNABLE_INT("kern.timecounter.smp_tsc", &smp_tsc);
 #endif
 
+static void tsc_freq_changed(void *arg, const struct cf_level *level,
+    int status);
+static void tsc_freq_changing(void *arg, const struct cf_level *level,
+    int *status);
 static	unsigned tsc_get_timecount(struct timecounter *tc);
+static void tsc_levels_changed(void *arg, int unit);
 
 static struct timecounter tsc_timecounter = {
 	tsc_get_timecount,	/* get_timecount */
 	0,			/* no poll_pps */
- 	~0u,			/* counter_mask */
+	~0u,			/* counter_mask */
 	0,			/* frequency */
-	 "TSC",			/* name */
+	"TSC",			/* name */
 	800,			/* quality (adjusted in code) */
 };
 
@@ -86,18 +97,33 @@
 	tsc_freq = tscval[1] - tscval[0];
 	if (bootverbose)
 		printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq);
-}
 
+	/*
+	 * Inform CPU accounting about our boot-time clock rate.  Once the
+	 * system is finished booting, we will get the real max clock rate
+	 * via tsc_freq_max().  This also will be updated if someone loads
+	 * a cpufreq driver after boot that discovers a new max frequency.
+	 */
+	set_cputicker(rdtsc, tsc_freq, 1);
+
+	/* Register to find out about changes in CPU frequency. */
+	tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change,
+	    tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST);
+	tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change,
+	    tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST);
+	tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed,
+	    tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY);
+}
 
 void
 init_TSC_tc(void)
 {
 	/*
-	 * We can not use the TSC if we support APM. Precise timekeeping
+	 * We can not use the TSC if we support APM.  Precise timekeeping
 	 * on an APM'ed machine is at best a fools pursuit, since 
 	 * any and all of the time spent in various SMM code can't 
 	 * be reliably accounted for.  Reading the RTC is your only
-	 * source of reliable time info.  The i8254 looses too of course
+	 * source of reliable time info.  The i8254 loses too, of course,
 	 * but we need to have some kind of time...
 	 * We don't know at this point whether APM is going to be used
 	 * or not, nor when it might be activated.  Play it safe.
@@ -127,6 +153,72 @@
 	}
 }
 
+/*
+ * When cpufreq levels change, find out about the (new) max frequency.  We
+ * use this to update CPU accounting in case it got a lower estimate at boot.
+ */
+static void
+tsc_levels_changed(void *arg, int unit)
+{
+	device_t cf_dev;
+	struct cf_level *levels;
+	int count, error;
+	uint64_t max_freq;
+
+	/* Only use values from the first CPU, assuming all are equal. */
+	if (unit != 0)
+		return;
+
+	/* Find the appropriate cpufreq device instance. */
+	cf_dev = devclass_get_device(devclass_find("cpufreq"), unit);
+	if (cf_dev == NULL) {
+		printf("tsc_levels_changed() called but no cpufreq device?\n");
+		return;
+	}
+
+	/* Get settings from the device and find the max frequency. */
+	count = 64;
+	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
+	if (levels == NULL)
+		return;
+	error = CPUFREQ_LEVELS(cf_dev, levels, &count);
+	if (error == 0 && count != 0) {
+		max_freq = (uint64_t)levels[0].total_set.freq * 1000000;
+		set_cputicker(rdtsc, max_freq, 1);
+	} else
+		printf("tsc_levels_changed: no max freq found\n");
+	free(levels, M_TEMP);
+}
+
+/*
+ * If the TSC timecounter is in use, veto the pending change.  It may be
+ * possible in the future to handle a dynamically-changing timecounter rate.
+ */
+static void
+tsc_freq_changing(void *arg, const struct cf_level *level, int *status)
+{
+
+	if (*status != 0 || timecounter != &tsc_timecounter)
+		return;
+
+	printf("timecounter TSC must not be in use when "
+	     "changing frequencies; change denied\n");
+	*status = EBUSY;
+}
+
+/* Update TSC freq with the value indicated by the caller. */
+static void
+tsc_freq_changed(void *arg, const struct cf_level *level, int status)
+{
+	/* If there was an error during the transition, don't do anything. */
+	if (status != 0)
+		return;
+
+	/* Total setting for this level gives the new frequency in MHz. */
+	tsc_freq = (uint64_t)level->total_set.freq * 1000000;
+	tsc_timecounter.tc_frequency = tsc_freq;
+}
+
 static int
 sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS)
 {
@@ -136,7 +228,7 @@
 	if (tsc_timecounter.tc_frequency == 0)
 		return (EOPNOTSUPP);
 	freq = tsc_freq;
-	error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
+	error = sysctl_handle_quad(oidp, &freq, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		tsc_freq = freq;
 		tsc_timecounter.tc_frequency = tsc_freq;
@@ -145,7 +237,7 @@
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_QUAD | CTLFLAG_RW,
-    0, sizeof(u_int), sysctl_machdep_tsc_freq, "IU", "");
+    0, sizeof(u_int), sysctl_machdep_tsc_freq, "QU", "");
 
 static unsigned
 tsc_get_timecount(struct timecounter *tc)
Index: sys_machdep.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/sys_machdep.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/sys_machdep.c -L sys/i386/i386/sys_machdep.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/sys_machdep.c
+++ sys/i386/i386/sys_machdep.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/sys_machdep.c,v 1.102.2.1 2005/09/26 19:38:11 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/sys_machdep.c,v 1.112 2007/07/08 18:17:42 attilio Exp $");
 
 #include "opt_kstack_pages.h"
 #include "opt_mac.h"
@@ -38,9 +38,9 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysproto.h>
@@ -56,21 +56,22 @@
 #include <machine/proc.h>
 #include <machine/sysarch.h>
 
+#include <security/audit/audit.h>
+
 #include <vm/vm_kern.h>		/* for kernel_map */
 
 #define MAX_LD 8192
 #define LD_PER_PAGE 512
 #define NEW_MAX_LD(num)  ((num + LD_PER_PAGE) & ~(LD_PER_PAGE-1))
 #define SIZE_FROM_LARGEST_LD(num) (NEW_MAX_LD(num) << 3)
+#define	NULL_LDT_BASE	((caddr_t)NULL)
 
-
-
+#ifdef SMP
+static void set_user_ldt_rv(struct vmspace *vmsp);
+#endif
 static int i386_set_ldt_data(struct thread *, int start, int num,
 	union descriptor *descs);
 static int i386_ldt_grow(struct thread *td, int len);
-#ifdef SMP
-static void set_user_ldt_rv(struct thread *);
-#endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct sysarch_args {
@@ -93,6 +94,7 @@
 	uint32_t base;
 	struct segment_descriptor sd, *sdp;
 
+	AUDIT_ARG(cmd, uap->op);
 	switch (uap->op) {
 	case I386_GET_IOPERM:
 	case I386_SET_IOPERM:
@@ -112,7 +114,6 @@
 		break;
 	}
 
-	mtx_lock(&Giant);
 	switch(uap->op) {
 	case I386_GET_LDT:
 		error = i386_get_ldt(td, &kargs.largs);
@@ -212,7 +213,6 @@
 		error = EINVAL;
 		break;
 	}
-	mtx_unlock(&Giant);
 	return (error);
 }
 
@@ -267,12 +267,12 @@
 	KASSERT(td->td_pcb->pcb_ext == 0, ("already have a TSS!"));
 
 	/* Switch to the new TSS. */
-	mtx_lock_spin(&sched_lock);
+	critical_enter();
 	td->td_pcb->pcb_ext = ext;
-	private_tss |= PCPU_GET(cpumask);
+	PCPU_SET(private_tss, 1);
 	*PCPU_GET(tss_gdt) = ext->ext_tssd;
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
-	mtx_unlock_spin(&sched_lock);
+	critical_exit();
 
 	return 0;
 }
@@ -285,11 +285,7 @@
 	int i, error;
 	char *iomap;
 
-#ifdef MAC
-	if ((error = mac_check_sysarch_ioperm(td->td_ucred)) != 0)
-		return (error);
-#endif
-	if ((error = suser(td)) != 0)
+	if ((error = priv_check(td, PRIV_IO)) != 0)
 		return (error);
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
@@ -352,16 +348,19 @@
 
 /*
  * Update the GDT entry pointing to the LDT to point to the LDT of the
- * current process.
- *
- * This must be called with sched_lock held.  Unfortunately, we can't use a
- * mtx_assert() here because cpu_switch() calls this function after changing
- * curproc but before sched_lock's owner is updated in mi_switch().
+ * current process. Manage dt_lock holding/unholding autonomously.
  */   
 void
 set_user_ldt(struct mdproc *mdp)
 {
 	struct proc_ldt *pldt;
+	int dtlocked;
+
+	dtlocked = 0;
+	if (!mtx_owned(&dt_lock)) {
+		mtx_lock_spin(&dt_lock);
+		dtlocked = 1;
+	}
 
 	pldt = mdp->md_ldt;
 #ifdef SMP
@@ -371,14 +370,18 @@
 #endif
 	lldt(GSEL(GUSERLDT_SEL, SEL_KPL));
 	PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL));
+	if (dtlocked)
+		mtx_unlock_spin(&dt_lock);
 }
 
 #ifdef SMP
 static void
-set_user_ldt_rv(struct thread *td)
+set_user_ldt_rv(struct vmspace *vmsp)
 {
+	struct thread *td;
 
-	if (td->td_proc != curthread->td_proc)
+	td = curthread;
+	if (vmsp != td->td_proc->p_vmspace)
 		return;
 
 	set_user_ldt(&td->td_proc->p_md);
@@ -386,17 +389,15 @@
 #endif
 
 /*
- * Must be called with either sched_lock free or held but not recursed.
- * If it does not return NULL, it will return with it owned.
+ * dt_lock must be held. Returns with dt_lock held.
  */
 struct proc_ldt *
 user_ldt_alloc(struct mdproc *mdp, int len)
 {
 	struct proc_ldt *pldt, *new_ldt;
 
-	if (mtx_owned(&sched_lock))
-		mtx_unlock_spin(&sched_lock);
-	mtx_assert(&sched_lock, MA_NOTOWNED);
+	mtx_assert(&dt_lock, MA_OWNED);
+	mtx_unlock_spin(&dt_lock);
 	MALLOC(new_ldt, struct proc_ldt *, sizeof(struct proc_ldt),
 		M_SUBPROC, M_WAITOK);
 
@@ -410,38 +411,35 @@
 	new_ldt->ldt_refcnt = 1;
 	new_ldt->ldt_active = 0;
 
-	mtx_lock_spin(&sched_lock);
+	mtx_lock_spin(&dt_lock);
 	gdt_segs[GUSERLDT_SEL].ssd_base = (unsigned)new_ldt->ldt_base;
 	gdt_segs[GUSERLDT_SEL].ssd_limit = len * sizeof(union descriptor) - 1;
 	ssdtosd(&gdt_segs[GUSERLDT_SEL], &new_ldt->ldt_sd);
 
-	if ((pldt = mdp->md_ldt)) {
+	if ((pldt = mdp->md_ldt) != NULL) {
 		if (len > pldt->ldt_len)
 			len = pldt->ldt_len;
 		bcopy(pldt->ldt_base, new_ldt->ldt_base,
 		    len * sizeof(union descriptor));
-	} else {
+	} else
 		bcopy(ldt, new_ldt->ldt_base, sizeof(ldt));
-	}
-	return new_ldt;
+	
+	return (new_ldt);
 }
 
 /*
- * Must be called either with sched_lock free or held but not recursed.
- * If md_ldt is not NULL, it will return with sched_lock released.
+ * Must be called with dt_lock held.  Returns with dt_lock unheld.
  */
 void
 user_ldt_free(struct thread *td)
 {
 	struct mdproc *mdp = &td->td_proc->p_md;
-	struct proc_ldt *pldt = mdp->md_ldt;
+	struct proc_ldt *pldt;
 
-	if (pldt == NULL)
+	mtx_assert(&dt_lock, MA_OWNED);
+	if ((pldt = mdp->md_ldt) == NULL)
 		return;
 
-	if (!mtx_owned(&sched_lock))
-		mtx_lock_spin(&sched_lock);
-	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	if (td == PCPU_GET(curthread)) {
 		lldt(_default_ldt);
 		PCPU_SET(currentldt, _default_ldt);
@@ -449,12 +447,12 @@
 
 	mdp->md_ldt = NULL;
 	if (--pldt->ldt_refcnt == 0) {
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&dt_lock);
 		kmem_free(kernel_map, (vm_offset_t)pldt->ldt_base,
 			pldt->ldt_len * sizeof(union descriptor));
 		FREE(pldt, M_SUBPROC);
 	} else
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&dt_lock);
 }
 
 /*
@@ -469,7 +467,7 @@
 	struct i386_ldt_args *uap;
 {
 	int error = 0;
-	struct proc_ldt *pldt = td->td_proc->p_md.md_ldt;
+	struct proc_ldt *pldt;
 	int nldt, num;
 	union descriptor *lp;
 
@@ -478,11 +476,14 @@
 	    uap->start, uap->num, (void *)uap->descs);
 #endif
 
-	if (pldt) {
+	mtx_lock_spin(&dt_lock);
+	if ((pldt = td->td_proc->p_md.md_ldt) != NULL) {
 		nldt = pldt->ldt_len;
-		num = min(uap->num, nldt);
 		lp = &((union descriptor *)(pldt->ldt_base))[uap->start];
+		mtx_unlock_spin(&dt_lock);
+		num = min(uap->num, nldt);
 	} else {
+		mtx_unlock_spin(&dt_lock);
 		nldt = sizeof(ldt)/sizeof(ldt[0]);
 		num = min(uap->num, nldt);
 		lp = &ldt[uap->start];
@@ -532,10 +533,10 @@
 		}
 		if (uap->num <= 0)
 			return (EINVAL);
-		mtx_lock_spin(&sched_lock);
-		pldt = mdp->md_ldt;
-		if (pldt == NULL || uap->start >= pldt->ldt_len) {
-			mtx_unlock_spin(&sched_lock);
+		mtx_lock_spin(&dt_lock);
+		if ((pldt = mdp->md_ldt) == NULL ||
+		    uap->start >= pldt->ldt_len) {
+			mtx_unlock_spin(&dt_lock);
 			return (0);
 		}
 		largest_ld = uap->start + uap->num;
@@ -544,7 +545,7 @@
 		i = largest_ld - uap->start;
 		bzero(&((union descriptor *)(pldt->ldt_base))[uap->start],
 		    sizeof(union descriptor) * i);
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&dt_lock);
 		return (0);
 	}
 
@@ -627,15 +628,15 @@
 
 	if (uap->start == LDT_AUTO_ALLOC && uap->num == 1) {
 		/* Allocate a free slot */
-		pldt = mdp->md_ldt;
-		if (pldt == NULL) {
-			error = i386_ldt_grow(td, NLDT + 1);
-			if (error)
+		mtx_lock_spin(&dt_lock);
+		if ((pldt = mdp->md_ldt) == NULL) {
+			if ((error = i386_ldt_grow(td, NLDT + 1))) {
+				mtx_unlock_spin(&dt_lock);
 				return (error);
+			}
 			pldt = mdp->md_ldt;
 		}
 again:
-		mtx_lock_spin(&sched_lock);
 		/*
 		 * start scanning a bit up to leave room for NVidia and
 		 * Wine, which still user the "Blat" method of allocation.
@@ -647,24 +648,23 @@
 			dp++;
 		}
 		if (i >= pldt->ldt_len) {
-			mtx_unlock_spin(&sched_lock);
-			error = i386_ldt_grow(td, pldt->ldt_len+1);
-			if (error)
+			if ((error = i386_ldt_grow(td, pldt->ldt_len+1))) {
+				mtx_unlock_spin(&dt_lock);
 				return (error);
+			}
 			goto again;
 		}
 		uap->start = i;
 		error = i386_set_ldt_data(td, i, 1, descs);
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&dt_lock);
 	} else {
 		largest_ld = uap->start + uap->num;
-		error = i386_ldt_grow(td, largest_ld);
-		if (error == 0) {
-			mtx_lock_spin(&sched_lock);
+		mtx_lock_spin(&dt_lock);
+		if (!(error = i386_ldt_grow(td, largest_ld))) {
 			error = i386_set_ldt_data(td, uap->start, uap->num,
 			    descs);
-			mtx_unlock_spin(&sched_lock);
 		}
+		mtx_unlock_spin(&dt_lock);
 	}
 	if (error == 0)
 		td->td_retval[0] = uap->start;
@@ -678,7 +678,7 @@
 	struct mdproc *mdp = &td->td_proc->p_md;
 	struct proc_ldt *pldt = mdp->md_ldt;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	mtx_assert(&dt_lock, MA_OWNED);
 
 	/* Fill in range */
 	bcopy(descs,
@@ -691,9 +691,11 @@
 i386_ldt_grow(struct thread *td, int len) 
 {
 	struct mdproc *mdp = &td->td_proc->p_md;
-	struct proc_ldt *pldt;
-	caddr_t old_ldt_base;
-	int old_ldt_len;
+	struct proc_ldt *new_ldt, *pldt;
+	caddr_t old_ldt_base = NULL_LDT_BASE;
+	int old_ldt_len = 0;
+
+	mtx_assert(&dt_lock, MA_OWNED);
 
 	if (len > MAX_LD)
 		return (ENOMEM);
@@ -701,52 +703,58 @@
 		len = NLDT + 1;
 
 	/* Allocate a user ldt. */
-	pldt = mdp->md_ldt;
-	if (!pldt || len > pldt->ldt_len) {
-		struct proc_ldt *new_ldt;
-
+	if ((pldt = mdp->md_ldt) == NULL || len > pldt->ldt_len) {
 		new_ldt = user_ldt_alloc(mdp, len);
 		if (new_ldt == NULL)
 			return (ENOMEM);
 		pldt = mdp->md_ldt;
 
-		/* sched_lock was acquired by user_ldt_alloc. */
-		if (pldt) {
-			if (new_ldt->ldt_len > pldt->ldt_len) {
-				old_ldt_base = pldt->ldt_base;
-				old_ldt_len = pldt->ldt_len;
-				pldt->ldt_sd = new_ldt->ldt_sd;
-				pldt->ldt_base = new_ldt->ldt_base;
-				pldt->ldt_len = new_ldt->ldt_len;
-				mtx_unlock_spin(&sched_lock);
-				kmem_free(kernel_map, (vm_offset_t)old_ldt_base,
-					old_ldt_len * sizeof(union descriptor));
-				FREE(new_ldt, M_SUBPROC);
-				mtx_lock_spin(&sched_lock);
-			} else {
+		if (pldt != NULL) {
+			if (new_ldt->ldt_len <= pldt->ldt_len) {
 				/*
-				 * If other threads already did the work,
-				 * do nothing.
+				 * We just lost the race for allocation, so
+				 * free the new object and return.
 				 */
-				mtx_unlock_spin(&sched_lock);
+				mtx_unlock_spin(&dt_lock);
 				kmem_free(kernel_map,
 				   (vm_offset_t)new_ldt->ldt_base,
 				   new_ldt->ldt_len * sizeof(union descriptor));
 				FREE(new_ldt, M_SUBPROC);
+				mtx_lock_spin(&dt_lock);
 				return (0);
 			}
-		} else {
+
+			/*
+			 * We have to substitute the current LDT entry for
+			 * curproc with the new one since its size grew.
+			 */
+			old_ldt_base = pldt->ldt_base;
+			old_ldt_len = pldt->ldt_len;
+			pldt->ldt_sd = new_ldt->ldt_sd;
+			pldt->ldt_base = new_ldt->ldt_base;
+			pldt->ldt_len = new_ldt->ldt_len;
+		} else
 			mdp->md_ldt = pldt = new_ldt;
-		}
 #ifdef SMP
-		mtx_unlock_spin(&sched_lock);
-		/* signal other cpus to reload ldt */
+		/*
+		 * Signal other cpus to reload ldt.  We need to unlock dt_lock
+		 * here because other CPU will contest on it since their
+		 * curthreads won't hold the lock and will block when trying
+		 * to acquire it.
+		 */
+		mtx_unlock_spin(&dt_lock);
 		smp_rendezvous(NULL, (void (*)(void *))set_user_ldt_rv,
-		    NULL, td);
+		    NULL, td->td_proc->p_vmspace);
 #else
-		set_user_ldt(mdp);
-		mtx_unlock_spin(&sched_lock);
+		set_user_ldt(&td->td_proc->p_md);
+		mtx_unlock_spin(&dt_lock);
 #endif
+		if (old_ldt_base != NULL_LDT_BASE) {
+			kmem_free(kernel_map, (vm_offset_t)old_ldt_base,
+			    old_ldt_len * sizeof(union descriptor));
+			FREE(new_ldt, M_SUBPROC);
+		}
+		mtx_lock_spin(&dt_lock);
 	}
 	return (0);
 }
Index: geode.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/geode.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/geode.c -L sys/i386/i386/geode.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/geode.c
+++ sys/i386/i386/geode.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/geode.c,v 1.5.8.1 2005/08/16 22:47:14 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/geode.c,v 1.10 2007/09/18 09:19:44 phk Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -49,6 +49,16 @@
 	}
 };
 
+static struct bios_oem bios_soekris_55 = {
+	{ 0xf0000, 0xf1000 },
+	{
+		{ "Soekris", 0, 8 },	/* Soekris Engineering. */
+		{ "net5", 0, 8 },	/* net5xxx */
+		{ "comBIOS", 0, 54 },	/* comBIOS ver. 1.26a  20040819 ... */
+		{ NULL, 0, 0 },
+	}
+};
+
 static struct bios_oem bios_pcengines = {
 	{ 0xf9000, 0xfa000 },
 	{
@@ -94,6 +104,25 @@
 	outl(gpio, u);
 }
 
+static void
+cs5536_led_func(void *ptr, int onoff)
+{
+	int bit;
+	uint16_t a;
+
+	bit = *(int *)ptr;
+	if (bit < 0) {
+		bit = -bit;
+		onoff = !onoff;
+	}
+
+	a = rdmsr(0x5140000c);
+	if (onoff)
+		outl(a, 1 << bit);
+	else
+		outl(a, 1 << (bit + 16));
+}
+
 
 static unsigned
 geode_get_timecount(struct timecounter *tc)
@@ -110,6 +139,20 @@
 	1000
 };
 
+static uint64_t
+geode_cputicks(void)
+{
+	unsigned c;
+	static unsigned last;
+	static uint64_t offset;
+
+	c = inl(geode_counter);
+	if (c < last)
+		offset += (1LL << 32);
+	last = c;
+	return (offset | c);
+}
+
 /*
  * The GEODE watchdog runs from a 32kHz frequency.  One period of that is
  * 31250 nanoseconds which we round down to 2^14 nanoseconds.  The watchdog
@@ -122,7 +165,7 @@
 	u_int u, p, r;
 
 	u = cmd & WD_INTERVAL;
-	if (cmd && u >= 14 && u <= 43) {
+	if (u >= 14 && u <= 43) {
 		u -= 14;
 		if (u > 16) {
 			p = u - 16;
@@ -144,6 +187,43 @@
 }
 
 /*
+ * We run MFGPT0 off the 32kHz frequency and prescale by 16384 giving a
+ * period of half a second.
+ * Range becomes 2^30 (= 1 sec) to 2^44 (almost 5 hours)
+ */
+static void
+cs5536_watchdog(void *foo __unused, u_int cmd, int *error)
+{
+	u_int u, p;
+	uint16_t a;
+	uint32_t m;
+
+	a = rdmsr(0x5140000d);
+	m = rdmsr(0x51400029);
+	m &= ~(1 << 24);
+	wrmsr(0x51400029, m);
+
+	u = cmd & WD_INTERVAL;
+	if (u >= 30 && u <= 44) {
+		p = 1 << (u - 29);
+
+		/* Set up MFGPT0, 32khz, prescaler 16k, C2 event */
+		outw(a + 6, 0x030e);
+		/* set comparator 2 */
+		outw(a + 2, p);
+		/* reset counter */
+		outw(a + 4, 0);
+		/* Arm reset mechanism */
+		m |= (1 << 24);
+		wrmsr(0x51400029, m);
+		/* Start counter */
+		outw(a + 6, 0x8000);
+
+		*error = 0;
+	}
+}
+
+/*
  * The Advantech PCM-582x watchdog expects 0x1 at I/O port 0x0443
  * every 1.6 secs +/- 30%. Writing 0x0 disables the watchdog
  * NB: reading the I/O port enables the timer as well
@@ -151,8 +231,15 @@
 static void
 advantech_watchdog(void *foo __unused, u_int cmd, int *error)
 {
-	outb(0x0443, (cmd & WD_INTERVAL) ? 1 : 0);
-	*error = 0;
+	u_int u;
+
+	u = cmd & WD_INTERVAL;
+	if (u > 0 && u <= WD_TO_1SEC) {
+		outb(0x0443, 1);
+		*error = 0;
+	} else {
+		outb(0x0443, 0);
+	}
 }
 
 static int
@@ -161,7 +248,8 @@
 #define BIOS_OEM_MAXLEN 80
 	static u_char bios_oem[BIOS_OEM_MAXLEN] = "\0";
 
-	if (pci_get_devid(self) == 0x0515100b) {
+	switch (pci_get_devid(self)) {
+	case 0x0515100b:
 		if (geode_counter == 0) {
 			/*
 			 * The address of the CBA is written to this register
@@ -176,8 +264,10 @@
 			tc_init(&geode_timecounter);
 			EVENTHANDLER_REGISTER(watchdog_list, geode_watchdog,
 			    NULL, 0);
+			set_cputicker(geode_cputicks, 27000000, 0);
 		}
-	} else if (pci_get_devid(self) == 0x0510100b) {
+		break;
+	case 0x0510100b:
 		gpio = pci_read_config(self, PCIR_BAR(0), 4);
 		gpio &= ~0x1f;
 		printf("Geode GPIO@ = %x\n", gpio);
@@ -201,13 +291,26 @@
 		}
 		if ( strlen(bios_oem) )
 			printf("Geode %s\n", bios_oem);
-	} else if (pci_get_devid(self) == 0x01011078) {
+		break;
+	case 0x01011078:
 		if ( bios_oem_strings(&bios_advantech,
 				bios_oem, BIOS_OEM_MAXLEN) > 0 ) {
 			printf("Geode %s\n", bios_oem);
 			EVENTHANDLER_REGISTER(watchdog_list, advantech_watchdog,
 			    NULL, 0);
 		}
+		break;
+	case 0x20801022:
+		if ( bios_oem_strings(&bios_soekris_55,
+		    bios_oem, BIOS_OEM_MAXLEN) > 0 ) {
+			printf("Geode LX: %s\n", bios_oem);
+			led1b = 6;
+			led1 = led_create(cs5536_led_func, &led1b, "error");
+		}
+		printf("MFGPT bar: %jx\n", rdmsr(0x5140000d));
+		EVENTHANDLER_REGISTER(watchdog_list, cs5536_watchdog,
+		    NULL, 0);
+		break;
 	}
 	return (ENXIO);
 }
Index: initcpu.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/initcpu.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/i386/i386/initcpu.c -L sys/i386/i386/initcpu.c -u -r1.3 -r1.4
--- sys/i386/i386/initcpu.c
+++ sys/i386/i386/initcpu.c
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/initcpu.c,v 1.52.2.3 2006/07/21 15:12:02 mr Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/initcpu.c,v 1.56 2007/04/06 18:15:02 ru Exp $");
 
 #include "opt_cpu.h"
 
@@ -41,6 +41,9 @@
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
@@ -686,6 +689,15 @@
 				break;
 			}
 		}
+#ifdef PAE
+		if ((amd_feature & AMDID_NX) != 0) {
+			uint64_t msr;
+
+			msr = rdmsr(MSR_EFER) | EFER_NXE;
+			wrmsr(MSR_EFER, msr);
+			pg_nx = PG_NX;
+		}
+#endif
 		break;
 #endif
 	default:
Index: bios.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/bios.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/bios.c -L sys/i386/i386/bios.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/bios.c
+++ sys/i386/i386/bios.c
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/bios.c,v 1.72.2.1 2005/08/16 22:47:14 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/bios.c,v 1.74 2007/04/19 09:18:51 phk Exp $");
 
 /*
  * Code for dealing with the BIOS in x86 PC systems.
@@ -475,7 +475,8 @@
     return (i);
 }
 
-int bios_oem_strings(struct bios_oem *oem, u_char *buffer, size_t maxlen)
+int
+bios_oem_strings(struct bios_oem *oem, u_char *buffer, size_t maxlen)
 {
 	size_t idx = 0;
 	struct bios_oem_signature *sig;
Index: legacy.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/legacy.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/legacy.c -L sys/i386/i386/legacy.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/legacy.c
+++ sys/i386/i386/legacy.c
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/legacy.c,v 1.61 2005/02/15 07:21:20 njl Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/legacy.c,v 1.63 2007/09/30 11:05:16 marius Exp $");
 
 /*
  * This code implements a system driver for legacy systems that do not
@@ -110,10 +110,10 @@
 {
 
 	/*
-	 * Add child device with order of 1 so it gets probed
-	 * after ACPI (which is at order 0.
+	 * Add child device with order of 11 so it gets probed
+	 * after ACPI (which is at order 10).
 	 */
-	if (BUS_ADD_CHILD(parent, 1, "legacy", 0) == NULL)
+	if (BUS_ADD_CHILD(parent, 11, "legacy", 0) == NULL)
 		panic("legacy: could not attach");
 }
 
@@ -228,6 +228,9 @@
 	struct legacy_device *atdev = DEVTOAT(child);
 
 	switch (which) {
+	case LEGACY_IVAR_PCIDOMAIN:
+		*result = 0;
+		break;
 	case LEGACY_IVAR_PCIBUS:
 		*result = atdev->lg_pcibus;
 		break;
@@ -244,6 +247,8 @@
 	struct legacy_device *atdev = DEVTOAT(child);
 
 	switch (which) {
+	case LEGACY_IVAR_PCIDOMAIN:
+		return EINVAL;
 	case LEGACY_IVAR_PCIBUS:
 		atdev->lg_pcibus = value;
 		break;
Index: machdep.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/machdep.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -L sys/i386/i386/machdep.c -L sys/i386/i386/machdep.c -u -r1.4 -r1.5
--- sys/i386/i386/machdep.c
+++ sys/i386/i386/machdep.c
@@ -14,7 +14,11 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
@@ -34,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/machdep.c,v 1.616.2.2 2006/02/07 00:29:33 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/machdep.c,v 1.658.2.1.2.1 2008/01/19 18:15:03 kib Exp $");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
@@ -49,6 +53,7 @@
 #include "opt_msgbuf.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
+#include "opt_xbox.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
@@ -57,6 +62,7 @@
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
+#include <sys/clock.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
@@ -129,6 +135,13 @@
 #include <i386/isa/icu.h>
 #endif
 
+#ifdef XBOX
+#include <machine/xbox.h>
+
+int arch_i386_is_xbox = 0;
+uint32_t arch_i386_xbox_memsize = 0;
+#endif
+
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
@@ -161,24 +174,35 @@
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
+/* Intel ICH registers */
+#define ICH_PMBASE	0x400
+#define ICH_SMI_EN	ICH_PMBASE + 0x30
+
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 int cold = 1;
 
 #ifdef COMPAT_43
-static void osendsig(sig_t catcher, int sig, sigset_t *mask, u_long code);
+static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
-static void freebsd4_sendsig(sig_t catcher, int sig, sigset_t *mask,
-    u_long code);
+static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
-vm_paddr_t phys_avail[10];
-vm_paddr_t dump_avail[10];
+/*
+ * The number of PHYSMAP entries must be one less than the number of
+ * PHYSSEG entries because the PHYSMAP entry that spans the largest
+ * physical address that is accessible by ISA DMA is split into two
+ * PHYSSEG entries.
+ */
+#define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
+
+vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
+vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
@@ -199,6 +223,27 @@
 cpu_startup(dummy)
 	void *dummy;
 {
+	char *sysenv;
+	
+	/*
+	 * On MacBooks, we need to disallow the legacy USB circuit to
+	 * generate an SMI# because this can cause several problems,
+	 * namely: incorrect CPU frequency detection and failure to
+	 * start the APs.
+	 * We do this by disabling a bit in the SMI_EN (SMI Control and
+	 * Enable register) of the Intel ICH LPC Interface Bridge.
+	 */
+	sysenv = getenv("smbios.system.product");
+	if (sysenv != NULL) {
+		if (strncmp(sysenv, "MacBook", 7) == 0) {
+			if (bootverbose)
+				printf("Disabling LEGACY_USB_EN bit on "
+				    "Intel ICH.\n");
+			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
+		}
+		freeenv(sysenv);
+	}
+
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
@@ -257,22 +302,20 @@
  */
 #ifdef COMPAT_43
 static void
-osendsig(catcher, sig, mask, code)
-	sig_t catcher;
-	int sig;
-	sigset_t *mask;
-	u_long code;
+osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
+	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
+	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
@@ -300,12 +343,12 @@
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
-		sf.sf_siginfo.si_code = code;
+		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 	} else {
 		/* Old FreeBSD-style arguments. */
-		sf.sf_arg2 = code;
-		sf.sf_addr = regs->tf_err;
+		sf.sf_arg2 = ksi->ksi_code;
+		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
@@ -387,22 +430,20 @@
 
 #ifdef COMPAT_FREEBSD4
 static void
-freebsd4_sendsig(catcher, sig, mask, code)
-	sig_t catcher;
-	int sig;
-	sigset_t *mask;
-	u_long code;
+freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
+	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
+	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
@@ -443,12 +484,12 @@
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
-		sf.sf_si.si_code = code;
-		sf.sf_si.si_addr = (void *)regs->tf_err;
+		sf.sf_si.si_code = ksi->ksi_code;
+		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
-		sf.sf_siginfo = code;
-		sf.sf_addr = regs->tf_err;
+		sf.sf_siginfo = ksi->ksi_code;
+		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
@@ -508,11 +549,7 @@
 #endif	/* COMPAT_FREEBSD4 */
 
 void
-sendsig(catcher, sig, mask, code)
-	sig_t catcher;
-	int sig;
-	sigset_t *mask;
-	u_long code;
+sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
@@ -520,22 +557,24 @@
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
+	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
+	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
-		freebsd4_sendsig(catcher, sig, mask, code);
+		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
-		osendsig(catcher, sig, mask, code);
+		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
@@ -581,13 +620,12 @@
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
-		sf.sf_si.si_signo = sig;
-		sf.sf_si.si_code = code;
-		sf.sf_si.si_addr = (void *)regs->tf_err;
+		sf.sf_si = ksi->ksi_info;
+		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
-		sf.sf_siginfo = code;
-		sf.sf_addr = regs->tf_err;
+		sf.sf_siginfo = ksi->ksi_code;
+		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
@@ -646,26 +684,6 @@
 }
 
 /*
- * Build siginfo_t for SA thread
- */
-void
-cpu_thread_siginfo(int sig, u_long code, siginfo_t *si)
-{
-	struct proc *p;
-	struct thread *td;
-
-	td = curthread;
-	p = td->td_proc;
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-
-	bzero(si, sizeof(*si));
-	si->si_signo = sig;
-	si->si_code = code;
-	si->si_addr = (void *)td->td_frame->tf_err;
-	/* XXXKSE fill other fields */
-}
-
-/*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
@@ -689,6 +707,7 @@
 	struct osigcontext *scp;
 	struct proc *p = td->td_proc;
 	int eflags, error;
+	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
@@ -711,8 +730,13 @@
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
-		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
-			trapsignal(td, SIGBUS, 0);
+		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
+			ksiginfo_init_trap(&ksi);
+			ksi.ksi_signo = SIGBUS;
+			ksi.ksi_code = BUS_OBJERR;
+			ksi.ksi_addr = (void *)regs->tf_eip;
+			trapsignal(td, &ksi);
+		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
@@ -753,7 +777,12 @@
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
-			trapsignal(td, SIGBUS, T_PROTFLT);
+			ksiginfo_init_trap(&ksi);
+			ksi.ksi_signo = SIGBUS;
+			ksi.ksi_code = BUS_OBJERR;
+			ksi.ksi_trapno = T_PROTFLT;
+			ksi.ksi_addr = (void *)regs->tf_eip;
+			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
@@ -807,6 +836,7 @@
 	struct trapframe *regs;
 	const struct ucontext4 *ucp;
 	int cs, eflags, error;
+	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
@@ -829,9 +859,13 @@
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
-		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
-			trapsignal(td, SIGBUS, 0);
-
+		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
+			ksiginfo_init_trap(&ksi);
+			ksi.ksi_signo = SIGBUS;
+			ksi.ksi_code = BUS_OBJERR;
+			ksi.ksi_addr = (void *)regs->tf_eip;
+			trapsignal(td, &ksi);
+		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
@@ -876,7 +910,12 @@
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("freebsd4_sigreturn: cs = 0x%x\n", cs);
-			trapsignal(td, SIGBUS, T_PROTFLT);
+			ksiginfo_init_trap(&ksi);
+			ksi.ksi_signo = SIGBUS;
+			ksi.ksi_code = BUS_OBJERR;
+			ksi.ksi_trapno = T_PROTFLT;
+			ksi.ksi_addr = (void *)regs->tf_eip;
+			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
@@ -906,7 +945,7 @@
 sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
-		const __ucontext *sigcntxp;
+		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
@@ -914,6 +953,7 @@
 	struct trapframe *regs;
 	const ucontext_t *ucp;
 	int cs, eflags, error, ret;
+	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
@@ -936,8 +976,13 @@
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
-		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
-			trapsignal(td, SIGBUS, 0);
+		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
+			ksiginfo_init_trap(&ksi);
+			ksi.ksi_signo = SIGBUS;
+			ksi.ksi_code = BUS_OBJERR;
+			ksi.ksi_addr = (void *)regs->tf_eip;
+			trapsignal(td, &ksi);
+		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
@@ -983,7 +1028,12 @@
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("sigreturn: cs = 0x%x\n", cs);
-			trapsignal(td, SIGBUS, T_PROTFLT);
+			ksiginfo_init_trap(&ksi);
+			ksi.ksi_signo = SIGBUS;
+			ksi.ksi_code = BUS_OBJERR;
+			ksi.ksi_trapno = T_PROTFLT;
+			ksi.ksi_addr = (void *)regs->tf_eip;
+			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
@@ -1039,9 +1089,9 @@
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_bind(curthread, cpu_id);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 #endif
 
 	/* Calibrate by measuring a short delay. */
@@ -1052,9 +1102,9 @@
 	intr_restore(reg);
 
 #ifdef SMP
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_unbind(curthread);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 #endif
 
 	/*
@@ -1093,6 +1143,7 @@
  * help lock contention somewhat, and this is critical for HTT. -Peter
  */
 static int	cpu_idle_hlt = 1;
+TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
     &cpu_idle_hlt, 0, "Idle loop HLT enable");
 
@@ -1151,8 +1202,11 @@
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
+	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
+	else
+		mtx_unlock_spin(&dt_lock);
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
@@ -1218,38 +1272,28 @@
 	unsigned int cr0;
 
 	cr0 = rcr0();
+
 	/*
-	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
-	 * BSP.  See the comments there about why we set them.
+	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
+	 *
+	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
+	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
+	 * bit to control the trap, because setting the CR0_EM bit does
+	 * not cause WAIT instructions to trap.  It's important to trap
+	 * WAIT instructions - otherwise the "wait" variants of no-wait
+	 * control instructions would degenerate to the "no-wait" variants
+	 * after FP context switches but work correctly otherwise.  It's
+	 * particularly important to trap WAITs when there is no NPX -
+	 * otherwise the "wait" variants would always degenerate.
+	 *
+	 * Try setting CR0_NE to get correct error reporting on 486DX's.
+	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
-static int
-sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
-{
-	int error;
-	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
-		req);
-	if (!error && req->newptr)
-		resettodr();
-	return (error);
-}
-
-SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
-	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
-
-SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
-	CTLFLAG_RW, &disable_rtc_set, 0, "");
-
-SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 
-	CTLFLAG_RD, &bootinfo, bootinfo, "");
-
-SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
-	CTLFLAG_RW, &wall_cmos_clock, 0, "");
-
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
@@ -1268,8 +1312,7 @@
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
-
-int private_tss;			/* flag indicating private tss */
+struct mtx dt_lock;			/* lock for GDT and LDT */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
@@ -1540,8 +1583,6 @@
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
-#define	IDTVEC(name)	__CONCAT(X,name)
-
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
@@ -1557,12 +1598,11 @@
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
-	int idx, quit;
+	int idx;
 	uintptr_t func;
 
 	ip = idt;
-	db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
-	for (idx = 0, quit = 0; idx < NIDT; idx++) {
+	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
@@ -1572,6 +1612,25 @@
 		ip++;
 	}
 }
+
+/* Show privileged registers. */
+DB_SHOW_COMMAND(sysregs, db_show_sysregs)
+{
+	uint64_t idtr, gdtr;
+
+	idtr = ridt();
+	db_printf("idtr\t0x%08x/%04x\n",
+	    (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
+	gdtr = rgdt();
+	db_printf("gdtr\t0x%08x/%04x\n",
+	    (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
+	db_printf("ldtr\t0x%04x\n", rldt());
+	db_printf("tr\t0x%04x\n", rtr());
+	db_printf("cr0\t0x%08x\n", rcr0());
+	db_printf("cr2\t0x%08x\n", rcr2());
+	db_printf("cr3\t0x%08x\n", rcr3());
+	db_printf("cr4\t0x%08x\n", rcr4());
+}
 #endif
 
 void
@@ -1588,8 +1647,6 @@
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
-#define PHYSMAP_SIZE	(2 * 8)
-
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
@@ -1606,8 +1663,8 @@
 static void
 getmemsize(int first)
 {
-	int i, physmap_idx, pa_indx, da_indx;
-	int hasbrokenint12;
+	int i, off, physmap_idx, pa_indx, da_indx;
+	int hasbrokenint12, has_smap;
 	u_long physmem_tunable;
 	u_int extmem;
 	struct vm86frame vmf;
@@ -1617,6 +1674,20 @@
 	struct bios_smap *smap;
 	quad_t dcons_addr, dcons_size;
 
+	has_smap = 0;
+#ifdef XBOX
+	if (arch_i386_is_xbox) {
+		/*
+		 * We queried the memory size before, so chop off 4MB for
+		 * the framebuffer and inform the OS of this.
+		 */
+		physmap[0] = 0;
+		physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
+		physmap_idx = 0;
+		goto physmap_done;
+	}
+#endif
+
 	hasbrokenint12 = 0;
 	TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
 	bzero(&vmf, sizeof(vmf));
@@ -1703,6 +1774,7 @@
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016llx len=%016llx\n",
 			    smap->type, smap->base, smap->length);
+		has_smap = 1;
 
 		if (smap->type != 0x01)
 			continue;
@@ -1722,7 +1794,7 @@
 			if (smap->base < physmap[i + 1]) {
 				if (boothowto & RB_VERBOSE)
 					printf(
-	"Overlapping or non-montonic memory region, ignoring second region\n");
+	"Overlapping or non-monotonic memory region, ignoring second region\n");
 				continue;
 			}
 		}
@@ -1844,6 +1916,13 @@
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
+	/*
+	 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
+	 * the amount of memory in the system.
+	 */
+	if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
+		Maxmem = atop(physmap[physmap_idx + 1]);
+
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
@@ -1856,7 +1935,7 @@
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/* call pmap initialization to make new kernel address space */
-	pmap_bootstrap(first, 0);
+	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
@@ -2012,7 +2091,10 @@
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
-	avail_end = phys_avail[pa_indx];
+	/* Map the message buffer. */
+	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
+		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
+		    off);
 }
 
 void
@@ -2020,7 +2102,7 @@
 	int first;
 {
 	struct gate_descriptor *gdp;
-	int gsel_tss, metadata_missing, off, x;
+	int gsel_tss, metadata_missing, x;
 	struct pcpu *pc;
 
 	thread0.td_kstack = proc0kstack;
@@ -2031,7 +2113,7 @@
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
-	proc_linkup(&proc0, &ksegrp0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
@@ -2073,6 +2155,7 @@
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
+	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
@@ -2089,8 +2172,7 @@
 	 *	     under witness.
 	 */
 	mutex_init();
-	mtx_init(&clock_lock, "clk", NULL, MTX_SPIN);
-	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
+	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
@@ -2150,6 +2232,34 @@
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
+#ifdef XBOX
+	/*
+	 * The following code queries the PCI ID of 0:0:0. For the XBOX,
+	 * This should be 0x10de / 0x02a5.
+	 *
+	 * This is exactly what Linux does.
+	 */
+	outl(0xcf8, 0x80000000);
+	if (inl(0xcfc) == 0x02a510de) {
+		arch_i386_is_xbox = 1;
+		pic16l_setled(XBOX_LED_GREEN);
+
+		/*
+		 * We are an XBOX, but we may have either 64MB or 128MB of
+		 * memory. The PCI host bridge should be programmed for this,
+		 * so we just query it. 
+		 */
+		outl(0xcf8, 0x80000084);
+		arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
+	}
+#endif /* XBOX */
+
+	/*
+	 * Initialize the i8254 before the console so that console
+	 * initialization can use DELAY().
+	 */
+	i8254_init();
+
 	/*
 	 * Initialize the console before we print anything out.
 	 */
@@ -2188,7 +2298,6 @@
 	    KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
-	private_tss = 0;
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
@@ -2220,10 +2329,6 @@
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
-	/* Map the message buffer. */
-	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
-		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
-
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	/* make a call gate to reenter kernel with */
@@ -2249,7 +2354,7 @@
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
-	thread0.td_pcb->pcb_flags = 0; /* XXXKSE */
+	thread0.td_pcb->pcb_flags = 0;
 #ifdef PAE
 	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 #else
@@ -2650,8 +2755,8 @@
 		}
 #ifdef DEV_NPX
 #ifdef CPU_ENABLE_SSE
-	if (cpu_fxsr)
-		addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask;
+		if (cpu_fxsr)
+			addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask;
 #endif
 		/*
 		 * XXX we violate the dubious requirement that npxsetregs()
@@ -2726,7 +2831,6 @@
 {
 	struct pcb *pcb;
 	int i;
-	u_int32_t mask1, mask2;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
@@ -2744,10 +2848,12 @@
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
-		for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; 
-		     i++, mask1 <<= 2, mask2 <<= 2)
-			if ((dbregs->dr[7] & mask1) == mask2)
+		for (i = 0; i < 4; i++) {
+			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
+			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
+				return (EINVAL);
+		}
 		
 		pcb = td->td_pcb;
 		
@@ -2765,25 +2871,25 @@
 		 * from within kernel mode?
 		 */
 
-		if (dbregs->dr[7] & 0x3) {
+		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
-		if (dbregs->dr[7] & (0x3<<2)) {
+		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
-		if (dbregs->dr[7] & (0x3<<4)) {
+		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
-		if (dbregs->dr[7] & (0x3<<6)) {
+		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
@@ -2855,9 +2961,8 @@
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
-        for (i=0; i<nbp; i++) {
-                if (addr[i] <
-                    (caddr_t)VM_MAXUSER_ADDRESS) {
+        for (i = 0; i < nbp; i++) {
+                if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
@@ -2886,7 +2991,7 @@
 }
 
 void *
-ioapic_create(uintptr_t addr, int32_t id, int intbase)
+ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase)
 {
 	return (NULL);
 }
@@ -2944,7 +3049,7 @@
 }
 
 void
-lapic_init(uintptr_t addr)
+lapic_init(vm_paddr_t addr)
 {
 }
 
Index: pmap.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/pmap.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/i386/i386/pmap.c -L sys/i386/i386/pmap.c -u -r1.2 -r1.3
--- sys/i386/i386/pmap.c
+++ sys/i386/i386/pmap.c
@@ -75,7 +75,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/pmap.c,v 1.523.2.6 2006/03/08 23:59:41 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/pmap.c,v 1.594.2.4.2.1 2008/01/19 18:15:03 kib Exp $");
 
 /*
  *	Manages physical address maps.
@@ -106,6 +106,8 @@
 #include "opt_cpu.h"
 #include "opt_pmap.h"
 #include "opt_msgbuf.h"
+#include "opt_smp.h"
+#include "opt_xbox.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -144,6 +146,10 @@
 #include <machine/smp.h>
 #endif
 
+#ifdef XBOX
+#include <machine/xbox.h>
+#endif
+
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
@@ -157,11 +163,18 @@
 #endif
 
 #if !defined(PMAP_DIAGNOSTIC)
-#define PMAP_INLINE __inline
+#define PMAP_INLINE	__gnu89_inline
 #else
 #define PMAP_INLINE
 #endif
 
+#define PV_STATS
+#ifdef PV_STATS
+#define PV_STAT(x)	do { x ; } while (0)
+#else
+#define PV_STAT(x)	do { } while (0)
+#endif
+
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
@@ -183,7 +196,6 @@
 static struct pmaplist allpmaps;
 static struct mtx allpmaps_lock;
 
-vm_paddr_t avail_end;	/* PA of last available physical page */
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 int pgeflag = 0;		/* PG_G or-in */
@@ -194,16 +206,19 @@
 extern u_int32_t KERNend;
 
 #ifdef PAE
+pt_entry_t pg_nx;
 static uma_zone_t pdptzone;
 #endif
 
 /*
  * Data for the pv entry allocation mechanism
  */
-static uma_zone_t pvzone;
-static struct vm_object pvzone_obj;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
-int pmap_pagedaemon_waken;
+static int shpgperproc = PMAP_SHPGPERPROC;
+
+struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
+int pv_maxchunks;			/* How many chunks we have KVA for */
+vm_offset_t pv_vafree;			/* freelist stored in the PTE */
 
 /*
  * All those kernel PT submaps that BSD is so fond of
@@ -249,23 +264,28 @@
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
-static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
-static pv_entry_t get_pv_entry(void);
-static void	pmap_clear_ptes(vm_page_t m, int bit);
+static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
+static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
 
-static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
-static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
+static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
+    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
+static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
+    vm_page_t *free);
+static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
+    vm_page_t *free);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 					vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
+static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
+    vm_page_t m);
 
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
-static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m);
+static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
 static void pmap_pte_release(pt_entry_t *pte);
-static int pmap_unuse_pt(pmap_t, vm_offset_t);
+static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 #ifdef PAE
 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
@@ -303,9 +323,7 @@
  *	(physical) address starting relative to 0]
  */
 void
-pmap_bootstrap(firstaddr, loadaddr)
-	vm_paddr_t firstaddr;
-	vm_paddr_t loadaddr;
+pmap_bootstrap(vm_paddr_t firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
@@ -334,7 +352,7 @@
 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
 #endif
 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
-	TAILQ_INIT(&kernel_pmap->pm_pvlist);
+	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	LIST_INIT(&allpmaps);
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 	mtx_lock_spin(&allpmaps_lock);
@@ -392,14 +410,71 @@
 	virtual_avail = va;
 
 	*CMAP1 = 0;
-	for (i = 0; i < NKPT; i++)
+
+	/*
+	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
+	 * physical memory region that is used by the ACPI wakeup code.  This
+	 * mapping must not have PG_G set. 
+	 */
+#ifdef XBOX
+	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
+	 * an early stadium, we cannot yet neatly map video memory ... :-(
+	 * Better fixes are very welcome! */
+	if (!arch_i386_is_xbox)
+#endif
+	for (i = 1; i < NKPT; i++)
 		PTD[i] = 0;
 
+	/* Initialize the PAT MSR if present. */
+	pmap_init_pat();
+
 	/* Turn on PG_G on kernel page(s) */
 	pmap_set_pg();
 }
 
 /*
+ * Setup the PAT MSR.
+ */
+void
+pmap_init_pat(void)
+{
+	uint64_t pat_msr;
+
+	/* Bail if this CPU doesn't implement PAT. */
+	if (!(cpu_feature & CPUID_PAT))
+		return;
+
+#ifdef PAT_WORKS
+	/*
+	 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
+	 * Program 4 and 5 as WP and WC.
+	 * Leave 6 and 7 as UC and UC-.
+	 */
+	pat_msr = rdmsr(MSR_PAT);
+	pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
+	pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
+	    PAT_VALUE(5, PAT_WRITE_COMBINING);
+#else
+	/*
+	 * Due to some Intel errata, we can only safely use the lower 4
+	 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
+	 * of UC-.
+	 *
+	 *   Intel Pentium III Processor Specification Update
+	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
+	 * or Mode C Paging)
+	 *
+	 *   Intel Pentium IV  Processor Specification Update
+	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
+	 */
+	pat_msr = rdmsr(MSR_PAT);
+	pat_msr &= ~PAT_MASK(2);
+	pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
+#endif
+	wrmsr(MSR_PAT, pat_msr);
+}
+
+/*
  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
  */
 void
@@ -463,6 +538,61 @@
 #endif
 
 /*
+ * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
+ * Requirements:
+ *  - Must deal with pages in order to ensure that none of the PG_* bits
+ *    are ever set, PG_V in particular.
+ *  - Assumes we can write to ptes without pte_store() atomic ops, even
+ *    on PAE systems.  This should be ok.
+ *  - Assumes nothing will ever test these addresses for 0 to indicate
+ *    no mapping instead of correctly checking PG_V.
+ *  - Assumes a vm_offset_t will fit in a pte (true for i386).
+ * Because PG_V is never set, there can be no mappings to invalidate.
+ */
+static vm_offset_t
+pmap_ptelist_alloc(vm_offset_t *head)
+{
+	pt_entry_t *pte;
+	vm_offset_t va;
+
+	va = *head;
+	if (va == 0)
+		return (va);	/* Out of memory */
+	pte = vtopte(va);
+	*head = *pte;
+	if (*head & PG_V)
+		panic("pmap_ptelist_alloc: va with PG_V set!");
+	*pte = 0;
+	return (va);
+}
+
+static void
+pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
+{
+	pt_entry_t *pte;
+
+	if (va & PG_V)
+		panic("pmap_ptelist_free: freeing va with PG_V set!");
+	pte = vtopte(va);
+	*pte = *head;		/* virtual! PG_V is 0 though */
+	*head = va;
+}
+
+static void
+pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
+{
+	int i;
+	vm_offset_t va;
+
+	*head = 0;
+	for (i = npages - 1; i >= 0; i--) {
+		va = (vm_offset_t)base + i * PAGE_SIZE;
+		pmap_ptelist_free(head, va);
+	}
+}
+
+
+/*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
@@ -470,21 +600,24 @@
 void
 pmap_init(void)
 {
-	int shpgperproc = PMAP_SHPGPERPROC;
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
-	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, 
-	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
+	pv_entry_max = roundup(pv_entry_max, _NPCPV);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
-	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
 
+	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
+	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
+	    PAGE_SIZE * pv_maxchunks);
+	if (pv_chunkbase == NULL)
+		panic("pmap_init: not enough kvm for pv chunks");
+	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
 #ifdef PAE
 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
@@ -493,53 +626,114 @@
 #endif
 }
 
-void
-pmap_init2()
-{
-}
 
+SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
+	"Max number of PV entries");
+SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
+	"Page share factor per proc");
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
-#if defined(PMAP_DIAGNOSTIC)
-
 /*
- * This code checks for non-writeable/modified pages.
- * This should be an invalid condition.
+ * Determine the appropriate bits to set in a PTE or PDE for a specified
+ * caching mode.
  */
 static int
-pmap_nw_modified(pt_entry_t ptea)
+pmap_cache_bits(int mode, boolean_t is_pde)
 {
-	int pte;
-
-	pte = (int) ptea;
-
-	if ((pte & (PG_M|PG_RW)) == PG_M)
-		return 1;
-	else
-		return 0;
-}
-#endif
+	int pat_flag, pat_index, cache_bits;
 
+	/* The PAT bit is different for PTE's and PDE's. */
+	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 
-/*
- * this routine defines the region(s) of memory that should
- * not be tested for the modified bit.
- */
-static PMAP_INLINE int
-pmap_track_modified(vm_offset_t va)
-{
-	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) 
-		return 1;
-	else
-		return 0;
+	/* If we don't support PAT, map extended modes to older ones. */
+	if (!(cpu_feature & CPUID_PAT)) {
+		switch (mode) {
+		case PAT_UNCACHEABLE:
+		case PAT_WRITE_THROUGH:
+		case PAT_WRITE_BACK:
+			break;
+		case PAT_UNCACHED:
+		case PAT_WRITE_COMBINING:
+		case PAT_WRITE_PROTECTED:
+			mode = PAT_UNCACHEABLE;
+			break;
+		}
+	}
+	
+	/* Map the caching mode to a PAT index. */
+	switch (mode) {
+#ifdef PAT_WORKS
+	case PAT_UNCACHEABLE:
+		pat_index = 3;
+		break;
+	case PAT_WRITE_THROUGH:
+		pat_index = 1;
+		break;
+	case PAT_WRITE_BACK:
+		pat_index = 0;
+		break;
+	case PAT_UNCACHED:
+		pat_index = 2;
+		break;
+	case PAT_WRITE_COMBINING:
+		pat_index = 5;
+		break;
+	case PAT_WRITE_PROTECTED:
+		pat_index = 4;
+		break;
+#else
+	case PAT_UNCACHED:
+	case PAT_UNCACHEABLE:
+	case PAT_WRITE_PROTECTED:
+		pat_index = 3;
+		break;
+	case PAT_WRITE_THROUGH:
+		pat_index = 1;
+		break;
+	case PAT_WRITE_BACK:
+		pat_index = 0;
+		break;
+	case PAT_WRITE_COMBINING:
+		pat_index = 2;
+		break;
+#endif
+	default:
+		panic("Unknown caching mode %d\n", mode);
+	}	
+
+	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
+	cache_bits = 0;
+	if (pat_index & 0x4)
+		cache_bits |= pat_flag;
+	if (pat_index & 0x2)
+		cache_bits |= PG_NC_PCD;
+	if (pat_index & 0x1)
+		cache_bits |= PG_NC_PWT;
+	return (cache_bits);
 }
-
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
+ *
+ * N.B.: Before calling any of the following TLB invalidation functions,
+ * the calling processor must ensure that all stores updating a non-
+ * kernel page table are globally performed.  Otherwise, another
+ * processor could cache an old, pre-update entry without being
+ * invalidated.  This can happen one of two ways: (1) The pmap becomes
+ * active on another processor after its pm_active field is checked by
+ * one of the following functions but before a store updating the page
+ * table is globally performed. (2) The pmap becomes active on another
+ * processor before its pm_active field is checked but due to
+ * speculative loads one of the following functions stills reads the
+ * pmap as inactive on the other processor.
+ * 
+ * The kernel page table is exempt because its pm_active field is
+ * immutable.  The kernel page table is always active on every
+ * processor.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
@@ -547,18 +741,7 @@
 	u_int cpumask;
 	u_int other_cpus;
 
-	if (smp_started) {
-		if (!(read_eflags() & PSL_I))
-			panic("%s: interrupts disabled", __func__);
-		mtx_lock_spin(&smp_ipi_mtx);
-	} else
-		critical_enter();
-	/*
-	 * We need to disable interrupt preemption but MUST NOT have
-	 * interrupts disabled here.
-	 * XXX we may need to hold schedlock to get a coherent pm_active
-	 * XXX critical sections disable interrupts again
-	 */
+	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invlpg(va);
 		smp_invlpg(va);
@@ -570,10 +753,7 @@
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
 	}
-	if (smp_started)
-		mtx_unlock_spin(&smp_ipi_mtx);
-	else
-		critical_exit();
+	sched_unpin();
 }
 
 void
@@ -583,18 +763,7 @@
 	u_int other_cpus;
 	vm_offset_t addr;
 
-	if (smp_started) {
-		if (!(read_eflags() & PSL_I))
-			panic("%s: interrupts disabled", __func__);
-		mtx_lock_spin(&smp_ipi_mtx);
-	} else
-		critical_enter();
-	/*
-	 * We need to disable interrupt preemption but MUST NOT have
-	 * interrupts disabled here.
-	 * XXX we may need to hold schedlock to get a coherent pm_active
-	 * XXX critical sections disable interrupts again
-	 */
+	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
@@ -609,10 +778,7 @@
 			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
 			    sva, eva);
 	}
-	if (smp_started)
-		mtx_unlock_spin(&smp_ipi_mtx);
-	else
-		critical_exit();
+	sched_unpin();
 }
 
 void
@@ -621,18 +787,7 @@
 	u_int cpumask;
 	u_int other_cpus;
 
-	if (smp_started) {
-		if (!(read_eflags() & PSL_I))
-			panic("%s: interrupts disabled", __func__);
-		mtx_lock_spin(&smp_ipi_mtx);
-	} else
-		critical_enter();
-	/*
-	 * We need to disable interrupt preemption but MUST NOT have
-	 * interrupts disabled here.
-	 * XXX we may need to hold schedlock to get a coherent pm_active
-	 * XXX critical sections disable interrupts again
-	 */
+	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invltlb();
 		smp_invltlb();
@@ -644,10 +799,17 @@
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invltlb(pmap->pm_active & other_cpus);
 	}
-	if (smp_started)
-		mtx_unlock_spin(&smp_ipi_mtx);
-	else
-		critical_exit();
+	sched_unpin();
+}
+
+void
+pmap_invalidate_cache(void)
+{
+
+	sched_pin();
+	wbinvd();
+	smp_cache_flush();
+	sched_unpin();
 }
 #else /* !SMP */
 /*
@@ -679,6 +841,13 @@
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
+
+PMAP_INLINE void
+pmap_invalidate_cache(void)
+{
+
+	wbinvd();
+}
 #endif /* !SMP */
 
 /*
@@ -808,7 +977,7 @@
 	pde = pmap->pm_pdir[va >> PDRSHIFT];
 	if (pde != 0) {
 		if ((pde & PG_PS) != 0) {
-			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
+			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 			PMAP_UNLOCK(pmap);
 			return rtval;
 		}
@@ -841,7 +1010,7 @@
 	if (pde != 0) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
-				m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
+				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
 			}
@@ -878,6 +1047,15 @@
 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
 }
 
+PMAP_INLINE void 
+pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
+{
+	pt_entry_t *pte;
+
+	pte = vtopte(va);
+	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
+}
+
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
@@ -930,17 +1108,22 @@
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
-pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
+pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
-	vm_offset_t va;
+	pt_entry_t *endpte, oldpte, *pte;
 
-	va = sva;
-	while (count-- > 0) {
-		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
-		va += PAGE_SIZE;
-		m++;
-	}
-	pmap_invalidate_range(kernel_pmap, sva, va);
+	oldpte = 0;
+	pte = vtopte(sva);
+	endpte = pte + count;
+	while (pte < endpte) {
+		oldpte |= *pte;
+		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V);
+		pte++;
+		ma++;
+	}
+	if ((oldpte & PG_V) != 0)
+		pmap_invalidate_range(kernel_pmap, sva, sva + count *
+		    PAGE_SIZE);
 }
 
 /*
@@ -964,24 +1147,35 @@
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
+static __inline void
+pmap_free_zero_pages(vm_page_t free)
+{
+	vm_page_t m;
+
+	while (free != NULL) {
+		m = free;
+		free = m->right;
+		vm_page_free_zero(m);
+	}
+}
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
-static PMAP_INLINE int
-pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
+static __inline int
+pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0)
-		return _pmap_unwire_pte_hold(pmap, m);
+		return _pmap_unwire_pte_hold(pmap, m, free);
 	else
 		return 0;
 }
 
 static int 
-_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
+_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
 {
 	vm_offset_t pteva;
 
@@ -992,14 +1186,26 @@
 	--pmap->pm_stats.resident_count;
 
 	/*
+	 * This is a release store so that the ordinary store unmapping
+	 * the page table page is globally performed before TLB shoot-
+	 * down is begun.
+	 */
+	atomic_subtract_rel_int(&cnt.v_wire_count, 1);
+
+	/*
 	 * Do an invltlb to make the invalidated mapping
 	 * take effect immediately.
 	 */
 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
 	pmap_invalidate_page(pmap, pteva);
 
-	vm_page_free_zero(m);
-	atomic_subtract_int(&cnt.v_wire_count, 1);
+	/* 
+	 * Put page on a list so that it is released after
+	 * *ALL* TLB shootdown is done
+	 */
+	m->right = *free;
+	*free = m;
+
 	return 1;
 }
 
@@ -1008,7 +1214,7 @@
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
-pmap_unuse_pt(pmap_t pmap, vm_offset_t va)
+pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
 {
 	pd_entry_t ptepde;
 	vm_page_t mpte;
@@ -1017,12 +1223,11 @@
 		return 0;
 	ptepde = *pmap_pde(pmap, va);
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
-	return pmap_unwire_pte_hold(pmap, mpte);
+	return pmap_unwire_pte_hold(pmap, mpte, free);
 }
 
 void
-pmap_pinit0(pmap)
-	struct pmap *pmap;
+pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
@@ -1032,7 +1237,7 @@
 #endif
 	pmap->pm_active = 0;
 	PCPU_SET(curpmap, pmap);
-	TAILQ_INIT(&pmap->pm_pvlist);
+	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
@@ -1043,9 +1248,8 @@
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
-void
-pmap_pinit(pmap)
-	register struct pmap *pmap;
+int
+pmap_pinit(pmap_t pmap)
 {
 	vm_page_t m, ptdpg[NPGPTD];
 	vm_paddr_t pa;
@@ -1061,6 +1265,11 @@
 	if (pmap->pm_pdir == NULL) {
 		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
 		    NBPTD);
+
+		if (pmap->pm_pdir == NULL) {
+			PMAP_LOCK_DESTROY(pmap);
+			return (0);
+		}
 #ifdef PAE
 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
@@ -1112,8 +1321,10 @@
 	}
 
 	pmap->pm_active = 0;
-	TAILQ_INIT(&pmap->pm_pvlist);
+	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+
+	return (1);
 }
 
 /*
@@ -1203,7 +1414,7 @@
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
-		m = PHYS_TO_VM_PAGE(ptepa);
+		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
@@ -1238,6 +1449,9 @@
 {
 	u_int mymask = PCPU_GET(cpumask);
 
+#ifdef COUNT_IPIS
+	(*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
+#endif
 	if (rcr3() == lazyptd)
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 	atomic_clear_int(lazymask, mymask);
@@ -1259,7 +1473,7 @@
 {
 	u_int mymask;
 	u_int mask;
-	register u_int spins;
+	u_int spins;
 
 	while ((mask = pmap->pm_active) != 0) {
 		spins = 50000000;
@@ -1333,7 +1547,8 @@
 	mtx_unlock_spin(&allpmaps_lock);
 
 	for (i = 0; i < NPGPTD; i++)
-		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
+		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
+		    PG_FRAME);
 
 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
 	    sizeof(*pmap->pm_pdir));
@@ -1343,7 +1558,6 @@
 
 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
 
-	vm_page_lock_queues();
 	for (i = 0; i < NPGPTD; i++) {
 		m = ptdpg[i];
 #ifdef PAE
@@ -1354,7 +1568,6 @@
 		atomic_subtract_int(&cnt.v_wire_count, 1);
 		vm_page_free_zero(m);
 	}
-	vm_page_unlock_queues();
 	PMAP_LOCK_DESTROY(pmap);
 }
 

@@ -1450,35 +1663,260 @@
  * page management routines.
  ***************************************************/
 
+CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
+CTASSERT(_NPCM == 11);
+
+static __inline struct pv_chunk *
+pv_to_chunk(pv_entry_t pv)
+{
+
+	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
+}
+
+#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
+
+#define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
+#define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
+
+static uint32_t pc_freemask[11] = {
+	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
+	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
+	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
+	PC_FREE0_9, PC_FREE10
+};
+
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
+	"Current number of pv entries");
+
+#ifdef PV_STATS
+static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
+
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
+	"Current number of pv entry chunks");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
+	"Current number of pv entry chunks allocated");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
+	"Current number of pv entry chunks frees");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
+	"Number of times tried to get a chunk page but failed.");
+
+static long pv_entry_frees, pv_entry_allocs;
+static int pv_entry_spare;
+
+SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
+	"Current number of pv entry frees");
+SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
+	"Current number of pv entry allocs");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
+	"Current number of spare pv entries");
+
+static int pmap_collect_inactive, pmap_collect_active;
+
+SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
+	"Current number times pmap_collect called on inactive queue");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
+	"Current number times pmap_collect called on active queue");
+#endif
+
+/*
+ * We are in a serious low memory condition.  Resort to
+ * drastic measures to free some pages so we can allocate
+ * another pv entry chunk.  This is normally called to
+ * unmap inactive pages, and if necessary, active pages.
+ */
+static void
+pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
+{
+	pmap_t pmap;
+	pt_entry_t *pte, tpte;
+	pv_entry_t next_pv, pv;
+	vm_offset_t va;
+	vm_page_t m, free;
+
+	sched_pin();
+	TAILQ_FOREACH(m, &vpq->pl, pageq) {
+		if (m->hold_count || m->busy)
+			continue;
+		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
+			va = pv->pv_va;
+			pmap = PV_PMAP(pv);
+			/* Avoid deadlock and lock recursion. */
+			if (pmap > locked_pmap)
+				PMAP_LOCK(pmap);
+			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
+				continue;
+			pmap->pm_stats.resident_count--;
+			pte = pmap_pte_quick(pmap, va);
+			tpte = pte_load_clear(pte);
+			KASSERT((tpte & PG_W) == 0,
+			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
+			if (tpte & PG_A)
+				vm_page_flag_set(m, PG_REFERENCED);
+			if (tpte & PG_M) {
+				KASSERT((tpte & PG_RW),
+	("pmap_collect: modified page not writable: va: %#x, pte: %#jx",
+				    va, (uintmax_t)tpte));
+				vm_page_dirty(m);
+			}
+			free = NULL;
+			pmap_unuse_pt(pmap, va, &free);
+			pmap_invalidate_page(pmap, va);
+			pmap_free_zero_pages(free);
+			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+			if (TAILQ_EMPTY(&m->md.pv_list))
+				vm_page_flag_clear(m, PG_WRITEABLE);
+			m->md.pv_list_count--;
+			free_pv_entry(pmap, pv);
+			if (pmap != locked_pmap)
+				PMAP_UNLOCK(pmap);
+		}
+	}
+	sched_unpin();
+}
+
+
 /*
  * free the pv_entry back to the free list
  */
-static PMAP_INLINE void
-free_pv_entry(pv_entry_t pv)
+static void
+free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
+	vm_page_t m;
+	struct pv_chunk *pc;
+	int idx, field, bit;
+
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	PV_STAT(pv_entry_frees++);
+	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
-	uma_zfree(pvzone, pv);
+	pc = pv_to_chunk(pv);
+	idx = pv - &pc->pc_pventry[0];
+	field = idx / 32;
+	bit = idx % 32;
+	pc->pc_map[field] |= 1ul << bit;
+	/* move to head of list */
+	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+	for (idx = 0; idx < _NPCM; idx++)
+		if (pc->pc_map[idx] != pc_freemask[idx])
+			return;
+	PV_STAT(pv_entry_spare -= _NPCPV);
+	PV_STAT(pc_chunk_count--);
+	PV_STAT(pc_chunk_frees++);
+	/* entire chunk is free, return it */
+	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
+	pmap_qremove((vm_offset_t)pc, 1);
+	vm_page_unwire(m, 0);
+	vm_page_free(m);
+	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
- * the memory allocation is performed bypassing the malloc code
- * because of the possibility of allocations at interrupt time.
  */
 static pv_entry_t
-get_pv_entry(void)
+get_pv_entry(pmap_t pmap, int try)
 {
+	static const struct timeval printinterval = { 60, 0 };
+	static struct timeval lastprint;
+	static vm_pindex_t colour;
+	int bit, field;
+	pv_entry_t pv;
+	struct pv_chunk *pc;
+	vm_page_t m;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
-	if ((pv_entry_count > pv_entry_high_water) &&
-		(pmap_pagedaemon_waken == 0)) {
-		pmap_pagedaemon_waken = 1;
-		wakeup (&vm_pages_needed);
+	if (pv_entry_count > pv_entry_high_water)
+		if (ratecheck(&lastprint, &printinterval))
+			printf("Approaching the limit on PV entries, consider "
+			    "increasing either the vm.pmap.shpgperproc or the "
+			    "vm.pmap.pv_entry_max tunable.\n");
+	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+	if (pc != NULL) {
+		for (field = 0; field < _NPCM; field++) {
+			if (pc->pc_map[field]) {
+				bit = bsfl(pc->pc_map[field]);
+				break;
+			}
+		}
+		if (field < _NPCM) {
+			pv = &pc->pc_pventry[field * 32 + bit];
+			pc->pc_map[field] &= ~(1ul << bit);
+			/* If this was the last item, move it to tail */
+			for (field = 0; field < _NPCM; field++)
+				if (pc->pc_map[field] != 0) {
+					PV_STAT(pv_entry_spare--);
+					return (pv);	/* not full, return */
+				}
+			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+			PV_STAT(pv_entry_spare--);
+			return (pv);
+		}
 	}
-	return uma_zalloc(pvzone, M_NOWAIT);
+	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
+	m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL |
+	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
+	if (m == NULL || pc == NULL) {
+		if (try) {
+			pv_entry_count--;
+			PV_STAT(pc_chunk_tryfail++);
+			if (m) {
+				vm_page_lock_queues();
+				vm_page_unwire(m, 0);
+				vm_page_free(m);
+				vm_page_unlock_queues();
+			}
+			if (pc)
+				pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+			return (NULL);
+		}
+		/*
+		 * Reclaim pv entries: At first, destroy mappings to
+		 * inactive pages.  After that, if a pv chunk entry
+		 * is still needed, destroy mappings to active pages.
+		 */
+		PV_STAT(pmap_collect_inactive++);
+		pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
+		if (m == NULL)
+			m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL |
+			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
+		if (pc == NULL)
+			pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
+		if (m == NULL || pc == NULL) {
+			PV_STAT(pmap_collect_active++);
+			pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
+			if (m == NULL)
+				m = vm_page_alloc(NULL, colour,
+				    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
+				    VM_ALLOC_WIRED);
+			if (pc == NULL)
+				pc = (struct pv_chunk *)
+				    pmap_ptelist_alloc(&pv_vafree);
+			if (m == NULL || pc == NULL)
+				panic("get_pv_entry: increase vm.pmap.shpgperproc");
+		}
+	}
+	PV_STAT(pc_chunk_count++);
+	PV_STAT(pc_chunk_allocs++);
+	colour++;
+	pmap_qenter((vm_offset_t)pc, &m, 1);
+	pc->pc_pmap = pmap;
+	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
+	for (field = 1; field < _NPCM; field++)
+		pc->pc_map[field] = pc_freemask[field];
+	pv = &pc->pc_pventry[0];
+	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+	PV_STAT(pv_entry_spare += _NPCPV - 1);
+	return (pv);
 }
 
-
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
@@ -1486,24 +1924,16 @@
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
-		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-			if (pmap == pv->pv_pmap && va == pv->pv_va) 
-				break;
-		}
-	} else {
-		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
-			if (va == pv->pv_va) 
-				break;
-		}
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+		if (pmap == PV_PMAP(pv) && va == pv->pv_va)
+			break;
 	}
 	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count--;
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_WRITEABLE);
-	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
-	free_pv_entry(pv);
+	free_pv_entry(pmap, pv);
 }
 
 /*
@@ -1515,24 +1945,39 @@
 {
 	pv_entry_t pv;
 
-	pv = get_pv_entry();
-	if (pv == NULL)
-		panic("no pv entries: increase vm.pmap.shpgperproc");
-	pv->pv_va = va;
-	pv->pv_pmap = pmap;
-
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
+	pv = get_pv_entry(pmap, FALSE);
+	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 /*
+ * Conditionally create a pv entry.
+ */
+static boolean_t
+pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
+{
+	pv_entry_t pv;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	if (pv_entry_count < pv_entry_high_water && 
+	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
+		pv->pv_va = va;
+		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+		m->md.pv_list_count++;
+		return (TRUE);
+	} else
+		return (FALSE);
+}
+
+/*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
-pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
+pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
@@ -1550,30 +1995,25 @@
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
-		m = PHYS_TO_VM_PAGE(oldpte);
+		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if (oldpte & PG_M) {
-#if defined(PMAP_DIAGNOSTIC)
-			if (pmap_nw_modified((pt_entry_t) oldpte)) {
-				printf(
-	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
-				    va, oldpte);
-			}
-#endif
-			if (pmap_track_modified(va))
-				vm_page_dirty(m);
+			KASSERT((oldpte & PG_RW),
+	("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
+			    va, (uintmax_t)oldpte));
+			vm_page_dirty(m);
 		}
 		if (oldpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
-	return (pmap_unuse_pt(pmap, va));
+	return (pmap_unuse_pt(pmap, va, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
-pmap_remove_page(pmap_t pmap, vm_offset_t va)
+pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
 {
 	pt_entry_t *pte;
 
@@ -1582,7 +2022,7 @@
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
 		return;
-	pmap_remove_pte(pmap, pte, va);
+	pmap_remove_pte(pmap, pte, va, free);
 	pmap_invalidate_page(pmap, va);
 }
 
@@ -1598,6 +2038,7 @@
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
+	vm_page_t free = NULL;
 	int anyvalid;
 
 	/*
@@ -1619,7 +2060,7 @@
 	 */
 	if ((sva + PAGE_SIZE == eva) && 
 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
-		pmap_remove_page(pmap, sva);
+		pmap_remove_page(pmap, sva, &free);
 		goto out;
 	}
 
@@ -1665,17 +2106,24 @@
 		    sva += PAGE_SIZE) {
 			if (*pte == 0)
 				continue;
-			anyvalid = 1;
-			if (pmap_remove_pte(pmap, pte, sva))
+
+			/*
+			 * The TLB entry for a PG_G mapping is invalidated
+			 * by pmap_remove_pte().
+			 */
+			if ((*pte & PG_G) == 0)
+				anyvalid = 1;
+			if (pmap_remove_pte(pmap, pte, sva, &free))
 				break;
 		}
 	}
 out:
 	sched_unpin();
-	vm_page_unlock_queues();
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
+	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
+	pmap_free_zero_pages(free);
 }
 
 /*
@@ -1694,8 +2142,10 @@
 void
 pmap_remove_all(vm_page_t m)
 {
-	register pv_entry_t pv;
+	pv_entry_t pv;
+	pmap_t pmap;
 	pt_entry_t *pte, tpte;
+	vm_page_t free;
 
 #if defined(PMAP_DIAGNOSTIC)
 	/*
@@ -1709,12 +2159,13 @@
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	sched_pin();
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
-		PMAP_LOCK(pv->pv_pmap);
-		pv->pv_pmap->pm_stats.resident_count--;
-		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		pmap->pm_stats.resident_count--;
+		pte = pmap_pte_quick(pmap, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
-			pv->pv_pmap->pm_stats.wired_count--;
+			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 
@@ -1722,23 +2173,19 @@
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
-#if defined(PMAP_DIAGNOSTIC)
-			if (pmap_nw_modified((pt_entry_t) tpte)) {
-				printf(
-	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
-				    pv->pv_va, tpte);
-			}
-#endif
-			if (pmap_track_modified(pv->pv_va))
-				vm_page_dirty(m);
+			KASSERT((tpte & PG_RW),
+	("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
+			    pv->pv_va, (uintmax_t)tpte));
+			vm_page_dirty(m);
 		}
-		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
-		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
+		free = NULL;
+		pmap_unuse_pt(pmap, pv->pv_va, &free);
+		pmap_invalidate_page(pmap, pv->pv_va);
+		pmap_free_zero_pages(free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
-		pmap_unuse_pt(pv->pv_pmap, pv->pv_va);
-		PMAP_UNLOCK(pv->pv_pmap);
-		free_pv_entry(pv);
+		free_pv_entry(pmap, pv);
+		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 	sched_unpin();
@@ -1761,8 +2208,14 @@
 		return;
 	}
 
+#ifdef PAE
+	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
+	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
+		return;
+#else
 	if (prot & VM_PROT_WRITE)
 		return;
+#endif
 
 	anychanged = 0;
 
@@ -1770,7 +2223,8 @@
 	sched_pin();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
-		unsigned obits, pbits, pdirindex;
+		pt_entry_t obits, pbits;
+		unsigned pdirindex;
 
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 
@@ -1788,7 +2242,12 @@
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
-			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
+			if ((prot & VM_PROT_WRITE) == 0)
+				pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
+#ifdef PAE
+			if ((prot & VM_PROT_EXECUTE) == 0)
+				pmap->pm_pdir[pdirindex] |= pg_nx;
+#endif
 			anychanged = 1;
 			continue;
 		}
@@ -1806,28 +2265,39 @@
 			 * size, PG_RW, PG_A, and PG_M are among the least
 			 * significant 32 bits.
 			 */
-			obits = pbits = *(u_int *)pte;
+			obits = pbits = *pte;
+			if ((pbits & PG_V) == 0)
+				continue;
 			if (pbits & PG_MANAGED) {
 				m = NULL;
 				if (pbits & PG_A) {
-					m = PHYS_TO_VM_PAGE(*pte);
+					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_flag_set(m, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
-				if ((pbits & PG_M) != 0 &&
-				    pmap_track_modified(sva)) {
+				if ((pbits & PG_M) != 0) {
 					if (m == NULL)
-						m = PHYS_TO_VM_PAGE(*pte);
+						m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 			}
 
-			pbits &= ~(PG_RW | PG_M);
+			if ((prot & VM_PROT_WRITE) == 0)
+				pbits &= ~(PG_RW | PG_M);
+#ifdef PAE
+			if ((prot & VM_PROT_EXECUTE) == 0)
+				pbits |= pg_nx;
+#endif
 
 			if (pbits != obits) {
+#ifdef PAE
+				if (!atomic_cmpset_64(pte, obits, pbits))
+					goto retry;
+#else
 				if (!atomic_cmpset_int((u_int *)pte, obits,
 				    pbits))
 					goto retry;
+#endif
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
@@ -1836,9 +2306,9 @@
 		}
 	}
 	sched_unpin();
-	vm_page_unlock_queues();
 	if (anychanged)
 		pmap_invalidate_all(pmap);
+	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
@@ -1859,13 +2329,14 @@
 	   boolean_t wired)
 {
 	vm_paddr_t pa;
-	register pt_entry_t *pte;
+	pd_entry_t *pde;
+	pt_entry_t *pte;
 	vm_paddr_t opa;
 	pt_entry_t origpte, newpte;
 	vm_page_t mpte, om;
 	boolean_t invlva;
 
-	va &= PG_FRAME;
+	va = trunc_page(va);
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
@@ -1897,6 +2368,9 @@
 	}
 #endif
 
+	pde = pmap_pde(pmap, va);
+	if ((*pde & PG_PS) != 0)
+		panic("pmap_enter: attempted pmap_enter on 4MB page");
 	pte = pmap_pte_quick(pmap, va);
 
 	/*
@@ -1912,16 +2386,6 @@
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
-	if (origpte & PG_PS) {
-		/*
-		 * Yes, I know this will truncate upper address bits for PAE,
-		 * but I'm actually more interested in the lower bits
-		 */
-		printf("pmap_enter: va %p, pte %p, origpte %p\n",
-		    (void *)va, (void *)pte, (void *)(uintptr_t)origpte);
-		panic("pmap_enter: attempted pmap_enter on 4MB page");
-	}
-
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
@@ -1977,6 +2441,8 @@
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
+		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
+		    ("pmap_enter: managed mapping within the clean submap"));
 		pmap_insert_entry(pmap, va, m);
 		pa |= PG_MANAGED;
 	}
@@ -1992,8 +2458,14 @@
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | PG_V);
-	if ((prot & VM_PROT_WRITE) != 0)
+	if ((prot & VM_PROT_WRITE) != 0) {
 		newpte |= PG_RW;
+		vm_page_flag_set(m, PG_WRITEABLE);
+	}
+#ifdef PAE
+	if ((prot & VM_PROT_EXECUTE) == 0)
+		newpte |= pg_nx;
+#endif
 	if (wired)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
@@ -2014,13 +2486,17 @@
 					vm_page_flag_set(om, PG_REFERENCED);
 				if (opa != VM_PAGE_TO_PHYS(m))
 					invlva = TRUE;
+#ifdef PAE
+				if ((origpte & PG_NX) == 0 &&
+				    (newpte & PG_NX) != 0)
+					invlva = TRUE;
+#endif
 			}
 			if (origpte & PG_M) {
 				KASSERT((origpte & PG_RW),
-				    ("pmap_enter: modified page not writable:"
-				     " va: 0x%x, pte: 0x%x", va, origpte));
-				if ((origpte & PG_MANAGED) &&
-				    pmap_track_modified(va))
+	("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
+				    va, (uintmax_t)origpte));
+				if ((origpte & PG_MANAGED) != 0)
 					vm_page_dirty(om);
 				if ((prot & VM_PROT_WRITE) == 0)
 					invlva = TRUE;
@@ -2036,6 +2512,38 @@
 }
 
 /*
+ * Maps a sequence of resident pages belonging to the same object.
+ * The sequence begins with the given page m_start.  This page is
+ * mapped at the given virtual address start.  Each subsequent page is
+ * mapped at a virtual address that is offset from start by the same
+ * amount as the page is offset from m_start within the object.  The
+ * last page in the sequence is the page with the largest offset from
+ * m_start that can be mapped at a virtual address less than the given
+ * virtual address end.  Not every virtual page between start and end
+ * is mapped; only those for which a resident page exists with the
+ * corresponding offset from m_start are mapped.
+ */
+void
+pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
+    vm_page_t m_start, vm_prot_t prot)
+{
+	vm_page_t m, mpte;
+	vm_pindex_t diff, psize;
+
+	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
+	psize = atop(end - start);
+	mpte = NULL;
+	m = m_start;
+	PMAP_LOCK(pmap);
+	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
+		mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m,
+		    prot, mpte);
+		m = TAILQ_NEXT(m, listq);
+	}
+ 	PMAP_UNLOCK(pmap);
+}
+
+/*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
@@ -2044,16 +2552,28 @@
  * but is *MUCH* faster than pmap_enter...
  */
 
-vm_page_t
-pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
-    vm_page_t mpte)
+void
+pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
+{
+
+	PMAP_LOCK(pmap);
+	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
+	PMAP_UNLOCK(pmap);
+}
+
+static vm_page_t
+pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
+    vm_prot_t prot, vm_page_t mpte)
 {
 	pt_entry_t *pte;
 	vm_paddr_t pa;
+	vm_page_t free;
 
+	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
+	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
+	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	PMAP_LOCK(pmap);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
@@ -2070,7 +2590,6 @@
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
-retry:
 			/*
 			 * Get the page directory entry
 			 */
@@ -2083,23 +2602,13 @@
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 4MB page");
-				mpte = PHYS_TO_VM_PAGE(ptepa);
+				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex,
 				    M_NOWAIT);
-				if (mpte == NULL) {
-					PMAP_UNLOCK(pmap);
-					vm_page_busy(m);
-					vm_page_unlock_queues();
-					VM_OBJECT_UNLOCK(m->object);
-					VM_WAIT;
-					VM_OBJECT_LOCK(m->object);
-					vm_page_lock_queues();
-					vm_page_wakeup(m);
-					PMAP_LOCK(pmap);
-					goto retry;
-				}
+				if (mpte == NULL)
+					return (mpte);
 			}
 		}
 	} else {
@@ -2115,19 +2624,28 @@
 	pte = vtopte(va);
 	if (*pte) {
 		if (mpte != NULL) {
-			pmap_unwire_pte_hold(pmap, mpte);
+			mpte->wire_count--;
 			mpte = NULL;
 		}
-		goto out;
+		return (mpte);
 	}
 
 	/*
-	 * Enter on the PV list if part of our managed memory. Note that we
-	 * raise IPL while manipulating pv_table since pmap_enter can be
-	 * called at interrupt time.
+	 * Enter on the PV list if part of our managed memory.
 	 */
-	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
-		pmap_insert_entry(pmap, va, m);
+	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
+	    !pmap_try_insert_pv_entry(pmap, va, m)) {
+		if (mpte != NULL) {
+			free = NULL;
+			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
+				pmap_invalidate_page(pmap, va);
+				pmap_free_zero_pages(free);
+			}
+			
+			mpte = NULL;
+		}
+		return (mpte);
+	}
 
 	/*
 	 * Increment counters
@@ -2135,6 +2653,10 @@
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m);
+#ifdef PAE
+	if ((prot & VM_PROT_EXECUTE) == 0)
+		pa |= pg_nx;
+#endif
 
 	/*
 	 * Now validate mapping with RO protection
@@ -2143,8 +2665,6 @@
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
-out:
-	PMAP_UNLOCK(pmap);
 	return mpte;
 }
 
@@ -2193,7 +2713,6 @@
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p != NULL) {
-			vm_page_lock_queues();
 			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
 				goto retry;
 		} else {
@@ -2212,8 +2731,8 @@
 			p = vm_page_lookup(object, pindex);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
+			vm_page_unlock_queues();
 		}
-		vm_page_unlock_queues();
 
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
@@ -2244,12 +2763,9 @@
  *			The mapping must already exist in the pmap.
  */
 void
-pmap_change_wiring(pmap, va, wired)
-	register pmap_t pmap;
-	vm_offset_t va;
-	boolean_t wired;
+pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
-	register pt_entry_t *pte;
+	pt_entry_t *pte;
 
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, va);
@@ -2282,10 +2798,10 @@
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
+	vm_page_t   free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t pdnxt;
-	vm_page_t m;
 
 	if (dst_addr != src_addr)
 		return;
@@ -2311,15 +2827,6 @@
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables");
 
-		/*
-		 * Don't let optional prefaulting of pages make us go
-		 * way below the low water mark of free pages or way
-		 * above high water mark of used pv entries.
-		 */
-		if (cnt.v_free_count < cnt.v_free_reserved ||
-		    pv_entry_count > pv_entry_high_water)
-			break;
-		
 		pdnxt = (addr + NBPDR) & ~PDRMASK;
 		ptepindex = addr >> PDRSHIFT;
 
@@ -2329,14 +2836,15 @@
 			
 		if (srcptepaddr & PG_PS) {
 			if (dst_pmap->pm_pdir[ptepindex] == 0) {
-				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
+				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
+				    ~PG_W;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			}
 			continue;
 		}
 
-		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
+		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
 		if (srcmpte->wire_count == 0)
 			panic("pmap_copy: source page table page is unused");
 
@@ -2351,28 +2859,31 @@
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
-				/*
-				 * We have to check after allocpte for the
-				 * pte still being around...  allocpte can
-				 * block.
-				 */
 				dstmpte = pmap_allocpte(dst_pmap, addr,
 				    M_NOWAIT);
 				if (dstmpte == NULL)
 					break;
 				dst_pte = pmap_pte_quick(dst_pmap, addr);
-				if (*dst_pte == 0) {
+				if (*dst_pte == 0 &&
+				    pmap_try_insert_pv_entry(dst_pmap, addr,
+				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 					/*
-					 * Clear the modified and
+					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
-					m = PHYS_TO_VM_PAGE(ptetemp);
-					*dst_pte = ptetemp & ~(PG_M | PG_A);
+					*dst_pte = ptetemp & ~(PG_W | PG_M |
+					    PG_A);
 					dst_pmap->pm_stats.resident_count++;
-					pmap_insert_entry(dst_pmap, addr, m);
-	 			} else
-					pmap_unwire_pte_hold(dst_pmap, dstmpte);
+	 			} else {
+					free = NULL;
+					if (pmap_unwire_pte_hold( dst_pmap,
+					    dstmpte, &free)) {
+						pmap_invalidate_page(dst_pmap,
+						    addr);
+						pmap_free_zero_pages(free);
+					}
+				}
 				if (dstmpte->wire_count >= srcmpte->wire_count)
 					break;
 			}
@@ -2508,9 +3019,7 @@
  * subset of pmaps for proper page aging.
  */
 boolean_t
-pmap_page_exists_quick(pmap, m)
-	pmap_t pmap;
-	vm_page_t m;
+pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
@@ -2520,7 +3029,7 @@
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		if (pv->pv_pmap == pmap) {
+		if (PV_PMAP(pv) == pmap) {
 			return TRUE;
 		}
 		loops++;
@@ -2530,7 +3039,6 @@
 	return (FALSE);
 }
 
-#define PMAP_REMOVE_PAGES_CURPROC_ONLY
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
@@ -2540,85 +3048,103 @@
  * in the case of running down an entire address space.
  */
 void
-pmap_remove_pages(pmap, sva, eva)
-	pmap_t pmap;
-	vm_offset_t sva, eva;
+pmap_remove_pages(pmap_t pmap)
 {
 	pt_entry_t *pte, tpte;
-	vm_page_t m;
-	pv_entry_t pv, npv;
+	vm_page_t m, free = NULL;
+	pv_entry_t pv;
+	struct pv_chunk *pc, *npc;
+	int field, idx;
+	int32_t bit;
+	uint32_t inuse, bitmask;
+	int allfree;
 
-#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
-#endif
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	sched_pin();
-	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
-
-		if (pv->pv_va >= eva || pv->pv_va < sva) {
-			npv = TAILQ_NEXT(pv, pv_plist);
-			continue;
-		}
-
-#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
-		pte = vtopte(pv->pv_va);
-#else
-		pte = pmap_pte_quick(pmap, pv->pv_va);
-#endif
-		tpte = *pte;
-
-		if (tpte == 0) {
-			printf("TPTE at %p  IS ZERO @ VA %08x\n",
-							pte, pv->pv_va);
-			panic("bad pte");
-		}
+	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
+		allfree = 1;
+		for (field = 0; field < _NPCM; field++) {
+			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
+			while (inuse != 0) {
+				bit = bsfl(inuse);
+				bitmask = 1UL << bit;
+				idx = field * 32 + bit;
+				pv = &pc->pc_pventry[idx];
+				inuse &= ~bitmask;
+
+				pte = vtopte(pv->pv_va);
+				tpte = *pte;
+
+				if (tpte == 0) {
+					printf(
+					    "TPTE at %p  IS ZERO @ VA %08x\n",
+					    pte, pv->pv_va);
+					panic("bad pte");
+				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
-		if (tpte & PG_W) {
-			npv = TAILQ_NEXT(pv, pv_plist);
-			continue;
-		}
-
-		m = PHYS_TO_VM_PAGE(tpte);
-		KASSERT(m->phys_addr == (tpte & PG_FRAME),
-		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
-		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
-
-		KASSERT(m < &vm_page_array[vm_page_array_size],
-			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
+				if (tpte & PG_W) {
+					allfree = 0;
+					continue;
+				}
 
-		pmap->pm_stats.resident_count--;
+				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+				KASSERT(m->phys_addr == (tpte & PG_FRAME),
+				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
+				    m, (uintmax_t)m->phys_addr,
+				    (uintmax_t)tpte));
+
+				KASSERT(m < &vm_page_array[vm_page_array_size],
+					("pmap_remove_pages: bad tpte %#jx",
+					(uintmax_t)tpte));
 
-		pte_clear(pte);
+				pmap->pm_stats.resident_count--;
 
-		/*
-		 * Update the vm_page_t clean and reference bits.
-		 */
-		if (tpte & PG_M) {
-			vm_page_dirty(m);
-		}
+				pte_clear(pte);
 
-		npv = TAILQ_NEXT(pv, pv_plist);
-		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
+				/*
+				 * Update the vm_page_t clean/reference bits.
+				 */
+				if (tpte & PG_M)
+					vm_page_dirty(m);
 
-		m->md.pv_list_count--;
-		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-		if (TAILQ_EMPTY(&m->md.pv_list))
-			vm_page_flag_clear(m, PG_WRITEABLE);
+				/* Mark free */
+				PV_STAT(pv_entry_frees++);
+				PV_STAT(pv_entry_spare++);
+				pv_entry_count--;
+				pc->pc_map[field] |= bitmask;
+				m->md.pv_list_count--;
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list))
+					vm_page_flag_clear(m, PG_WRITEABLE);
 
-		pmap_unuse_pt(pmap, pv->pv_va);
-		free_pv_entry(pv);
+				pmap_unuse_pt(pmap, pv->pv_va, &free);
+			}
+		}
+		if (allfree) {
+			PV_STAT(pv_entry_spare -= _NPCPV);
+			PV_STAT(pc_chunk_count--);
+			PV_STAT(pc_chunk_frees++);
+			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
+			pmap_qremove((vm_offset_t)pc, 1);
+			vm_page_unwire(m, 0);
+			vm_page_free(m);
+			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+		}
 	}
 	sched_unpin();
 	pmap_invalidate_all(pmap);
-	PMAP_UNLOCK(pmap);
 	vm_page_unlock_queues();
+	PMAP_UNLOCK(pmap);
+	pmap_free_zero_pages(free);
 }
 
 /*
@@ -2632,6 +3158,7 @@
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
+	pmap_t pmap;
 	boolean_t rv;
 
 	rv = FALSE;
@@ -2641,17 +3168,11 @@
 	sched_pin();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		/*
-		 * if the bit being tested is the modified bit, then
-		 * mark clean_map and ptes as never
-		 * modified.
-		 */
-		if (!pmap_track_modified(pv->pv_va))
-			continue;
-		PMAP_LOCK(pv->pv_pmap);
-		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		pte = pmap_pte_quick(pmap, pv->pv_va);
 		rv = (*pte & PG_M) != 0;
-		PMAP_UNLOCK(pv->pv_pmap);
+		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
@@ -2682,82 +3203,46 @@
 }
 
 /*
- *	Clear the given bit in each of the given page's ptes.  The bit is
- *	expressed as a 32-bit mask.  Consequently, if the pte is 64 bits in
- *	size, only a bit within the least significant 32 can be cleared.
+ * Clear the write and modified bits in each of the given page's mappings.
  */
-static __inline void
-pmap_clear_ptes(vm_page_t m, int bit)
+void
+pmap_remove_write(vm_page_t m)
 {
-	register pv_entry_t pv;
-	pt_entry_t pbits, *pte;
+	pv_entry_t pv;
+	pmap_t pmap;
+	pt_entry_t oldpte, *pte;
 
-	if ((m->flags & PG_FICTITIOUS) ||
-	    (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	if ((m->flags & PG_FICTITIOUS) != 0 ||
+	    (m->flags & PG_WRITEABLE) == 0)
 		return;
-
 	sched_pin();
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	/*
-	 * Loop over all current mappings setting/clearing as appropos If
-	 * setting RO do we need to clear the VAC?
-	 */
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		/*
-		 * don't write protect pager mappings
-		 */
-		if (bit == PG_RW) {
-			if (!pmap_track_modified(pv->pv_va))
-				continue;
-		}
-
-		PMAP_LOCK(pv->pv_pmap);
-		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		pte = pmap_pte_quick(pmap, pv->pv_va);
 retry:
-		pbits = *pte;
-		if (pbits & bit) {
-			if (bit == PG_RW) {
-				/*
-				 * Regardless of whether a pte is 32 or 64 bits
-				 * in size, PG_RW and PG_M are among the least
-				 * significant 32 bits.
-				 */
-				if (!atomic_cmpset_int((u_int *)pte, pbits,
-				    pbits & ~(PG_RW | PG_M)))
-					goto retry;
-				if (pbits & PG_M) {
-					vm_page_dirty(m);
-				}
-			} else {
-				atomic_clear_int((u_int *)pte, bit);
-			}
-			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
+		oldpte = *pte;
+		if ((oldpte & PG_RW) != 0) {
+			/*
+			 * Regardless of whether a pte is 32 or 64 bits
+			 * in size, PG_RW and PG_M are among the least
+			 * significant 32 bits.
+			 */
+			if (!atomic_cmpset_int((u_int *)pte, oldpte,
+			    oldpte & ~(PG_RW | PG_M)))
+				goto retry;
+			if ((oldpte & PG_M) != 0)
+				vm_page_dirty(m);
+			pmap_invalidate_page(pmap, pv->pv_va);
 		}
-		PMAP_UNLOCK(pv->pv_pmap);
+		PMAP_UNLOCK(pmap);
 	}
-	if (bit == PG_RW)
-		vm_page_flag_clear(m, PG_WRITEABLE);
+	vm_page_flag_clear(m, PG_WRITEABLE);
 	sched_unpin();
 }
 
 /*
- *      pmap_page_protect:
- *
- *      Lower the permission for all mappings to a given page.
- */
-void
-pmap_page_protect(vm_page_t m, vm_prot_t prot)
-{
-	if ((prot & VM_PROT_WRITE) == 0) {
-		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
-			pmap_clear_ptes(m, PG_RW);
-		} else {
-			pmap_remove_all(m);
-		}
-	}
-}
-
-/*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
@@ -2772,48 +3257,35 @@
 int
 pmap_ts_referenced(vm_page_t m)
 {
-	register pv_entry_t pv, pvf, pvn;
+	pv_entry_t pv, pvf, pvn;
+	pmap_t pmap;
 	pt_entry_t *pte;
-	pt_entry_t v;
 	int rtval = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return (rtval);
-
 	sched_pin();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
-
 		pvf = pv;
-
 		do {
 			pvn = TAILQ_NEXT(pv, pv_list);
-
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-
-			if (!pmap_track_modified(pv->pv_va))
-				continue;
-
-			PMAP_LOCK(pv->pv_pmap);
-			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
-
-			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
+			pmap = PV_PMAP(pv);
+			PMAP_LOCK(pmap);
+			pte = pmap_pte_quick(pmap, pv->pv_va);
+			if ((*pte & PG_A) != 0) {
 				atomic_clear_int((u_int *)pte, PG_A);
-				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
-
+				pmap_invalidate_page(pmap, pv->pv_va);
 				rtval++;
-				if (rtval > 4) {
-					PMAP_UNLOCK(pv->pv_pmap);
-					break;
-				}
+				if (rtval > 4)
+					pvn = NULL;
 			}
-			PMAP_UNLOCK(pv->pv_pmap);
+			PMAP_UNLOCK(pmap);
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	sched_unpin();
-
 	return (rtval);
 }
 
@@ -2823,7 +3295,30 @@
 void
 pmap_clear_modify(vm_page_t m)
 {
-	pmap_clear_ptes(m, PG_M);
+	pv_entry_t pv;
+	pmap_t pmap;
+	pt_entry_t *pte;
+
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	if ((m->flags & PG_FICTITIOUS) != 0)
+		return;
+	sched_pin();
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		pte = pmap_pte_quick(pmap, pv->pv_va);
+		if ((*pte & PG_M) != 0) {
+			/*
+			 * Regardless of whether a pte is 32 or 64 bits
+			 * in size, PG_M is among the least significant
+			 * 32 bits. 
+			 */
+			atomic_clear_int((u_int *)pte, PG_M);
+			pmap_invalidate_page(pmap, pv->pv_va);
+		}
+		PMAP_UNLOCK(pmap);
+	}
+	sched_unpin();
 }
 
 /*
@@ -2834,7 +3329,30 @@
 void
 pmap_clear_reference(vm_page_t m)
 {
-	pmap_clear_ptes(m, PG_A);
+	pv_entry_t pv;
+	pmap_t pmap;
+	pt_entry_t *pte;
+
+	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	if ((m->flags & PG_FICTITIOUS) != 0)
+		return;
+	sched_pin();
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		pte = pmap_pte_quick(pmap, pv->pv_va);
+		if ((*pte & PG_A) != 0) {
+			/*
+			 * Regardless of whether a pte is 32 or 64 bits
+			 * in size, PG_A is among the least significant
+			 * 32 bits. 
+			 */
+			atomic_clear_int((u_int *)pte, PG_A);
+			pmap_invalidate_page(pmap, pv->pv_va);
+		}
+		PMAP_UNLOCK(pmap);
+	}
+	sched_unpin();
 }
 
 /*
@@ -2848,9 +3366,7 @@
  * NOT real memory.
  */
 void *
-pmap_mapdev(pa, size)
-	vm_paddr_t pa;
-	vm_size_t size;
+pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 	vm_offset_t va, tmpva, offset;
 
@@ -2866,25 +3382,38 @@
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	for (tmpva = va; size > 0; ) {
-		pmap_kenter(tmpva, pa);
+		pmap_kenter_attr(tmpva, pa, mode);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
+	pmap_invalidate_cache();
 	return ((void *)(va + offset));
 }
 
+void *
+pmap_mapdev(vm_paddr_t pa, vm_size_t size)
+{
+
+	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
+}
+
+void *
+pmap_mapbios(vm_paddr_t pa, vm_size_t size)
+{
+
+	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
+}
+
 void
-pmap_unmapdev(va, size)
-	vm_offset_t va;
-	vm_size_t size;
+pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t base, offset, tmpva;
 
 	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
 		return;
-	base = va & PG_FRAME;
+	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
@@ -2893,13 +3422,72 @@
 	kmem_free(kernel_map, base, size);
 }
 
+int
+pmap_change_attr(va, size, mode)
+	vm_offset_t va;
+	vm_size_t size;
+	int mode;
+{
+	vm_offset_t base, offset, tmpva;
+	pt_entry_t *pte;
+	u_int opte, npte;
+	pd_entry_t *pde;
+
+	base = trunc_page(va);
+	offset = va & PAGE_MASK;
+	size = roundup(offset + size, PAGE_SIZE);
+
+	/* Only supported on kernel virtual addresses. */
+	if (base <= VM_MAXUSER_ADDRESS)
+		return (EINVAL);
+
+	/* 4MB pages and pages that aren't mapped aren't supported. */
+	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
+		pde = pmap_pde(kernel_pmap, tmpva);
+		if (*pde & PG_PS)
+			return (EINVAL);
+		if (*pde == 0)
+			return (EINVAL);
+		pte = vtopte(va);
+		if (*pte == 0)
+			return (EINVAL);
+	}
+
+	/*
+	 * Ok, all the pages exist and are 4k, so run through them updating
+	 * their cache mode.
+	 */
+	for (tmpva = base; size > 0; ) {
+		pte = vtopte(tmpva);
+
+		/*
+		 * The cache mode bits are all in the low 32-bits of the
+		 * PTE, so we can just spin on updating the low 32-bits.
+		 */
+		do {
+			opte = *(u_int *)pte;
+			npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
+			npte |= pmap_cache_bits(mode, 0);
+		} while (npte != opte &&
+		    !atomic_cmpset_int((u_int *)pte, opte, npte));
+		tmpva += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+
+	/*
+	 * Flush CPU caches to make sure any data isn't cached that shouldn't
+	 * be, etc.
+	 */    
+	pmap_invalidate_range(kernel_pmap, base, tmpva);
+	pmap_invalidate_cache();
+	return (0);
+}
+
 /*
  * perform the pmap work for mincore
  */
 int
-pmap_mincore(pmap, addr)
-	pmap_t pmap;
-	vm_offset_t addr;
+pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *ptep, pte;
 	vm_page_t m;
@@ -2960,7 +3548,6 @@
 void
 pmap_activate(struct thread *td)
 {
-	struct proc *p = td->td_proc;
 	pmap_t	pmap, oldpmap;
 	u_int32_t  cr3;
 
@@ -2979,18 +3566,10 @@
 #else
 	cr3 = vtophys(pmap->pm_pdir);
 #endif
-	/* XXXKSE this is wrong.
+	/*
 	 * pmap_activate is for the current thread on the current cpu
 	 */
-	if (p->p_flag & P_SA) {
-		/* Make sure all other cr3 entries are updated. */
-		/* what if they are running?  XXXKSE (maybe abort them) */
-		FOREACH_THREAD_IN_PROC(p, td) {
-			td->td_pcb->pcb_cr3 = cr3;
-		}
-	} else {
-		td->td_pcb->pcb_cr3 = cr3;
-	}
+	td->td_pcb->pcb_cr3 = cr3;
 	load_cr3(cr3);
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
@@ -3018,7 +3597,7 @@
 	int index;
 
 	sx_slock(&allproc_lock);
-	LIST_FOREACH(p, &allproc, p_list) {
+	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_pid != pid)
 			continue;
 
@@ -3048,7 +3627,7 @@
 							pt_entry_t pa;
 							vm_page_t m;
 							pa = *pte;
-							m = PHYS_TO_VM_PAGE(pa);
+							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 								va, pa, m->hold_count, m->wire_count, m->flags);
 							npte++;
@@ -3077,8 +3656,7 @@
 
 /* print address space of pmap*/
 static void
-pads(pm)
-	pmap_t pm;
+pads(pmap_t pm)
 {
 	int i, j;
 	vm_paddr_t va;
@@ -3102,17 +3680,18 @@
 }
 
 void
-pmap_pvdump(pa)
-	vm_paddr_t pa;
+pmap_pvdump(vm_paddr_t pa)
 {
 	pv_entry_t pv;
+	pmap_t pmap;
 	vm_page_t m;
 
 	printf("pa %x", pa);
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
-		pads(pv->pv_pmap);
+		pmap = PV_PMAP(pv);
+		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
+		pads(pmap);
 	}
 	printf(" ");
 }
Index: io.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/io.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/io.c -L sys/i386/i386/io.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/io.c
+++ sys/i386/i386/io.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/io.c,v 1.1 2004/08/01 11:40:52 markm Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/io.c,v 1.2 2006/11/06 13:41:59 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/conf.h>
@@ -33,6 +33,7 @@
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
@@ -54,7 +55,7 @@
 {
 	int error;
 
-	error = suser(td);
+	error = priv_check(td, PRIV_IO);
 	if (error != 0)
 		return (error);
 	error = securelevel_gt(td->td_ucred, 0);
Index: apic_vector.s
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/apic_vector.s,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/apic_vector.s -L sys/i386/i386/apic_vector.s -u -r1.1.1.1 -r1.2
--- sys/i386/i386/apic_vector.s
+++ sys/i386/i386/apic_vector.s
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
- * $FreeBSD: src/sys/i386/i386/apic_vector.s,v 1.103.2.1 2005/10/04 15:15:21 jhb Exp $
+ * $FreeBSD: src/sys/i386/i386/apic_vector.s,v 1.113 2006/12/17 05:07:00 kmacy Exp $
  */
 
 /*
@@ -36,31 +36,14 @@
  * as well as IPI handlers.
  */
 
+#include "opt_smp.h"
+
 #include <machine/asmacros.h>
 #include <machine/apicreg.h>
-#include <machine/smptests.h>
 
 #include "assym.s"
 
 /*
- * Macros to create and destroy a trap frame.
- */
-#define PUSH_FRAME							\
-	pushl	$0 ;		/* dummy error code */			\
-	pushl	$0 ;		/* dummy trap type */			\
-	pushal ;		/* 8 ints */				\
-	pushl	%ds ;		/* save data and extra segments ... */	\
-	pushl	%es ;							\
-	pushl	%fs
-
-#define POP_FRAME							\
-	popl	%fs ;							\
-	popl	%es ;							\
-	popl	%ds ;							\
-	popal ;								\
-	addl	$4+4,%esp
-
-/*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
  * in the ISR.  The handler determines the highest bit set in the ISR,
@@ -72,11 +55,7 @@
 	SUPERALIGN_TEXT ;						\
 IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
-	movl	$KDSEL, %eax ;	/* reload with kernel's data segment */	\
-	movl	%eax, %ds ;						\
-	movl	%eax, %es ;						\
-	movl	$KPSEL, %eax ;	/* reload with per-CPU data segment */	\
-	movl	%eax, %fs ;						\
+	SET_KERNEL_SREGS ;						\
 	FAKE_MCOUNT(TF_EIP(%esp)) ;					\
 	movl	lapic, %edx ;	/* pointer to local APIC */		\
 	movl	LA_ISR + 16 * (index)(%edx), %eax ;	/* load ISR */	\
@@ -84,9 +63,10 @@
 	jz	2f ;							\
 	addl	$(32 * index),%eax ;					\
 1: ;									\
+	pushl	%esp		;                                       \
 	pushl	%eax ;		/* pass the IRQ */			\
 	call	lapic_handle_intr ;					\
-	addl	$4, %esp ;	/* discard parameter */			\
+	addl	$8, %esp ;	/* discard parameter */			\
 	MEXITCOUNT ;							\
 	jmp	doreti ;						\
 2:	movl	$-1, %eax ;	/* send a vector of -1 */		\
@@ -122,20 +102,11 @@
 	SUPERALIGN_TEXT
 IDTVEC(timerint)
 	PUSH_FRAME
-	movl	$KDSEL, %eax	/* reload with kernel's data segment */
-	movl	%eax, %ds
-	movl	%eax, %es
-	movl	$KPSEL, %eax
-	movl	%eax, %fs
-
-	movl	lapic, %edx
-	movl	$0, LA_EOI(%edx)	/* End Of Interrupt to APIC */
-	
+	SET_KERNEL_SREGS
 	FAKE_MCOUNT(TF_EIP(%esp))
-
-	pushl	$0		/* XXX convert trapframe to clockframe */
+	pushl	%esp
 	call	lapic_handle_timer
-	addl	$4, %esp	/* XXX convert clockframe to trapframe */
+	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
@@ -264,97 +235,71 @@
 	iret
 
 /*
- * Forward hardclock to another CPU.  Pushes a clockframe and calls
- * forwarded_hardclock().
+ * Invalidate cache.
  */
 	.text
 	SUPERALIGN_TEXT
-IDTVEC(ipi_intr_bitmap_handler)	
-	
-	PUSH_FRAME
-	movl	$KDSEL, %eax	/* reload with kernel's data segment */
+IDTVEC(invlcache)
+	pushl	%eax
+	pushl	%ds
+	movl	$KDSEL, %eax		/* Kernel data selector */
 	movl	%eax, %ds
-	movl	%eax, %es
-	movl	$KPSEL, %eax
+
+#ifdef COUNT_IPIS
+	pushl	%fs
+	movl	$KPSEL, %eax		/* Private space selector */
 	movl	%eax, %fs
+	movl	PCPU(CPUID), %eax
+	popl	%fs
+	movl	ipi_invlcache_counts(,%eax,4),%eax
+	incl	(%eax)
+#endif
+
+	wbinvd
+
+	movl	lapic, %eax
+	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
+
+	lock
+	incl	smp_tlb_wait
+
+	popl	%ds
+	popl	%eax
+	iret
+
+/*
+ * Handler for IPIs sent via the per-cpu IPI bitmap.
+ */
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(ipi_intr_bitmap_handler)	
+	PUSH_FRAME
+	SET_KERNEL_SREGS
 
 	movl	lapic, %edx
 	movl	$0, LA_EOI(%edx)	/* End Of Interrupt to APIC */
 	
 	FAKE_MCOUNT(TF_EIP(%esp))
 
-	pushl	$0		/* XXX convert trapframe to clockframe */
 	call	ipi_bitmap_handler
-	addl	$4, %esp	/* XXX convert clockframe to trapframe */
 	MEXITCOUNT
 	jmp	doreti
 
 /*
- * Executed by a CPU when it receives an Xcpustop IPI from another CPU,
- *
- *  - Signals its receipt.
- *  - Waits for permission to restart.
- *  - Signals its restart.
+ * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpustop)
-	pushl	%ebp
-	movl	%esp, %ebp
-	pushl	%eax
-	pushl	%ecx
-	pushl	%edx
-	pushl	%ds			/* save current data segment */
-	pushl	%es
-	pushl	%fs
-
-	movl	$KDSEL, %eax
-	movl	%eax, %ds		/* use KERNEL data segment */
-	movl	%eax, %es
-	movl	$KPSEL, %eax
-	movl	%eax, %fs
+	PUSH_FRAME
+	SET_KERNEL_SREGS
 
 	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 
-	movl	PCPU(CPUID), %eax
-	imull	$PCB_SIZE, %eax
-	leal	CNAME(stoppcbs)(%eax), %eax
-	pushl	%eax
-	call	CNAME(savectx)		/* Save process context */
-	addl	$4, %esp
-		
-	movl	PCPU(CPUID), %eax
-
-	lock
-	btsl	%eax, CNAME(stopped_cpus) /* stopped_cpus |= (1<<id) */
-1:
-	btl	%eax, CNAME(started_cpus) /* while (!(started_cpus & (1<<id))) */
-	jnc	1b
-
-	lock
-	btrl	%eax, CNAME(started_cpus) /* started_cpus &= ~(1<<id) */
-	lock
-	btrl	%eax, CNAME(stopped_cpus) /* stopped_cpus &= ~(1<<id) */
+	call	cpustop_handler
 
-	test	%eax, %eax
-	jnz	2f
-
-	movl	CNAME(cpustop_restartfunc), %eax
-	test	%eax, %eax
-	jz	2f
-	movl	$0, CNAME(cpustop_restartfunc)	/* One-shot */
-
-	call	*%eax
-2:
-	popl	%fs
-	popl	%es
-	popl	%ds			/* restore previous data segment */
-	popl	%edx
-	popl	%ecx
-	popl	%eax
-	movl	%ebp, %esp
-	popl	%ebp
+	POP_FRAME
 	iret
 
 /*
@@ -366,11 +311,7 @@
 	SUPERALIGN_TEXT
 IDTVEC(rendezvous)
 	PUSH_FRAME
-	movl	$KDSEL, %eax
-	movl	%eax, %ds		/* use KERNEL data segment */
-	movl	%eax, %es
-	movl	$KPSEL, %eax
-	movl	%eax, %fs
+	SET_KERNEL_SREGS
 
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
@@ -391,20 +332,11 @@
 	SUPERALIGN_TEXT
 IDTVEC(lazypmap)
 	PUSH_FRAME
-	movl	$KDSEL, %eax
-	movl	%eax, %ds		/* use KERNEL data segment */
-	movl	%eax, %es
-	movl	$KPSEL, %eax
-	movl	%eax, %fs
+	SET_KERNEL_SREGS
 
-#ifdef COUNT_IPIS
-	movl	PCPU(CPUID), %eax
-	movl	ipi_lazypmap_counts(,%eax,4), %eax
-	incl	(%eax)
-#endif
 	call	pmap_lazyfix_action
 
-	movl	lapic, %eax	
+	movl	lapic, %eax
 	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
 	POP_FRAME
 	iret
Index: nexus.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/nexus.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/nexus.c -L sys/i386/i386/nexus.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/nexus.c
+++ sys/i386/i386/nexus.c
@@ -12,7 +12,7 @@
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/nexus.c,v 1.62 2005/05/10 12:02:15 nyan Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/nexus.c,v 1.73 2007/05/08 21:29:14 jhb Exp $");
 
 /*
  * This code implements a `root nexus' for Intel Architecture
@@ -41,6 +41,7 @@
  * and I/O memory address space.
  */
 
+#include "opt_apic.h"
 #include "opt_isa.h"
 
 #include <sys/param.h>
@@ -61,6 +62,10 @@
 
 #include <machine/resource.h>
 
+#ifdef DEV_APIC
+#include "pcib_if.h"
+#endif
+
 #ifdef DEV_ISA
 #include <isa/isavar.h>
 #ifdef PC98
@@ -97,13 +102,21 @@
 static	int nexus_release_resource(device_t, device_t, int, int,
 				   struct resource *);
 static	int nexus_setup_intr(device_t, device_t, struct resource *, int flags,
-			     void (*)(void *), void *, void **);
+			     driver_filter_t filter, void (*)(void *), void *,
+			      void **);
 static	int nexus_teardown_intr(device_t, device_t, struct resource *,
 				void *);
 static struct resource_list *nexus_get_reslist(device_t dev, device_t child);
 static	int nexus_set_resource(device_t, device_t, int, int, u_long, u_long);
 static	int nexus_get_resource(device_t, device_t, int, int, u_long *, u_long *);
 static void nexus_delete_resource(device_t, device_t, int, int);
+#ifdef DEV_APIC
+static	int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs);
+static	int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs);
+static	int nexus_alloc_msix(device_t pcib, device_t dev, int *irq);
+static	int nexus_release_msix(device_t pcib, device_t dev, int irq);
+static	int nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data);
+#endif
 
 static device_method_t nexus_methods[] = {
 	/* Device interface */
@@ -129,6 +142,15 @@
 	DEVMETHOD(bus_get_resource,	nexus_get_resource),
 	DEVMETHOD(bus_delete_resource,	nexus_delete_resource),
 
+	/* pcib interface */
+#ifdef DEV_APIC
+	DEVMETHOD(pcib_alloc_msi,	nexus_alloc_msi),
+	DEVMETHOD(pcib_release_msi,	nexus_release_msi),
+	DEVMETHOD(pcib_alloc_msix,	nexus_alloc_msix),
+	DEVMETHOD(pcib_release_msix,	nexus_release_msix),
+	DEVMETHOD(pcib_map_msi,		nexus_map_msi),
+#endif
+
 	{ 0, 0 }
 };
 
@@ -144,11 +166,11 @@
 static int
 nexus_probe(device_t dev)
 {
-	int irq, last;
+	int irq;
 
 	device_quiet(dev);	/* suppress attach message for neatness */
 
-	/* 
+	/*
 	 * XXX working notes:
 	 *
 	 * - IRQ resource creation should be moved to the PIC/APIC driver.
@@ -177,18 +199,10 @@
 	 * We search for regions of existing IRQs and add those to the IRQ
 	 * resource manager.
 	 */
-	last = -1;
 	for (irq = 0; irq < NUM_IO_INTS; irq++)
-		if (intr_lookup_source(irq) != NULL) {
-			if (last == -1)
-				last = irq;
-		} else if (last != -1) {
-			if (rman_manage_region(&irq_rman, last, irq - 1) != 0)
+		if (intr_lookup_source(irq) != NULL)
+			if (rman_manage_region(&irq_rman, irq, irq) != 0)
 				panic("nexus_probe irq_rman add");
-			last = -1;
-		}
-	if (last != -1 && rman_manage_region(&irq_rman, last, irq - 1) != 0)
-		panic("nexus_probe irq_rman add");
 
 	/*
 	 * ISA DMA on PCI systems is implemented in the ISA part of each
@@ -251,7 +265,7 @@
 
 	if (STAILQ_FIRST(rl))
 		retval += printf(" at");
-	
+
 	retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#lx");
 	retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#lx");
 	retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%ld");
@@ -284,7 +298,7 @@
 		return(0);
 	resource_list_init(&ndev->nx_resources);
 
-	child = device_add_child_ordered(bus, order, name, unit); 
+	child = device_add_child_ordered(bus, order, name, unit);
 
 	/* should we free this in nexus_child_detached? */
 	device_set_ivars(child, ndev);
@@ -306,9 +320,6 @@
 	struct resource_list_entry *rle;
 	struct	rman *rm;
 	int needactivate = flags & RF_ACTIVE;
-#ifdef PC98
-	bus_space_handle_t bh;
-#endif
 
 	/*
 	 * If this is an allocation of the "default" range for a given RID, and
@@ -352,40 +363,15 @@
 	rv = rman_reserve_resource(rm, start, end, count, flags, child);
 	if (rv == 0)
 		return 0;
-
-	if (type == SYS_RES_MEMORY) {
-		rman_set_bustag(rv, I386_BUS_SPACE_MEM);
-	} else if (type == SYS_RES_IOPORT) {
-		rman_set_bustag(rv, I386_BUS_SPACE_IO);
-#ifndef PC98
-		rman_set_bushandle(rv, rman_get_start(rv));
-#endif
-	}
-
-#ifdef PC98
-	if ((type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) &&
-	    i386_bus_space_handle_alloc(rman_get_bustag(rv),
-	      rman_get_start(rv), count, &bh) != 0) {
-		rman_release_resource(rv);
-		return 0;
-	}
-	rman_set_bushandle(rv, bh);
-#endif
+	rman_set_rid(rv, *rid);
 
 	if (needactivate) {
 		if (bus_activate_resource(child, type, *rid, rv)) {
-#ifdef PC98
-			if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) {
-				bh = rman_get_bushandle(rv);
-				i386_bus_space_handle_free(rman_get_bustag(rv),
-				    bh, bh->bsh_sz);
-			}
-#endif
 			rman_release_resource(rv);
 			return 0;
 		}
 	}
-	
+
 	return rv;
 }
 
@@ -395,34 +381,40 @@
 {
 #ifdef PC98
 	bus_space_handle_t bh;
+	int error;
 #endif
+	void *vaddr;
+
 	/*
 	 * If this is a memory resource, map it into the kernel.
 	 */
-	if (rman_get_bustag(r) == I386_BUS_SPACE_MEM) {
-		caddr_t vaddr = 0;
-
-		if (rman_get_end(r) < 1024 * 1024) {
-			/*
-			 * The first 1Mb is mapped at KERNBASE.
-			 */
-			vaddr = (caddr_t)(uintptr_t)(KERNBASE + rman_get_start(r));
-		} else {
-			u_int32_t paddr;
-			u_int32_t psize;
-			u_int32_t poffs;
-
-			paddr = rman_get_start(r);
-			psize = rman_get_size(r);
-
-			poffs = paddr - trunc_page(paddr);
-			vaddr = (caddr_t) pmap_mapdev(paddr-poffs, psize+poffs) + poffs;
-		}
+	switch (type) {
+	case SYS_RES_IOPORT:
+#ifdef PC98
+		error = i386_bus_space_handle_alloc(I386_BUS_SPACE_IO,
+		    rman_get_start(r), rman_get_size(r), &bh);
+		if (error)
+			return (error);
+		rman_set_bushandle(r, bh);
+#else
+		rman_set_bushandle(r, rman_get_start(r));
+#endif
+		rman_set_bustag(r, I386_BUS_SPACE_IO);
+		break;
+	case SYS_RES_MEMORY:
+#ifdef PC98
+		error = i386_bus_space_handle_alloc(I386_BUS_SPACE_MEM,
+		    rman_get_start(r), rman_get_size(r), &bh);
+		if (error)
+			return (error);
+#endif
+		vaddr = pmap_mapdev(rman_get_start(r), rman_get_size(r));
 		rman_set_virtual(r, vaddr);
+		rman_set_bustag(r, I386_BUS_SPACE_MEM);
 #ifdef PC98
 		/* PC-98: the type of bus_space_handle_t is the structure. */
-		bh = rman_get_bushandle(r);
 		bh->bsh_base = (bus_addr_t) vaddr;
+		rman_set_bushandle(r, bh);
 #else
 		/* IBM-PC: the type of bus_space_handle_t is u_int */
 		rman_set_bushandle(r, (bus_space_handle_t) vaddr);
@@ -435,17 +427,22 @@
 nexus_deactivate_resource(device_t bus, device_t child, int type, int rid,
 			  struct resource *r)
 {
+
 	/*
 	 * If this is a memory resource, unmap it.
 	 */
-	if ((rman_get_bustag(r) == I386_BUS_SPACE_MEM) &&
-	    (rman_get_end(r) >= 1024 * 1024)) {
-		u_int32_t psize;
+	if (type == SYS_RES_MEMORY) {
+		pmap_unmapdev((vm_offset_t)rman_get_virtual(r),
+		    rman_get_size(r));
+	}
+#ifdef PC98
+	if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) {
+		bus_space_handle_t bh;
 
-		psize = rman_get_size(r);
-		pmap_unmapdev((vm_offset_t)rman_get_virtual(r), psize);
+		bh = rman_get_bushandle(r);
+		i386_bus_space_handle_free(rman_get_bustag(r), bh, bh->bsh_sz);
 	}
-		
+#endif
 	return (rman_deactivate_resource(r));
 }
 
@@ -458,14 +455,6 @@
 		if (error)
 			return error;
 	}
-#ifdef PC98
-	if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) {
-		bus_space_handle_t bh;
-
-		bh = rman_get_bushandle(r);
-		i386_bus_space_handle_free(rman_get_bustag(r), bh, bh->bsh_sz);
-	}
-#endif
 	return (rman_release_resource(r));
 }
 
@@ -477,7 +466,8 @@
  */
 static int
 nexus_setup_intr(device_t bus, device_t child, struct resource *irq,
-		 int flags, void (*ihand)(void *), void *arg, void **cookiep)
+		 int flags, driver_filter_t filter, void (*ihand)(void *),
+		 void *arg, void **cookiep)
 {
 	int		error;
 
@@ -497,7 +487,7 @@
 		return (error);
 
 	error = intr_add_handler(device_get_nameunit(child),
-	    rman_get_start(irq), ihand, arg, flags, cookiep);
+	    rman_get_start(irq), filter, ihand, arg, flags, cookiep);
 
 	return (error);
 }
@@ -560,9 +550,133 @@
 	resource_list_delete(rl, type, rid);
 }
 
+/* Called from the MSI code to add new IRQs to the IRQ rman. */
+void
+nexus_add_irq(u_long irq)
+{
+
+	if (rman_manage_region(&irq_rman, irq, irq) != 0)
+		panic("%s: failed", __func__);
+}
+
+#ifdef DEV_APIC
+static int
+nexus_alloc_msix(device_t pcib, device_t dev, int *irq)
+{
+
+	return (msix_alloc(dev, irq));
+}
+
+static int
+nexus_release_msix(device_t pcib, device_t dev, int irq)
+{
+
+	return (msix_release(irq));
+}
+
+static int
+nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs)
+{
+
+	return (msi_alloc(dev, count, maxcount, irqs));
+}
+
+static int
+nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs)
+{
+
+	return (msi_release(irqs, count));
+}
+
+static int
+nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data)
+{
+
+	return (msi_map(irq, addr, data));
+}
+#endif
+
+/* Placeholder for system RAM. */
+static void
+ram_identify(driver_t *driver, device_t parent)
+{
+
+	if (resource_disabled("ram", 0))
+		return;	
+	if (BUS_ADD_CHILD(parent, 0, "ram", 0) == NULL)
+		panic("ram_identify");
+}
+
+static int
+ram_probe(device_t dev)
+{
+
+	device_quiet(dev);
+	device_set_desc(dev, "System RAM");
+	return (0);
+}
+
+static int
+ram_attach(device_t dev)
+{
+	struct resource *res;
+	vm_paddr_t *p;
+	int error, i, rid;
+
+	/*
+	 * We use the dump_avail[] array rather than phys_avail[] for
+	 * the memory map as phys_avail[] contains holes for kernel
+	 * memory, page 0, the message buffer, and the dcons buffer.
+	 * We test the end address in the loop instead of the start
+	 * since the start address for the first segment is 0.
+	 *
+	 * XXX: It would be preferable to use the SMAP if it exists
+	 * instead since if the SMAP is very fragmented we may not
+	 * include some memory regions in dump_avail[] and phys_avail[].
+	 */
+	for (i = 0, p = dump_avail; p[1] != 0; i++, p += 2) {
+		rid = i;
+#ifdef PAE
+		/*
+		 * Resources use long's to track resources, so we can't
+		 * include memory regions above 4GB.
+		 */
+		if (p[0] >= ~0ul)
+			break;
+#endif
+		error = bus_set_resource(dev, SYS_RES_MEMORY, rid, p[0],
+		    p[1] - p[0]);
+		if (error)
+			panic("ram_attach: resource %d failed set with %d", i,
+			    error);
+		res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0);
+		if (res == NULL)
+			panic("ram_attach: resource %d failed to attach", i);
+	}
+	return (0);
+}
+
+static device_method_t ram_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_identify,	ram_identify),
+	DEVMETHOD(device_probe,		ram_probe),
+	DEVMETHOD(device_attach,	ram_attach),
+	{ 0, 0 }
+};
+
+static driver_t ram_driver = {
+	"ram",
+	ram_methods,
+	1,		/* no softc */
+};
+
+static devclass_t ram_devclass;
+
+DRIVER_MODULE(ram, nexus, ram_driver, ram_devclass, 0, 0);
+
 #ifdef DEV_ISA
 /*
- * Placeholder which claims PnP 'devices' which describe system 
+ * Placeholder which claims PnP 'devices' which describe system
  * resources.
  */
 static struct isa_pnp_id sysresource_ids[] = {
@@ -575,7 +689,7 @@
 sysresource_probe(device_t dev)
 {
 	int	result;
-	
+
 	if ((result = ISA_PNP_PROBE(device_get_parent(dev), dev, sysresource_ids)) <= 0) {
 		device_quiet(dev);
 	}
Index: genassym.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/genassym.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/genassym.c -L sys/i386/i386/genassym.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/genassym.c
+++ sys/i386/i386/genassym.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/genassym.c,v 1.151 2005/04/13 22:57:17 peter Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/genassym.c,v 1.160 2007/09/17 21:55:28 peter Exp $");
 
 #include "opt_apic.h"
 #include "opt_compat.h"
@@ -78,12 +78,13 @@
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
-ASSYM(P_SFLAG, offsetof(struct proc, p_sflag));
 
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
 ASSYM(TD_MD, offsetof(struct thread, td_md));
+ASSYM(TD_TID, offsetof(struct thread, td_tid));
 
 ASSYM(P_MD, offsetof(struct proc, p_md));
 ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
@@ -140,7 +141,6 @@
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
-ASSYM(PCB_SWITCHOUT, offsetof(struct pcb, pcb_switchout));
 
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_VM86CALL, PCB_VM86CALL);
@@ -173,6 +173,7 @@
 ASSYM(ENOENT, ENOENT);
 ASSYM(EFAULT, EFAULT);
 ASSYM(ENAMETOOLONG, ENAMETOOLONG);
+ASSYM(MAXCPU, MAXCPU);
 ASSYM(MAXCOMLEN, MAXCOMLEN);
 ASSYM(MAXPATHLEN, MAXPATHLEN);
 ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo));
@@ -198,6 +199,7 @@
 ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
+ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss));
 
 #ifdef DEV_APIC
 ASSYM(LA_VER, offsetof(struct LAPIC, version));
--- /dev/null
+++ sys/i386/i386/minidump_machdep.c
@@ -0,0 +1,405 @@
+/*-
+ * Copyright (c) 2006 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/i386/i386/minidump_machdep.c,v 1.3.4.1 2008/01/30 21:21:50 ru Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/kernel.h>
+#include <sys/kerneldump.h>
+#include <sys/msgbuf.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/atomic.h>
+#include <machine/elf.h>
+#include <machine/md_var.h>
+#include <machine/vmparam.h>
+#include <machine/minidump.h>
+
+CTASSERT(sizeof(struct kerneldumpheader) == 512);
+
+/*
+ * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
+ * is to protect us from metadata and to protect metadata from us.
+ */
+#define	SIZEOF_METADATA		(64*1024)
+
+#define	MD_ALIGN(x)	(((off_t)(x) + PAGE_MASK) & ~PAGE_MASK)
+#define	DEV_ALIGN(x)	(((off_t)(x) + (DEV_BSIZE-1)) & ~(DEV_BSIZE-1))
+
+uint32_t *vm_page_dump;
+int vm_page_dump_size;
+
+static struct kerneldumpheader kdh;
+static off_t dumplo;
+
+/* Handle chunked writes. */
+static size_t fragsz;
+static void *dump_va;
+static uint64_t counter, progress;
+
+CTASSERT(sizeof(*vm_page_dump) == 4);
+
+static int
+is_dumpable(vm_paddr_t pa)
+{
+	int i;
+
+	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
+		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
+			return (1);
+	}
+	return (0);
+}
+
+/* XXX should be MI */
+static void
+mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen,
+    uint32_t blksz)
+{
+
+	bzero(kdh, sizeof(*kdh));
+	strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic));
+	strncpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
+	kdh->version = htod32(KERNELDUMPVERSION);
+	kdh->architectureversion = htod32(archver);
+	kdh->dumplength = htod64(dumplen);
+	kdh->dumptime = htod64(time_second);
+	kdh->blocksize = htod32(blksz);
+	strncpy(kdh->hostname, hostname, sizeof(kdh->hostname));
+	strncpy(kdh->versionstring, version, sizeof(kdh->versionstring));
+	if (panicstr != NULL)
+		strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
+	kdh->parity = kerneldump_parity(kdh);
+}
+
+#define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8)
+
+static int
+blk_flush(struct dumperinfo *di)
+{
+	int error;
+
+	if (fragsz == 0)
+		return (0);
+
+	error = dump_write(di, dump_va, 0, dumplo, fragsz);
+	dumplo += fragsz;
+	fragsz = 0;
+	return (error);
+}
+
+static int
+blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
+{
+	size_t len;
+	int error, i, c;
+
+	error = 0;
+	if ((sz % PAGE_SIZE) != 0) {
+		printf("size not page aligned\n");
+		return (EINVAL);
+	}
+	if (ptr != NULL && pa != 0) {
+		printf("cant have both va and pa!\n");
+		return (EINVAL);
+	}
+	if (pa != 0 && (((uintptr_t)ptr) % PAGE_SIZE) != 0) {
+		printf("address not page aligned\n");
+		return (EINVAL);
+	}
+	if (ptr != NULL) {
+		/* If we're doing a virtual dump, flush any pre-existing pa pages */
+		error = blk_flush(di);
+		if (error)
+			return (error);
+	}
+	while (sz) {
+		len = (MAXDUMPPGS * PAGE_SIZE) - fragsz;
+		if (len > sz)
+			len = sz;
+		counter += len;
+		progress -= len;
+		if (counter >> 24) {
+			printf(" %lld", PG2MB(progress >> PAGE_SHIFT));
+			counter &= (1<<24) - 1;
+		}
+		if (ptr) {
+			error = dump_write(di, ptr, 0, dumplo, len);
+			if (error)
+				return (error);
+			dumplo += len;
+			ptr += len;
+			sz -= len;
+		} else {
+			for (i = 0; i < len; i += PAGE_SIZE)
+				dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT);
+			fragsz += len;
+			pa += len;
+			sz -= len;
+			if (fragsz == (MAXDUMPPGS * PAGE_SIZE)) {
+				error = blk_flush(di);
+				if (error)
+					return (error);
+			}
+		}
+
+		/* Check for user abort. */
+		c = cncheckc();
+		if (c == 0x03)
+			return (ECANCELED);
+		if (c != -1)
+			printf(" (CTRL-C to abort) ");
+	}
+
+	return (0);
+}
+
+/* A fake page table page, to avoid having to handle both 4K and 2M pages */
+static pt_entry_t fakept[NPTEPG];
+
+void
+minidumpsys(struct dumperinfo *di)
+{
+	uint64_t dumpsize;
+	uint32_t ptesize;
+	vm_offset_t va;
+	int error;
+	uint32_t bits;
+	uint64_t pa;
+	pd_entry_t *pd;
+	pt_entry_t *pt;
+	int i, j, k, bit;
+	struct minidumphdr mdhdr;
+
+	counter = 0;
+	/* Walk page table pages, set bits in vm_page_dump */
+	ptesize = 0;
+	for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) {
+		/*
+		 * We always write a page, even if it is zero. Each
+		 * page written corresponds to 2MB of space
+		 */
+		ptesize += PAGE_SIZE;
+		pd = (pd_entry_t *)((uintptr_t)IdlePTD + KERNBASE);	/* always mapped! */
+		j = va >> PDRSHIFT;
+		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
+			/* This is an entire 2M page. */
+			pa = pd[j] & PG_PS_FRAME;
+			for (k = 0; k < NPTEPG; k++) {
+				if (is_dumpable(pa))
+					dump_add_page(pa);
+				pa += PAGE_SIZE;
+			}
+			continue;
+		}
+		if ((pd[j] & PG_V) == PG_V) {
+			/* set bit for each valid page in this 2MB block */
+			pt = pmap_kenter_temporary(pd[j] & PG_FRAME, 0);
+			for (k = 0; k < NPTEPG; k++) {
+				if ((pt[k] & PG_V) == PG_V) {
+					pa = pt[k] & PG_FRAME;
+					if (is_dumpable(pa))
+						dump_add_page(pa);
+				}
+			}
+		} else {
+			/* nothing, we're going to dump a null page */
+		}
+	}
+
+	/* Calculate dump size. */
+	dumpsize = ptesize;
+	dumpsize += round_page(msgbufp->msg_size);
+	dumpsize += round_page(vm_page_dump_size);
+	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
+		bits = vm_page_dump[i];
+		while (bits) {
+			bit = bsfl(bits);
+			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
+			/* Clear out undumpable pages now if needed */
+			if (is_dumpable(pa)) {
+				dumpsize += PAGE_SIZE;
+			} else {
+				dump_drop_page(pa);
+			}
+			bits &= ~(1ul << bit);
+		}
+	}
+	dumpsize += PAGE_SIZE;
+
+	/* Determine dump offset on device. */
+	if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
+		error = ENOSPC;
+		goto fail;
+	}
+	dumplo = di->mediaoffset + di->mediasize - dumpsize;
+	dumplo -= sizeof(kdh) * 2;
+	progress = dumpsize;
+
+	/* Initialize mdhdr */
+	bzero(&mdhdr, sizeof(mdhdr));
+	strcpy(mdhdr.magic, MINIDUMP_MAGIC);
+	mdhdr.version = MINIDUMP_VERSION;
+	mdhdr.msgbufsize = msgbufp->msg_size;
+	mdhdr.bitmapsize = vm_page_dump_size;
+	mdhdr.ptesize = ptesize;
+	mdhdr.kernbase = KERNBASE;
+#ifdef PAE
+	mdhdr.paemode = 1;
+#endif
+
+	mkdumpheader(&kdh, KERNELDUMP_I386_VERSION, dumpsize, di->blocksize);
+
+	printf("Physical memory: %ju MB\n", ptoa((uintmax_t)physmem) / 1048576);
+	printf("Dumping %llu MB:", (long long)dumpsize >> 20);
+
+	/* Dump leader */
+	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
+	if (error)
+		goto fail;
+	dumplo += sizeof(kdh);
+
+	/* Dump my header */
+	bzero(&fakept, sizeof(fakept));
+	bcopy(&mdhdr, &fakept, sizeof(mdhdr));
+	error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+	if (error)
+		goto fail;
+
+	/* Dump msgbuf up front */
+	error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size));
+	if (error)
+		goto fail;
+
+	/* Dump bitmap */
+	error = blk_write(di, (char *)vm_page_dump, 0, round_page(vm_page_dump_size));
+	if (error)
+		goto fail;
+
+	/* Dump kernel page table pages */
+	for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) {
+		/* We always write a page, even if it is zero */
+		pd = (pd_entry_t *)((uintptr_t)IdlePTD + KERNBASE);	/* always mapped! */
+		j = va >> PDRSHIFT;
+		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
+			/* This is a single 2M block. Generate a fake PTP */
+			pa = pd[j] & PG_PS_FRAME;
+			for (k = 0; k < NPTEPG; k++) {
+				fakept[k] = (pa + (k * PAGE_SIZE)) | PG_V | PG_RW | PG_A | PG_M;
+			}
+			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+			if (error)
+				goto fail;
+			/* flush, in case we reuse fakept in the same block */
+			error = blk_flush(di);
+			if (error)
+				goto fail;
+			continue;
+		}
+		if ((pd[j] & PG_V) == PG_V) {
+			pa = pd[j] & PG_FRAME;
+			error = blk_write(di, 0, pa, PAGE_SIZE);
+			if (error)
+				goto fail;
+		} else {
+			bzero(fakept, sizeof(fakept));
+			error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
+			if (error)
+				goto fail;
+			/* flush, in case we reuse fakept in the same block */
+			error = blk_flush(di);
+			if (error)
+				goto fail;
+		}
+	}
+
+	/* Dump memory chunks */
+	/* XXX cluster it up and use blk_dump() */
+	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
+		bits = vm_page_dump[i];
+		while (bits) {
+			bit = bsfl(bits);
+			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
+			error = blk_write(di, 0, pa, PAGE_SIZE);
+			if (error)
+				goto fail;
+			bits &= ~(1ul << bit);
+		}
+	}
+
+	error = blk_flush(di);
+	if (error)
+		goto fail;
+
+	/* Dump trailer */
+	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
+	if (error)
+		goto fail;
+	dumplo += sizeof(kdh);
+
+	/* Signal completion, signoff and exit stage left. */
+	dump_write(di, NULL, 0, 0, 0);
+	printf("\nDump complete\n");
+	return;
+
+ fail:
+	if (error < 0)
+		error = -error;
+
+	if (error == ECANCELED)
+		printf("\nDump aborted\n");
+	else if (error == ENOSPC)
+		printf("\nDump failed. Partition too small.\n");
+	else
+		printf("\n** DUMP FAILED (ERROR %d) **\n", error);
+}
+
+void
+dump_add_page(vm_paddr_t pa)
+{
+	int idx, bit;
+
+	pa >>= PAGE_SHIFT;
+	idx = pa >> 5;		/* 2^5 = 32 */
+	bit = pa & 31;
+	atomic_set_int(&vm_page_dump[idx], 1ul << bit);
+}
+
+void
+dump_drop_page(vm_paddr_t pa)
+{
+	int idx, bit;
+
+	pa >>= PAGE_SHIFT;
+	idx = pa >> 5;		/* 2^5 = 32 */
+	bit = pa & 31;
+	atomic_clear_int(&vm_page_dump[idx], 1ul << bit);
+}
+
Index: locore.s
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/locore.s,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/locore.s -L sys/i386/i386/locore.s -u -r1.1.1.1 -r1.2
--- sys/i386/i386/locore.s
+++ sys/i386/i386/locore.s
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)locore.s	7.3 (Berkeley) 5/13/91
- * $FreeBSD: src/sys/i386/i386/locore.s,v 1.186 2005/05/16 09:47:53 obrien Exp $
+ * $FreeBSD: src/sys/i386/i386/locore.s,v 1.188 2007/03/24 19:53:22 alc Exp $
  *
  *		originally from: locore.s, by William F. Jolitz
  *
@@ -777,21 +777,6 @@
 	movl	%esi, R(SMPpt)		/* relocated to KVM space */
 #endif	/* SMP */
 
-/* Map page zero read-write so bios32 calls can use it */
-	xorl	%eax, %eax
-	movl	$PG_RW,%edx
-	movl	$1,%ecx
-	fillkptphys(%edx)
-
-/* Map read-only from page 1 to the beginning of the kernel text section */
-	movl	$PAGE_SIZE, %eax
-	xorl	%edx,%edx
-	movl	$R(btext),%ecx
-	addl	$PAGE_MASK,%ecx
-	subl	%eax,%ecx
-	shrl	$PAGE_SHIFT,%ecx
-	fillkptphys(%edx)
-
 /*
  * Enable PSE and PGE.
  */
@@ -815,22 +800,21 @@
 #endif
 
 /*
- * Write page tables for the kernel starting at btext and
- * until the end.  Make sure to map read+write.  We do this even
+ * Initialize page table pages mapping physical address zero through the
+ * end of the kernel.  All of the page table entries allow read and write
+ * access.  Write access to the first physical page is required by bios32
+ * calls, and write access to the first 1 MB of physical memory is required
+ * by ACPI for implementing suspend and resume.  We do this even
  * if we've enabled PSE above, we'll just switch the corresponding kernel
  * PDEs before we turn on paging.
  *
  * XXX: We waste some pages here in the PSE case!  DON'T BLINDLY REMOVE
  * THIS!  SMP needs the page table to be there to map the kernel P==V.
  */
-	movl	$R(btext),%eax
-	addl	$PAGE_MASK, %eax
-	andl	$~PAGE_MASK, %eax
-	movl	$PG_RW,%edx
+	xorl	%eax, %eax
 	movl	R(KERNend),%ecx
-	subl	%eax,%ecx
 	shrl	$PAGE_SHIFT,%ecx
-	fillkptphys(%edx)
+	fillkptphys($PG_RW)
 
 /* Map page directory. */
 #ifdef PAE
@@ -901,17 +885,43 @@
 	fillkpt(R(SMPptpa), $PG_RW)
 #endif	/* SMP */
 
-/* install a pde for temporary double map of bottom of VA */
+/*
+ * Create an identity mapping for low physical memory, including the kernel.
+ * The part of this mapping that covers the first 1 MB of physical memory
+ * becomes a permanent part of the kernel's address space.  The rest of this
+ * mapping is destroyed in pmap_bootstrap().  Ordinarily, the same page table
+ * pages are shared by the identity mapping and the kernel's native mapping.
+ * However, the permanent identity mapping cannot contain PG_G mappings.
+ * Thus, if the kernel is loaded within the permanent identity mapping, that
+ * page table page must be duplicated and not shared.
+ *
+ * N.B. Due to errata concerning large pages and physical address zero,
+ * a PG_PS mapping is not used.
+ */
 	movl	R(KPTphys), %eax
 	xorl	%ebx, %ebx
 	movl	$NKPT, %ecx
 	fillkpt(R(IdlePTD), $PG_RW)
+#if KERNLOAD < (1 << PDRSHIFT)
+	testl	$PG_G, R(pgeflag)
+	jz	1f
+	ALLOCPAGES(1)
+	movl	%esi, %edi
+	movl	R(IdlePTD), %eax
+	movl	(%eax), %esi
+	movl	%edi, (%eax)
+	movl	$PAGE_SIZE, %ecx
+	cld
+	rep
+	movsb
+1:	
+#endif
 
 /*
- * For the non-PSE case, install PDEs for PTs covering the kernel.
+ * For the non-PSE case, install PDEs for PTs covering the KVA.
  * For the PSE case, do the same, but clobber the ones corresponding
- * to the kernel (from btext to KERNend) with 4M ('PS') PDEs immediately
- * after.
+ * to the kernel (from btext to KERNend) with 4M (2M for PAE) ('PS')
+ * PDEs immediately after.
  */
 	movl	R(KPTphys), %eax
 	movl	$KPTDI, %ebx
Index: elan-mmcr.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/elan-mmcr.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/elan-mmcr.c -L sys/i386/i386/elan-mmcr.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/elan-mmcr.c
+++ sys/i386/i386/elan-mmcr.c
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/elan-mmcr.c,v 1.31.2.1 2005/08/16 22:47:14 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/elan-mmcr.c,v 1.35 2007/06/04 18:25:06 dwmalone Exp $");
 
 #include "opt_cpu.h"
 #include <sys/param.h>
@@ -313,7 +313,7 @@
 	int error;
 
 	f = elan_timecounter.tc_frequency * 4;
-	error = sysctl_handle_int(oidp, &f, sizeof(f), req);
+	error = sysctl_handle_int(oidp, &f, 0, req);
 	if (error == 0 && req->newptr != NULL) 
 		elan_timecounter.tc_frequency = (f + 3) / 4;
 	return (error);
@@ -367,11 +367,11 @@
 static void
 elan_watchdog(void *foo __unused, u_int spec, int *error)
 {
-	u_int u, v;
+	u_int u, v, w;
 	static u_int cur;
 
 	u = spec & WD_INTERVAL;
-	if (spec && u <= 35) {
+	if (u > 0 && u <= 35) {
 		u = imax(u - 5, 24);
 		v = 2 << (u - 24);
 		v |= 0xc000;
@@ -383,7 +383,7 @@
 		 * for other reasons.  Save and restore the GP echo mode
 		 * around our hardware tom-foolery.
 		 */
-		u = elan_mmcr->GPECHO;
+		w = elan_mmcr->GPECHO;
 		elan_mmcr->GPECHO = 0;
 		if (v != cur) {
 			/* Clear the ENB bit */
@@ -401,19 +401,17 @@
 			elan_mmcr->WDTMRCTL = 0xaaaa;
 			elan_mmcr->WDTMRCTL = 0x5555;
 		}
-		elan_mmcr->GPECHO = u;
+		elan_mmcr->GPECHO = w;
 		*error = 0;
-		return;
 	} else {
-		u = elan_mmcr->GPECHO;
+		w = elan_mmcr->GPECHO;
 		elan_mmcr->GPECHO = 0;
 		elan_mmcr->WDTMRCTL = 0x3333;
 		elan_mmcr->WDTMRCTL = 0xcccc;
 		elan_mmcr->WDTMRCTL = 0x4080;
-		elan_mmcr->WDTMRCTL = u;
-		elan_mmcr->GPECHO = u;
+		elan_mmcr->WDTMRCTL = w;		/* XXX What does this statement do? */
+		elan_mmcr->GPECHO = w;
 		cur = 0;
-		return;
 	}
 }
 
--- /dev/null
+++ sys/i386/i386/bpf_jit_machdep.h
@@ -0,0 +1,404 @@
+/*-
+ * Copyright (c) 2002 - 2003 NetGroup, Politecnico di Torino (Italy)
+ * Copyright (c) 2005 Jung-uk Kim <jkim at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Politecnico di Torino nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS intERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/i386/i386/bpf_jit_machdep.h,v 1.3 2005/12/06 20:11:07 jkim Exp $
+ */
+
+#ifndef _BPF_JIT_MACHDEP_H_
+#define _BPF_JIT_MACHDEP_H_
+
+/*
+ * Registers
+ */
+#define EAX	0
+#define ECX	1
+#define EDX	2
+#define EBX	3
+#define ESP	4
+#define EBP	5
+#define ESI	6
+#define EDI	7
+
+#define AX	0
+#define CX	1
+#define DX	2
+#define BX	3
+#define SP	4
+#define BP	5
+#define SI	6
+#define DI	7
+
+#define AL	0
+#define CL	1
+#define DL	2
+#define BL	3
+
+/* A stream of native binary code.*/
+typedef struct bpf_bin_stream {
+	/* Current native instruction pointer. */
+	int		cur_ip;
+
+	/*
+	 * Current BPF instruction pointer, i.e. position in
+	 * the BPF program reached by the jitter.
+	 */
+	int		bpf_pc;
+
+	/* Instruction buffer, contains the generated native code. */
+	char		*ibuf;
+
+	/* Jumps reference table. */
+	u_int		*refs;
+} bpf_bin_stream;
+
+/*
+ * Prototype of the emit functions.
+ *
+ * Different emit functions are used to create the reference table and
+ * to generate the actual filtering code. This allows to have simpler
+ * instruction macros.
+ * The first parameter is the stream that will receive the data.
+ * The second one is a variable containing the data.
+ * The third one is the length, that can be 1, 2, or 4 since it is possible
+ * to emit a byte, a short, or a word at a time.
+ */
+typedef void (*emit_func)(bpf_bin_stream *stream, u_int value, u_int n);
+
+/*
+ * native Instruction Macros
+ */
+
+/* mov r32,i32 */
+#define MOVid(r32, i32) do {						\
+	emitm(&stream, (11 << 4) | (1 << 3) | (r32 & 0x7), 1);		\
+	emitm(&stream, i32, 4);						\
+} while (0)
+
+/* mov dr32,sr32 */
+#define MOVrd(dr32, sr32) do {						\
+	emitm(&stream, (8 << 4) | 3 | (1 << 3), 1);			\
+	emitm(&stream,							\
+	    (3 << 6) | ((dr32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+} while (0)
+
+/* mov dr32,sr32[off] */
+#define MOVodd(dr32, sr32, off) do {					\
+	emitm(&stream, (8 << 4) | 3 | (1 << 3), 1);			\
+	emitm(&stream,							\
+	    (1 << 6) | ((dr32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+	emitm(&stream, off, 1);						\
+} while (0)
+
+/* mov dr32,sr32[or32] */
+#define MOVobd(dr32, sr32, or32) do {					\
+	emitm(&stream, (8 << 4) | 3 | (1 << 3), 1);			\
+	emitm(&stream, ((dr32 & 0x7) << 3) | 4, 1);			\
+	emitm(&stream, ((or32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+} while (0)
+
+/* mov dr16,sr32[or32] */
+#define MOVobw(dr32, sr32, or32) do {					\
+	emitm(&stream, 0x66, 1);					\
+	emitm(&stream, (8 << 4) | 3 | (1 << 3), 1);			\
+	emitm(&stream, ((dr32 & 0x7) << 3) | 4, 1);			\
+	emitm(&stream, ((or32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+} while (0)
+
+/* mov dr8,sr32[or32] */
+#define MOVobb(dr8, sr32, or32) do {					\
+	emitm(&stream, 0x8a, 1);					\
+	emitm(&stream, ((dr8 & 0x7) << 3) | 4, 1);			\
+	emitm(&stream, ((or32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+} while (0)
+
+/* mov [dr32][or32],sr32 */
+#define MOVomd(dr32, or32, sr32) do {					\
+	emitm(&stream, 0x89, 1);					\
+	emitm(&stream, ((sr32 & 0x7) << 3) | 4, 1);			\
+	emitm(&stream, ((or32 & 0x7) << 3) | (dr32 & 0x7), 1);		\
+} while (0)
+
+/* bswap dr32 */
+#define BSWAP(dr32) do {						\
+	emitm(&stream, 0xf, 1);						\
+	emitm(&stream, (0x19 << 3) | dr32, 1);				\
+} while (0)
+
+/* xchg al,ah */
+#define SWAP_AX() do {							\
+	emitm(&stream, 0x86, 1);					\
+	emitm(&stream, 0xc4, 1);					\
+} while (0)
+
+/* push r32 */
+#define PUSH(r32) do {							\
+	emitm(&stream, (5 << 4) | (0 << 3) | (r32 & 0x7), 1);		\
+} while (0)
+
+/* pop r32 */
+#define POP(r32) do {							\
+	emitm(&stream, (5 << 4) | (1 << 3) | (r32 & 0x7), 1);		\
+} while (0)
+
+/* leave/ret */
+#define LEAVE_RET() do {						\
+	emitm(&stream, 0xc9, 1);					\
+	emitm(&stream, 0xc3, 1);					\
+} while (0)
+
+/* add dr32,sr32 */
+#define ADDrd(dr32, sr32) do {						\
+	emitm(&stream, 0x03, 1);					\
+	emitm(&stream,							\
+	    (3 << 6) | ((dr32 & 0x7) << 3) | (sr32 & 0x7), 1);	\
+} while (0)
+
+/* add eax,i32 */
+#define ADD_EAXi(i32) do {						\
+	emitm(&stream, 0x05, 1);					\
+	emitm(&stream, i32, 4);						\
+} while (0)
+
+/* add r32,i32 */
+#define ADDid(r32, i32) do {						\
+	emitm(&stream, 0x81, 1);					\
+	emitm(&stream, (24 << 3) | r32, 1);				\
+	emitm(&stream, i32, 4);						\
+} while (0)
+
+/* add r32,i8 */
+#define ADDib(r32, i8) do {						\
+	emitm(&stream, 0x83, 1);					\
+	emitm(&stream, (24 << 3) | r32, 1);				\
+	emitm(&stream, i8, 1);						\
+} while (0)
+
+/* sub dr32,sr32 */
+#define SUBrd(dr32, sr32) do {						\
+	emitm(&stream, 0x2b, 1);					\
+	emitm(&stream,							\
+	    (3 << 6) | ((dr32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+} while (0)
+
+/* sub eax,i32 */
+#define SUB_EAXi(i32) do {						\
+	emitm(&stream, 0x2d, 1);					\
+	emitm(&stream, i32, 4);						\
+} while (0)
+
+/* mul r32 */
+#define MULrd(r32) do {							\
+	emitm(&stream, 0xf7, 1);					\
+	emitm(&stream, (7 << 5) | (r32 & 0x7), 1);			\
+} while (0)
+
+/* div r32 */
+#define DIVrd(r32) do {							\
+	emitm(&stream, 0xf7, 1);					\
+	emitm(&stream, (15 << 4) | (r32 & 0x7), 1);			\
+} while (0)
+
+/* and r8,i8 */
+#define ANDib(r8, i8) do {						\
+	emitm(&stream, 0x80, 1);					\
+	emitm(&stream, (7 << 5) | r8, 1);				\
+	emitm(&stream, i8, 1);						\
+} while (0)
+
+/* and r32,i32 */
+#define ANDid(r32, i32) do {						\
+	if (r32 == EAX) {						\
+		emitm(&stream, 0x25, 1);				\
+		emitm(&stream, i32, 4);					\
+	} else {							\
+		emitm(&stream, 0x81, 1);				\
+		emitm(&stream, (7 << 5) | r32, 1);			\
+		emitm(&stream, i32, 4);					\
+	}								\
+} while (0)
+
+/* and dr32,sr32 */
+#define ANDrd(dr32, sr32) do {						\
+	emitm(&stream, 0x23, 1);					\
+	emitm(&stream,							\
+	    (3 << 6) | ((dr32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+} while (0)
+
+/* or dr32,sr32 */
+#define ORrd(dr32, sr32) do {						\
+	emitm(&stream, 0x0b, 1);					\
+	emitm(&stream,							\
+	    (3 << 6) | ((dr32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+} while (0)
+
+/* or r32,i32 */
+#define ORid(r32, i32) do {						\
+	if (r32 == EAX) {						\
+		emitm(&stream, 0x0d, 1);				\
+		emitm(&stream, i32, 4);					\
+	} else {							\
+		emitm(&stream, 0x81, 1);				\
+		emitm(&stream, (25 << 3) | r32, 1);			\
+		emitm(&stream, i32, 4);					\
+	}								\
+} while (0)
+
+/* shl r32,i8 */
+#define SHLib(r32, i8) do {						\
+	emitm(&stream, 0xc1, 1);					\
+	emitm(&stream, (7 << 5) | (r32 & 0x7), 1);			\
+	emitm(&stream, i8, 1);						\
+} while (0)
+
+/* shl dr32,cl */
+#define SHL_CLrb(dr32) do {						\
+	emitm(&stream, 0xd3, 1);					\
+	emitm(&stream, (7 << 5) | (dr32 & 0x7), 1);			\
+} while (0)
+
+/* shr r32,i8 */
+#define SHRib(r32, i8) do {						\
+	emitm(&stream, 0xc1, 1);					\
+	emitm(&stream, (29 << 3) | (r32 & 0x7), 1);			\
+	emitm(&stream, i8, 1);						\
+} while (0)
+
+/* shr dr32,cl */
+#define SHR_CLrb(dr32) do {						\
+	emitm(&stream, 0xd3, 1);					\
+	emitm(&stream, (29 << 3) | (dr32 & 0x7), 1);			\
+} while (0)
+
+/* neg r32 */
+#define NEGd(r32) do {							\
+	emitm(&stream, 0xf7, 1);					\
+	emitm(&stream, (27 << 3) | (r32 & 0x7), 1);			\
+} while (0)
+
+/* cmp dr32,sr32[off] */
+#define CMPodd(dr32, sr32, off) do {					\
+	emitm(&stream, (3 << 4) | 3 | (1 << 3), 1);			\
+	emitm(&stream,							\
+	    (1 << 6) | ((dr32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+	emitm(&stream, off, 1);						\
+} while (0)
+
+/* cmp dr32,sr32 */
+#define CMPrd(dr32, sr32) do {						\
+	emitm(&stream, 0x3b, 1);					\
+	emitm(&stream,							\
+	    (3 << 6) | ((dr32 & 0x7) << 3) | (sr32 & 0x7), 1);		\
+} while (0)
+
+/* cmp dr32,i32 */
+#define CMPid(dr32, i32) do {						\
+	if (dr32 == EAX){						\
+		emitm(&stream, 0x3d, 1);				\
+		emitm(&stream, i32, 4);					\
+	} else {							\
+		emitm(&stream, 0x81, 1);				\
+		emitm(&stream, (0x1f << 3) | (dr32 & 0x7), 1);		\
+		emitm(&stream, i32, 4);					\
+	}								\
+} while (0)
+
+/* jne off32 */
+#define JNEb(off8) do {							\
+	emitm(&stream, 0x75, 1);					\
+	emitm(&stream, off8, 1);					\
+} while (0)
+
+/* je off32 */
+#define JE(off32) do {							\
+	emitm(&stream, 0x0f, 1);					\
+	emitm(&stream, 0x84, 1);					\
+	emitm(&stream, off32, 4);					\
+} while (0)
+
+/* jle off32 */
+#define JLE(off32) do {							\
+	emitm(&stream, 0x0f, 1);					\
+	emitm(&stream, 0x8e, 1);					\
+	emitm(&stream, off32, 4);					\
+} while (0)
+
+/* jle off8 */
+#define JLEb(off8) do {							\
+	emitm(&stream, 0x7e, 1);					\
+	emitm(&stream, off8, 1);					\
+} while (0)
+
+/* ja off32 */
+#define JA(off32) do {							\
+	emitm(&stream, 0x0f, 1);					\
+	emitm(&stream, 0x87, 1);					\
+	emitm(&stream, off32, 4);					\
+} while (0)
+
+/* jae off32 */
+#define JAE(off32) do {							\
+	emitm(&stream, 0x0f, 1);					\
+	emitm(&stream, 0x83, 1);					\
+	emitm(&stream, off32, 4);					\
+} while (0)
+
+/* jg off32 */
+#define JG(off32) do {							\
+	emitm(&stream, 0x0f, 1);					\
+	emitm(&stream, 0x8f, 1);					\
+	emitm(&stream, off32, 4);					\
+} while (0)
+
+/* jge off32 */
+#define JGE(off32) do {							\
+	emitm(&stream, 0x0f, 1);					\
+	emitm(&stream, 0x8d, 1);					\
+	emitm(&stream, off32, 4);					\
+} while (0)
+
+/* jmp off32 */
+#define JMP(off32) do {							\
+	emitm(&stream, 0xe9, 1);					\
+	emitm(&stream, off32, 4);					\
+} while (0)
+
+/* xor eax,eax */
+#define ZERO_EAX() do {							\
+	emitm(&stream, 0x31, 1);					\
+	emitm(&stream, 0xc0, 1);					\
+} while (0)
+
+/* xor edx,edx */
+#define ZERO_EDX() do {							\
+	emitm(&stream, 0x31, 1);					\
+	emitm(&stream, 0xd2, 1);					\
+} while (0)
+
+#endif	/* _BPF_JIT_MACHDEP_H_ */
--- /dev/null
+++ sys/i386/i386/msi.c
@@ -0,0 +1,506 @@
+/*-
+ * Copyright (c) 2006 Yahoo!, Inc.
+ * All rights reserved.
+ * Written by: John Baldwin <jhb at FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Support for PCI Message Signalled Interrupts (MSI).  MSI interrupts on
+ * x86 are basically APIC messages that the northbridge delivers directly
+ * to the local APICs as if they had come from an I/O APIC.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/i386/i386/msi.c,v 1.6.2.1 2007/10/30 18:00:56 jhb Exp $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/systm.h>
+#include <machine/apicreg.h>
+#include <machine/md_var.h>
+#include <machine/frame.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <dev/pci/pcivar.h>
+
+/* Fields in address for Intel MSI messages. */
+#define	MSI_INTEL_ADDR_DEST		0x000ff000
+#define	MSI_INTEL_ADDR_RH		0x00000008
+# define MSI_INTEL_ADDR_RH_ON		0x00000008
+# define MSI_INTEL_ADDR_RH_OFF		0x00000000
+#define	MSI_INTEL_ADDR_DM		0x00000004
+# define MSI_INTEL_ADDR_DM_PHYSICAL	0x00000000
+# define MSI_INTEL_ADDR_DM_LOGICAL	0x00000004
+
+/* Fields in data for Intel MSI messages. */
+#define	MSI_INTEL_DATA_TRGRMOD		IOART_TRGRMOD	/* Trigger mode. */
+# define MSI_INTEL_DATA_TRGREDG		IOART_TRGREDG
+# define MSI_INTEL_DATA_TRGRLVL		IOART_TRGRLVL
+#define	MSI_INTEL_DATA_LEVEL		0x00004000	/* Polarity. */
+# define MSI_INTEL_DATA_DEASSERT	0x00000000
+# define MSI_INTEL_DATA_ASSERT		0x00004000
+#define	MSI_INTEL_DATA_DELMOD		IOART_DELMOD	/* Delivery mode. */
+# define MSI_INTEL_DATA_DELFIXED	IOART_DELFIXED
+# define MSI_INTEL_DATA_DELLOPRI	IOART_DELLOPRI
+# define MSI_INTEL_DATA_DELSMI		IOART_DELSMI
+# define MSI_INTEL_DATA_DELNMI		IOART_DELNMI
+# define MSI_INTEL_DATA_DELINIT		IOART_DELINIT
+# define MSI_INTEL_DATA_DELEXINT	IOART_DELEXINT
+#define	MSI_INTEL_DATA_INTVEC		IOART_INTVEC	/* Interrupt vector. */
+
+/*
+ * Build Intel MSI message and data values from a source.  AMD64 systems
+ * seem to be compatible, so we use the same function for both.
+ */
+#define	INTEL_ADDR(msi)							\
+	(MSI_INTEL_ADDR_BASE | (msi)->msi_cpu << 12 |			\
+	    MSI_INTEL_ADDR_RH_OFF | MSI_INTEL_ADDR_DM_PHYSICAL)
+#define	INTEL_DATA(msi)							\
+	(MSI_INTEL_DATA_TRGREDG | MSI_INTEL_DATA_DELFIXED | (msi)->msi_vector)
+
+static MALLOC_DEFINE(M_MSI, "msi", "PCI MSI");
+
+/*
+ * MSI sources are bunched into groups.  This is because MSI forces
+ * all of the messages to share the address and data registers and
+ * thus certain properties (such as the local APIC ID target on x86).
+ * Each group has a 'first' source that contains information global to
+ * the group.  These fields are marked with (g) below.
+ *
+ * Note that local APIC ID is kind of special.  Each message will be
+ * assigned an ID by the system; however, a group will use the ID from
+ * the first message.
+ *
+ * For MSI-X, each message is isolated.
+ */
+struct msi_intsrc {
+	struct intsrc msi_intsrc;
+	device_t msi_dev;		/* Owning device. (g) */
+	struct msi_intsrc *msi_first;	/* First source in group. */
+	u_int msi_irq;			/* IRQ cookie. */
+	u_int msi_msix;			/* MSI-X message. */
+	u_int msi_vector:8;		/* IDT vector. */
+	u_int msi_cpu:8;		/* Local APIC ID. (g) */
+	u_int msi_count:8;		/* Messages in this group. (g) */
+};
+
+static void	msi_create_source(void);
+static void	msi_enable_source(struct intsrc *isrc);
+static void	msi_disable_source(struct intsrc *isrc, int eoi);
+static void	msi_eoi_source(struct intsrc *isrc);
+static void	msi_enable_intr(struct intsrc *isrc);
+static void	msi_disable_intr(struct intsrc *isrc);
+static int	msi_vector(struct intsrc *isrc);
+static int	msi_source_pending(struct intsrc *isrc);
+static int	msi_config_intr(struct intsrc *isrc, enum intr_trigger trig,
+		    enum intr_polarity pol);
+static void	msi_assign_cpu(struct intsrc *isrc, u_int apic_id);
+
+struct pic msi_pic = { msi_enable_source, msi_disable_source, msi_eoi_source,
+		       msi_enable_intr, msi_disable_intr, msi_vector,
+		       msi_source_pending, NULL, NULL, msi_config_intr,
+		       msi_assign_cpu };
+
+static int msi_enabled;
+static int msi_last_irq;
+static struct mtx msi_lock;
+
+static void
+msi_enable_source(struct intsrc *isrc)
+{
+}
+
+static void
+msi_disable_source(struct intsrc *isrc, int eoi)
+{
+
+	if (eoi == PIC_EOI)
+		lapic_eoi();
+}
+
+static void
+msi_eoi_source(struct intsrc *isrc)
+{
+
+	lapic_eoi();
+}
+
+static void
+msi_enable_intr(struct intsrc *isrc)
+{
+	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
+
+	apic_enable_vector(msi->msi_vector);
+}
+
+static void
+msi_disable_intr(struct intsrc *isrc)
+{
+	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
+
+	apic_disable_vector(msi->msi_vector);
+}
+
+static int
+msi_vector(struct intsrc *isrc)
+{
+	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
+
+	return (msi->msi_irq);
+}
+
+static int
+msi_source_pending(struct intsrc *isrc)
+{
+
+	return (0);
+}
+
+static int
+msi_config_intr(struct intsrc *isrc, enum intr_trigger trig,
+    enum intr_polarity pol)
+{
+
+	return (ENODEV);
+}
+
+static void
+msi_assign_cpu(struct intsrc *isrc, u_int apic_id)
+{
+	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
+
+	msi->msi_cpu = apic_id;
+	if (bootverbose)
+		printf("msi: Assigning %s IRQ %d to local APIC %u\n",
+		    msi->msi_msix ? "MSI-X" : "MSI", msi->msi_irq,
+		    msi->msi_cpu);	
+	pci_remap_msi_irq(msi->msi_dev, msi->msi_irq);
+}
+
+void
+msi_init(void)
+{
+
+	/* Check if we have a supported CPU. */
+	if (!(strcmp(cpu_vendor, "GenuineIntel") == 0 ||
+	      strcmp(cpu_vendor, "AuthenticAMD") == 0))
+		return;
+
+	msi_enabled = 1;
+	intr_register_pic(&msi_pic);
+	mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
+}
+
+void
+msi_create_source(void)
+{
+	struct msi_intsrc *msi;
+	u_int irq;
+
+	mtx_lock(&msi_lock);
+	if (msi_last_irq >= NUM_MSI_INTS) {
+		mtx_unlock(&msi_lock);
+		return;
+	}
+	irq = msi_last_irq + FIRST_MSI_INT;
+	msi_last_irq++;
+	mtx_unlock(&msi_lock);
+
+	msi = malloc(sizeof(struct msi_intsrc), M_MSI, M_WAITOK | M_ZERO);	
+	msi->msi_intsrc.is_pic = &msi_pic;
+	msi->msi_irq = irq;
+	intr_register_source(&msi->msi_intsrc);
+	nexus_add_irq(irq);
+}
+
+/*
+ * Try to allocate 'count' interrupt sources with contiguous IDT values.  If
+ * we allocate any new sources, then their IRQ values will be at the end of
+ * the irqs[] array, with *newirq being the index of the first new IRQ value
+ * and *newcount being the number of new IRQ values added.
+ */
+int
+msi_alloc(device_t dev, int count, int maxcount, int *irqs)
+{
+	struct msi_intsrc *msi, *fsrc;
+	int cnt, i, vector;
+
+	if (!msi_enabled)
+		return (ENXIO);
+
+again:
+	mtx_lock(&msi_lock);
+
+	/* Try to find 'count' free IRQs. */
+	cnt = 0;
+	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
+		msi = (struct msi_intsrc *)intr_lookup_source(i);
+
+		/* End of allocated sources, so break. */
+		if (msi == NULL)
+			break;
+
+		/* If this is a free one, save its IRQ in the array. */
+		if (msi->msi_dev == NULL) {
+			irqs[cnt] = i;
+			cnt++;
+			if (cnt == count)
+				break;
+		}
+	}
+
+	/* Do we need to create some new sources? */
+	if (cnt < count) {
+		/* If we would exceed the max, give up. */
+		if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) {
+			mtx_unlock(&msi_lock);
+			return (ENXIO);
+		}
+		mtx_unlock(&msi_lock);
+
+		/* We need count - cnt more sources. */
+		while (cnt < count) {
+			msi_create_source();
+			cnt++;
+		}
+		goto again;
+	}
+
+	/* Ok, we now have the IRQs allocated. */
+	KASSERT(cnt == count, ("count mismatch"));
+
+	/* Allocate 'count' IDT vectors. */
+	vector = apic_alloc_vectors(irqs, count, maxcount);
+	if (vector == 0) {
+		mtx_unlock(&msi_lock);
+		return (ENOSPC);
+	}
+
+	/* Assign IDT vectors and make these messages owned by 'dev'. */
+	fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
+	for (i = 0; i < count; i++) {
+		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
+		msi->msi_dev = dev;
+		msi->msi_vector = vector + i;
+		if (bootverbose)
+			printf("msi: routing MSI IRQ %d to vector %u\n",
+			    msi->msi_irq, msi->msi_vector);
+		msi->msi_first = fsrc;
+		KASSERT(msi->msi_intsrc.is_handlers == 0,
+		    ("dead MSI has handlers"));
+	}
+	fsrc->msi_count = count;
+	mtx_unlock(&msi_lock);
+
+	return (0);
+}
+
+int
+msi_release(int *irqs, int count)
+{
+	struct msi_intsrc *msi, *first;
+	int i;
+
+	mtx_lock(&msi_lock);
+	first = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
+	if (first == NULL) {
+		mtx_unlock(&msi_lock);
+		return (ENOENT);
+	}
+
+	/* Make sure this isn't an MSI-X message. */
+	if (first->msi_msix) {
+		mtx_unlock(&msi_lock);
+		return (EINVAL);
+	}
+
+	/* Make sure this message is allocated to a group. */
+	if (first->msi_first == NULL) {
+		mtx_unlock(&msi_lock);
+		return (ENXIO);
+	}
+
+	/*
+	 * Make sure this is the start of a group and that we are releasing
+	 * the entire group.
+	 */
+	if (first->msi_first != first || first->msi_count != count) {
+		mtx_unlock(&msi_lock);
+		return (EINVAL);
+	}
+	KASSERT(first->msi_dev != NULL, ("unowned group"));
+
+	/* Clear all the extra messages in the group. */
+	for (i = 1; i < count; i++) {
+		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
+		KASSERT(msi->msi_first == first, ("message not in group"));
+		KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch"));
+		msi->msi_first = NULL;
+		msi->msi_dev = NULL;
+		apic_free_vector(msi->msi_vector, msi->msi_irq);
+		msi->msi_vector = 0;
+	}
+
+	/* Clear out the first message. */
+	first->msi_first = NULL;
+	first->msi_dev = NULL;
+	apic_free_vector(first->msi_vector, first->msi_irq);
+	first->msi_vector = 0;
+	first->msi_count = 0;
+
+	mtx_unlock(&msi_lock);
+	return (0);
+}
+
+int
+msi_map(int irq, uint64_t *addr, uint32_t *data)
+{
+	struct msi_intsrc *msi;
+
+	mtx_lock(&msi_lock);
+	msi = (struct msi_intsrc *)intr_lookup_source(irq);
+	if (msi == NULL) {
+		mtx_unlock(&msi_lock);
+		return (ENOENT);
+	}
+
+	/* Make sure this message is allocated to a device. */
+	if (msi->msi_dev == NULL) {
+		mtx_unlock(&msi_lock);
+		return (ENXIO);
+	}
+
+	/*
+	 * If this message isn't an MSI-X message, make sure it's part
+	 * of a group, and switch to the first message in the
+	 * group.
+	 */
+	if (!msi->msi_msix) {
+		if (msi->msi_first == NULL) {
+			mtx_unlock(&msi_lock);
+			return (ENXIO);
+		}
+		msi = msi->msi_first;
+	}
+
+	*addr = INTEL_ADDR(msi);
+	*data = INTEL_DATA(msi);
+	mtx_unlock(&msi_lock);
+	return (0);
+}
+
+int
+msix_alloc(device_t dev, int *irq)
+{
+	struct msi_intsrc *msi;
+	int i, vector;
+
+	if (!msi_enabled)
+		return (ENXIO);
+
+again:
+	mtx_lock(&msi_lock);
+
+	/* Find a free IRQ. */
+	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
+		msi = (struct msi_intsrc *)intr_lookup_source(i);
+
+		/* End of allocated sources, so break. */
+		if (msi == NULL)
+			break;
+
+		/* Stop at the first free source. */
+		if (msi->msi_dev == NULL)
+			break;
+	}
+
+	/* Do we need to create a new source? */
+	if (msi == NULL) {
+		/* If we would exceed the max, give up. */
+		if (i + 1 > FIRST_MSI_INT + NUM_MSI_INTS) {
+			mtx_unlock(&msi_lock);
+			return (ENXIO);
+		}
+		mtx_unlock(&msi_lock);
+
+		/* Create a new source. */
+		msi_create_source();
+		goto again;
+	}
+
+	/* Allocate an IDT vector. */
+	vector = apic_alloc_vector(i);
+	if (bootverbose)
+		printf("msi: routing MSI-X IRQ %d to vector %u\n", msi->msi_irq,
+		    vector);
+
+	/* Setup source. */
+	msi->msi_dev = dev;
+	msi->msi_vector = vector;
+	msi->msi_msix = 1;
+
+	KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI-X has handlers"));
+	mtx_unlock(&msi_lock);
+
+	*irq = i;
+	return (0);
+}
+
+int
+msix_release(int irq)
+{
+	struct msi_intsrc *msi;
+
+	mtx_lock(&msi_lock);
+	msi = (struct msi_intsrc *)intr_lookup_source(irq);
+	if (msi == NULL) {
+		mtx_unlock(&msi_lock);
+		return (ENOENT);
+	}
+
+	/* Make sure this is an MSI-X message. */
+	if (!msi->msi_msix) {
+		mtx_unlock(&msi_lock);
+		return (EINVAL);
+	}
+
+	KASSERT(msi->msi_dev != NULL, ("unowned message"));
+
+	/* Clear out the message. */
+	msi->msi_dev = NULL;
+	apic_free_vector(msi->msi_vector, msi->msi_irq);
+	msi->msi_vector = 0;
+	msi->msi_msix = 0;
+
+	mtx_unlock(&msi_lock);
+	return (0);
+}
Index: dump_machdep.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/dump_machdep.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/dump_machdep.c -L sys/i386/i386/dump_machdep.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/dump_machdep.c
+++ sys/i386/i386/dump_machdep.c
@@ -25,12 +25,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/dump_machdep.c,v 1.11 2005/07/02 19:57:31 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/dump_machdep.c,v 1.12.4.1 2008/01/30 21:21:50 ru Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
+#include <sys/sysctl.h>
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <vm/vm.h>
@@ -40,6 +41,11 @@
 
 CTASSERT(sizeof(struct kerneldumpheader) == 512);
 
+int do_minidump = 1;
+TUNABLE_INT("debug.minidump", &do_minidump);
+SYSCTL_INT(_debug, OID_AUTO, minidump, CTLFLAG_RW, &do_minidump, 0,
+    "Enable mini crash dumps");
+
 /*
  * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
  * is to protect us from metadata and to protect metadata from us.
@@ -134,7 +140,7 @@
 		ptr += len;
 		sz -= len;
 		if (fragsz == DEV_BSIZE) {
-			error = di->dumper(di->priv, buffer, 0, dumplo,
+			error = dump_write(di, buffer, 0, dumplo,
 			    DEV_BSIZE);
 			if (error)
 				return error;
@@ -154,7 +160,7 @@
 	if (fragsz == 0)
 		return (0);
 
-	error = di->dumper(di->priv, buffer, 0, dumplo, DEV_BSIZE);
+	error = dump_write(di, buffer, 0, dumplo, DEV_BSIZE);
 	dumplo += DEV_BSIZE;
 	fragsz = 0;
 	return (error);
@@ -195,7 +201,7 @@
 			a = pa + i * PAGE_SIZE;
 			va = pmap_kenter_temporary(trunc_page(a), i);
 		}
-		error = di->dumper(di->priv, va, 0, dumplo, sz);
+		error = dump_write(di, va, 0, dumplo, sz);
 		if (error)
 			break;
 		dumplo += sz;
@@ -272,6 +278,10 @@
 	size_t hdrsz;
 	int error;
 
+	if (do_minidump) {
+		minidumpsys(di);
+		return;
+	}
 	bzero(&ehdr, sizeof(ehdr));
 	ehdr.e_ident[EI_MAG0] = ELFMAG0;
 	ehdr.e_ident[EI_MAG1] = ELFMAG1;
@@ -317,7 +327,7 @@
 	    ehdr.e_phnum);
 
 	/* Dump leader */
-	error = di->dumper(di->priv, &kdh, 0, dumplo, sizeof(kdh));
+	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
 	if (error)
 		goto fail;
 	dumplo += sizeof(kdh);
@@ -348,12 +358,12 @@
 		goto fail;
 
 	/* Dump trailer */
-	error = di->dumper(di->priv, &kdh, 0, dumplo, sizeof(kdh));
+	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
 	if (error)
 		goto fail;
 
 	/* Signal completion, signoff and exit stage left. */
-	di->dumper(di->priv, NULL, 0, 0, 0);
+	dump_write(di, NULL, 0, 0, 0);
 	printf("\nDump complete\n");
 	return;
 
Index: ptrace_machdep.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/ptrace_machdep.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/i386/i386/ptrace_machdep.c -L sys/i386/i386/ptrace_machdep.c -u -r1.3 -r1.4
--- sys/i386/i386/ptrace_machdep.c
+++ sys/i386/i386/ptrace_machdep.c
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/ptrace_machdep.c,v 1.3.2.1 2005/08/11 14:28:42 tobez Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/ptrace_machdep.c,v 1.6 2006/05/30 23:44:21 davidxu Exp $");
 
 #include "opt_cpu.h"
 
Index: exception.s
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/exception.s,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/exception.s -L sys/i386/i386/exception.s -u -r1.1.1.1 -r1.2
--- sys/i386/i386/exception.s
+++ sys/i386/i386/exception.s
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/i386/i386/exception.s,v 1.113.2.1 2005/07/28 03:30:53 jkoshy Exp $
+ * $FreeBSD: src/sys/i386/i386/exception.s,v 1.117 2006/12/17 05:07:00 kmacy Exp $
  */
 
 #include "opt_apic.h"
@@ -74,6 +74,8 @@
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
+#define	TRAP(a)		pushl $(a) ; jmp alltraps
+
 IDTVEC(div)
 	pushl $0; TRAP(T_DIVIDE)
 IDTVEC(dbg)
@@ -116,8 +118,9 @@
 	/*
 	 * alltraps entry point.  Interrupts are enabled if this was a trap
 	 * gate (TGT), else disabled if this was an interrupt gate (IGT).
-	 * Note that int0x80_syscall is a trap gate.  Only page faults
-	 * use an interrupt gate.
+	 * Note that int0x80_syscall is a trap gate.   Interrupt gates are
+	 * used by page faults, non-maskable interrupts, debug and breakpoint
+	 * exceptions.
 	 */
 
 	SUPERALIGN_TEXT
@@ -129,15 +132,13 @@
 	pushl	%es
 	pushl	%fs
 alltraps_with_regs_pushed:
-	movl	$KDSEL,%eax
-	movl	%eax,%ds
-	movl	%eax,%es
-	movl	$KPSEL,%eax
-	movl	%eax,%fs
+	SET_KERNEL_SREGS
 	FAKE_MCOUNT(TF_EIP(%esp))
 calltrap:
+	pushl	%esp
 	call	trap
-
+	add	$4, %esp
+	
 	/*
 	 * Return via doreti to handle ASTs.
 	 */
@@ -166,13 +167,11 @@
 	pushl	%ds
 	pushl	%es
 	pushl	%fs
-	movl	$KDSEL,%eax		/* switch to kernel segments */
-	movl	%eax,%ds
-	movl	%eax,%es
-	movl	$KPSEL,%eax
-	movl	%eax,%fs
+	SET_KERNEL_SREGS
 	FAKE_MCOUNT(TF_EIP(%esp))
+	pushl	%esp
 	call	syscall
+	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
@@ -191,13 +190,11 @@
 	pushl	%ds
 	pushl	%es
 	pushl	%fs
-	movl	$KDSEL,%eax		/* switch to kernel segments */
-	movl	%eax,%ds
-	movl	%eax,%es
-	movl	$KPSEL,%eax
-	movl	%eax,%fs
+	SET_KERNEL_SREGS
 	FAKE_MCOUNT(TF_EIP(%esp))
+	pushl	%esp
 	call	syscall
+	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
Index: mptable_pci.c
===================================================================
RCS file: /home/cvs/src/sys/i386/i386/mptable_pci.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/i386/i386/mptable_pci.c -L sys/i386/i386/mptable_pci.c -u -r1.1.1.1 -r1.2
--- sys/i386/i386/mptable_pci.c
+++ sys/i386/i386/mptable_pci.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/mptable_pci.c,v 1.2.8.1 2005/09/18 02:55:10 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/mptable_pci.c,v 1.8 2007/05/02 17:50:35 jhb Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -72,6 +72,37 @@
 	return (bus_generic_attach(dev));
 }
 
+/* Pass MSI requests up to the nexus. */
+static int
+mptable_hostb_alloc_msi(device_t pcib, device_t dev, int count, int maxcount,
+    int *irqs)
+{
+	device_t bus;
+
+	bus = device_get_parent(pcib);
+	return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount,
+	    irqs));
+}
+
+static int
+mptable_hostb_alloc_msix(device_t pcib, device_t dev, int *irq)
+{
+	device_t bus;
+
+	bus = device_get_parent(pcib);
+	return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq));
+}
+
+static int
+mptable_hostb_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr,
+    uint32_t *data)
+{
+	device_t bus;
+
+	bus = device_get_parent(pcib);
+	return (PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data));
+}
+
 static device_method_t mptable_hostb_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		mptable_hostb_probe),
@@ -96,17 +127,19 @@
 	DEVMETHOD(pcib_read_config,	legacy_pcib_read_config),
 	DEVMETHOD(pcib_write_config,	legacy_pcib_write_config),
 	DEVMETHOD(pcib_route_interrupt,	mptable_pci_route_interrupt),
+	DEVMETHOD(pcib_alloc_msi,	mptable_hostb_alloc_msi),
+	DEVMETHOD(pcib_release_msi,	pcib_release_msi),
+	DEVMETHOD(pcib_alloc_msix,	mptable_hostb_alloc_msix),
+	DEVMETHOD(pcib_release_msix,	pcib_release_msix),
+	DEVMETHOD(pcib_map_msi,		mptable_hostb_map_msi),
 
 	{ 0, 0 }
 };
 
-static driver_t mptable_hostb_driver = {
-	"pcib",
-	mptable_hostb_methods,
-	1,
-};
+static devclass_t hostb_devclass;
 
-DRIVER_MODULE(mptable_pcib, legacy, mptable_hostb_driver, pcib_devclass, 0, 0);
+DEFINE_CLASS_0(pcib, mptable_hostb_driver, mptable_hostb_methods, 1);
+DRIVER_MODULE(mptable_pcib, legacy, mptable_hostb_driver, hostb_devclass, 0, 0);
 
 /* PCI to PCI bridge driver. */
 
@@ -151,15 +184,17 @@
 	DEVMETHOD(pcib_read_config,	pcib_read_config),
 	DEVMETHOD(pcib_write_config,	pcib_write_config),
 	DEVMETHOD(pcib_route_interrupt,	mptable_pci_route_interrupt),
+	DEVMETHOD(pcib_alloc_msi,	pcib_alloc_msi),
+	DEVMETHOD(pcib_release_msi,	pcib_release_msi),
+	DEVMETHOD(pcib_alloc_msix,	pcib_alloc_msix),
+	DEVMETHOD(pcib_release_msix,	pcib_release_msix),
+	DEVMETHOD(pcib_map_msi,		pcib_map_msi),
 
 	{0, 0}
 };
 
-static driver_t mptable_pcib_driver = {
-	"pcib",
-	mptable_pcib_pci_methods,
-	sizeof(struct pcib_softc),
-};
+static devclass_t pcib_devclass;
 
+DEFINE_CLASS_0(pcib, mptable_pcib_driver, mptable_pcib_pci_methods,
+    sizeof(struct pcib_softc));
 DRIVER_MODULE(mptable_pcib, pci, mptable_pcib_driver, pcib_devclass, 0, 0);
-


More information about the Midnightbsd-cvs mailing list