[Midnightbsd-cvs] src [10177] trunk/sys/amd64/amd64: sync with freebsd

Fri Jun 1 19:00:13 EDT 2018

Revision: 10177
          http://svnweb.midnightbsd.org/src/?rev=10177
Author:   laffer1
Date:     2018-06-01 19:00:12 -0400 (Fri, 01 Jun 2018)
Log Message:
-----------
sync with freebsd

Modified Paths:
--------------
    trunk/sys/amd64/amd64/amd64_mem.c
    trunk/sys/amd64/amd64/apic_vector.S
    trunk/sys/amd64/amd64/atomic.c
    trunk/sys/amd64/amd64/atpic_vector.S
    trunk/sys/amd64/amd64/autoconf.c
    trunk/sys/amd64/amd64/bios.c
    trunk/sys/amd64/amd64/bpf_jit_machdep.c
    trunk/sys/amd64/amd64/bpf_jit_machdep.h
    trunk/sys/amd64/amd64/cpu_switch.S
    trunk/sys/amd64/amd64/db_disasm.c
    trunk/sys/amd64/amd64/db_interface.c
    trunk/sys/amd64/amd64/db_trace.c
    trunk/sys/amd64/amd64/elf_machdep.c
    trunk/sys/amd64/amd64/exception.S
    trunk/sys/amd64/amd64/fpu.c
    trunk/sys/amd64/amd64/gdb_machdep.c
    trunk/sys/amd64/amd64/genassym.c
    trunk/sys/amd64/amd64/in_cksum.c
    trunk/sys/amd64/amd64/initcpu.c
    trunk/sys/amd64/amd64/io.c
    trunk/sys/amd64/amd64/locore.S
    trunk/sys/amd64/amd64/machdep.c
    trunk/sys/amd64/amd64/mem.c
    trunk/sys/amd64/amd64/minidump_machdep.c
    trunk/sys/amd64/amd64/mp_machdep.c
    trunk/sys/amd64/amd64/mp_watchdog.c
    trunk/sys/amd64/amd64/mpboot.S
    trunk/sys/amd64/amd64/pmap.c
    trunk/sys/amd64/amd64/prof_machdep.c
    trunk/sys/amd64/amd64/ptrace_machdep.c
    trunk/sys/amd64/amd64/sigtramp.S
    trunk/sys/amd64/amd64/stack_machdep.c
    trunk/sys/amd64/amd64/support.S
    trunk/sys/amd64/amd64/sys_machdep.c
    trunk/sys/amd64/amd64/trap.c
    trunk/sys/amd64/amd64/uio_machdep.c
    trunk/sys/amd64/amd64/uma_machdep.c
    trunk/sys/amd64/amd64/vm_machdep.c

Property Changed:
----------------
    trunk/sys/amd64/amd64/apic_vector.S
    trunk/sys/amd64/amd64/atpic_vector.S
    trunk/sys/amd64/amd64/cpu_switch.S
    trunk/sys/amd64/amd64/exception.S
    trunk/sys/amd64/amd64/locore.S
    trunk/sys/amd64/amd64/mpboot.S
    trunk/sys/amd64/amd64/sigtramp.S
    trunk/sys/amd64/amd64/support.S

Modified: trunk/sys/amd64/amd64/amd64_mem.c
===================================================================

--- trunk/sys/amd64/amd64/amd64_mem.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/amd64_mem.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1999 Michael Smith <msmith at freebsd.org>
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/amd64_mem.c 217506 2011-01-17 17:30:35Z jkim $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>

Modified: trunk/sys/amd64/amd64/apic_vector.S
===================================================================
--- trunk/sys/amd64/amd64/apic_vector.S	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/apic_vector.S	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
@@ -28,7 +29,7 @@
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/apic_vector.S 302041 2016-06-21 04:51:55Z sephe $
  */
 
 /*
@@ -43,6 +44,12 @@
 
 #include "assym.s"
 
+#ifdef SMP
+#define LK	lock ;
+#else
+#define LK
+#endif
+
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
@@ -128,73 +135,68 @@
 	MEXITCOUNT
 	jmp	doreti
 
-#ifdef SMP
+#ifdef XENHVM
 /*
- * Global address space TLB shootdown.
+ * Xen event channel upcall interrupt handler.
+ * Only used when the hypervisor supports direct vector callbacks.
  */
 	.text
 	SUPERALIGN_TEXT
-IDTVEC(invltlb)
-#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
+IDTVEC(xen_intr_upcall)
 	PUSH_FRAME
-	movl	PCPU(CPUID), %eax
-#ifdef COUNT_XINVLTLB_HITS
-	incl	xhits_gbl(,%rax,4)
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp, %rdi
+	call	xen_intr_handle_upcall
+	MEXITCOUNT
+	jmp	doreti
 #endif
-#ifdef COUNT_IPIS
-	movq	ipi_invltlb_counts(,%rax,8),%rax
-	incq	(%rax)
-#endif
-	POP_FRAME
-#endif
 
-	pushq	%rax
+#ifdef SMP
+/*
+ * Global address space TLB shootdown.
+ */
+	.text
 
-	movq	%cr3, %rax		/* invalidate the TLB */
-	movq	%rax, %cr3
-
+	SUPERALIGN_TEXT
+invltlb_ret:
 	movq	lapic, %rax
 	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	POP_FRAME
+	jmp	doreti_iret
 
-	lock
-	incl	smp_tlb_wait
+	SUPERALIGN_TEXT
+IDTVEC(invltlb_pcid)
+	PUSH_FRAME
 
-	popq	%rax
-	jmp	doreti_iret
+	call	invltlb_pcid_handler
+	jmp	invltlb_ret
 
+
+	SUPERALIGN_TEXT
+IDTVEC(invltlb)
+	PUSH_FRAME
+
+	call	invltlb_handler
+	jmp	invltlb_ret
+
 /*
  * Single page TLB shootdown
  */
 	.text
 	SUPERALIGN_TEXT
-IDTVEC(invlpg)
-#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
+IDTVEC(invlpg_pcid)
 	PUSH_FRAME
-	movl	PCPU(CPUID), %eax
-#ifdef COUNT_XINVLTLB_HITS
-	incl	xhits_pg(,%rax,4)
-#endif
-#ifdef COUNT_IPIS
-	movq	ipi_invlpg_counts(,%rax,8),%rax
-	incq	(%rax)
-#endif
-	POP_FRAME
-#endif
 
-	pushq	%rax
+	call	invlpg_pcid_handler
+	jmp	invltlb_ret
 
-	movq	smp_tlb_addr1, %rax
-	invlpg	(%rax)			/* invalidate single page */
+	SUPERALIGN_TEXT
+IDTVEC(invlpg)
+	PUSH_FRAME
 
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	call	invlpg_handler
+	jmp	invltlb_ret
 
-	lock
-	incl	smp_tlb_wait
-
-	popq	%rax
-	jmp	doreti_iret
-
 /*
  * Page range TLB shootdown.
  */
@@ -201,39 +203,11 @@
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlrng)
-#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
 	PUSH_FRAME
-	movl	PCPU(CPUID), %eax
-#ifdef COUNT_XINVLTLB_HITS
-	incl	xhits_rng(,%rax,4)
-#endif
-#ifdef COUNT_IPIS
-	movq	ipi_invlrng_counts(,%rax,8),%rax
-	incq	(%rax)
-#endif
-	POP_FRAME
-#endif
 
-	pushq	%rax
-	pushq	%rdx
+	call	invlrng_handler
+	jmp	invltlb_ret
 
-	movq	smp_tlb_addr1, %rdx
-	movq	smp_tlb_addr2, %rax
-1:	invlpg	(%rdx)			/* invalidate single page */
-	addq	$PAGE_SIZE, %rdx
-	cmpq	%rax, %rdx
-	jb	1b
-
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
-
-	lock
-	incl	smp_tlb_wait
-
-	popq	%rdx
-	popq	%rax
-	jmp	doreti_iret
-
 /*
  * Invalidate cache.
  */
@@ -240,27 +214,11 @@
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlcache)
-#ifdef COUNT_IPIS
 	PUSH_FRAME
-	movl	PCPU(CPUID), %eax
-	movq	ipi_invlcache_counts(,%rax,8),%rax
-	incq	(%rax)
-	POP_FRAME
-#endif
 
-	pushq	%rax
+	call	invlcache_handler
+	jmp	invltlb_ret
 
-	wbinvd
-
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
-
-	lock
-	incl	smp_tlb_wait
-
-	popq	%rax
-	jmp	doreti_iret
-
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */


Property changes on: trunk/sys/amd64/amd64/apic_vector.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/atomic.c
===================================================================
--- trunk/sys/amd64/amd64/atomic.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/atomic.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1999 Peter Jeremy
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/atomic.c 118031 2003-07-25 21:19:19Z obrien $");
 
 /* This file creates publically callable functions to perform various
  * simple arithmetic on memory which is atomic in the presence of

Modified: trunk/sys/amd64/amd64/atpic_vector.S
===================================================================
--- trunk/sys/amd64/amd64/atpic_vector.S	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/atpic_vector.S	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
@@ -28,7 +29,7 @@
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/atpic_vector.S 204309 2010-02-25 14:13:39Z attilio $
  */
 
 /*


Property changes on: trunk/sys/amd64/amd64/atpic_vector.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/autoconf.c
===================================================================
--- trunk/sys/amd64/amd64/autoconf.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/autoconf.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
@@ -33,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/autoconf.c 146794 2005-05-29 23:44:22Z marcel $");
 
 /*
  * Setup the system to run on the current machine.

Modified: trunk/sys/amd64/amd64/bios.c
===================================================================
--- trunk/sys/amd64/amd64/bios.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/bios.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1997 Michael Smith
  * Copyright (c) 1998 Jonathan Lemon
@@ -26,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/bios.c 148231 2005-07-21 09:48:37Z phk $");
 
 /*
  * Subset of the i386 bios support code.  We cannot make bios16 nor bios32

Modified: trunk/sys/amd64/amd64/bpf_jit_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/bpf_jit_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/bpf_jit_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
  * Copyright (C) 2005-2009 Jung-uk Kim <jkim at FreeBSD.org>
@@ -30,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/bpf_jit_machdep.c 207081 2010-04-22 23:47:19Z jkim $");
 
 #ifdef _KERNEL
 #include "opt_bpf.h"

Modified: trunk/sys/amd64/amd64/bpf_jit_machdep.h
===================================================================
--- trunk/sys/amd64/amd64/bpf_jit_machdep.h	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/bpf_jit_machdep.h	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
  * Copyright (C) 2005-2009 Jung-uk Kim <jkim at FreeBSD.org>
@@ -28,7 +29,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/bpf_jit_machdep.h 207081 2010-04-22 23:47:19Z jkim $
  */
 
 #ifndef _BPF_JIT_MACHDEP_H_

Modified: trunk/sys/amd64/amd64/cpu_switch.S
===================================================================
--- trunk/sys/amd64/amd64/cpu_switch.S	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/cpu_switch.S	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1990 The Regents of the University of California.
@@ -30,7 +31,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/cpu_switch.S 271999 2014-09-22 20:34:36Z jhb $
  */
 
 #include <machine/asmacros.h>
@@ -76,9 +77,8 @@
 	movq	PCPU(CURPMAP),%rdx
 	LK btrl	%eax,PM_ACTIVE(%rdx)		/* clear old */
 1:
-	movq	TD_PCB(%rsi),%r8		/* newtd->td_proc */
-	movq	PCB_CR3(%r8),%rdx
-	movq	%rdx,%cr3			/* new address space */
+	movq	TD_PCB(%rsi),%r8		/* newtd->td_pcb */
+	movq	PCB_CR3(%r8),%rcx		/* new address space */
 	jmp	swact
 END(cpu_throw)
 
@@ -145,20 +145,41 @@
 	SETLK	%rdx, TD_LOCK(%rdi)		/* Release the old thread */
 	jmp	sw1
 swinact:
-	movq	%rcx,%cr3			/* new address space */
-	movl	PCPU(CPUID), %eax
+	movl	PCPU(CPUID),%eax
 	/* Release bit from old pmap->pm_active */
-	movq	PCPU(CURPMAP),%rcx
-	LK btrl	%eax,PM_ACTIVE(%rcx)		/* clear old */
-	SETLK	%rdx, TD_LOCK(%rdi)		/* Release the old thread */
+	movq	PCPU(CURPMAP),%r12
+	LK btrl	%eax,PM_ACTIVE(%r12)		/* clear old */
+	SETLK	%rdx,TD_LOCK(%rdi)		/* Release the old thread */
 swact:
 	/* Set bit in new pmap->pm_active */
 	movq	TD_PROC(%rsi),%rdx		/* newproc */
 	movq	P_VMSPACE(%rdx), %rdx
 	addq	$VM_PMAP,%rdx
+	cmpl	$-1,PM_PCID(%rdx)
+	je	1f
+	LK btsl	%eax,PM_SAVE(%rdx)
+	jnc	1f
+	btsq	$63,%rcx			/* CR3_PCID_SAVE */
+	incq	PCPU(PM_SAVE_CNT)
+1:
+	movq	%rcx,%cr3			/* new address space */
 	LK btsl	%eax,PM_ACTIVE(%rdx)		/* set new */
 	movq	%rdx,PCPU(CURPMAP)
 
+	/*
+	 * We might lose the race and other CPU might have changed
+	 * the pmap after we set our bit in pmap->pm_save.  Recheck.
+	 * Reload %cr3 with CR3_PCID_SAVE bit cleared if pmap was
+	 * modified, causing TLB flush for this pcid.
+	 */
+	btrq	$63,%rcx
+	jnc	1f
+	LK btsl	%eax,PM_SAVE(%rdx)
+	jc	1f
+	decq	PCPU(PM_SAVE_CNT)
+	movq	%rcx,%cr3
+1:
+
 sw1:
 #if defined(SCHED_ULE) && defined(SMP)
 	/* Wait for the new thread to become unblocked */
@@ -325,8 +346,8 @@
 	movq	%r14,PCB_R14(%rdi)
 	movq	%r15,PCB_R15(%rdi)
 
-	movq	%cr0,%rsi
-	movq	%rsi,PCB_CR0(%rdi)
+	movq	%cr0,%rax
+	movq	%rax,PCB_CR0(%rdi)
 	movq	%cr2,%rax
 	movq	%rax,PCB_CR2(%rdi)
 	movq	%cr3,%rax
@@ -359,6 +380,26 @@
 	rdmsr
 	movl	%eax,PCB_KGSBASE(%rdi)
 	movl	%edx,PCB_KGSBASE+4(%rdi)
+	movl	$MSR_EFER,%ecx
+	rdmsr
+	movl	%eax,PCB_EFER(%rdi)
+	movl	%edx,PCB_EFER+4(%rdi)
+	movl	$MSR_STAR,%ecx
+	rdmsr
+	movl	%eax,PCB_STAR(%rdi)
+	movl	%edx,PCB_STAR+4(%rdi)
+	movl	$MSR_LSTAR,%ecx
+	rdmsr
+	movl	%eax,PCB_LSTAR(%rdi)
+	movl	%edx,PCB_LSTAR+4(%rdi)
+	movl	$MSR_CSTAR,%ecx
+	rdmsr
+	movl	%eax,PCB_CSTAR(%rdi)
+	movl	%edx,PCB_CSTAR+4(%rdi)
+	movl	$MSR_SF_MASK,%ecx
+	rdmsr
+	movl	%eax,PCB_SFMASK(%rdi)
+	movl	%edx,PCB_SFMASK+4(%rdi)
 
 	sgdt	PCB_GDT(%rdi)
 	sidt	PCB_IDT(%rdi)
@@ -365,19 +406,117 @@
 	sldt	PCB_LDT(%rdi)
 	str	PCB_TR(%rdi)
 
-2:	movq	%rsi,%cr0	/* The previous %cr0 is saved in %rsi. */
-
 	movl	$1,%eax
 	ret
 END(savectx)
 
 /*
- * Wrapper around fpusave to care about TS0_CR.
- */
-ENTRY(ctx_fpusave)
-	movq	%cr0,%rsi
-	clts
-	call	fpusave
-	movq	%rsi,%cr0
+ * resumectx(pcb)
+ * Resuming processor state from pcb.
+ */     
+ENTRY(resumectx)
+	/* Switch to KPML4phys. */
+	movq	KPML4phys,%rax
+	movq	%rax,%cr3
+
+	/* Force kernel segment registers. */
+	movl	$KDSEL,%eax
+	movw	%ax,%ds
+	movw	%ax,%es
+	movw	%ax,%ss
+	movl	$KUF32SEL,%eax
+	movw	%ax,%fs
+	movl	$KUG32SEL,%eax
+	movw	%ax,%gs
+
+	movl	$MSR_FSBASE,%ecx
+	movl	PCB_FSBASE(%rdi),%eax
+	movl	4 + PCB_FSBASE(%rdi),%edx
+	wrmsr
+	movl	$MSR_GSBASE,%ecx
+	movl	PCB_GSBASE(%rdi),%eax
+	movl	4 + PCB_GSBASE(%rdi),%edx
+	wrmsr
+	movl	$MSR_KGSBASE,%ecx
+	movl	PCB_KGSBASE(%rdi),%eax
+	movl	4 + PCB_KGSBASE(%rdi),%edx
+	wrmsr
+
+	/* Restore EFER. */
+	movl	$MSR_EFER,%ecx
+	movl	PCB_EFER(%rdi),%eax
+	wrmsr
+
+	/* Restore fast syscall stuff. */
+	movl	$MSR_STAR,%ecx
+	movl	PCB_STAR(%rdi),%eax
+	movl	4 + PCB_STAR(%rdi),%edx
+	wrmsr
+	movl	$MSR_LSTAR,%ecx
+	movl	PCB_LSTAR(%rdi),%eax
+	movl	4 + PCB_LSTAR(%rdi),%edx
+	wrmsr
+	movl	$MSR_CSTAR,%ecx
+	movl	PCB_CSTAR(%rdi),%eax
+	movl	4 + PCB_CSTAR(%rdi),%edx
+	wrmsr
+	movl	$MSR_SF_MASK,%ecx
+	movl	PCB_SFMASK(%rdi),%eax
+	wrmsr
+
+	/* Restore CR0, CR2, CR4 and CR3. */
+	movq	PCB_CR0(%rdi),%rax
+	movq	%rax,%cr0
+	movq	PCB_CR2(%rdi),%rax
+	movq	%rax,%cr2
+	movq	PCB_CR4(%rdi),%rax
+	movq	%rax,%cr4
+	movq	PCB_CR3(%rdi),%rax
+	movq	%rax,%cr3
+
+	/* Restore descriptor tables. */
+	lidt	PCB_IDT(%rdi)
+	lldt	PCB_LDT(%rdi)
+
+#define	SDT_SYSTSS	9
+#define	SDT_SYSBSY	11
+
+	/* Clear "task busy" bit and reload TR. */
+	movq	PCPU(TSS),%rax
+	andb	$(~SDT_SYSBSY | SDT_SYSTSS),5(%rax)
+	movw	PCB_TR(%rdi),%ax
+	ltr	%ax
+
+#undef	SDT_SYSTSS
+#undef	SDT_SYSBSY
+
+	/* Restore debug registers. */
+	movq	PCB_DR0(%rdi),%rax
+	movq	%rax,%dr0
+	movq	PCB_DR1(%rdi),%rax
+	movq	%rax,%dr1
+	movq	PCB_DR2(%rdi),%rax
+	movq	%rax,%dr2
+	movq	PCB_DR3(%rdi),%rax
+	movq	%rax,%dr3
+	movq	PCB_DR6(%rdi),%rax
+	movq	%rax,%dr6
+	movq	PCB_DR7(%rdi),%rax
+	movq	%rax,%dr7
+
+	/* Restore other callee saved registers. */
+	movq	PCB_R15(%rdi),%r15
+	movq	PCB_R14(%rdi),%r14
+	movq	PCB_R13(%rdi),%r13
+	movq	PCB_R12(%rdi),%r12
+	movq	PCB_RBP(%rdi),%rbp
+	movq	PCB_RSP(%rdi),%rsp
+	movq	PCB_RBX(%rdi),%rbx
+
+	/* Restore return address. */
+	movq	PCB_RIP(%rdi),%rax
+	movq	%rax,(%rsp)
+
+	xorl	%eax,%eax
 	ret
-END(ctx_fpusave)
+END(resumectx)


Property changes on: trunk/sys/amd64/amd64/cpu_switch.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/db_disasm.c
===================================================================
--- trunk/sys/amd64/amd64/db_disasm.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/db_disasm.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Mach Operating System
  * Copyright (c) 1991,1990 Carnegie Mellon University
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/db_disasm.c 280272 2015-03-19 23:13:19Z markj $");
 
 /*
  * Instruction disassembler.
@@ -127,7 +128,7 @@
 static const struct inst db_inst_0f388x[] = {
 /*80*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invept" },
 /*81*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invvpid" },
-/*82*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*82*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invpcid" },
 /*83*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*84*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*85*/	{ "",	   FALSE, NONE,  0,	      0 },
@@ -249,6 +250,26 @@
 /*0f*/	{ "",      FALSE, NONE,  0,	      0 },
 };
 
+static const struct inst db_inst_0f1x[] = {
+/*10*/	{ "",      FALSE, NONE,  0,	      0 },
+/*11*/	{ "",      FALSE, NONE,  0,	      0 },
+/*12*/	{ "",      FALSE, NONE,  0,	      0 },
+/*13*/	{ "",      FALSE, NONE,  0,	      0 },
+/*14*/	{ "",      FALSE, NONE,  0,	      0 },
+/*15*/	{ "",      FALSE, NONE,  0,	      0 },
+/*16*/	{ "",      FALSE, NONE,  0,	      0 },
+/*17*/	{ "",      FALSE, NONE,  0,	      0 },
+
+/*18*/	{ "",      FALSE, NONE,  0,	      0 },
+/*19*/	{ "",      FALSE, NONE,  0,	      0 },
+/*1a*/	{ "",      FALSE, NONE,  0,	      0 },
+/*1b*/	{ "",      FALSE, NONE,  0,	      0 },
+/*1c*/	{ "",      FALSE, NONE,  0,	      0 },
+/*1d*/	{ "",      FALSE, NONE,  0,	      0 },
+/*1e*/	{ "",      FALSE, NONE,  0,	      0 },
+/*1f*/	{ "nopl",  TRUE,  SDEP,  0,	      "nopw" },
+};
+
 static const struct inst db_inst_0f2x[] = {
 /*20*/	{ "mov",   TRUE,  LONG,  op2(CR,El),  0 },
 /*21*/	{ "mov",   TRUE,  LONG,  op2(DR,El),  0 },
@@ -430,7 +451,7 @@
 
 static const struct inst * const db_inst_0f[] = {
 	db_inst_0f0x,
-	0,
+	db_inst_0f1x,
 	db_inst_0f2x,
 	db_inst_0f3x,
 	db_inst_0f4x,

Modified: trunk/sys/amd64/amd64/db_interface.c
===================================================================
--- trunk/sys/amd64/amd64/db_interface.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/db_interface.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Mach Operating System
  * Copyright (c) 1991,1990 Carnegie Mellon University
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/db_interface.c 208392 2010-05-21 17:17:56Z jhb $");
 
 /*
  * Interface to new debugger.

Modified: trunk/sys/amd64/amd64/db_trace.c
===================================================================
--- trunk/sys/amd64/amd64/db_trace.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/db_trace.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Mach Operating System
  * Copyright (c) 1991,1990 Carnegie Mellon University
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/db_trace.c 330132 2018-02-28 22:33:07Z jhb $");
 
 #include "opt_compat.h"
 
@@ -52,17 +53,8 @@
 #include <ddb/db_sym.h>
 #include <ddb/db_variables.h>
 
-static db_varfcn_t db_dr0;
-static db_varfcn_t db_dr1;
-static db_varfcn_t db_dr2;
-static db_varfcn_t db_dr3;
-static db_varfcn_t db_dr4;
-static db_varfcn_t db_dr5;
-static db_varfcn_t db_dr6;
-static db_varfcn_t db_dr7;
 static db_varfcn_t db_frame;
-static db_varfcn_t db_rsp;
-static db_varfcn_t db_ss;
+static db_varfcn_t db_frame_seg;
 
 CTASSERT(sizeof(struct dbreg) == sizeof(((struct pcpu *)NULL)->pc_dbreg));
 
@@ -71,17 +63,17 @@
  */
 #define	DB_OFFSET(x)	(db_expr_t *)offsetof(struct trapframe, x)
 struct db_variable db_regs[] = {
-	{ "cs",		DB_OFFSET(tf_cs),	db_frame },
-	{ "ds",		DB_OFFSET(tf_ds),	db_frame },
-	{ "es",		DB_OFFSET(tf_es),	db_frame },
-	{ "fs",		DB_OFFSET(tf_fs),	db_frame },
-	{ "gs",		DB_OFFSET(tf_gs),	db_frame },
-	{ "ss",		NULL,			db_ss },
+	{ "cs",		DB_OFFSET(tf_cs),	db_frame_seg },
+	{ "ds",		DB_OFFSET(tf_ds),	db_frame_seg },
+	{ "es",		DB_OFFSET(tf_es),	db_frame_seg },
+	{ "fs",		DB_OFFSET(tf_fs),	db_frame_seg },
+	{ "gs",		DB_OFFSET(tf_gs),	db_frame_seg },
+	{ "ss",		DB_OFFSET(tf_ss),	db_frame_seg },
 	{ "rax",	DB_OFFSET(tf_rax),	db_frame },
 	{ "rcx",        DB_OFFSET(tf_rcx),	db_frame },
 	{ "rdx",	DB_OFFSET(tf_rdx),	db_frame },
 	{ "rbx",	DB_OFFSET(tf_rbx),	db_frame },
-	{ "rsp",	NULL,			db_rsp },
+	{ "rsp",	DB_OFFSET(tf_rsp),	db_frame },
 	{ "rbp",	DB_OFFSET(tf_rbp),	db_frame },
 	{ "rsi",	DB_OFFSET(tf_rsi),	db_frame },
 	{ "rdi",	DB_OFFSET(tf_rdi),	db_frame },
@@ -95,57 +87,18 @@
 	{ "r15",	DB_OFFSET(tf_r15),	db_frame },
 	{ "rip",	DB_OFFSET(tf_rip),	db_frame },
 	{ "rflags",	DB_OFFSET(tf_rflags),	db_frame },
-#define	DB_N_SHOW_REGS	24	/* Don't show registers after here. */
-	{ "dr0",	NULL,			db_dr0 },
-	{ "dr1",	NULL,			db_dr1 },
-	{ "dr2",	NULL,			db_dr2 },
-	{ "dr3",	NULL,			db_dr3 },
-	{ "dr4",	NULL,			db_dr4 },
-	{ "dr5",	NULL,			db_dr5 },
-	{ "dr6",	NULL,			db_dr6 },
-	{ "dr7",	NULL,			db_dr7 },
 };
-struct db_variable *db_eregs = db_regs + DB_N_SHOW_REGS;
+struct db_variable *db_eregs = db_regs + nitems(db_regs);
 
-#define DB_DRX_FUNC(reg)		\
-static int				\
-db_ ## reg (vp, valuep, op)		\
-	struct db_variable *vp;		\
-	db_expr_t * valuep;		\
-	int op;				\
-{					\
-	if (op == DB_VAR_GET)		\
-		*valuep = r ## reg ();	\
-	else				\
-		load_ ## reg (*valuep); \
-	return (1);			\
-}
-
-DB_DRX_FUNC(dr0)
-DB_DRX_FUNC(dr1)
-DB_DRX_FUNC(dr2)
-DB_DRX_FUNC(dr3)
-DB_DRX_FUNC(dr4)
-DB_DRX_FUNC(dr5)
-DB_DRX_FUNC(dr6)
-DB_DRX_FUNC(dr7)
-
-static __inline long
-get_rsp(struct trapframe *tf)
-{
-	return ((ISPL(tf->tf_cs)) ? tf->tf_rsp :
-	    (db_expr_t)tf + offsetof(struct trapframe, tf_rsp));
-}
-
 static int
-db_frame(struct db_variable *vp, db_expr_t *valuep, int op)
+db_frame_seg(struct db_variable *vp, db_expr_t *valuep, int op)
 {
-	long *reg;
+	uint16_t *reg;
 
 	if (kdb_frame == NULL)
 		return (0);
 
-	reg = (long *)((uintptr_t)kdb_frame + (db_expr_t)vp->valuep);
+	reg = (uint16_t *)((uintptr_t)kdb_frame + (db_expr_t)vp->valuep);
 	if (op == DB_VAR_GET)
 		*valuep = *reg;
 	else
@@ -154,33 +107,21 @@
 }
 
 static int
-db_rsp(struct db_variable *vp, db_expr_t *valuep, int op)
+db_frame(struct db_variable *vp, db_expr_t *valuep, int op)
 {
+	long *reg;
 
 	if (kdb_frame == NULL)
 		return (0);
 
+	reg = (long *)((uintptr_t)kdb_frame + (db_expr_t)vp->valuep);
 	if (op == DB_VAR_GET)
-		*valuep = get_rsp(kdb_frame);
-	else if (ISPL(kdb_frame->tf_cs))
-		kdb_frame->tf_rsp = *valuep;
+		*valuep = *reg;
+	else
+		*reg = *valuep;
 	return (1);
 }
 
-static int
-db_ss(struct db_variable *vp, db_expr_t *valuep, int op)
-{
-
-	if (kdb_frame == NULL)
-		return (0);
-
-	if (op == DB_VAR_GET)
-		*valuep = (ISPL(kdb_frame->tf_cs)) ? kdb_frame->tf_ss : rss();
-	else if (ISPL(kdb_frame->tf_cs))
-		kdb_frame->tf_ss = *valuep;
-	return (1);
-}
-
 #define NORMAL		0
 #define	TRAP		1
 #define	INTERRUPT	2
@@ -188,9 +129,7 @@
 #define	TRAP_INTERRUPT	5
 
 static void db_nextframe(struct amd64_frame **, db_addr_t *, struct thread *);
-static int db_numargs(struct amd64_frame *);
-static void db_print_stack_entry(const char *, int, char **, long *, db_addr_t,
-    void *);
+static void db_print_stack_entry(const char *, db_addr_t, void *);
 static void decode_syscall(int, struct thread *);
 
 static const char * watchtype_str(int type);
@@ -198,62 +137,11 @@
 		    int access, struct dbreg *d);
 int  amd64_clr_watch(int watchnum, struct dbreg *d);
 
-/*
- * Figure out how many arguments were passed into the frame at "fp".
- */
-static int
-db_numargs(fp)
-	struct amd64_frame *fp;
+static void
+db_print_stack_entry(const char *name, db_addr_t callpc, void *frame)
 {
-#if 1
-	return (0);	/* regparm, needs dwarf2 info */
-#else
-	long	*argp;
-	int	inst;
-	int	args;
 
-	argp = (long *)db_get_value((long)&fp->f_retaddr, 8, FALSE);
-	/*
-	 * XXX etext is wrong for LKMs.  We should attempt to interpret
-	 * the instruction at the return address in all cases.  This
-	 * may require better fault handling.
-	 */
-	if (argp < (long *)btext || argp >= (long *)etext) {
-		args = 5;
-	} else {
-		inst = db_get_value((long)argp, 4, FALSE);
-		if ((inst & 0xff) == 0x59)	/* popl %ecx */
-			args = 1;
-		else if ((inst & 0xffff) == 0xc483)	/* addl $Ibs, %esp */
-			args = ((inst >> 16) & 0xff) / 4;
-		else
-			args = 5;
-	}
-	return (args);
-#endif
-}
-
-static void
-db_print_stack_entry(name, narg, argnp, argp, callpc, frame)
-	const char *name;
-	int narg;
-	char **argnp;
-	long *argp;
-	db_addr_t callpc;
-	void *frame;
-{
-	db_printf("%s(", name);
-#if 0
-	while (narg) {
-		if (argnp)
-			db_printf("%s=", *argnp++);
-		db_printf("%lr", (long)db_get_value((long)argp, 8, FALSE));
-		argp++;
-		if (--narg != 0)
-			db_printf(",");
-	}
-#endif
-	db_printf(") at ");
+	db_printf("%s() at ", name != NULL ? name : "??");
 	db_printsym(callpc, DB_STGY_PROC);
 	if (frame != NULL)
 		db_printf("/frame 0x%lx", (register_t)frame);
@@ -348,7 +236,7 @@
 		return;
 	}
 
-	db_print_stack_entry(name, 0, 0, 0, rip, &(*fp)->f_frame);
+	db_print_stack_entry(name, rip, &(*fp)->f_frame);
 
 	/*
 	 * Point to base of trapframe which is just above the
@@ -357,7 +245,7 @@
 	tf = (struct trapframe *)((long)*fp + 16);
 
 	if (INKERNEL((long) tf)) {
-		rsp = get_rsp(tf);
+		rsp = tf->tf_rsp;
 		rip = tf->tf_rip;
 		rbp = tf->tf_rbp;
 		switch (frame_type) {
@@ -384,17 +272,13 @@
 }
 
 static int
-db_backtrace(struct thread *td, struct trapframe *tf,
-    struct amd64_frame *frame, db_addr_t pc, int count)
+db_backtrace(struct thread *td, struct trapframe *tf, struct amd64_frame *frame,
+    db_addr_t pc, register_t sp, int count)
 {
 	struct amd64_frame *actframe;
-#define MAXNARG	16
-	char *argnames[MAXNARG], **argnp = NULL;
 	const char *name;
-	long *argp;
 	db_expr_t offset;
 	c_db_sym_t sym;
-	int narg;
 	boolean_t first;
 
 	if (count == -1)
@@ -418,16 +302,29 @@
 		 */
 		actframe = frame;
 		if (first) {
-			if (tf != NULL) {
+			first = FALSE;
+			if (sym == C_DB_SYM_NULL && sp != 0) {
+				/*
+				 * If a symbol couldn't be found, we've probably
+				 * jumped to a bogus location, so try and use
+				 * the return address to find our caller.
+				 */
+				db_print_stack_entry(name, pc, NULL);
+				pc = db_get_value(sp, 8, FALSE);
+				if (db_search_symbol(pc, DB_STGY_PROC,
+				    &offset) == C_DB_SYM_NULL)
+					break;
+				continue;
+			} else if (tf != NULL) {
 				int instr;
 
 				instr = db_get_value(pc, 4, FALSE);
 				if ((instr & 0xffffffff) == 0xe5894855) {
 					/* pushq %rbp; movq %rsp, %rbp */
-					actframe = (void *)(get_rsp(tf) - 8);
+					actframe = (void *)(tf->tf_rsp - 8);
 				} else if ((instr & 0xffffff) == 0xe58948) {
 					/* movq %rsp, %rbp */
-					actframe = (void *)get_rsp(tf);
+					actframe = (void *)tf->tf_rsp;
 					if (tf->tf_rbp == 0) {
 						/* Fake frame better. */
 						frame = actframe;
@@ -434,33 +331,24 @@
 					}
 				} else if ((instr & 0xff) == 0xc3) {
 					/* ret */
-					actframe = (void *)(get_rsp(tf) - 8);
+					actframe = (void *)(tf->tf_rsp - 8);
 				} else if (offset == 0) {
 					/* Probably an assembler symbol. */
-					actframe = (void *)(get_rsp(tf) - 8);
+					actframe = (void *)(tf->tf_rsp - 8);
 				}
-			} else if (strcmp(name, "fork_trampoline") == 0) {
+			} else if (name != NULL &&
+			    strcmp(name, "fork_trampoline") == 0) {
 				/*
 				 * Don't try to walk back on a stack for a
 				 * process that hasn't actually been run yet.
 				 */
-				db_print_stack_entry(name, 0, 0, 0, pc,
-				    actframe);
+				db_print_stack_entry(name, pc, actframe);
 				break;
 			}
-			first = FALSE;
 		}
 
-		argp = &actframe->f_arg0;
-		narg = MAXNARG;
-		if (sym != NULL && db_sym_numargs(sym, &narg, argnames)) {
-			argnp = argnames;
-		} else {
-			narg = db_numargs(frame);
-		}
+		db_print_stack_entry(name, pc, actframe);
 
-		db_print_stack_entry(name, narg, argnp, argp, pc, actframe);
-
 		if (actframe != frame) {
 			/* `frame' belongs to caller. */
 			pc = (db_addr_t)
@@ -473,7 +361,7 @@
 		if (INKERNEL((long)pc) && !INKERNEL((long)frame)) {
 			sym = db_search_symbol(pc, DB_STGY_ANY, &offset);
 			db_symbol_values(sym, &name, NULL);
-			db_print_stack_entry(name, 0, 0, 0, pc, frame);
+			db_print_stack_entry(name, pc, frame);
 			break;
 		}
 		if (!INKERNEL((long) frame)) {
@@ -495,7 +383,7 @@
 	frame = (struct amd64_frame *)rbp;
 	callpc = (db_addr_t)db_get_value((long)&frame->f_retaddr, 8, FALSE);
 	frame = frame->f_frame;
-	db_backtrace(curthread, NULL, frame, callpc, -1);
+	db_backtrace(curthread, NULL, frame, callpc, 0, -1);
 }
 
 int
@@ -502,10 +390,12 @@
 db_trace_thread(struct thread *thr, int count)
 {
 	struct pcb *ctx;
+	struct trapframe *tf;
 
 	ctx = kdb_thr_ctx(thr);
-	return (db_backtrace(thr, NULL, (struct amd64_frame *)ctx->pcb_rbp,
-		    ctx->pcb_rip, count));
+	tf = thr == kdb_thread ? kdb_frame : NULL;
+	return (db_backtrace(thr, tf, (struct amd64_frame *)ctx->pcb_rbp,
+	    ctx->pcb_rip, ctx->pcb_rsp, count));
 }
 
 int

Modified: trunk/sys/amd64/amd64/elf_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/elf_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/elf_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright 1996-1998 John D. Polstra.
  * All rights reserved.
@@ -24,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/elf_machdep.c 294136 2016-01-16 07:56:49Z dchagin $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -36,7 +37,6 @@
 #include <sys/sysent.h>
 #include <sys/imgact_elf.h>
 #include <sys/syscall.h>
-#include <sys/sysent.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 
@@ -45,6 +45,7 @@
 #include <vm/vm_param.h>
 
 #include <machine/elf.h>
+#include <machine/fpu.h>
 #include <machine/md_var.h>
 
 struct sysentvec elf64_freebsd_sysvec = {
@@ -82,28 +83,11 @@
 	.sv_shared_page_base = SHAREDPAGE,
 	.sv_shared_page_len = PAGE_SIZE,
 	.sv_schedtail	= NULL,
+	.sv_thread_detach = NULL,
+	.sv_trap	= NULL,
 };
 INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec);
 
-void
-amd64_lower_shared_page(struct sysentvec *sv)
-{
-	if (hw_lower_amd64_sharedpage != 0) {
-		sv->sv_maxuser -= PAGE_SIZE;
-		sv->sv_shared_page_base -= PAGE_SIZE;
-		sv->sv_usrstack -= PAGE_SIZE;
-		sv->sv_psstrings -= PAGE_SIZE;
-	}
-}
-
-/*
- * Do this fixup before INIT_SYSENTVEC (SI_ORDER_ANY) because the latter
- * uses the value of sv_shared_page_base.
- */
-SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST,
-	(sysinit_cfunc_t) amd64_lower_shared_page,
-	&elf64_freebsd_sysvec);
-
 static Elf64_Brandinfo freebsd_brand_info = {
 	.brand		= ELFOSABI_FREEBSD,
 	.machine	= EM_X86_64,
@@ -136,13 +120,44 @@
 	(sysinit_cfunc_t) elf64_insert_brand_entry,
 	&freebsd_brand_oinfo);
 
+static Elf64_Brandinfo kfreebsd_brand_info = {
+	.brand		= ELFOSABI_FREEBSD,
+	.machine	= EM_X86_64,
+	.compat_3_brand	= "FreeBSD",
+	.emul_path	= NULL,
+	.interp_path	= "/lib/ld-kfreebsd-x86-64.so.1",
+	.sysvec		= &elf64_freebsd_sysvec,
+	.interp_newpath	= NULL,
+	.brand_note	= &elf64_kfreebsd_brandnote,
+	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
+};
+
+SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
+	(sysinit_cfunc_t) elf64_insert_brand_entry,
+	&kfreebsd_brand_info);
+
 void
-elf64_dump_thread(struct thread *td __unused, void *dst __unused,
-    size_t *off __unused)
+elf64_dump_thread(struct thread *td, void *dst, size_t *off)
 {
+	void *buf;
+	size_t len;
+
+	len = 0;
+	if (use_xsave) {
+		if (dst != NULL) {
+			fpugetregs(td);
+			len += elf64_populate_note(NT_X86_XSTATE,
+			    get_pcb_user_save_td(td), dst,
+			    cpu_max_ext_state_size, &buf);
+			*(uint64_t *)((char *)buf + X86_XSTATE_XCR0_OFFSET) =
+			    xsave_mask;
+		} else
+			len += elf64_populate_note(NT_X86_XSTATE, NULL, NULL,
+			    cpu_max_ext_state_size, NULL);
+	}
+	*off = len;
 }
 
-
 /* Process one elf relocation with addend. */
 static int
 elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data,
@@ -155,6 +170,7 @@
 	Elf_Size rtype, symidx;
 	const Elf_Rel *rel;
 	const Elf_Rela *rela;
+	int error;
 
 	switch (type) {
 	case ELF_RELOC_REL:
@@ -190,9 +206,9 @@
 			break;
 
 		case R_X86_64_64:		/* S + A */
-			addr = lookup(lf, symidx, 1);
+			error = lookup(lf, symidx, 1, &addr);
 			val = addr + addend;
-			if (addr == 0)
+			if (error != 0)
 				return -1;
 			if (*where != val)
 				*where = val;
@@ -199,10 +215,10 @@
 			break;
 
 		case R_X86_64_PC32:	/* S + A - P */
-			addr = lookup(lf, symidx, 1);
+			error = lookup(lf, symidx, 1, &addr);
 			where32 = (Elf32_Addr *)where;
 			val32 = (Elf32_Addr)(addr + addend - (Elf_Addr)where);
-			if (addr == 0)
+			if (error != 0)
 				return -1;
 			if (*where32 != val32)
 				*where32 = val32;
@@ -209,10 +225,10 @@
 			break;
 
 		case R_X86_64_32S:	/* S + A sign extend */
-			addr = lookup(lf, symidx, 1);
+			error = lookup(lf, symidx, 1, &addr);
 			val32 = (Elf32_Addr)(addr + addend);
 			where32 = (Elf32_Addr *)where;
-			if (addr == 0)
+			if (error != 0)
 				return -1;
 			if (*where32 != val32)
 				*where32 = val32;
@@ -229,8 +245,8 @@
 
 		case R_X86_64_GLOB_DAT:	/* S */
 		case R_X86_64_JMP_SLOT:	/* XXX need addend + offset */
-			addr = lookup(lf, symidx, 1);
-			if (addr == 0)
+			error = lookup(lf, symidx, 1, &addr);
+			if (error != 0)
 				return -1;
 			if (*where != addr)
 				*where = addr;

Modified: trunk/sys/amd64/amd64/exception.S
===================================================================
--- trunk/sys/amd64/amd64/exception.S	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/exception.S	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
@@ -31,7 +32,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/exception.S 333370 2018-05-08 17:05:39Z emaste $
  */
 
 #include "opt_atpic.h"
@@ -80,18 +81,22 @@
  * This is equivalent to the i386 port's use of SDT_SYS386TGT.
  *
  * The cpu will push a certain amount of state onto the kernel stack for
- * the current process.  See amd64/include/frame.h.  
- * This includes the current RFLAGS (status register, which includes 
+ * the current process.  See amd64/include/frame.h.
+ * This includes the current RFLAGS (status register, which includes
  * the interrupt disable state prior to the trap), the code segment register,
- * and the return instruction pointer are pushed by the cpu.  The cpu 
- * will also push an 'error' code for certain traps.  We push a dummy 
- * error code for those traps where the cpu doesn't in order to maintain 
+ * and the return instruction pointer are pushed by the cpu.  The cpu
+ * will also push an 'error' code for certain traps.  We push a dummy
+ * error code for those traps where the cpu doesn't in order to maintain
  * a consistent frame.  We also push a contrived 'trap number'.
  *
- * The cpu does not push the general registers, we must do that, and we 
- * must restore them prior to calling 'iret'.  The cpu adjusts the %cs and
- * %ss segment registers, but does not mess with %ds, %es, or %fs.  Thus we
- * must load them with appropriate values for supervisor mode operation.
+ * The CPU does not push the general registers, so we must do that, and we
+ * must restore them prior to calling 'iret'.  The CPU adjusts %cs and %ss
+ * but does not mess with %ds, %es, %gs or %fs.  We swap the %gs base for
+ * for the kernel mode operation shortly, without changes to the selector
+ * loaded.  Since superuser long mode works with any selectors loaded into
+ * segment registers other then %cs, which makes them mostly unused in long
+ * mode, and kernel does not reference %fs, leave them alone.  The segment
+ * registers are reloaded on return to the usermode.
  */
 
 MCOUNT_LABEL(user)
@@ -104,8 +109,6 @@
 	movq $0,TF_ADDR(%rsp) ; \
 	movq $0,TF_ERR(%rsp) ; \
 	jmp alltraps_noen
-IDTVEC(dbg)
-	TRAP_NOEN(T_TRCTRAP)
 IDTVEC(bpt)
 	TRAP_NOEN(T_BPTFLT)
 #ifdef KDTRACE_HOOKS
@@ -208,6 +211,8 @@
 	 * interrupt. For all other trap types, just handle them in
 	 * the usual way.
 	 */
+	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+	jnz	calltrap		/* ignore userland traps */
 	cmpl	$T_BPTFLT,TF_TRAPNO(%rsp)
 	jne	calltrap
 
@@ -430,6 +435,101 @@
 	sysret
 
 /*
+ * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
+ * generation of exception until the next instruction is executed,
+ * which might be a kernel entry.  So we must execute the handler
+ * on IST stack and be ready for non-kernel GSBASE.
+ */
+IDTVEC(dbg)
+	subq	$TF_RIP,%rsp
+	movl	$(T_TRCTRAP),TF_TRAPNO(%rsp)
+	movq	$0,TF_ADDR(%rsp)
+	movq	$0,TF_ERR(%rsp)
+	movq	%rdi,TF_RDI(%rsp)
+	movq	%rsi,TF_RSI(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	movq	%r8,TF_R8(%rsp)
+	movq	%r9,TF_R9(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rbx,TF_RBX(%rsp)
+	movq	%rbp,TF_RBP(%rsp)
+	movq	%r10,TF_R10(%rsp)
+	movq	%r11,TF_R11(%rsp)
+	movq	%r12,TF_R12(%rsp)
+	movq	%r13,TF_R13(%rsp)
+	movq	%r14,TF_R14(%rsp)
+	movq	%r15,TF_R15(%rsp)
+	movw	%fs,TF_FS(%rsp)
+	movw	%gs,TF_GS(%rsp)
+	movw	%es,TF_ES(%rsp)
+	movw	%ds,TF_DS(%rsp)
+	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
+	cld
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
+	jnz	dbg_fromuserspace
+	/*
+	 * We've interrupted the kernel.  Preserve GS.base in %r12.
+	 */
+	movl	$MSR_GSBASE,%ecx
+	rdmsr
+	movq	%rax,%r12
+	shlq	$32,%rdx
+	orq	%rdx,%r12
+	/* Retrieve and load the canonical value for GS.base. */
+	movq	TF_SIZE(%rsp),%rdx
+	movl	%edx,%eax
+	shrq	$32,%rdx
+	wrmsr
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp,%rdi
+	call	trap
+	MEXITCOUNT
+	/*
+	 * Put back the preserved MSR_GSBASE value.
+	 */
+	movl	$MSR_GSBASE,%ecx
+	movq	%r12,%rdx
+	movl	%edx,%eax
+	shrq	$32,%rdx
+	wrmsr
+	movq	TF_RDI(%rsp),%rdi
+	movq	TF_RSI(%rsp),%rsi
+	movq	TF_RDX(%rsp),%rdx
+	movq	TF_RCX(%rsp),%rcx
+	movq	TF_R8(%rsp),%r8
+	movq	TF_R9(%rsp),%r9
+	movq	TF_RAX(%rsp),%rax
+	movq	TF_RBX(%rsp),%rbx
+	movq	TF_RBP(%rsp),%rbp
+	movq	TF_R10(%rsp),%r10
+	movq	TF_R11(%rsp),%r11
+	movq	TF_R12(%rsp),%r12
+	movq	TF_R13(%rsp),%r13
+	movq	TF_R14(%rsp),%r14
+	movq	TF_R15(%rsp),%r15
+	addq	$TF_RIP,%rsp
+	jmp	doreti_iret
+dbg_fromuserspace:
+	/*
+	 * Switch to kernel GSBASE and kernel page table, and copy frame
+	 * from the IST stack to the normal kernel stack, since trap()
+	 * re-enables interrupts, and since we might trap on DB# while
+	 * in trap().
+	 */
+	swapgs
+	movq	PCPU(RSP0),%rax
+	movl	$TF_SIZE,%ecx
+	subq	%rcx,%rax
+	movq	%rax,%rdi
+	movq	%rsp,%rsi
+	rep;movsb
+	movq	%rax,%rsp
+	movq	PCPU(CURPCB),%rdi
+	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
+	jmp	calltrap
+
+/*
  * NMI handling is special.
  *
  * First, NMIs do not respect the state of the processor's RFLAGS.IF
@@ -508,7 +608,7 @@
 #ifdef HWPMC_HOOKS
 	/*
 	 * Capture a userspace callchain if needed.
-	 * 
+	 *
 	 * - Check if the current trap was from user mode.
 	 * - Check if the current thread is valid.
 	 * - Check if the thread requires a user call chain to be
@@ -554,7 +654,7 @@
 	 * At this point the processor has exited NMI mode and is running
 	 * with interrupts turned off on the normal kernel stack.
 	 *
-	 * If a pending NMI gets recognized at or after this point, it 
+	 * If a pending NMI gets recognized at or after this point, it
 	 * will cause a kernel callchain to be traced.
 	 *
 	 * We turn interrupts back on, and call the user callchain capture hook.
@@ -572,7 +672,7 @@
 #endif
 	testl	%ebx,%ebx
 	jnz	doreti_exit
-nmi_kernelexit:	
+nmi_kernelexit:
 	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
@@ -655,6 +755,7 @@
 	.text
 	SUPERALIGN_TEXT
 	.type	doreti, at function
+	.globl	doreti
 doreti:
 	FAKE_MCOUNT($bintr)		/* init "from" bintr -> doreti */
 	/*
@@ -723,21 +824,38 @@
 	pushfq
 	cli
 	movl	$MSR_GSBASE,%ecx
+	/* Save current kernel %gs base into %r12d:%r13d */
 	rdmsr
+	movl	%eax,%r12d
+	movl	%edx,%r13d
 	.globl	ld_gs
 ld_gs:
 	movw	%si,%gs
+	/* Save user %gs base into %r14d:%r15d */
+	rdmsr
+	movl	%eax,%r14d
+	movl	%edx,%r15d
+	/* Restore kernel %gs base */
+	movl	%r12d,%eax
+	movl	%r13d,%edx
 	wrmsr
 	popfq
+	/*
+	 * Restore user %gs base, either from PCB if used for TLS, or
+	 * from the previously saved msr read.
+	 */
+	movl	$MSR_KGSBASE,%ecx
 	cmpw	$KUG32SEL,%si
 	jne	1f
-	movl	$MSR_KGSBASE,%ecx
 	movl	PCB_GSBASE(%r8),%eax
 	movl	PCB_GSBASE+4(%r8),%edx
+	jmp	ld_gsbase
+1:
+	movl	%r14d,%eax
+	movl	%r15d,%edx
 	.globl	ld_gsbase
 ld_gsbase:
-	wrmsr
-1:
+	wrmsr	/* May trap if non-canonical, but only for TLS. */
 	.globl	ld_es
 ld_es:
 	movw	TF_ES(%rsp),%es


Property changes on: trunk/sys/amd64/amd64/exception.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/fpu.c
===================================================================
--- trunk/sys/amd64/amd64/fpu.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/fpu.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1990 William Jolitz.
  * Copyright (c) 1991 The Regents of the University of California.
@@ -31,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/fpu.c 325543 2017-11-08 11:39:42Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -127,10 +128,17 @@
  */
 CTASSERT(sizeof(struct pcb) % XSAVE_AREA_ALIGN == 0);
 
+/*
+ * Ensure the copy of XCR0 saved in a core is contained in the padding
+ * area.
+ */
+CTASSERT(X86_XSTATE_XCR0_OFFSET >= offsetof(struct savefpu, sv_pad) &&
+    X86_XSTATE_XCR0_OFFSET + sizeof(uint64_t) <= sizeof(struct savefpu));
+
 static	void	fpu_clean_state(void);
 
 SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
-    NULL, 1, "Floating point instructions executed in hardware");
+    SYSCTL_NULL_INT_PTR, 1, "Floating point instructions executed in hardware");
 
 int use_xsave;			/* non-static for cpu_switch.S */
 uint64_t xsave_mask;		/* the same */
@@ -162,6 +170,31 @@
 		fxrstor((char *)addr);
 }
 
+void
+fpususpend(void *addr)
+{
+	u_long cr0;
+
+	cr0 = rcr0();
+	stop_emulating();
+	fpusave(addr);
+	load_cr0(cr0);
+}
+
+void
+fpuresume(void *addr)
+{
+	u_long cr0;
+
+	cr0 = rcr0();
+	stop_emulating();
+	fninit();
+	if (use_xsave)
+		load_xcr(XCR0, xsave_mask);
+	fpurestore(addr);
+	load_cr0(cr0);
+}
+
 /*
  * Enable XSAVE if supported and allowed by user.
  * Calculate the xsave_mask.
@@ -188,6 +221,10 @@
 	TUNABLE_ULONG_FETCH("hw.xsave_mask", &xsave_mask_user);
 	xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE;
 	xsave_mask &= xsave_mask_user;
+	if ((xsave_mask & XFEATURE_AVX512) != XFEATURE_AVX512)
+		xsave_mask &= ~XFEATURE_AVX512;
+	if ((xsave_mask & XFEATURE_MPX) != XFEATURE_MPX)
+		xsave_mask &= ~XFEATURE_MPX;
 
 	cpuid_count(0xd, 0x1, cp);
 	if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) {
@@ -282,13 +319,15 @@
 		cpu_mxcsr_mask = 0xFFBF;
 
 	/*
-	 * The fninit instruction does not modify XMM registers.  The
-	 * fpusave call dumped the garbage contained in the registers
-	 * after reset to the initial state saved.  Clear XMM
-	 * registers file image to make the startup program state and
-	 * signal handler XMM register content predictable.
+	 * The fninit instruction does not modify XMM registers or x87
+	 * registers (MM/ST).  The fpusave call dumped the garbage
+	 * contained in the registers after reset to the initial state
+	 * saved.  Clear XMM and x87 registers file image to make the
+	 * startup program state and signal handler XMM/x87 register
+	 * content predictable.
 	 */
-	bzero(&fpu_initialstate->sv_xmm[0], sizeof(struct xmmacc));
+	bzero(fpu_initialstate->sv_fp, sizeof(fpu_initialstate->sv_fp));
+	bzero(fpu_initialstate->sv_xmm, sizeof(fpu_initialstate->sv_xmm));
 
 	/*
 	 * Create a table describing the layout of the CPU Extended
@@ -333,13 +372,13 @@
 		stop_emulating();
 		fpusave(curpcb->pcb_save);
 		start_emulating();
-		PCPU_SET(fpcurthread, 0);
+		PCPU_SET(fpcurthread, NULL);
 	}
 	critical_exit();
 }
 
 int
-fpuformat()
+fpuformat(void)
 {
 
 	return (_MC_FPFMT_XMM);
@@ -574,33 +613,37 @@
 }
 
 /*
- * Implement device not available (DNA) exception
+ * Device Not Available (DNA, #NM) exception handler.
  *
- * It would be better to switch FP context here (if curthread != fpcurthread)
- * and not necessarily for every context switch, but it is too hard to
- * access foreign pcb's.
+ * It would be better to switch FP context here (if curthread !=
+ * fpcurthread) and not necessarily for every context switch, but it
+ * is too hard to access foreign pcb's.
  */
-
-static int err_count = 0;
-
 void
 fpudna(void)
 {
 
+	/*
+	 * This handler is entered with interrupts enabled, so context
+	 * switches may occur before critical_enter() is executed.  If
+	 * a context switch occurs, then when we regain control, our
+	 * state will have been completely restored.  The CPU may
+	 * change underneath us, but the only part of our context that
+	 * lives in the CPU is CR0.TS and that will be "restored" by
+	 * setting it on the new CPU.
+	 */
 	critical_enter();
+
 	if (PCPU_GET(fpcurthread) == curthread) {
-		printf("fpudna: fpcurthread == curthread %d times\n",
-		    ++err_count);
+		printf("fpudna: fpcurthread == curthread\n");
 		stop_emulating();
 		critical_exit();
 		return;
 	}
 	if (PCPU_GET(fpcurthread) != NULL) {
-		printf("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
-		       PCPU_GET(fpcurthread),
-		       PCPU_GET(fpcurthread)->td_proc->p_pid,
-		       curthread, curthread->td_proc->p_pid);
-		panic("fpudna");
+		panic("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
+		    PCPU_GET(fpcurthread), PCPU_GET(fpcurthread)->td_tid,
+		    curthread, curthread->td_tid);
 	}
 	stop_emulating();
 	/*
@@ -621,7 +664,8 @@
 		 * fpu_initialstate, to ignite the XSAVEOPT
 		 * tracking engine.
 		 */
-		bcopy(fpu_initialstate, curpcb->pcb_save, cpu_max_ext_state_size);
+		bcopy(fpu_initialstate, curpcb->pcb_save,
+		    cpu_max_ext_state_size);
 		fpurestore(curpcb->pcb_save);
 		if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
 			fldcw(curpcb->pcb_initial_fpucw);
@@ -636,7 +680,7 @@
 }
 
 void
-fpudrop()
+fpudrop(void)
 {
 	struct thread *td;
 
@@ -761,6 +805,7 @@
 	struct pcb *pcb;
 	int error;
 
+	addr->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 	pcb = td->td_pcb;
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) {
@@ -875,6 +920,7 @@
     "Kernel contexts for FPU state");
 
 #define	FPU_KERN_CTX_FPUINITDONE 0x01
+#define	FPU_KERN_CTX_DUMMY	 0x02	/* avoided save for the kern thread */
 
 struct fpu_kern_ctx {
 	struct savefpu *prev;
@@ -918,6 +964,10 @@
 {
 	struct pcb *pcb;
 
+	if ((flags & FPU_KERN_KTHR) != 0 && is_fpu_kern_thread(0)) {
+		ctx->flags = FPU_KERN_CTX_DUMMY;
+		return (0);
+	}
 	pcb = td->td_pcb;
 	KASSERT(!PCB_USER_FPU(pcb) || pcb->pcb_save ==
 	    get_pcb_user_save_pcb(pcb), ("mangled pcb_save"));
@@ -937,6 +987,9 @@
 {
 	struct pcb *pcb;
 
+	if (is_fpu_kern_thread(0) && (ctx->flags & FPU_KERN_CTX_DUMMY) != 0)
+		return (0);
+	KASSERT((ctx->flags & FPU_KERN_CTX_DUMMY) == 0, ("dummy ctx"));
 	pcb = td->td_pcb;
 	critical_enter();
 	if (curthread == PCPU_GET(fpcurthread))

Modified: trunk/sys/amd64/amd64/gdb_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/gdb_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/gdb_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2004 Marcel Moolenaar
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/gdb_machdep.c 290734 2015-11-13 00:50:34Z jhb $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -48,6 +49,8 @@
 void *
 gdb_cpu_getreg(int regnum, size_t *regsz)
 {
+	static uint32_t _kcodesel = GSEL(GCODE_SEL, SEL_KPL);
+	static uint32_t _kdatasel = GSEL(GDATA_SEL, SEL_KPL);
 
 	*regsz = gdb_cpu_regsz(regnum);
 
@@ -76,6 +79,8 @@
 	case 14: return (&kdb_thrctx->pcb_r14);
 	case 15: return (&kdb_thrctx->pcb_r15);
 	case 16: return (&kdb_thrctx->pcb_rip);
+	case 18: return (&_kcodesel);
+	case 19: return (&_kdatasel);
 	}
 	return (NULL);
 }

Modified: trunk/sys/amd64/amd64/genassym.c
===================================================================
--- trunk/sys/amd64/amd64/genassym.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/genassym.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1990 The Regents of the University of California.
  * All rights reserved.
@@ -33,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/genassym.c 286308 2015-08-05 07:35:34Z kib $");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
@@ -76,6 +77,8 @@
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
+ASSYM(PM_SAVE, offsetof(struct pmap, pm_save));
+ASSYM(PM_PCID, offsetof(struct pmap, pm_pcid));
 
 ASSYM(P_MD, offsetof(struct proc, p_md));
 ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
@@ -109,7 +112,6 @@
 ASSYM(addr_PML4pml4e, addr_PML4pml4e);
 ASSYM(PDESIZE, sizeof(pd_entry_t));
 ASSYM(PTESIZE, sizeof(pt_entry_t));
-ASSYM(PTESHIFT, PTESHIFT);
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);
 ASSYM(PAGE_MASK, PAGE_MASK);
 ASSYM(PDRSHIFT, PDRSHIFT);
@@ -152,11 +154,13 @@
 ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
-ASSYM(PCB_GS32SD, offsetof(struct pcb, pcb_gs32sd));
 ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
-ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct savefpu));
-ASSYM(PCB_USERFPU, sizeof(struct pcb));
+ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer));
+ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star));
+ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar));
+ASSYM(PCB_CSTAR, offsetof(struct pcb, pcb_cstar));
+ASSYM(PCB_SFMASK, offsetof(struct pcb, pcb_sfmask));
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_FULL_IRET, PCB_FULL_IRET);
 ASSYM(PCB_DBREGS, PCB_DBREGS);
@@ -219,6 +223,7 @@
 ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt));
 ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
 ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
+ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
  
 ASSYM(LA_VER, offsetof(struct LAPIC, version));
 ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));

Modified: trunk/sys/amd64/amd64/in_cksum.c
===================================================================
--- trunk/sys/amd64/amd64/in_cksum.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/in_cksum.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /* $NetBSD: in_cksum.c,v 1.7 1997/09/02 13:18:15 thorpej Exp $ */
 
 /*-
@@ -38,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/in_cksum.c 139731 2005-01-05 20:17:21Z imp $");
 
 #include <sys/param.h>
 #include <sys/mbuf.h>

Modified: trunk/sys/amd64/amd64/initcpu.c
===================================================================
--- trunk/sys/amd64/amd64/initcpu.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/initcpu.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) KATO Takenori, 1997, 1998.
  * 
@@ -28,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/initcpu.c 313150 2017-02-03 12:20:44Z kib $");
 
 #include "opt_cpu.h"
 
@@ -48,11 +49,6 @@
 static int	hw_instruction_sse;
 SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
     &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU");
-static int	lower_sharedpage_init;
-int		hw_lower_amd64_sharedpage;
-SYSCTL_INT(_hw, OID_AUTO, lower_amd64_sharedpage, CTLFLAG_RDTUN,
-    &hw_lower_amd64_sharedpage, 0,
-   "Lower sharedpage to work around Ryzen issue with executing code near the top of user memory");
 /*
  * -1: automatic (default)
  *  0: keep enable CLFLUSH
@@ -79,7 +75,12 @@
 u_int	cpu_mxcsr_mask;		/* Valid bits in mxcsr */
 u_int	cpu_clflush_line_size = 32;
 u_int	cpu_stdext_feature;
+u_int	cpu_stdext_feature2;
 u_int	cpu_max_ext_state_size;
+u_int	cpu_mon_mwait_flags;	/* MONITOR/MWAIT flags (CPUID.05H.ECX) */
+u_int	cpu_mon_min_size;	/* MONITOR minimum range size, bytes */
+u_int	cpu_mon_max_size;	/* MONITOR minimum range size, bytes */
+u_int	cpu_maxphyaddr;		/* Max phys addr width in bits */
 
 SYSCTL_UINT(_hw, OID_AUTO, via_feature_rng, CTLFLAG_RD,
 	&via_feature_rng, 0, "VIA RNG feature available in CPU");
@@ -89,6 +90,7 @@
 static void
 init_amd(void)
 {
+	uint64_t msr;
 
 	/*
 	 * Work around Erratum 721 for Family 10h and 12h processors.
@@ -113,26 +115,45 @@
 	}
 
 	/*
-	 * Work around a problem on Ryzen that is triggered by executing
-	 * code near the top of user memory, in our case the signal
-	 * trampoline code in the shared page on amd64.
-	 *
-	 * This function is executed once for the BSP before tunables take
-	 * effect so the value determined here can be overridden by the
-	 * tunable.  This function is then executed again for each AP and
-	 * also on resume.  Set a flag the first time so that value set by
-	 * the tunable is not overwritten.
-	 *
-	 * The stepping and/or microcode versions should be checked after
-	 * this issue is fixed by AMD so that we don't use this mode if not
-	 * needed.
+	 * BIOS may fail to set InitApicIdCpuIdLo to 1 as it should per BKDG.
+	 * So, do it here or otherwise some tools could be confused by
+	 * Initial Local APIC ID reported with CPUID Function 1 in EBX.
 	 */
-	if (lower_sharedpage_init == 0) {
-		lower_sharedpage_init = 1;
-		if (CPUID_TO_FAMILY(cpu_id) == 0x17) {
-			hw_lower_amd64_sharedpage = 1;
+	if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
+		if ((cpu_feature2 & CPUID2_HV) == 0) {
+			msr = rdmsr(MSR_NB_CFG1);
+			msr |= (uint64_t)1 << 54;
+			wrmsr(MSR_NB_CFG1, msr);
 		}
 	}
+
+	/*
+	 * BIOS may configure Family 10h processors to convert WC+ cache type
+	 * to CD.  That can hurt performance of guest VMs using nested paging.
+	 * The relevant MSR bit is not documented in the BKDG,
+	 * the fix is borrowed from Linux.
+	 */
+	if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
+		if ((cpu_feature2 & CPUID2_HV) == 0) {
+			msr = rdmsr(0xc001102a);
+			msr &= ~((uint64_t)1 << 24);
+			wrmsr(0xc001102a, msr);
+		}
+	}
+
+	/*
+	 * Work around Erratum 793: Specific Combination of Writes to Write
+	 * Combined Memory Types and Locked Instructions May Cause Core Hang.
+	 * See Revision Guide for AMD Family 16h Models 00h-0Fh Processors,
+	 * revision 3.04 or later, publication 51810.
+	 */
+	if (CPUID_TO_FAMILY(cpu_id) == 0x16 && CPUID_TO_MODEL(cpu_id) <= 0xf) {
+		if ((cpu_feature2 & CPUID2_HV) == 0) {
+			msr = rdmsr(0xc0011020);
+			msr |= (uint64_t)1 << 15;
+			wrmsr(0xc0011020, msr);
+		}
+	}
 }
 
 /*
@@ -216,7 +237,7 @@
 }
 
 void
-initializecpucache()
+initializecpucache(void)
 {
 
 	/*
@@ -233,12 +254,17 @@
 	 * CPUID_SS feature even though the native CPU supports it.
 	 */
 	TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable);
-	if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1)
+	if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) {
 		cpu_feature &= ~CPUID_CLFSH;
+		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
+	}
+
 	/*
-	 * Allow to disable CLFLUSH feature manually by
-	 * hw.clflush_disable tunable.
+	 * The kernel's use of CLFLUSH{,OPT} can be disabled manually
+	 * by setting the hw.clflush_disable tunable.
 	 */
-	if (hw_clflush_disable == 1)
+	if (hw_clflush_disable == 1) {
 		cpu_feature &= ~CPUID_CLFSH;
+		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
+	}
 }

Modified: trunk/sys/amd64/amd64/io.c
===================================================================
--- trunk/sys/amd64/amd64/io.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/io.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2004 Mark R V Murray
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/io.c 207329 2010-04-28 15:38:01Z attilio $");
 
 #include <sys/param.h>
 #include <sys/proc.h>

Modified: trunk/sys/amd64/amd64/locore.S
===================================================================
--- trunk/sys/amd64/amd64/locore.S	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/locore.S	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003 Peter Wemm <peter at FreeBSD.org>
  * All rights reserved.
@@ -23,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/locore.S 115431 2003-05-31 06:54:29Z peter $
  */
 
 #include <machine/asmacros.h>


Property changes on: trunk/sys/amd64/amd64/locore.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/machdep.c
===================================================================
--- trunk/sys/amd64/amd64/machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
@@ -39,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/machdep.c 333370 2018-05-08 17:05:39Z emaste $");
 
 #include "opt_atalk.h"
 #include "opt_atpic.h"
@@ -53,6 +54,7 @@
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_perfmon.h"
+#include "opt_platform.h"
 #include "opt_sched.h"
 #include "opt_kdtrace.h"
 
@@ -65,6 +67,7 @@
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
+#include <sys/efi.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
@@ -80,6 +83,7 @@
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
@@ -132,6 +136,9 @@
 #ifdef SMP
 #include <machine/smp.h>
 #endif
+#ifdef FDT
+#include <x86/fdt.h>
+#endif
 
 #ifdef DEV_ATPIC
 #include <x86/isa/icu.h>
@@ -147,10 +154,6 @@
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 
-extern void printcpuinfo(void);	/* XXX header file */
-extern void identify_cpu(void);
-extern void panicifcpuunsupported(void);
-
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
@@ -157,7 +160,7 @@
 static void cpu_startup(void *);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
-static int  set_fpcontext(struct thread *td, const mcontext_t *mcp,
+static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
@@ -211,6 +214,8 @@
 
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
+void (*vmm_resume_p)(void);
+
 static void
 cpu_startup(dummy)
 	void *dummy;
@@ -230,9 +235,11 @@
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
+		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
+		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
@@ -251,7 +258,6 @@
 #ifdef PERFMON
 	perfmon_init();
 #endif
-	realmem = Maxmem;
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
@@ -265,6 +271,7 @@
 	if (memsize < ptoa((uintmax_t)cnt.v_free_count))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
+	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
@@ -298,11 +305,6 @@
 	vm_pager_bufferinit();
 
 	cpu_setregs();
-
-	/*
-	 * Add BSP as an interrupt target.
-	 */
-	intr_add_cpu(0);
 }
 
 /*
@@ -383,10 +385,6 @@
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
-	/* Translate the signal if appropriate. */
-	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
-		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
-
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
@@ -487,17 +485,7 @@
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
-	/*
-	 * XXX do allow users to change the privileged flag PSL_RF.
-	 * The cpu sets PSL_RF in tf_rflags for faults.  Debuggers
-	 * should sometimes set it there too.  tf_rflags is kept in
-	 * the signal context during signal handling and there is no
-	 * other place to remember it, so the PSL_RF bit may be
-	 * corrupted by the signal handler without us knowing.
-	 * Corruption of the PSL_RF bit at worst causes one more or
-	 * one less debugger trap, so allowing it is fairly harmless.
-	 */
-	if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
+	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
 		    td->td_name, rflags);
 		return (EINVAL);
@@ -661,10 +649,10 @@
 cpu_halt(void)
 {
 	for (;;)
-		__asm__ ("hlt");
+		halt();
 }
 
-void (*cpu_idle_hook)(void) = NULL;	/* ACPI idle hook. */
+void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
@@ -676,17 +664,19 @@
 #define	STATE_SLEEPING	0x2
 
 static void
-cpu_idle_acpi(int busy)
+cpu_idle_acpi(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
+
+	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
-		cpu_idle_hook();
+		cpu_idle_hook(sbt);
 	else
 		__asm __volatile("sti; hlt");
 	*state = STATE_RUNNING;
@@ -693,15 +683,28 @@
 }
 
 static void
-cpu_idle_hlt(int busy)
+cpu_idle_hlt(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_SLEEPING;
+
 	/*
-	 * We must absolutely guarentee that hlt is the next instruction
-	 * after sti or we introduce a timing window.
+	 * Since we may be in a critical section from cpu_idle(), if
+	 * an interrupt fires during that critical section we may have
+	 * a pending preemption.  If the CPU halts, then that thread
+	 * may not execute until a later interrupt awakens the CPU.
+	 * To handle this race, check for a runnable thread after
+	 * disabling interrupts and immediately return if one is
+	 * found.  Also, we must absolutely guarentee that hlt is
+	 * the next instruction after sti.  This ensures that any
+	 * interrupt that fires after the call to disable_intr() will
+	 * immediately awaken the CPU from hlt.  Finally, please note
+	 * that on x86 this works fine because of interrupts enabled only
+	 * after the instruction following sti takes place, while IF is set
+	 * to 1 immediately, allowing hlt instruction to acknowledge the
+	 * interrupt.
 	 */
 	disable_intr();
 	if (sched_runnable())
@@ -711,32 +714,31 @@
 	*state = STATE_RUNNING;
 }
 
-/*
- * MWAIT cpu power states.  Lower 4 bits are sub-states.
- */
-#define	MWAIT_C0	0xf0
-#define	MWAIT_C1	0x00
-#define	MWAIT_C2	0x10
-#define	MWAIT_C3	0x20
-#define	MWAIT_C4	0x30
-
 static void
-cpu_idle_mwait(int busy)
+cpu_idle_mwait(sbintime_t sbt)
 {
 	int *state;
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_MWAIT;
-	if (!sched_runnable()) {
-		cpu_monitor(state, 0, 0);
-		if (*state == STATE_MWAIT)
-			cpu_mwait(0, MWAIT_C1);
+
+	/* See comments in cpu_idle_hlt(). */
+	disable_intr();
+	if (sched_runnable()) {
+		enable_intr();
+		*state = STATE_RUNNING;
+		return;
 	}
+	cpu_monitor(state, 0, 0);
+	if (*state == STATE_MWAIT)
+		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
+	else
+		enable_intr();
 	*state = STATE_RUNNING;
 }
 
 static void
-cpu_idle_spin(int busy)
+cpu_idle_spin(sbintime_t sbt)
 {
 	int *state;
 	int i;
@@ -743,6 +745,12 @@
 
 	state = (int *)PCPU_PTR(monitorbuf);
 	*state = STATE_RUNNING;
+
+	/*
+	 * The sched_runnable() call is racy but as long as there is
+	 * a loop missing it one time will have just a little impact if any
+	 * (and it is much better than missing the check at all).
+	 */
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
@@ -779,12 +787,13 @@
 	}
 }
 
-void (*cpu_idle_fn)(int) = cpu_idle_acpi;
+void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
 
 void
 cpu_idle(int busy)
 {
 	uint64_t msr;
+	sbintime_t sbt = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
@@ -802,11 +811,11 @@
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
-		cpu_idleclock();
+		sbt = cpu_idleclock();
 	}
 
 	/* Apply AMD APIC timer C1E workaround. */
-	if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
+	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
 		msr = rdmsr(MSR_AMDK8_IPM);
 		if (msr & AMDK8_CMPHALT)
 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
@@ -813,7 +822,7 @@
 	}
 
 	/* Call main idle method. */
-	cpu_idle_fn(busy);
+	cpu_idle_fn(sbt);
 
 	/* Switch timers mack into active mode. */
 	if (!busy) {
@@ -1015,6 +1024,7 @@
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 
 static char nmi0_stack[PAGE_SIZE] __aligned(16);
+static char dbg0_stack[PAGE_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
 struct amd64tss common_tss[MAXCPU];
@@ -1146,12 +1156,7 @@
 };
 
 void
-setidt(idx, func, typ, dpl, ist)
-	int idx;
-	inthand_t *func;
-	int typ;
-	int dpl;
-	int ist;
+setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
 {
 	struct gate_descriptor *ip;
 
@@ -1175,6 +1180,9 @@
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret),
 #endif
+#ifdef XENHVM
+	IDTVEC(xen_intr_upcall),
+#endif
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
 #ifdef DDB
@@ -1199,6 +1207,51 @@
 		ip++;
 	}
 }
+
+/* Show privileged registers. */
+DB_SHOW_COMMAND(sysregs, db_show_sysregs)
+{
+	struct {
+		uint16_t limit;
+		uint64_t base;
+	} __packed idtr, gdtr;
+	uint16_t ldt, tr;
+
+	__asm __volatile("sidt %0" : "=m" (idtr));
+	db_printf("idtr\t0x%016lx/%04x\n",
+	    (u_long)idtr.base, (u_int)idtr.limit);
+	__asm __volatile("sgdt %0" : "=m" (gdtr));
+	db_printf("gdtr\t0x%016lx/%04x\n",
+	    (u_long)gdtr.base, (u_int)gdtr.limit);
+	__asm __volatile("sldt %0" : "=r" (ldt));
+	db_printf("ldtr\t0x%04x\n", ldt);
+	__asm __volatile("str %0" : "=r" (tr));
+	db_printf("tr\t0x%04x\n", tr);
+	db_printf("cr0\t0x%016lx\n", rcr0());
+	db_printf("cr2\t0x%016lx\n", rcr2());
+	db_printf("cr3\t0x%016lx\n", rcr3());
+	db_printf("cr4\t0x%016lx\n", rcr4());
+	if (rcr4() & CR4_XSAVE)
+		db_printf("xcr0\t0x%016lx\n", rxcr(0));
+	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
+	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
+		db_printf("FEATURES_CTL\t%016lx\n",
+		    rdmsr(MSR_IA32_FEATURE_CONTROL));
+	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
+	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
+	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
+}
+
+DB_SHOW_COMMAND(dbregs, db_show_dbregs)
+{
+
+	db_printf("dr0\t0x%016lx\n", rdr0());
+	db_printf("dr1\t0x%016lx\n", rdr1());
+	db_printf("dr2\t0x%016lx\n", rdr2());
+	db_printf("dr3\t0x%016lx\n", rdr3());
+	db_printf("dr6\t0x%016lx\n", rdr6());
+	db_printf("dr7\t0x%016lx\n", rdr7());	
+}
 #endif
 
 void
@@ -1276,30 +1329,26 @@
 u_int basemem;
 
 static int
-add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
+add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
+    int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 
-	if (boothowto & RB_VERBOSE)
-		printf("SMAP type=%02x base=%016lx len=%016lx\n",
-		    smap->type, smap->base, smap->length);
-
-	if (smap->type != SMAP_TYPE_MEMORY)
+	if (length == 0)
 		return (1);
 
-	if (smap->length == 0)
-		return (0);
-
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
+	 *
+	 * NB: physmap_idx points to the next free slot.
 	 */
-	insert_idx = physmap_idx + 2;
+	insert_idx = physmap_idx;
 	for (i = 0; i <= physmap_idx; i += 2) {
-		if (smap->base < physmap[i + 1]) {
-			if (smap->base + smap->length <= physmap[i]) {
+		if (base < physmap[i + 1]) {
+			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
@@ -1311,15 +1360,14 @@
 	}
 
 	/* See if we can prepend to the next entry. */
-	if (insert_idx <= physmap_idx &&
-	    smap->base + smap->length == physmap[insert_idx]) {
-		physmap[insert_idx] = smap->base;
+	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
+		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
-	if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
-		physmap[insert_idx - 1] += smap->length;
+	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
+		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
@@ -1335,17 +1383,148 @@
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
-	for (i = physmap_idx; i > insert_idx; i -= 2) {
+	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
-	physmap[insert_idx] = smap->base;
-	physmap[insert_idx + 1] = smap->base + smap->length;
+	physmap[insert_idx] = base;
+	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
+static void
+add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap,
+    int *physmap_idx)
+{
+	struct bios_smap *smap, *smapend;
+	u_int32_t smapsize;
+
+	/*
+	 * Memory map from INT 15:E820.
+	 *
+	 * subr_module.c says:
+	 * "Consumer may safely assume that size value precedes data."
+	 * ie: an int32_t immediately precedes smap.
+	 */
+	smapsize = *((u_int32_t *)smapbase - 1);
+	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
+
+	for (smap = smapbase; smap < smapend; smap++) {
+		if (boothowto & RB_VERBOSE)
+			printf("SMAP type=%02x base=%016lx len=%016lx\n",
+			    smap->type, smap->base, smap->length);
+
+		if (smap->type != SMAP_TYPE_MEMORY)
+			continue;
+
+		if (!add_physmap_entry(smap->base, smap->length, physmap,
+		    physmap_idx))
+			break;
+	}
+}
+
+#define efi_next_descriptor(ptr, size) \
+	((struct efi_md *)(((uint8_t *) ptr) + size))
+
+static void
+add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
+    int *physmap_idx)
+{
+	struct efi_md *map, *p;
+	const char *type;
+	size_t efisz;
+	int ndesc, i;
+
+	static const char *types[] = {
+		"Reserved",
+		"LoaderCode",
+		"LoaderData",
+		"BootServicesCode",
+		"BootServicesData",
+		"RuntimeServicesCode",
+		"RuntimeServicesData",
+		"ConventionalMemory",
+		"UnusableMemory",
+		"ACPIReclaimMemory",
+		"ACPIMemoryNVS",
+		"MemoryMappedIO",
+		"MemoryMappedIOPortSpace",
+		"PalCode"
+	};
+
+	/*
+	 * Memory map data provided by UEFI via the GetMemoryMap
+	 * Boot Services API.
+	 */
+	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
+	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
+
+	if (efihdr->descriptor_size == 0)
+		return;
+	ndesc = efihdr->memory_size / efihdr->descriptor_size;
+
+	if (boothowto & RB_VERBOSE)
+		printf("%23s %12s %12s %8s %4s\n",
+		    "Type", "Physical", "Virtual", "#Pages", "Attr");
+
+	for (i = 0, p = map; i < ndesc; i++,
+	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
+		if (boothowto & RB_VERBOSE) {
+			if (p->md_type <= EFI_MD_TYPE_PALCODE)
+				type = types[p->md_type];
+			else
+				type = "<INVALID>";
+			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
+			    p->md_virt, p->md_pages);
+			if (p->md_attr & EFI_MD_ATTR_UC)
+				printf("UC ");
+			if (p->md_attr & EFI_MD_ATTR_WC)
+				printf("WC ");
+			if (p->md_attr & EFI_MD_ATTR_WT)
+				printf("WT ");
+			if (p->md_attr & EFI_MD_ATTR_WB)
+				printf("WB ");
+			if (p->md_attr & EFI_MD_ATTR_UCE)
+				printf("UCE ");
+			if (p->md_attr & EFI_MD_ATTR_WP)
+				printf("WP ");
+			if (p->md_attr & EFI_MD_ATTR_RP)
+				printf("RP ");
+			if (p->md_attr & EFI_MD_ATTR_XP)
+				printf("XP ");
+			if (p->md_attr & EFI_MD_ATTR_RT)
+				printf("RUNTIME");
+			printf("\n");
+		}
+
+		switch (p->md_type) {
+		case EFI_MD_TYPE_CODE:
+		case EFI_MD_TYPE_DATA:
+		case EFI_MD_TYPE_BS_CODE:
+		case EFI_MD_TYPE_BS_DATA:
+		case EFI_MD_TYPE_FREE:
+			/*
+			 * We're allowed to use any entry with these types.
+			 */
+			break;
+		default:
+			continue;
+		}
+
+		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
+		    physmap, physmap_idx))
+			break;
+	}
+}
+
+static char bootmethod[16] = "";
+SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
+    "System firmware boot method");
+
+#define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
+
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
@@ -1363,32 +1542,30 @@
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	u_long physmem_start, physmem_tunable, memtest;
 	pt_entry_t *pte;
-	struct bios_smap *smapbase, *smap, *smapend;
-	u_int32_t smapsize;
+	struct bios_smap *smapbase;
+	struct efi_map_header *efihdr;
 	quad_t dcons_addr, dcons_size;
+	int page_counter;
 
 	bzero(physmap, sizeof(physmap));
-	basemem = 0;
 	physmap_idx = 0;
 
-	/*
-	 * get memory map from INT 15:E820, kindly supplied by the loader.
-	 *
-	 * subr_module.c says:
-	 * "Consumer may safely assume that size value precedes data."
-	 * ie: an int32_t immediately precedes smap.
-	 */
+	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
-	if (smapbase == NULL)
-		panic("No BIOS smap info from loader!");
 
-	smapsize = *((u_int32_t *)smapbase - 1);
-	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
+	if (efihdr != NULL) {
+		add_efi_map_entries(efihdr, physmap, &physmap_idx);
+		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
+	} else if (smapbase != NULL) {
+		add_smap_entries(smapbase, physmap, &physmap_idx);
+		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
+	} else {
+		panic("No BIOS smap or EFI map info from loader!");
+	}
 
-	for (smap = smapbase; smap < smapend; smap++)
-		if (!add_smap_entry(smap, physmap, &physmap_idx))
-			break;
+	physmap_idx -= 2;
 
 	/*
 	 * Find the 'base memory' segment for SMP
@@ -1395,16 +1572,22 @@
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
-		if (physmap[i] == 0x00000000) {
+		if (physmap[i] <= 0xA0000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
-	if (basemem == 0)
-		panic("BIOS smap did not include a basemem segment!");
+	if (basemem == 0 || basemem > 640) {
+		if (bootverbose)
+			printf(
+		"Memory map doesn't contain a basemem segment, faking it");
+		basemem = 640;
+	}
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
+	if (physmap[1] >= 0x100000000)
+		panic("Basemem segment is not suitable for AP bootstrap code!");
 	physmap[1] = mp_bootaddress(physmap[1] / 1024);
 #endif
 
@@ -1424,13 +1607,15 @@
 		Maxmem = atop(physmem_tunable);
 
 	/*
-	 * By default enable the memory test on real hardware, and disable
-	 * it if we appear to be running in a VM.  This avoids touching all
-	 * pages unnecessarily, which doesn't matter on real hardware but is
-	 * bad for shared VM hosts.  Use a general name so that
-	 * one could eventually do more with the code than just disable it.
+	 * The boot memory test is disabled by default, as it takes a
+	 * significant amount of time on large-memory systems, and is
+	 * unfriendly to virtual machines as it unnecessarily touches all
+	 * pages.
+	 *
+	 * A general name is used as the code may be extended to support
+	 * additional tests beyond the current "page present" test.
 	 */
-	memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
+	memtest = 0;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	/*
@@ -1456,12 +1641,14 @@
 	 */
 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
-	if (physmem_start < PAGE_SIZE)
-		physmap[0] = PAGE_SIZE;
-	else if (physmem_start >= physmap[1])
-		physmap[0] = round_page(physmap[1] - PAGE_SIZE);
-	else
-		physmap[0] = round_page(physmem_start);
+	if (physmap[0] < physmem_start) {
+		if (physmem_start < PAGE_SIZE)
+			physmap[0] = PAGE_SIZE;
+		else if (physmem_start >= physmap[1])
+			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
+		else
+			physmap[0] = round_page(physmem_start);
+	}
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
@@ -1480,6 +1667,9 @@
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
+	page_counter = 0;
+	if (memtest != 0)
+		printf("Testing system memory");
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
@@ -1510,9 +1700,17 @@
 				goto skip_memtest;
 
 			/*
+			 * Print a "." every GB to show we're making
+			 * progress.
+			 */
+			page_counter++;
+			if ((page_counter % PAGES_PER_GB) == 0)
+				printf(".");
+
+			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
-			*pte = pa | PG_V | PG_RW | PG_N;
+			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();
 
 			tmp = *(int *)ptr;
@@ -1596,6 +1794,8 @@
 	}
 	*pte = 0;
 	invltlb();
+	if (memtest != 0)
+		printf("\n");
 
 	/*
 	 * XXX
@@ -1649,12 +1849,15 @@
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
-	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
+	init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE, 0);
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 #endif
 
+	identify_cpu();
+	identify_hypervisor();
+
 	/* Init basic tunables, hz etc */
 	init_param1();
 
@@ -1707,7 +1910,7 @@
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 4);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
@@ -1728,6 +1931,9 @@
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
+#ifdef XENHVM
+	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
+#endif
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
@@ -1740,38 +1946,14 @@
 	i8254_init();
 
 	/*
-	 * Initialize the console before we print anything out.
+	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
+	 * transition).
 	 */
-	cninit();
+	if (kmdp != NULL && preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL)
+		vty_set_preferred(VTY_VT);
 
-#ifdef DEV_ISA
-#ifdef DEV_ATPIC
-	elcr_probe();
-	atpic_startup();
-#else
-	/* Reset and mask the atpics and leave them shut down. */
-	atpic_reset();
-
-	/*
-	 * Point the ICU spurious interrupt vectors at the APIC spurious
-	 * interrupt handler.
-	 */
-	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
-#endif
-#else
-#error "have you forgotten the isa device?";
-#endif
-
-	kdb_init();
-
-#ifdef KDB
-	if (boothowto & RB_KDB)
-		kdb_enter(KDB_WHY_BOOTFLAGS,
-		    "Boot flags requested debugger");
-#endif
-
-	identify_cpu();		/* Final stage of CPU initialization */
+	finishidentcpu();	/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 	initializecpucache();
 
@@ -1786,6 +1968,13 @@
 	np->np_pcpu = (register_t) pc;
 	common_tss[0].tss_ist2 = (long) np;
 
+	/*
+	 * DB# stack, runs on ist4.
+	 */
+	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
+	np->np_pcpu = (register_t) pc;
+	common_tss[0].tss_ist4 = (long) np;
+
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	common_tss[0].tss_iobase = sizeof(struct amd64tss) +
 	    IOPAGES * PAGE_SIZE;
@@ -1808,6 +1997,35 @@
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
+	cninit();
+
+#ifdef DEV_ISA
+#ifdef DEV_ATPIC
+	elcr_probe();
+	atpic_startup();
+#else
+	/* Reset and mask the atpics and leave them shut down. */
+	atpic_reset();
+
+	/*
+	 * Point the ICU spurious interrupt vectors at the APIC spurious
+	 * interrupt handler.
+	 */
+	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
+#endif
+#else
+#error "have you forgotten the isa device?";
+#endif
+
+	kdb_init();
+
+#ifdef KDB
+	if (boothowto & RB_KDB)
+		kdb_enter(KDB_WHY_BOOTFLAGS,
+		    "Boot flags requested debugger");
+#endif
+
 	msgbufinit(msgbufp, msgbufsize);
 	fpuinit();
 
@@ -1817,6 +2035,7 @@
 	 * area.
 	 */
 	thread0.td_pcb = get_pcb_td(&thread0);
+	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
@@ -1844,7 +2063,7 @@
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
-	thread0.td_pcb->pcb_cr3 = KPML4phys;
+	thread0.td_pcb->pcb_cr3 = KPML4phys; /* PCID 0 is reserved for kernel */
 	thread0.td_frame = &proc0_tf;
 
         env = getenv("kernelname");
@@ -1851,16 +2070,12 @@
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
-#ifdef XENHVM
-	if (inw(0x10) == 0x49d2) {
-		if (bootverbose)
-			printf("Xen detected: disabling emulated block and network devices\n");
-		outw(0x10, 3);
-	}
+	cpu_probe_amdc1e();
+
+#ifdef FDT
+	x86_init_fdt();
 #endif
 
-	cpu_probe_amdc1e();
-
 	/* Location of kernel stack for locore */
 	return ((u_int64_t)thread0.td_pcb);
 }
@@ -1872,6 +2087,62 @@
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
+static int
+smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct bios_smap *smapbase;
+	struct bios_smap_xattr smap;
+	caddr_t kmdp;
+	uint32_t *smapattr;
+	int count, error, i;
+
+	/* Retrieve the system memory map from the loader. */
+	kmdp = preload_search_by_type("elf kernel");
+	if (kmdp == NULL)
+		kmdp = preload_search_by_type("elf64 kernel");
+	smapbase = (struct bios_smap *)preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_SMAP);
+	if (smapbase == NULL)
+		return (0);
+	smapattr = (uint32_t *)preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
+	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
+	error = 0;
+	for (i = 0; i < count; i++) {
+		smap.base = smapbase[i].base;
+		smap.length = smapbase[i].length;
+		smap.type = smapbase[i].type;
+		if (smapattr != NULL)
+			smap.xattr = smapattr[i];
+		else
+			smap.xattr = 0;
+		error = SYSCTL_OUT(req, &smap, sizeof(smap));
+	}
+	return (error);
+}
+SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
+    smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
+
+static int
+efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct efi_map_header *efihdr;
+	caddr_t kmdp;
+	uint32_t efisize;
+
+	kmdp = preload_search_by_type("elf kernel");
+	if (kmdp == NULL)
+		kmdp = preload_search_by_type("elf64 kernel");
+	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
+	if (efihdr == NULL)
+		return (0);
+	efisize = *((uint32_t *)efihdr - 1);
+	return (SYSCTL_OUT(req, efihdr, efisize));
+}
+SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
+    efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
+
 void
 spinlock_enter(void)
 {
@@ -1926,7 +2197,9 @@
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
+
 	td->td_frame->tf_rip = addr;
+	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
@@ -2026,8 +2299,8 @@
 		tp->tf_fs = regs->r_fs;
 		tp->tf_gs = regs->r_gs;
 		tp->tf_flags = TF_HASSEGS;
-		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	}
+	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
@@ -2175,7 +2448,7 @@
  * touch the cs selector.
  */
 int
-set_mcontext(struct thread *td, const mcontext_t *mcp)
+set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
@@ -2262,10 +2535,9 @@
 }
 
 static int
-set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate,
+set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
-	struct savefpu *fpstate;
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
@@ -2278,9 +2550,8 @@
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
-		fpstate = (struct savefpu *)&mcp->mc_fpstate;
-		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
-		error = fpusetregs(td, fpstate, xfpustate, xfpustate_len);
+		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
+		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);

Modified: trunk/sys/amd64/amd64/mem.c
===================================================================
--- trunk/sys/amd64/amd64/mem.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/mem.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1982, 1986, 1990 The Regents of the University of California.
@@ -37,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/mem.c 309426 2016-12-02 19:02:12Z jhb $");
 
 /*
  * Memory special file
@@ -58,6 +59,7 @@
 #include <sys/systm.h>
 #include <sys/uio.h>
 
+#include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>
 
@@ -76,14 +78,15 @@
 int
 memrw(struct cdev *dev, struct uio *uio, int flags)
 {
-	int o;
-	u_long c = 0, v;
 	struct iovec *iov;
-	int error = 0;
-	vm_offset_t addr, eaddr;
+	void *p;
+	ssize_t orig_resid;
+	u_long v, vd;
+	u_int c;
+	int error;
 
-	GIANT_REQUIRED;
-
+	error = 0;
+	orig_resid = uio->uio_resid;
 	while (uio->uio_resid > 0 && error == 0) {
 		iov = uio->uio_iov;
 		if (iov->iov_len == 0) {
@@ -93,48 +96,67 @@
 				panic("memrw");
 			continue;
 		}
-		if (dev2unit(dev) == CDEV_MINOR_MEM) {
-			v = uio->uio_offset;
-kmemphys:
-			o = v & PAGE_MASK;
-			c = min(uio->uio_resid, (u_int)(PAGE_SIZE - o));
-			error = uiomove((void *)PHYS_TO_DMAP(v), (int)c, uio);
-			continue;
-		}
-		else if (dev2unit(dev) == CDEV_MINOR_KMEM) {
-			v = uio->uio_offset;
+		v = uio->uio_offset;
+		c = ulmin(iov->iov_len, PAGE_SIZE - (u_int)(v & PAGE_MASK));
 
-			if (v >= DMAP_MIN_ADDRESS && v < DMAP_MAX_ADDRESS) {
-				v = DMAP_TO_PHYS(v);
-				goto kmemphys;
+		switch (dev2unit(dev)) {
+		case CDEV_MINOR_KMEM:
+			/*
+			 * Since c is clamped to be less or equal than
+			 * PAGE_SIZE, the uiomove() call does not
+			 * access past the end of the direct map.
+			 */
+			if (v >= DMAP_MIN_ADDRESS &&
+			    v < DMAP_MIN_ADDRESS + dmaplimit) {
+				error = uiomove((void *)v, c, uio);
+				break;
 			}
 
-			c = iov->iov_len;
+			if (!kernacc((void *)v, c, uio->uio_rw == UIO_READ ?
+			    VM_PROT_READ : VM_PROT_WRITE)) {
+				error = EFAULT;
+				break;
+			}
 
 			/*
-			 * Make sure that all of the pages are currently
-			 * resident so that we don't create any zero-fill
-			 * pages.
+			 * If the extracted address is not accessible
+			 * through the direct map, then we make a
+			 * private (uncached) mapping because we can't
+			 * depend on the existing kernel mapping
+			 * remaining valid until the completion of
+			 * uiomove().
+			 *
+			 * XXX We cannot provide access to the
+			 * physical page 0 mapped into KVA.
 			 */
-			addr = trunc_page(v);
-			eaddr = round_page(v + c);
-
-			if (addr < VM_MIN_KERNEL_ADDRESS)
-				return (EFAULT);
-			for (; addr < eaddr; addr += PAGE_SIZE) 
-				if (pmap_extract(kernel_pmap, addr) == 0)
-					return (EFAULT);
-
-			if (!kernacc((caddr_t)(long)v, c,
-			    uio->uio_rw == UIO_READ ? 
-			    VM_PROT_READ : VM_PROT_WRITE))
-				return (EFAULT);
-
-			error = uiomove((caddr_t)(long)v, (int)c, uio);
-			continue;
+			v = pmap_extract(kernel_pmap, v);
+			if (v == 0) {
+				error = EFAULT;
+				break;
+			}
+			/* FALLTHROUGH */
+		case CDEV_MINOR_MEM:
+			if (v < dmaplimit) {
+				vd = PHYS_TO_DMAP(v);
+				error = uiomove((void *)vd, c, uio);
+				break;
+			}
+			if (v > cpu_getmaxphyaddr()) {
+				error = EFAULT;
+				break;
+			}
+			p = pmap_mapdev(v, PAGE_SIZE);
+			error = uiomove(p, c, uio);
+			pmap_unmapdev((vm_offset_t)p, PAGE_SIZE);
+			break;
 		}
-		/* else panic! */
 	}
+	/*
+	 * Don't return error if any byte was written.  Read and write
+	 * can return error only if no i/o was performed.
+	 */
+	if (uio->uio_resid != orig_resid)
+		error = 0;
 	return (error);
 }
 
@@ -147,9 +169,11 @@
 memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int prot __unused, vm_memattr_t *memattr __unused)
 {
-	if (dev2unit(dev) == CDEV_MINOR_MEM)
+	if (dev2unit(dev) == CDEV_MINOR_MEM) {
+		if (offset > cpu_getmaxphyaddr())
+			return (-1);
 		*paddr = offset;
-	else if (dev2unit(dev) == CDEV_MINOR_KMEM)
+	} else if (dev2unit(dev) == CDEV_MINOR_KMEM)
         	*paddr = vtophys(offset);
 	/* else panic! */
 	return (0);

Modified: trunk/sys/amd64/amd64/minidump_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/minidump_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/minidump_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2006 Peter Wemm
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/minidump_machdep.c 257575 2013-11-03 16:03:19Z kib $");
 
 #include "opt_pmap.h"
 #include "opt_watchdog.h"
@@ -39,7 +40,9 @@
 #include <sys/msgbuf.h>
 #include <sys/watchdog.h>
 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/vm_page.h>
+#include <vm/vm_phys.h>
 #include <vm/pmap.h>
 #include <machine/atomic.h>
 #include <machine/elf.h>
@@ -125,8 +128,9 @@
 	int sofar, i;
 
 	sofar = 100 - ((progress * 100) / dumpsize);
-	for (i = 0; i < 10; i++) {
-		if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per)
+	for (i = 0; i < nitems(progress_track); i++) {
+		if (sofar < progress_track[i].min_per ||
+		    sofar > progress_track[i].max_per)
 			continue;
 		if (progress_track[i].visited)
 			return;
@@ -155,8 +159,8 @@
 		printf("cant have both va and pa!\n");
 		return (EINVAL);
 	}
-	if (pa != 0 && (((uintptr_t)ptr) % PAGE_SIZE) != 0) {
-		printf("address not page aligned\n");
+	if ((((uintptr_t)pa) % PAGE_SIZE) != 0) {
+		printf("address not page aligned %p\n", ptr);
 		return (EINVAL);
 	}
 	if (ptr != NULL) {
@@ -219,8 +223,8 @@
 	vm_offset_t va;
 	int error;
 	uint64_t bits;
-	uint64_t *pdp, *pd, *pt, pa;
-	int i, j, k, n, bit;
+	uint64_t *pml4, *pdp, *pd, *pt, pa;
+	int i, ii, j, k, n, bit;
 	int retry_count;
 	struct minidumphdr mdhdr;
 
@@ -228,10 +232,11 @@
  retry:
 	retry_count++;
 	counter = 0;
+	for (i = 0; i < nitems(progress_track); i++)
+		progress_track[i].visited = 0;
 	/* Walk page table pages, set bits in vm_page_dump */
 	pmapsize = 0;
-	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
-	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
+	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
 	    kernel_vm_end); ) {
 		/*
 		 * We always write a page, even if it is zero. Each
@@ -238,6 +243,9 @@
 		 * page written corresponds to 1GB of space
 		 */
 		pmapsize += PAGE_SIZE;
+		ii = (va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1);
+		pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
+		pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
 		if ((pdp[i] & PG_V) == 0) {
 			va += NBPDP;
@@ -362,9 +370,11 @@
 
 	/* Dump kernel page directory pages */
 	bzero(fakepd, sizeof(fakepd));
-	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
-	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
+	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
 	    kernel_vm_end); va += NBPDP) {
+		ii = (va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1);
+		pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
+		pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
 
 		/* We always write a page, even if it is zero */

Modified: trunk/sys/amd64/amd64/mp_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/mp_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/mp_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1996, by Steve Passe
  * Copyright (c) 2003, by Peter Wemm
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/mp_machdep.c 333370 2018-05-08 17:05:39Z emaste $");
 
 #include "opt_cpu.h"
 #include "opt_ddb.h"
@@ -69,6 +70,7 @@
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/tss.h>
+#include <machine/cpu.h>
 
 #define WARMBOOT_TARGET		0
 #define WARMBOOT_OFF		(KERNBASE + 0x0467)
@@ -97,16 +99,19 @@
 /* Temporary variables for init_secondary()  */
 char *doublefault_stack;
 char *nmi_stack;
+char *dbg_stack;
 void *dpcpu;
 
 struct pcb stoppcbs[MAXCPU];
-struct pcb **susppcbs;
-void **suspfpusave;
+struct susppcb **susppcbs;
 
 /* Variables needed for SMP tlb shootdown. */
-vm_offset_t smp_tlb_addr1;
 vm_offset_t smp_tlb_addr2;
+struct invpcid_descr smp_tlb_invpcid;
 volatile int smp_tlb_wait;
+uint64_t pcid_cr3;
+pmap_t smp_tlb_pmap;
+extern int invpcid_works;
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
@@ -120,8 +125,15 @@
 static u_long *ipi_hardclock_counts[MAXCPU];
 #endif
 
+/* Default cpu_ops implementation. */
+struct cpu_ops cpu_ops = {
+	.ipi_vectored = lapic_ipi_vectored
+};
+
 extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
+extern int pmap_pcid_enabled;
+
 /*
  * Local data and functions.
  */
@@ -128,6 +140,9 @@
 
 static volatile cpuset_t ipi_nmi_pending;
 
+volatile cpuset_t resuming_cpus;
+volatile cpuset_t toresume_cpus;
+
 /* used to hold the AP's until we are ready to release them */
 static struct mtx ap_boot_mtx;
 
@@ -148,7 +163,7 @@
 int apic_cpuids[MAX_APIC_ID + 1];
 
 /* Holds pending bitmap based IPIs per CPU */
-static volatile u_int cpu_ipi_pending[MAXCPU];
+volatile u_int cpu_ipi_pending[MAXCPU];
 
 static u_int boot_address;
 static int cpu_logical;			/* logical cpus per core */
@@ -519,8 +534,15 @@
 	}
 
 	/* Install an inter-CPU IPI for TLB invalidation */
-	setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
+	if (pmap_pcid_enabled) {
+		setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT,
+		    SEL_KPL, 0);
+		setidt(IPI_INVLPG, IDTVEC(invlpg_pcid), SDT_SYSIGT,
+		    SEL_KPL, 0);
+	} else {
+		setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
+		setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
+	}
 	setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for cache invalidation. */
@@ -568,7 +590,7 @@
 	const char *hyperthread;
 	int i;
 
-	printf("MidnightBSD/SMP: %d package(s) x %d core(s)",
+	printf("FreeBSD/SMP: %d package(s) x %d core(s)",
 	    mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
 	if (hyperthreading_cpus > 1)
 	    printf(" x %d HTT threads", cpu_logical);
@@ -627,6 +649,10 @@
 	np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
 	common_tss[cpu].tss_ist2 = (long) np;
 
+	/* The DB# stack runs on IST4. */
+	np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
+	common_tss[cpu].tss_ist4 = (long) np;
+
 	/* Prepare private GDT */
 	gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
 	for (x = 0; x < NGDT; x++) {
@@ -662,9 +688,14 @@
 	/* Save the per-cpu pointer for use by the NMI handler. */
 	np->np_pcpu = (register_t) pc;
 
+	/* Save the per-cpu pointer for use by the DB# handler. */
+	np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
+	np->np_pcpu = (register_t) pc;
+
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
+	fix_cpuid();
 
 	lidt(&r_idt);
 
@@ -706,12 +737,15 @@
 	/* set up CPU registers and state */
 	cpu_setregs();
 
-	/* set up SSE/NX registers */
+	/* set up SSE/NX */
 	initializecpu();
 
 	/* set up FPU state on the AP */
 	fpuinit();
 
+	if (cpu_ops.cpu_init)
+		cpu_ops.cpu_init();
+
 	/* A quick check from sanity claus */
 	cpuid = PCPU_GET(cpuid);
 	if (PCPU_GET(apic_id) != lapic_id()) {
@@ -751,7 +785,6 @@
 	if (smp_cpus == mp_ncpus) {
 		/* enable IPI's, tlb shootdown, freezes etc */
 		atomic_store_rel_int(&smp_started, 1);
-		smp_active = 1;	 /* historic */
 	}
 
 	/*
@@ -760,6 +793,8 @@
 	 */
 
 	load_cr4(rcr4() | CR4_PGE);
+	if (pmap_pcid_enabled)
+		load_cr4(rcr4() | CR4_PCIDE);
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
@@ -786,6 +821,8 @@
  * We tell the I/O APIC code about all the CPUs we want to receive
  * interrupts.  If we don't want certain CPUs to receive IRQs we
  * can simply not tell the I/O APIC code about them in this function.
+ * We also do not tell it about the BSP since it tells itself about
+ * the BSP internally to work with UP kernels and on UP machines.
  */
 static void
 set_interrupt_apic_ids(void)
@@ -796,6 +833,8 @@
 		apic_id = cpu_apic_ids[i];
 		if (apic_id == -1)
 			continue;
+		if (cpu_info[apic_id].cpu_bsp)
+			continue;
 		if (cpu_info[apic_id].cpu_disabled)
 			continue;
 
@@ -935,10 +974,16 @@
 		apic_id = cpu_apic_ids[cpu];
 
 		/* allocate and set up an idle stack data page */
-		bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
-		doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
-		nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
-		dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE);
+		bootstacks[cpu] = (void *)kmem_malloc(kernel_arena,
+		    KSTACK_PAGES * PAGE_SIZE, M_WAITOK | M_ZERO);
+		doublefault_stack = (char *)kmem_malloc(kernel_arena,
+		    PAGE_SIZE, M_WAITOK | M_ZERO);
+		nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
+		    M_WAITOK | M_ZERO);
+		dbg_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
+		    M_WAITOK | M_ZERO);
+		dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
+		    M_WAITOK | M_ZERO);
 
 		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
 		bootAP = cpu;
@@ -983,57 +1028,8 @@
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
-	/*
-	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
-	 * and running the target CPU. OR this INIT IPI might be latched (P5
-	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
-	 * ignored.
-	 */
+	ipi_startup(apic_id, vector);
 
-	/* do an INIT IPI: assert RESET */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
-
-	/* wait for pending status end */
-	lapic_ipi_wait(-1);
-
-	/* do an INIT IPI: deassert RESET */
-	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
-
-	/* wait for pending status end */
-	DELAY(10000);		/* wait ~10mS */
-	lapic_ipi_wait(-1);
-
-	/*
-	 * next we do a STARTUP IPI: the previous INIT IPI might still be
-	 * latched, (P5 bug) this 1st STARTUP would then terminate
-	 * immediately, and the previously started INIT IPI would continue. OR
-	 * the previous INIT IPI has already run. and this STARTUP IPI will
-	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
-	 * will run.
-	 */
-
-	/* do a STARTUP IPI */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
-	    vector, apic_id);
-	lapic_ipi_wait(-1);
-	DELAY(200);		/* wait ~200uS */
-
-	/*
-	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
-	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
-	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
-	 * recognized after hardware RESET or INIT IPI.
-	 */
-
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
-	    vector, apic_id);
-	lapic_ipi_wait(-1);
-	DELAY(200);		/* wait ~200uS */
-
 	/* Wait up to 5 seconds for it to start. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (mp_naps > cpus)
@@ -1080,6 +1076,69 @@
 #endif /* COUNT_XINVLTLB_HITS */
 
 /*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
+	/*
+	 * This attempts to follow the algorithm described in the
+	 * Intel Multiprocessor Specification v1.4 in section B.4.
+	 * For each IPI, we allow the local APIC ~20us to deliver the
+	 * IPI.  If that times out, we panic.
+	 */
+
+	/*
+	 * first we do an INIT IPI: this INIT IPI might be run, resetting
+	 * and running the target CPU. OR this INIT IPI might be latched (P5
+	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
+	 * ignored.
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
+	lapic_ipi_wait(100);
+
+	/* Explicitly deassert the INIT IPI. */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
+	    apic_id);
+
+	DELAY(10000);		/* wait ~10mS */
+
+	/*
+	 * next we do a STARTUP IPI: the previous INIT IPI might still be
+	 * latched, (P5 bug) this 1st STARTUP would then terminate
+	 * immediately, and the previously started INIT IPI would continue. OR
+	 * the previous INIT IPI has already run. and this STARTUP IPI will
+	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
+	 * will run.
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+	    vector, apic_id);
+	if (!lapic_ipi_wait(100))
+		panic("Failed to deliver first STARTUP IPI to APIC %d",
+		    apic_id);
+	DELAY(200);		/* wait ~200uS */
+
+	/*
+	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
+	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
+	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
+	 * recognized after hardware RESET or INIT IPI.
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+	    vector, apic_id);
+	if (!lapic_ipi_wait(100))
+		panic("Failed to deliver second STARTUP IPI to APIC %d",
+		    apic_id);
+
+	DELAY(200);		/* wait ~200uS */
+}
+
+/*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
 static void
@@ -1100,7 +1159,7 @@
 		if (old_pending)
 			return;
 	}
-	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
+	cpu_ops.ipi_vectored(ipi, cpu_apic_ids[cpu]);
 }
 
 /*
@@ -1107,7 +1166,8 @@
  * Flush the TLB on all other CPU's
  */
 static void
-smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+smp_tlb_shootdown(u_int vector, pmap_t pmap, vm_offset_t addr1,
+    vm_offset_t addr2)
 {
 	u_int ncpu;
 
@@ -1117,8 +1177,15 @@
 	if (!(read_rflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
-	smp_tlb_addr1 = addr1;
+	smp_tlb_invpcid.addr = addr1;
+	if (pmap == NULL) {
+		smp_tlb_invpcid.pcid = 0;
+	} else {
+		smp_tlb_invpcid.pcid = pmap->pm_pcid;
+		pcid_cr3 = pmap->pm_cr3;
+	}
 	smp_tlb_addr2 = addr2;
+	smp_tlb_pmap = pmap;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
 	ipi_all_but_self(vector);
 	while (smp_tlb_wait < ncpu)
@@ -1127,7 +1194,8 @@
 }
 
 static void
-smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
+    vm_offset_t addr1, vm_offset_t addr2)
 {
 	int cpu, ncpu, othercpus;
 
@@ -1143,8 +1211,15 @@
 	if (!(read_rflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
-	smp_tlb_addr1 = addr1;
+	smp_tlb_invpcid.addr = addr1;
+	if (pmap == NULL) {
+		smp_tlb_invpcid.pcid = 0;
+	} else {
+		smp_tlb_invpcid.pcid = pmap->pm_pcid;
+		pcid_cr3 = pmap->pm_cr3;
+	}
 	smp_tlb_addr2 = addr2;
+	smp_tlb_pmap = pmap;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
 	if (CPU_ISFULLSET(&mask)) {
 		ncpu = othercpus;
@@ -1151,7 +1226,7 @@
 		ipi_all_but_self(vector);
 	} else {
 		ncpu = 0;
-		while ((cpu = cpusetobj_ffs(&mask)) != 0) {
+		while ((cpu = CPU_FFS(&mask)) != 0) {
 			cpu--;
 			CPU_CLR(cpu, &mask);
 			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
@@ -1170,15 +1245,15 @@
 {
 
 	if (smp_started)
-		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
+		smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0);
 }
 
 void
-smp_invltlb(void)
+smp_invltlb(pmap_t pmap)
 {
 
 	if (smp_started) {
-		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+		smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_global++;
 #endif
@@ -1186,11 +1261,11 @@
 }
 
 void
-smp_invlpg(vm_offset_t addr)
+smp_invlpg(pmap_t pmap, vm_offset_t addr)
 {
 
 	if (smp_started) {
-		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+		smp_tlb_shootdown(IPI_INVLPG, pmap, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_page++;
 #endif
@@ -1198,11 +1273,11 @@
 }
 
 void
-smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+smp_invlpg_range(pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2)
 {
 
 	if (smp_started) {
-		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+		smp_tlb_shootdown(IPI_INVLRNG, pmap, addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_range++;
 		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
@@ -1211,11 +1286,11 @@
 }
 
 void
-smp_masked_invltlb(cpuset_t mask)
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_global++;
 #endif
@@ -1223,11 +1298,11 @@
 }
 
 void
-smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
+smp_masked_invlpg(cpuset_t mask, pmap_t pmap, vm_offset_t addr)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_page++;
 #endif
@@ -1235,11 +1310,13 @@
 }
 
 void
-smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
+smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1,
+    vm_offset_t addr2)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1,
+		    addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_range++;
 		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
@@ -1300,7 +1377,7 @@
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus);
 
-	while ((cpu = cpusetobj_ffs(&cpus)) != 0) {
+	while ((cpu = CPU_FFS(&cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &cpus);
 		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
@@ -1352,7 +1429,7 @@
 		CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus);
 
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
-	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
+	cpu_ops.ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
 
 int
@@ -1417,33 +1494,223 @@
 {
 	u_int cpu;
 
+	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
+
 	cpu = PCPU_GET(cpuid);
-
-	if (savectx(susppcbs[cpu])) {
-		ctx_fpusave(suspfpusave[cpu]);
+	if (savectx(&susppcbs[cpu]->sp_pcb)) {
+		fpususpend(susppcbs[cpu]->sp_fpususpend);
 		wbinvd();
-		CPU_SET_ATOMIC(cpu, &stopped_cpus);
+		CPU_SET_ATOMIC(cpu, &suspended_cpus);
+		/*
+		 * Hack for xen, which does not use resumectx() so never
+		 * uses the next clause: set resuming_cpus early so that
+		 * resume_cpus() can wait on the same bitmap for acpi and
+		 * xen.  resuming_cpus now means eventually_resumable_cpus.
+		 */
+		CPU_SET_ATOMIC(cpu, &resuming_cpus);
 	} else {
+		fpuresume(susppcbs[cpu]->sp_fpususpend);
 		pmap_init_pat();
-		load_cr3(susppcbs[cpu]->pcb_cr3);
 		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
+
+		/* Indicate that we are resuming */
+		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
 
-	/* Wait for resume */
-	while (!CPU_ISSET(cpu, &started_cpus))
+	/* Wait for resume directive */
+	while (!CPU_ISSET(cpu, &toresume_cpus))
 		ia32_pause();
 
+	if (cpu_ops.cpu_resume)
+		cpu_ops.cpu_resume();
+	if (vmm_resume_p)
+		vmm_resume_p();
+
 	/* Resume MCA and local APIC */
 	mca_resume();
 	lapic_setup(0);
 
-	CPU_CLR_ATOMIC(cpu, &started_cpus);
-	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
+	/* Indicate that we are resumed */
+	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
+	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
 }
 
 /*
+ * Handlers for TLB related IPIs
+ */
+void
+invltlb_handler(void)
+{
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	invltlb();
+	atomic_add_int(&smp_tlb_wait, 1);
+}
+
+void
+invltlb_pcid_handler(void)
+{
+	uint64_t cr3;
+	u_int cpuid;
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	if (smp_tlb_invpcid.pcid != (uint64_t)-1 &&
+	    smp_tlb_invpcid.pcid != 0) {
+		if (invpcid_works) {
+			invpcid(&smp_tlb_invpcid, INVPCID_CTX);
+		} else {
+			/* Otherwise reload %cr3 twice. */
+			cr3 = rcr3();
+			if (cr3 != pcid_cr3) {
+				load_cr3(pcid_cr3);
+				cr3 |= CR3_PCID_SAVE;
+			}
+			load_cr3(cr3);
+		}
+	} else {
+		invltlb_globpcid();
+	}
+	if (smp_tlb_pmap != NULL) {
+		cpuid = PCPU_GET(cpuid);
+		if (!CPU_ISSET(cpuid, &smp_tlb_pmap->pm_active))
+			CPU_CLR_ATOMIC(cpuid, &smp_tlb_pmap->pm_save);
+	}
+
+	atomic_add_int(&smp_tlb_wait, 1);
+}
+
+void
+invlpg_handler(void)
+{
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	invlpg(smp_tlb_invpcid.addr);
+	atomic_add_int(&smp_tlb_wait, 1);
+}
+
+void
+invlpg_pcid_handler(void)
+{
+	uint64_t cr3;
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	if (smp_tlb_invpcid.pcid == (uint64_t)-1) {
+		invltlb_globpcid();
+	} else if (smp_tlb_invpcid.pcid == 0) {
+		invlpg(smp_tlb_invpcid.addr);
+	} else if (invpcid_works) {
+		invpcid(&smp_tlb_invpcid, INVPCID_ADDR);
+	} else {
+		/*
+		 * PCID supported, but INVPCID is not.
+		 * Temporarily switch to the target address
+		 * space and do INVLPG.
+		 */
+		cr3 = rcr3();
+		if (cr3 != pcid_cr3)
+			load_cr3(pcid_cr3 | CR3_PCID_SAVE);
+		invlpg(smp_tlb_invpcid.addr);
+		load_cr3(cr3 | CR3_PCID_SAVE);
+	}
+
+	atomic_add_int(&smp_tlb_wait, 1);
+}
+
+static inline void
+invlpg_range(vm_offset_t start, vm_offset_t end)
+{
+
+	do {
+		invlpg(start);
+		start += PAGE_SIZE;
+	} while (start < end);
+}
+
+void
+invlrng_handler(void)
+{
+	struct invpcid_descr d;
+	vm_offset_t addr;
+	uint64_t cr3;
+	u_int cpuid;
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_invpcid.addr;
+	if (pmap_pcid_enabled) {
+		if (smp_tlb_invpcid.pcid == 0) {
+			/*
+			 * kernel pmap - use invlpg to invalidate
+			 * global mapping.
+			 */
+			invlpg_range(addr, smp_tlb_addr2);
+		} else if (smp_tlb_invpcid.pcid == (uint64_t)-1) {
+			invltlb_globpcid();
+			if (smp_tlb_pmap != NULL) {
+				cpuid = PCPU_GET(cpuid);
+				if (!CPU_ISSET(cpuid, &smp_tlb_pmap->pm_active))
+					CPU_CLR_ATOMIC(cpuid,
+					    &smp_tlb_pmap->pm_save);
+			}
+		} else if (invpcid_works) {
+			d = smp_tlb_invpcid;
+			do {
+				invpcid(&d, INVPCID_ADDR);
+				d.addr += PAGE_SIZE;
+			} while (d.addr <= smp_tlb_addr2);
+		} else {
+			cr3 = rcr3();
+			if (cr3 != pcid_cr3)
+				load_cr3(pcid_cr3 | CR3_PCID_SAVE);
+			invlpg_range(addr, smp_tlb_addr2);
+			load_cr3(cr3 | CR3_PCID_SAVE);
+		}
+	} else {
+		invlpg_range(addr, smp_tlb_addr2);
+	}
+
+	atomic_add_int(&smp_tlb_wait, 1);
+}
+
+void
+invlcache_handler(void)
+{
+#ifdef COUNT_IPIS
+	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	wbinvd();
+	atomic_add_int(&smp_tlb_wait, 1);
+}
+
+/*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */

Modified: trunk/sys/amd64/amd64/mp_watchdog.c
===================================================================
--- trunk/sys/amd64/amd64/mp_watchdog.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/mp_watchdog.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2004 Robert N. M. Watson
  * All rights reserved.
@@ -23,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/mp_watchdog.c 314667 2017-03-04 13:03:31Z avg $
  */
 
 #include "opt_mp_watchdog.h"
@@ -86,7 +87,7 @@
 watchdog_init(void *arg)
 {
 
-	callout_init(&watchdog_callout, CALLOUT_MPSAFE);
+	callout_init(&watchdog_callout, 1);
 	if (watchdog_cpu != -1)
 		watchdog_change(watchdog_cpu);
 }

Modified: trunk/sys/amd64/amd64/mpboot.S
===================================================================
--- trunk/sys/amd64/amd64/mpboot.S	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/mpboot.S	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
@@ -23,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/mpboot.S 130224 2004-06-08 01:02:52Z peter $
  */
 
 #include <machine/asmacros.h>		/* miscellaneous asm macros */


Property changes on: trunk/sys/amd64/amd64/mpboot.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/pmap.c
===================================================================
--- trunk/sys/amd64/amd64/pmap.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/pmap.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
@@ -76,19 +77,14 @@
  * SUCH DAMAGE.
  */
 
+#define	AMD64_NPT_AWARE
+
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 322063 2017-08-04 21:38:34Z marius $");
 
 /*
  *	Manages physical address maps.
  *
- *	In addition to hardware address maps, this
- *	module is called upon to provide software-use-only
- *	maps which may or may not be stored in the same
- *	form as hardware maps.  These pseudo-maps are
- *	used to store intermediate results from copy
- *	operations to and from address spaces.
- *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
@@ -123,11 +119,8 @@
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
-#ifdef SMP
+#include <sys/_unrhdr.h>
 #include <sys/smp.h>
-#else
-#include <sys/cpuset.h>
-#endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -138,6 +131,8 @@
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
@@ -152,6 +147,132 @@
 #include <machine/smp.h>
 #endif
 
+static __inline boolean_t
+pmap_type_guest(pmap_t pmap)
+{
+
+	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
+}
+
+static __inline boolean_t
+pmap_emulate_ad_bits(pmap_t pmap)
+{
+
+	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
+}
+
+static __inline pt_entry_t
+pmap_valid_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+	case PT_RVI:
+		mask = X86_PG_V;
+		break;
+	case PT_EPT:
+		if (pmap_emulate_ad_bits(pmap))
+			mask = EPT_PG_EMUL_V;
+		else
+			mask = EPT_PG_READ;
+		break;
+	default:
+		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline pt_entry_t
+pmap_rw_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+	case PT_RVI:
+		mask = X86_PG_RW;
+		break;
+	case PT_EPT:
+		if (pmap_emulate_ad_bits(pmap))
+			mask = EPT_PG_EMUL_RW;
+		else
+			mask = EPT_PG_WRITE;
+		break;
+	default:
+		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline pt_entry_t
+pmap_global_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		mask = X86_PG_G;
+		break;
+	case PT_RVI:
+	case PT_EPT:
+		mask = 0;
+		break;
+	default:
+		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline pt_entry_t
+pmap_accessed_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+	case PT_RVI:
+		mask = X86_PG_A;
+		break;
+	case PT_EPT:
+		if (pmap_emulate_ad_bits(pmap))
+			mask = EPT_PG_READ;
+		else
+			mask = EPT_PG_A;
+		break;
+	default:
+		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline pt_entry_t
+pmap_modified_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+	case PT_RVI:
+		mask = X86_PG_M;
+		break;
+	case PT_EPT:
+		if (pmap_emulate_ad_bits(pmap))
+			mask = EPT_PG_WRITE;
+		else
+			mask = EPT_PG_M;
+		break;
+	default:
+		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
@@ -209,8 +330,12 @@
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
+int nkpt;
+SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
+    "Number of kernel page table pages allocated on bootup");
+
 static int ndmpdp;
-static vm_paddr_t dmaplimit;
+vm_paddr_t dmaplimit;
 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 pt_entry_t pg_nx;
 
@@ -234,17 +359,21 @@
 
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
+static int		ndmpdpphys;	/* number of DMPDPphys pages */
 
 /*
- * Isolate the global pv list lock from data and other locks to prevent false
- * sharing within the cache.
+ * pmap_mapdev support pre initialization (i.e. console)
  */
-static struct {
-	struct rwlock	lock;
-	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
-} pvh_global __aligned(CACHE_LINE_SIZE);
+#define	PMAP_PREINIT_MAPPING_COUNT	8
+static struct pmap_preinit_mapping {
+	vm_paddr_t	pa;
+	vm_offset_t	va;
+	vm_size_t	sz;
+	int		mode;
+} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
+static int pmap_initialized;
 
-#define	pvh_global_lock	pvh_global.lock
+static struct rwlock_padalign pvh_global_lock;
 
 /*
  * Data for the pv entry allocation mechanism
@@ -253,6 +382,7 @@
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
+static struct md_page pv_dummy;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
@@ -260,6 +390,38 @@
 pt_entry_t *CMAP1 = 0;
 caddr_t CADDR1 = 0;
 
+static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
+
+static struct unrhdr pcid_unr;
+static struct mtx pcid_mtx;
+int pmap_pcid_enabled = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
+    0, "Is TLB Context ID enabled ?");
+int invpcid_works = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
+    "Is the invpcid instruction available ?");
+
+static int
+pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
+{
+	int i;
+	uint64_t res;
+
+	res = 0;
+	CPU_FOREACH(i) {
+		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
+	}
+	return (sysctl_handle_64(oidp, &res, 0, req));
+}
+SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
+    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
+    "Count of saved TLB context on switch");
+
+/* pmap_copy_pages() over non-DMAP */
+static struct mtx cpage_lock;
+static vm_offset_t cpage_a;
+static vm_offset_t cpage_b;
+
 /*
  * Crashdump maps.
  */
@@ -281,7 +443,6 @@
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
-static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
@@ -294,30 +455,29 @@
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
-static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
-static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
-static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
+static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
+static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
+		    pd_entry_t pde);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
-static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
+static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp);
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
-static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
+static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
-		vm_page_t *free, struct rwlock **lockp);
-static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
-		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free,
-		struct rwlock **lockp);
+    struct spglist *free, struct rwlock **lockp);
+static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
+    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
-    vm_page_t *free);
+    struct spglist *free);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
-static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
+static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
@@ -327,13 +487,10 @@
 		struct rwlock **lockp);
 
 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
-                vm_page_t *free);
-static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
+    struct spglist *free);
+static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 
-CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
-CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
-
 /*
  * Move the kernel virtual free pointer to the next
  * 2MB.  This is used to help improve performance
@@ -413,7 +570,9 @@
 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
+	pt_entry_t PG_V;
 
+	PG_V = pmap_valid_bit(pmap);
 	pml4e = pmap_pml4e(pmap, va);
 	if ((*pml4e & PG_V) == 0)
 		return (NULL);
@@ -435,7 +594,9 @@
 pmap_pde(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
+	pt_entry_t PG_V;
 
+	PG_V = pmap_valid_bit(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
 		return (NULL);
@@ -457,7 +618,9 @@
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *pde;
+	pt_entry_t PG_V;
 
+	PG_V = pmap_valid_bit(pmap);
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return (NULL);
@@ -479,6 +642,9 @@
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	KASSERT(pmap->pm_stats.resident_count >= count,
+	    ("pmap %p resident count underflow %ld %d", pmap,
+	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
@@ -487,6 +653,8 @@
 {
 	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
+	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
+
 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
 }
 
@@ -495,6 +663,8 @@
 {
 	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
+	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
+
 	return (PDmap + ((va >> PDRSHIFT) & mask));
 }
 
@@ -511,21 +681,67 @@
 
 CTASSERT(powerof2(NDMPML4E));
 
+/* number of kernel PDP slots */
+#define	NKPDPE(ptpgs)		howmany(ptpgs, NPDEPG)
+
 static void
+nkpt_init(vm_paddr_t addr)
+{
+	int pt_pages;
+	
+#ifdef NKPT
+	pt_pages = NKPT;
+#else
+	pt_pages = howmany(addr, 1 << PDRSHIFT);
+	pt_pages += NKPDPE(pt_pages);
+
+	/*
+	 * Add some slop beyond the bare minimum required for bootstrapping
+	 * the kernel.
+	 *
+	 * This is quite important when allocating KVA for kernel modules.
+	 * The modules are required to be linked in the negative 2GB of
+	 * the address space.  If we run out of KVA in this region then
+	 * pmap_growkernel() will need to allocate page table pages to map
+	 * the entire 512GB of KVA space which is an unnecessary tax on
+	 * physical memory.
+	 *
+	 * Secondly, device memory mapped as part of setting up the low-
+	 * level console(s) is taken from KVA, starting at virtual_avail.
+	 * This is because cninit() is called after pmap_bootstrap() but
+	 * before vm_init() and pmap_init(). 20MB for a frame buffer is
+	 * not uncommon.
+	 */
+	pt_pages += 32;		/* 64MB additional slop. */
+#endif
+	nkpt = pt_pages;
+}
+
+static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
-	int i, j, ndm1g;
+	int i, j, ndm1g, nkpdpe;
+	pt_entry_t *pt_p;
+	pd_entry_t *pd_p;
+	pdp_entry_t *pdp_p;
+	pml4_entry_t *p4_p;
 
-	/* Allocate pages */
-	KPTphys = allocpages(firstaddr, NKPT);
-	KPML4phys = allocpages(firstaddr, 1);
-	KPDPphys = allocpages(firstaddr, NKPML4E);
-	KPDphys = allocpages(firstaddr, NKPDPE);
-
+	/* Allocate page table pages for the direct map */
 	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
-	DMPDPphys = allocpages(firstaddr, NDMPML4E);
+	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
+	if (ndmpdpphys > NDMPML4E) {
+		/*
+		 * Each NDMPML4E allows 512 GB, so limit to that,
+		 * and then readjust ndmpdp and ndmpdpphys.
+		 */
+		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
+		Maxmem = atop(NDMPML4E * NBPML4);
+		ndmpdpphys = NDMPML4E;
+		ndmpdp = NDMPML4E * NPDEPG;
+	}
+	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 	ndm1g = 0;
 	if ((amd_feature & AMDID_PAGE1GB) != 0)
 		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
@@ -533,33 +749,49 @@
 		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 
+	/* Allocate pages */
+	KPML4phys = allocpages(firstaddr, 1);
+	KPDPphys = allocpages(firstaddr, NKPML4E);
+
+	/*
+	 * Allocate the initial number of kernel page table pages required to
+	 * bootstrap.  We defer this until after all memory-size dependent
+	 * allocations are done (e.g. direct map), so that we don't have to
+	 * build in too much slop in our estimate.
+	 *
+	 * Note that when NKPML4E > 1, we have an empty page underneath
+	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
+	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
+	 */
+	nkpt_init(*firstaddr);
+	nkpdpe = NKPDPE(nkpt);
+
+	KPTphys = allocpages(firstaddr, nkpt);
+	KPDphys = allocpages(firstaddr, nkpdpe);
+
 	/* Fill in the underlying page table pages */
-	/* Read-only from zero to physfree */
+	/* Nominally read-only (but really R/W) from zero to physfree */
 	/* XXX not fully used, underneath 2M pages */
-	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
-		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
-		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
-	}
+	pt_p = (pt_entry_t *)KPTphys;
+	for (i = 0; ptoa(i) < *firstaddr; i++)
+		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
 
 	/* Now map the page tables at their location within PTmap */
-	for (i = 0; i < NKPT; i++) {
-		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
-		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
-	}
+	pd_p = (pd_entry_t *)KPDphys;
+	for (i = 0; i < nkpt; i++)
+		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/* Map from zero to end of allocations under 2M pages */
 	/* This replaces some of the KPTphys entries above */
-	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
-		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
-		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
-	}
+	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
+		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
+		    X86_PG_G;
 
-	/* And connect up the PD to the PDP */
-	for (i = 0; i < NKPDPE; i++) {
-		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
-		    (i << PAGE_SHIFT);
-		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
-	}
+	/* And connect up the PD to the PDP (leaving room for L4 pages) */
+	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
+	for (i = 0; i < nkpdpe; i++)
+		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
+		    PG_U;
 
 	/*
 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
@@ -569,37 +801,41 @@
 	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
 	 * that are partially used. 
 	 */
+	pd_p = (pd_entry_t *)DMPDphys;
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
-		((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
+		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
-		    PG_M | PG_A;
+		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+		    X86_PG_M | X86_PG_A;
 	}
+	pdp_p = (pdp_entry_t *)DMPDPphys;
 	for (i = 0; i < ndm1g; i++) {
-		((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
+		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
-		    PG_M | PG_A;
+		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+		    X86_PG_M | X86_PG_A;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
-		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
-		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
+		pdp_p[i] = DMPDphys + ptoa(j);
+		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
-	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
-	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
+	p4_p = (pml4_entry_t *)KPML4phys;
+	p4_p[PML4PML4I] = KPML4phys;
+	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
 
 	/* Connect the Direct Map slot(s) up to the PML4. */
-	for (i = 0; i < NDMPML4E; i++) {
-		((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys +
-		    (i << PAGE_SHIFT);
-		((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U;
+	for (i = 0; i < ndmpdpphys; i++) {
+		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
+		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
 	}
 
-	/* Connect the KVA slot up to the PML4 */
-	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
-	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
+	/* Connect the KVA slots up to the PML4 */
+	for (i = 0; i < NKPML4E; i++) {
+		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
+		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
+	}
 }
 
 /*
@@ -616,7 +852,7 @@
 pmap_bootstrap(vm_paddr_t *firstaddr)
 {
 	vm_offset_t va;
-	pt_entry_t *pte, *unused;
+	pt_entry_t *pte;
 
 	/*
 	 * Create an initial set of page tables to run the kernel in.
@@ -623,6 +859,15 @@
 	 */
 	create_pagetables(firstaddr);
 
+	/*
+	 * Add a physical memory segment (vm_phys_seg) corresponding to the
+	 * preallocated kernel page table pages so that vm_page structures
+	 * representing these pages will be created.  The vm_page structures
+	 * are required for promotion of the corresponding kernel virtual
+	 * addresses to superpage mappings.
+	 */
+	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
+
 	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
@@ -640,9 +885,11 @@
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
-	kernel_pmap->pm_root = NULL;
+	kernel_pmap->pm_cr3 = KPML4phys;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
+	CPU_FILL(&kernel_pmap->pm_save);	/* always superset of pm_active */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
+	kernel_pmap->pm_flags = pmap_flags;
 
  	/*
 	 * Initialize the global pv list lock.
@@ -660,19 +907,37 @@
 	pte = vtopte(va);
 
 	/*
-	 * CMAP1 is only used for the memory test.
+	 * Crashdump maps.  The first page is reused as CMAP1 for the
+	 * memory test.
 	 */
-	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
+	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
+	CADDR1 = crashdumpmap;
 
+	virtual_avail = va;
+
 	/*
-	 * Crashdump maps.
+	 * Initialize the PAT MSR.
+	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
+	 * side-effect, invalidates stale PG_G TLB entries that might
+	 * have been created in our pre-boot environment.
 	 */
-	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
+	pmap_init_pat();
 
-	virtual_avail = va;
-
-	/* Initialize the PAT MSR. */
-	pmap_init_pat();
+	/* Initialize TLB Context Id. */
+	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
+	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
+		load_cr4(rcr4() | CR4_PCIDE);
+		mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
+		init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
+		/* Check for INVPCID support */
+		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
+		    != 0;
+		kernel_pmap->pm_pcid = 0;
+#ifndef SMP
+		pmap_pcid_enabled = 0;
+#endif
+	} else
+		pmap_pcid_enabled = 0;
 }
 
 /*
@@ -776,6 +1041,7 @@
 void
 pmap_init(void)
 {
+	struct pmap_preinit_mapping *ppim;
 	vm_page_t mpte;
 	vm_size_t s;
 	int i, pv_npg;
@@ -784,7 +1050,7 @@
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */ 
-	for (i = 0; i < NKPT; i++) {
+	for (i = 0; i < nkpt; i++) {
 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
@@ -794,12 +1060,18 @@
 	}
 
 	/*
-	 * If the kernel is running in a virtual machine on an AMD Family 10h
-	 * processor, then it must assume that MCA is enabled by the virtual
-	 * machine monitor.
+	 * If the kernel is running on a virtual machine, then it must assume
+	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
+	 * be prepared for the hypervisor changing the vendor and family that
+	 * are reported by CPUID.  Consequently, the workaround for AMD Family
+	 * 10h Erratum 383 is enabled if the processor's feature set does not
+	 * include at least one feature that is only supported by older Intel
+	 * or newer AMD processors.
 	 */
-	if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
-	    CPUID_TO_FAMILY(cpu_id) == 0x10)
+	if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
+	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
+	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
+	    AMDID2_FMA4)) == 0)
 		workaround_erratum383 = 1;
 
 	/*
@@ -826,8 +1098,7 @@
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
-	for (i = 0; phys_avail[i + 1]; i += 2);
-	pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
+	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
@@ -834,9 +1105,31 @@
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
-	pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
+	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
+	    M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
+	TAILQ_INIT(&pv_dummy.pv_list);
+
+	mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
+	cpage_a = kva_alloc(PAGE_SIZE);
+	cpage_b = kva_alloc(PAGE_SIZE);
+
+	pmap_initialized = 1;
+	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
+		ppim = pmap_preinit_mapping + i;
+		if (ppim->va == 0)
+			continue;
+		/* Make the direct map consistent */
+		if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) {
+			(void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
+			    ppim->sz, ppim->mode);
+		}
+		if (!bootverbose)
+			continue;
+		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
+		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
+	}
 }
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
@@ -869,12 +1162,41 @@
  * Low level helper routines.....
  ***************************************************/
 
+static pt_entry_t
+pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
+{
+	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+	case PT_RVI:
+		/* Verify that both PAT bits are not set at the same time */
+		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
+		    ("Invalid PAT bits in entry %#lx", entry));
+
+		/* Swap the PAT bits if one of them is set */
+		if ((entry & x86_pat_bits) != 0)
+			entry ^= x86_pat_bits;
+		break;
+	case PT_EPT:
+		/*
+		 * Nothing to do - the memory attributes are represented
+		 * the same way for regular pages and superpages.
+		 */
+		break;
+	default:
+		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
+	}
+
+	return (entry);
+}
+
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 static int
-pmap_cache_bits(int mode, boolean_t is_pde)
+pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
 
@@ -881,23 +1203,94 @@
 	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
 		panic("Unknown caching mode %d\n", mode);
 
-	/* The PAT bit is different for PTE's and PDE's. */
-	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
+	switch (pmap->pm_type) {
+	case PT_X86:
+	case PT_RVI:
+		/* The PAT bit is different for PTE's and PDE's. */
+		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 
-	/* Map the caching mode to a PAT index. */
-	pat_idx = pat_index[mode];
+		/* Map the caching mode to a PAT index. */
+		pat_idx = pat_index[mode];
 
-	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
-	cache_bits = 0;
-	if (pat_idx & 0x4)
-		cache_bits |= pat_flag;
-	if (pat_idx & 0x2)
-		cache_bits |= PG_NC_PCD;
-	if (pat_idx & 0x1)
-		cache_bits |= PG_NC_PWT;
+		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
+		cache_bits = 0;
+		if (pat_idx & 0x4)
+			cache_bits |= pat_flag;
+		if (pat_idx & 0x2)
+			cache_bits |= PG_NC_PCD;
+		if (pat_idx & 0x1)
+			cache_bits |= PG_NC_PWT;
+		break;
+
+	case PT_EPT:
+		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
+		break;
+
+	default:
+		panic("unsupported pmap type %d", pmap->pm_type);
+	}
+
 	return (cache_bits);
 }
 
+static int
+pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
+{
+	int mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+	case PT_RVI:
+		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
+		break;
+	case PT_EPT:
+		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
+		break;
+	default:
+		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline boolean_t
+pmap_ps_enabled(pmap_t pmap)
+{
+
+	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
+}
+
+static void
+pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
+{
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		break;
+	case PT_RVI:
+	case PT_EPT:
+		/*
+		 * XXX
+		 * This is a little bogus since the generation number is
+		 * supposed to be bumped up when a region of the address
+		 * space is invalidated in the page tables.
+		 *
+		 * In this case the old PDE entry is valid but yet we want
+		 * to make sure that any mappings using the old entry are
+		 * invalidated in the TLB.
+		 *
+		 * The reason this works as expected is because we rendezvous
+		 * "all" host cpus and force any vcpu context to exit as a
+		 * side-effect.
+		 */
+		atomic_add_acq_long(&pmap->pm_eptgen, 1);
+		break;
+	default:
+		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
+	}
+	pde_store(pde, newpde);
+}
+
 /*
  * After changing the page size for the specified virtual address in the page
  * table, flush the corresponding entries from the processor's TLB.  Only the
@@ -906,10 +1299,18 @@
  * The calling thread must be pinned to a processor.
  */
 static void
-pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
+pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 {
-	u_long cr4;
+	pt_entry_t PG_G;
 
+	if (pmap_type_guest(pmap))
+		return;
+
+	KASSERT(pmap->pm_type == PT_X86,
+	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
+
+	PG_G = pmap_global_bit(pmap);
+
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
 		invlpg(va);
@@ -924,19 +1325,33 @@
 		 * Promotion: flush every 4KB page mapping from the TLB,
 		 * including any global (PG_G) mappings.
 		 */
-		cr4 = rcr4();
-		load_cr4(cr4 & ~CR4_PGE);
-		/*
-		 * Although preemption at this point could be detrimental to
-		 * performance, it would not lead to an error.  PG_G is simply
-		 * ignored if CR4.PGE is clear.  Moreover, in case this block
-		 * is re-entered, the load_cr4() either above or below will
-		 * modify CR4.PGE flushing the TLB.
-		 */
-		load_cr4(cr4 | CR4_PGE);
+		invltlb_globpcid();
 	}
 }
 #ifdef SMP
+
+static void
+pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
+{
+	struct invpcid_descr d;
+	uint64_t cr3;
+
+	if (invpcid_works) {
+		d.pcid = pmap->pm_pcid;
+		d.pad = 0;
+		d.addr = va;
+		invpcid(&d, INVPCID_ADDR);
+		return;
+	}
+
+	cr3 = rcr3();
+	critical_enter();
+	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
+	invlpg(va);
+	load_cr3(cr3 | CR3_PCID_SAVE);
+	critical_exit();
+}
+
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  *
@@ -956,6 +1371,46 @@
  * immutable.  The kernel page table is always active on every
  * processor.
  */
+
+/*
+ * Interrupt the cpus that are executing in the guest context.
+ * This will force the vcpu to exit and the cached EPT mappings
+ * will be invalidated by the host before the next vmresume.
+ */
+static __inline void
+pmap_invalidate_ept(pmap_t pmap)
+{
+	int ipinum;
+
+	sched_pin();
+	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+	    ("pmap_invalidate_ept: absurd pm_active"));
+
+	/*
+	 * The TLB mappings associated with a vcpu context are not
+	 * flushed each time a different vcpu is chosen to execute.
+	 *
+	 * This is in contrast with a process's vtop mappings that
+	 * are flushed from the TLB on each context switch.
+	 *
+	 * Therefore we need to do more than just a TLB shootdown on
+	 * the active cpus in 'pmap->pm_active'. To do this we keep
+	 * track of the number of invalidations performed on this pmap.
+	 *
+	 * Each vcpu keeps a cache of this counter and compares it
+	 * just before a vmresume. If the counter is out-of-date an
+	 * invept will be done to flush stale mappings from the TLB.
+	 */
+	atomic_add_acq_long(&pmap->pm_eptgen, 1);
+
+	/*
+	 * Force the vcpu to exit and trap back into the hypervisor.
+	 */
+	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
+	ipi_selected(pmap->pm_active, ipinum);
+	sched_unpin();
+}
+
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
@@ -962,10 +1417,29 @@
 	cpuset_t other_cpus;
 	u_int cpuid;
 
+	if (pmap_type_guest(pmap)) {
+		pmap_invalidate_ept(pmap);
+		return;
+	}
+
+	KASSERT(pmap->pm_type == PT_X86,
+	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
+
 	sched_pin();
 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
-		invlpg(va);
-		smp_invlpg(va);
+		if (!pmap_pcid_enabled) {
+			invlpg(va);
+		} else {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+				if (pmap == PCPU_GET(curpmap))
+					invlpg(va);
+				else
+					pmap_invalidate_page_pcid(pmap, va);
+			} else {
+				invltlb_globpcid();
+			}
+		}
+		smp_invlpg(pmap, va);
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
@@ -972,13 +1446,48 @@
 		CPU_CLR(cpuid, &other_cpus);
 		if (CPU_ISSET(cpuid, &pmap->pm_active))
 			invlpg(va);
-		CPU_AND(&other_cpus, &pmap->pm_active);
+		else if (pmap_pcid_enabled) {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
+				pmap_invalidate_page_pcid(pmap, va);
+			else
+				invltlb_globpcid();
+		}
+		if (pmap_pcid_enabled)
+			CPU_AND(&other_cpus, &pmap->pm_save);
+		else
+			CPU_AND(&other_cpus, &pmap->pm_active);
 		if (!CPU_EMPTY(&other_cpus))
-			smp_masked_invlpg(other_cpus, va);
+			smp_masked_invlpg(other_cpus, pmap, va);
 	}
 	sched_unpin();
 }
 
+static void
+pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+	struct invpcid_descr d;
+	uint64_t cr3;
+	vm_offset_t addr;
+
+	if (invpcid_works) {
+		d.pcid = pmap->pm_pcid;
+		d.pad = 0;
+		for (addr = sva; addr < eva; addr += PAGE_SIZE) {
+			d.addr = addr;
+			invpcid(&d, INVPCID_ADDR);
+		}
+		return;
+	}
+
+	cr3 = rcr3();
+	critical_enter();
+	load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
+	for (addr = sva; addr < eva; addr += PAGE_SIZE)
+		invlpg(addr);
+	load_cr3(cr3 | CR3_PCID_SAVE);
+	critical_exit();
+}
+
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
@@ -986,21 +1495,53 @@
 	vm_offset_t addr;
 	u_int cpuid;
 
+	if (pmap_type_guest(pmap)) {
+		pmap_invalidate_ept(pmap);
+		return;
+	}
+
+	KASSERT(pmap->pm_type == PT_X86,
+	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
+
 	sched_pin();
 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
-		for (addr = sva; addr < eva; addr += PAGE_SIZE)
-			invlpg(addr);
-		smp_invlpg_range(sva, eva);
+		if (!pmap_pcid_enabled) {
+			for (addr = sva; addr < eva; addr += PAGE_SIZE)
+				invlpg(addr);
+		} else {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+				if (pmap == PCPU_GET(curpmap)) {
+					for (addr = sva; addr < eva;
+					    addr += PAGE_SIZE)
+						invlpg(addr);
+				} else {
+					pmap_invalidate_range_pcid(pmap,
+					    sva, eva);
+				}
+			} else {
+				invltlb_globpcid();
+			}
+		}
+		smp_invlpg_range(pmap, sva, eva);
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
-		if (CPU_ISSET(cpuid, &pmap->pm_active))
+		if (CPU_ISSET(cpuid, &pmap->pm_active)) {
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
-		CPU_AND(&other_cpus, &pmap->pm_active);
+		} else if (pmap_pcid_enabled) {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
+				pmap_invalidate_range_pcid(pmap, sva, eva);
+			else
+				invltlb_globpcid();
+		}
+		if (pmap_pcid_enabled)
+			CPU_AND(&other_cpus, &pmap->pm_save);
+		else
+			CPU_AND(&other_cpus, &pmap->pm_active);
 		if (!CPU_EMPTY(&other_cpus))
-			smp_masked_invlpg_range(other_cpus, sva, eva);
+			smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
 	}
 	sched_unpin();
 }
@@ -1009,21 +1550,72 @@
 pmap_invalidate_all(pmap_t pmap)
 {
 	cpuset_t other_cpus;
+	struct invpcid_descr d;
+	uint64_t cr3;
 	u_int cpuid;
 
+	if (pmap_type_guest(pmap)) {
+		pmap_invalidate_ept(pmap);
+		return;
+	}
+
+	KASSERT(pmap->pm_type == PT_X86,
+	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
+
 	sched_pin();
-	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
-		invltlb();
-		smp_invltlb();
+	cpuid = PCPU_GET(cpuid);
+	if (pmap == kernel_pmap ||
+	    (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
+	    !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+		if (invpcid_works) {
+			bzero(&d, sizeof(d));
+			invpcid(&d, INVPCID_CTXGLOB);
+		} else {
+			invltlb_globpcid();
+		}
+		if (!CPU_ISSET(cpuid, &pmap->pm_active))
+			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
+		smp_invltlb(pmap);
 	} else {
-		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
-		if (CPU_ISSET(cpuid, &pmap->pm_active))
+
+		/*
+		 * This logic is duplicated in the Xinvltlb shootdown
+		 * IPI handler.
+		 */
+		if (pmap_pcid_enabled) {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+				if (invpcid_works) {
+					d.pcid = pmap->pm_pcid;
+					d.pad = 0;
+					d.addr = 0;
+					invpcid(&d, INVPCID_CTX);
+				} else {
+					cr3 = rcr3();
+					critical_enter();
+
+					/*
+					 * Bit 63 is clear, pcid TLB
+					 * entries are invalidated.
+					 */
+					load_cr3(pmap->pm_cr3);
+					load_cr3(cr3 | CR3_PCID_SAVE);
+					critical_exit();
+				}
+			} else {
+				invltlb_globpcid();
+			}
+		} else if (CPU_ISSET(cpuid, &pmap->pm_active))
 			invltlb();
-		CPU_AND(&other_cpus, &pmap->pm_active);
+		if (!CPU_ISSET(cpuid, &pmap->pm_active))
+			CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
+		if (pmap_pcid_enabled)
+			CPU_AND(&other_cpus, &pmap->pm_save);
+		else
+			CPU_AND(&other_cpus, &pmap->pm_active);
 		if (!CPU_EMPTY(&other_cpus))
-			smp_masked_invltlb(other_cpus);
+			smp_masked_invltlb(other_cpus, pmap);
 	}
 	sched_unpin();
 }
@@ -1040,6 +1632,7 @@
 
 struct pde_action {
 	cpuset_t invalidate;	/* processors that invalidate their TLB */
+	pmap_t pmap;
 	vm_offset_t va;
 	pd_entry_t *pde;
 	pd_entry_t newpde;
@@ -1052,7 +1645,7 @@
 	struct pde_action *act = arg;
 
 	if (act->store == PCPU_GET(cpuid))
-		pde_store(act->pde, act->newpde);
+		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 }
 
 static void
@@ -1061,7 +1654,7 @@
 	struct pde_action *act = arg;
 
 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
-		pmap_update_pde_invalidate(act->va, act->newpde);
+		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 }
 
 /*
@@ -1083,14 +1676,17 @@
 	cpuid = PCPU_GET(cpuid);
 	other_cpus = all_cpus;
 	CPU_CLR(cpuid, &other_cpus);
-	if (pmap == kernel_pmap)
+	if (pmap == kernel_pmap || pmap_type_guest(pmap)) 
 		active = all_cpus;
-	else
+	else {
 		active = pmap->pm_active;
+		CPU_AND_ATOMIC(&pmap->pm_save, &active);
+	}
 	if (CPU_OVERLAP(&active, &other_cpus)) { 
 		act.store = cpuid;
 		act.invalidate = active;
 		act.va = va;
+		act.pmap = pmap;
 		act.pde = pde;
 		act.newpde = newpde;
 		CPU_SET(cpuid, &active);
@@ -1098,9 +1694,9 @@
 		    smp_no_rendevous_barrier, pmap_update_pde_action,
 		    pmap_update_pde_teardown, &act);
 	} else {
-		pde_store(pde, newpde);
+		pmap_update_pde_store(pmap, pde, newpde);
 		if (CPU_ISSET(cpuid, &active))
-			pmap_update_pde_invalidate(va, newpde);
+			pmap_update_pde_invalidate(pmap, va, newpde);
 	}
 	sched_unpin();
 }
@@ -1113,8 +1709,18 @@
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
-	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
-		invlpg(va);
+	switch (pmap->pm_type) {
+	case PT_X86:
+		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+			invlpg(va);
+		break;
+	case PT_RVI:
+	case PT_EPT:
+		pmap->pm_eptgen++;
+		break;
+	default:
+		panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
+	}
 }
 
 PMAP_INLINE void
@@ -1122,9 +1728,19 @@
 {
 	vm_offset_t addr;
 
-	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
-		for (addr = sva; addr < eva; addr += PAGE_SIZE)
-			invlpg(addr);
+	switch (pmap->pm_type) {
+	case PT_X86:
+		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+			for (addr = sva; addr < eva; addr += PAGE_SIZE)
+				invlpg(addr);
+		break;
+	case PT_RVI:
+	case PT_EPT:
+		pmap->pm_eptgen++;
+		break;
+	default:
+		panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
+	}
 }
 
 PMAP_INLINE void
@@ -1131,8 +1747,18 @@
 pmap_invalidate_all(pmap_t pmap)
 {
 
-	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
-		invltlb();
+	switch (pmap->pm_type) {
+	case PT_X86:
+		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+			invltlb();
+		break;
+	case PT_RVI:
+	case PT_EPT:
+		pmap->pm_eptgen++;
+		break;
+	default:
+		panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
+	}
 }
 
 PMAP_INLINE void
@@ -1146,28 +1772,54 @@
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 
-	pde_store(pde, newpde);
+	pmap_update_pde_store(pmap, pde, newpde);
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
-		pmap_update_pde_invalidate(va, newpde);
+		pmap_update_pde_invalidate(pmap, va, newpde);
+	else
+		CPU_ZERO(&pmap->pm_save);
 }
 #endif /* !SMP */
 
+static void
+pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
+{
+
+	/*
+	 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
+	 * by a promotion that did not invalidate the 512 4KB page mappings
+	 * that might exist in the TLB.  Consequently, at this point, the TLB
+	 * may hold both 4KB and 2MB page mappings for the address range [va,
+	 * va + NBPDR).  Therefore, the entire range must be invalidated here.
+	 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
+	 * 4KB page mappings for the address range [va, va + NBPDR), and so a
+	 * single INVLPG suffices to invalidate the 2MB page mapping from the
+	 * TLB.
+	 */
+	if ((pde & PG_PROMOTED) != 0)
+		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
+	else
+		pmap_invalidate_page(pmap, va);
+}
+
 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
 
 void
-pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
+pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
 {
 
-	KASSERT((sva & PAGE_MASK) == 0,
-	    ("pmap_invalidate_cache_range: sva not page-aligned"));
-	KASSERT((eva & PAGE_MASK) == 0,
-	    ("pmap_invalidate_cache_range: eva not page-aligned"));
+	if (force) {
+		sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
+	} else {
+		KASSERT((sva & PAGE_MASK) == 0,
+		    ("pmap_invalidate_cache_range: sva not page-aligned"));
+		KASSERT((eva & PAGE_MASK) == 0,
+		    ("pmap_invalidate_cache_range: eva not page-aligned"));
+	}
 
-	if (cpu_feature & CPUID_SS)
-		; /* If "Self Snoop" is supported, do nothing. */
-	else if ((cpu_feature & CPUID_CLFSH) != 0 &&
+	if ((cpu_feature & CPUID_SS) != 0 && !force)
+		; /* If "Self Snoop" is supported and allowed, do nothing. */
+	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
-
 		/*
 		 * XXX: Some CPUs fault, hang, or trash the local APIC
 		 * registers if we use CLFLUSH on the local APIC
@@ -1178,16 +1830,29 @@
 			return;
 
 		/*
-		 * Otherwise, do per-cache line flush.  Use the mfence
+		 * Otherwise, do per-cache line flush.  Use the sfence
 		 * instruction to insure that previous stores are
 		 * included in the write-back.  The processor
 		 * propagates flush to other processors in the cache
 		 * coherence domain.
 		 */
-		mfence();
+		sfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
+			clflushopt(sva);
+		sfence();
+	} else if ((cpu_feature & CPUID_CLFSH) != 0 &&
+	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
+		if (pmap_kextract(sva) == lapic_paddr)
+			return;
+		/*
+		 * Writes are ordered by CLFLUSH on Intel CPUs.
+		 */
+		if (cpu_vendor_id != CPU_VENDOR_INTEL)
+			mfence();
+		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflush(sva);
-		mfence();
+		if (cpu_vendor_id != CPU_VENDOR_INTEL)
+			mfence();
 	} else {
 
 		/*
@@ -1211,33 +1876,35 @@
 {
 	vm_offset_t daddr, eva;
 	int i;
+	bool useclflushopt;
 
+	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
-	    (cpu_feature & CPUID_CLFSH) == 0)
+	    ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
 		pmap_invalidate_cache();
 	else {
-		mfence();
+		if (useclflushopt)
+			sfence();
+		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
+			mfence();
 		for (i = 0; i < count; i++) {
 			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
 			eva = daddr + PAGE_SIZE;
-			for (; daddr < eva; daddr += cpu_clflush_line_size)
-				clflush(daddr);
+			for (; daddr < eva; daddr += cpu_clflush_line_size) {
+				if (useclflushopt)
+					clflushopt(daddr);
+				else
+					clflush(daddr);
+			}
 		}
-		mfence();
+		if (useclflushopt)
+			sfence();
+		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
+			mfence();
 	}
 }
 
 /*
- * Are we current address space or kernel?
- */
-static __inline int
-pmap_is_current(pmap_t pmap)
-{
-	return (pmap == kernel_pmap ||
-	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
-}
-
-/*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
@@ -1248,10 +1915,11 @@
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
-	pt_entry_t *pte;
+	pt_entry_t *pte, PG_V;
 	vm_paddr_t pa;
 
 	pa = 0;
+	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -1286,12 +1954,14 @@
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde, *pdep;
-	pt_entry_t pte;
+	pt_entry_t pte, PG_RW, PG_V;
 	vm_paddr_t pa;
 	vm_page_t m;
 
 	pa = 0;
 	m = NULL;
+	PG_RW = pmap_rw_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, va);
@@ -1364,7 +2034,7 @@
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
-	pte_store(pte, pa | PG_RW | PG_V | PG_G);
+	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
 }
 
 static __inline void
@@ -1371,9 +2041,11 @@
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
+	int cache_bits;
 
 	pte = vtopte(va);
-	pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
+	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
+	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
 }
 
 /*
@@ -1422,6 +2094,7 @@
 {
 	pt_entry_t *endpte, oldpte, pa, *pte;
 	vm_page_t m;
+	int cache_bits;
 
 	oldpte = 0;
 	pte = vtopte(sva);
@@ -1428,14 +2101,15 @@
 	endpte = pte + count;
 	while (pte < endpte) {
 		m = *ma++;
-		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
-		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
+		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
+		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
+		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
-			pte_store(pte, pa | PG_G | PG_RW | PG_V);
+			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
 		}
 		pte++;
 	}
-	if (__predict_false((oldpte & PG_V) != 0))
+	if (__predict_false((oldpte & X86_PG_V) != 0))
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
@@ -1463,13 +2137,12 @@
  * Page table page management routines.....
  ***************************************************/
 static __inline void
-pmap_free_zero_pages(vm_page_t free)
+pmap_free_zero_pages(struct spglist *free)
 {
 	vm_page_t m;
 
-	while (free != NULL) {
-		m = free;
-		free = m->right;
+	while ((m = SLIST_FIRST(free)) != NULL) {
+		SLIST_REMOVE_HEAD(free, plinks.s.ss);
 		/* Preserve the page's PG_ZERO setting. */
 		vm_page_free_toq(m);
 	}
@@ -1481,7 +2154,8 @@
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
-pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
+pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
+    boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
@@ -1488,8 +2162,7 @@
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
-	m->right = *free;
-	*free = m;
+	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 	
 /*
@@ -1498,31 +2171,12 @@
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  */
-static void
+static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 {
-	vm_page_t root;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	root = pmap->pm_root;
-	if (root == NULL) {
-		mpte->left = NULL;
-		mpte->right = NULL;
-	} else {
-		root = vm_page_splay(mpte->pindex, root);
-		if (mpte->pindex < root->pindex) {
-			mpte->left = root->left;
-			mpte->right = root;
-			root->left = NULL;
-		} else if (mpte->pindex == root->pindex)
-			panic("pmap_insert_pt_page: pindex already inserted");
-		else {
-			mpte->right = root->right;
-			mpte->left = root;
-			root->right = NULL;
-		}
-	}
-	pmap->pm_root = mpte;
+	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
@@ -1530,19 +2184,12 @@
  * specified pmap's collection of idle page table pages.  Returns NULL if there
  * is no page table page corresponding to the specified virtual address.
  */
-static vm_page_t
+static __inline vm_page_t
 pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
 {
-	vm_page_t mpte;
-	vm_pindex_t pindex = pmap_pde_pindex(va);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
-		mpte = vm_page_splay(pindex, mpte);
-		if ((pmap->pm_root = mpte)->pindex != pindex)
-			mpte = NULL;
-	}
-	return (mpte);
+	return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
 }
 
 /*
@@ -1550,25 +2197,12 @@
  * of idle page table pages.  The specified page table page must be a member of
  * the pmap's collection.
  */
-static void
+static __inline void
 pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
 {
-	vm_page_t root;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	if (mpte != pmap->pm_root) {
-		root = vm_page_splay(mpte->pindex, pmap->pm_root);
-		KASSERT(mpte == root,
-		    ("pmap_remove_pt_page: mpte %p is missing from pmap %p",
-		    mpte, pmap));
-	}
-	if (mpte->left == NULL)
-		root = mpte->right;
-	else {
-		root = vm_page_splay(mpte->pindex, mpte->left);
-		root->right = mpte->right;
-	}
-	pmap->pm_root = root;
+	vm_radix_remove(&pmap->pm_root, mpte->pindex);
 }
 
 /*
@@ -1578,7 +2212,7 @@
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
-pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
+pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
@@ -1590,7 +2224,7 @@
 }
 
 static void
-_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
+_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -1648,7 +2282,8 @@
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
-pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
+pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
+    struct spglist *free)
 {
 	vm_page_t mpte;
 
@@ -1665,11 +2300,15 @@
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
-	pmap->pm_root = NULL;
+	pmap->pm_cr3 = KPML4phys;
+	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
+	CPU_ZERO(&pmap->pm_save);
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+	pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
+	pmap->pm_flags = pmap_flags;
 }
 
 /*
@@ -1677,13 +2316,12 @@
  * such as one in a vmspace structure.
  */
 int
-pmap_pinit(pmap_t pmap)
+pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 {
 	vm_page_t pml4pg;
+	vm_paddr_t pml4phys;
 	int i;
 
-	PMAP_LOCK_INIT(pmap);
-
 	/*
 	 * allocate the page directory page
 	 */
@@ -1691,29 +2329,61 @@
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 		VM_WAIT;
 
-	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
+	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
+	pmap->pm_pcid = -1;
+	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
 
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_pml4);
 
-	/* Wire in kernel global address entries. */
-	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
-	for (i = 0; i < NDMPML4E; i++) {
-		pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
-		    PG_RW | PG_V | PG_U;
+	/*
+	 * Do not install the host kernel mappings in the nested page
+	 * tables. These mappings are meaningless in the guest physical
+	 * address space.
+	 */
+	if ((pmap->pm_type = pm_type) == PT_X86) {
+		pmap->pm_cr3 = pml4phys;
+
+		/* Wire in kernel global address entries. */
+		for (i = 0; i < NKPML4E; i++) {
+			pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
+			    X86_PG_RW | X86_PG_V | PG_U;
+		}
+		for (i = 0; i < ndmpdpphys; i++) {
+			pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
+			    X86_PG_RW | X86_PG_V | PG_U;
+		}
+
+		/* install self-referential address mapping entry(s) */
+		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
+		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
+
+		if (pmap_pcid_enabled) {
+			pmap->pm_pcid = alloc_unr(&pcid_unr);
+			if (pmap->pm_pcid != -1)
+				pmap->pm_cr3 |= pmap->pm_pcid;
+		}
 	}
 
-	/* install self-referential address mapping entry(s) */
-	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
-
-	pmap->pm_root = NULL;
+	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+	pmap->pm_flags = flags;
+	pmap->pm_eptgen = 0;
+	CPU_ZERO(&pmap->pm_save);
 
 	return (1);
 }
 
+int
+pmap_pinit(pmap_t pmap)
+{
+
+	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
+}
+
 /*
  * This routine is called if the desired page table page does not exist.
  *
@@ -1729,9 +2399,15 @@
 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, pdppg, pdpg;
+	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
 	/*
 	 * Allocate a page table page.
 	 */
@@ -1859,9 +2535,11 @@
 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t pdpindex, ptepindex;
-	pdp_entry_t *pdpe;
+	pdp_entry_t *pdpe, PG_V;
 	vm_page_t pdpg;
 
+	PG_V = pmap_valid_bit(pmap);
+
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -1883,9 +2561,11 @@
 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
-	pd_entry_t *pd;
+	pd_entry_t *pd, PG_V;
 	vm_page_t m;
 
+	PG_V = pmap_valid_bit(pmap);
+
 	/*
 	 * Calculate pagetable page index
 	 */
@@ -1948,13 +2628,22 @@
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
-	KASSERT(pmap->pm_root == NULL,
+	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 
-	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
+	if (pmap_pcid_enabled) {
+		/*
+		 * Invalidate any left TLB entries, to allow the reuse
+		 * of the pcid.
+		 */
+		pmap_invalidate_all(pmap);
+	}
 
-	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
-	for (i = 0; i < NDMPML4E; i++)	/* Direct Map */
+	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
+
+	for (i = 0; i < NKPML4E; i++)	/* KVA */
+		pmap->pm_pml4[KPML4BASE + i] = 0;
+	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 		pmap->pm_pml4[DMPML4I + i] = 0;
 	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
 
@@ -1961,7 +2650,8 @@
 	m->wire_count--;
 	atomic_subtract_int(&cnt.v_wire_count, 1);
 	vm_page_free_zero(m);
-	PMAP_LOCK_DESTROY(pmap);
+	if (pmap->pm_pcid != -1)
+		free_unr(&pcid_unr, pmap->pm_pcid);
 }
 

 static int
@@ -2003,7 +2693,7 @@
 	 * "kernel_vm_end" and the kernel page table as they were.
 	 *
 	 * The correctness of this action is based on the following
-	 * argument: vm_map_findspace() allocates contiguous ranges of the
+	 * argument: vm_map_insert() allocates contiguous ranges of the
 	 * kernel virtual address space.  It calls this function if a range
 	 * ends after "kernel_vm_end".  If the kernel is mapped between
 	 * "kernel_vm_end" and "addr", then the range cannot begin at
@@ -2012,7 +2702,7 @@
 	 * any new kernel page table pages between "kernel_vm_end" and
 	 * "KERNBASE".
 	 */
-	if (KERNBASE < addr && addr <= KERNBASE + NKPT * NBPDR)
+	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
 		return;
 
 	addr = roundup2(addr, NBPDR);
@@ -2020,7 +2710,7 @@
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
-		if ((*pdpe & PG_V) == 0) {
+		if ((*pdpe & X86_PG_V) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
@@ -2030,12 +2720,12 @@
 			if ((nkpg->flags & PG_ZERO) == 0)
 				pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
-			*pdpe = (pdp_entry_t)
-				(paddr | PG_V | PG_RW | PG_A | PG_M);
+			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
+			    X86_PG_A | X86_PG_M);
 			continue; /* try again */
 		}
 		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
-		if ((*pde & PG_V) != 0) {
+		if ((*pde & X86_PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
@@ -2052,7 +2742,7 @@
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
-		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
+		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 		pde_store(pde, newpdir);
 
 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
@@ -2132,20 +2822,24 @@
 	pd_entry_t *pde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
+	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 	pv_entry_t pv;
 	vm_offset_t va;
-	vm_page_t free, m, m_pc;
+	vm_page_t m, m_pc;
+	struct spglist free;
 	uint64_t inuse;
 	int bit, field, freed;
-	
+
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 	pmap = NULL;
-	free = m_pc = NULL;
+	m_pc = NULL;
+	PG_G = PG_A = PG_M = PG_RW = 0;
+	SLIST_INIT(&free);
 	TAILQ_INIT(&new_tail);
 	mtx_lock(&pv_chunks_mutex);
-	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && free == NULL) {
+	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
 		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 		if (pmap != pc->pc_pmap) {
@@ -2166,6 +2860,10 @@
 				mtx_lock(&pv_chunks_mutex);
 				continue;
 			}
+			PG_G = pmap_global_bit(pmap);
+			PG_A = pmap_accessed_bit(pmap);
+			PG_M = pmap_modified_bit(pmap);
+			PG_RW = pmap_rw_bit(pmap);
 		}
 
 		/*
@@ -2193,7 +2891,8 @@
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
-				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+				m->md.pv_gen++;
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -2203,7 +2902,7 @@
 					}
 				}
 				pc->pc_map[field] |= 1UL << bit;
-				pmap_unuse_pt(pmap, va, *pde, &free);	
+				pmap_unuse_pt(pmap, va, *pde, &free);
 				freed++;
 			}
 		}
@@ -2243,14 +2942,14 @@
 		if (pmap != locked_pmap)
 			PMAP_UNLOCK(pmap);
 	}
-	if (m_pc == NULL && free != NULL) {
-		m_pc = free;
-		free = m_pc->right;
+	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
+		m_pc = SLIST_FIRST(&free);
+		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 		atomic_add_int(&cnt.v_wire_count, 1);
 	}
-	pmap_free_zero_pages(free);
+	pmap_free_zero_pages(&free);
 	return (m_pc);
 }
 
@@ -2477,9 +3176,10 @@
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
-	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
-			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
+			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
+			pvh->pv_gen++;
 			break;
 		}
 	}
@@ -2518,7 +3218,8 @@
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
-	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+	m->md.pv_gen++;
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 	va_last = va + NBPDR - PAGE_SIZE;
@@ -2536,7 +3237,8 @@
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_pde: page %p is not managed", m));
-				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
@@ -2584,7 +3286,8 @@
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 	pvh = pa_to_pvh(pa);
-	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
+	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+	pvh->pv_gen++;
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
@@ -2625,7 +3328,8 @@
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
-		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
@@ -2649,7 +3353,8 @@
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 		pvh = pa_to_pvh(pa);
-		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
+		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+		pvh->pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
@@ -2692,15 +3397,26 @@
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
+	pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
 	vm_paddr_t mptepa;
-	vm_page_t free, mpte;
+	vm_page_t mpte;
+	struct spglist free;
+	vm_offset_t sva;
+	int PG_PTE_CACHE;
 
+	PG_G = pmap_global_bit(pmap);
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
+
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpde = *pde;
 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
-	mpte = pmap_lookup_pt_page(pmap, va);
-	if (mpte != NULL)
+	if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
+	    NULL)
 		pmap_remove_pt_page(pmap, mpte);
 	else {
 		KASSERT((oldpde & PG_W) == 0,
@@ -2723,11 +3439,12 @@
 		    pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
 		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
-			free = NULL;
-			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
-			    lockp);
-			pmap_invalidate_page(pmap, trunc_2mpage(va));
-			pmap_free_zero_pages(free);
+			SLIST_INIT(&free);
+			sva = trunc_2mpage(va);
+			pmap_remove_pde(pmap, pde, sva, &free, lockp);
+			if ((oldpde & PG_G) == 0)
+				pmap_invalidate_pde_page(pmap, sva, oldpde);
+			pmap_free_zero_pages(&free);
 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (FALSE);
@@ -2743,8 +3460,7 @@
 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pde: oldpde is missing PG_M"));
 	newpte = oldpde & ~PG_PS;
-	if ((newpte & PG_PDE_PAT) != 0)
-		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
+	newpte = pmap_swap_pat(pmap, newpte);
 
 	/*
 	 * If the page table page is new, initialize it.
@@ -2806,17 +3522,62 @@
 }
 
 /*
+ * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
+ */
+static void
+pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
+{
+	pd_entry_t newpde;
+	vm_paddr_t mptepa;
+	vm_page_t mpte;
+
+	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	mpte = pmap_lookup_pt_page(pmap, va);
+	if (mpte == NULL)
+		panic("pmap_remove_kernel_pde: Missing pt page.");
+
+	pmap_remove_pt_page(pmap, mpte);
+	mptepa = VM_PAGE_TO_PHYS(mpte);
+	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
+
+	/*
+	 * Initialize the page table page.
+	 */
+	pagezero((void *)PHYS_TO_DMAP(mptepa));
+
+	/*
+	 * Demote the mapping.
+	 */
+	if (workaround_erratum383)
+		pmap_update_pde(pmap, va, pde, newpde);
+	else
+		pde_store(pde, newpde);
+
+	/*
+	 * Invalidate a stale recursive mapping of the page table page.
+	 */
+	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
+}
+
+/*
  * pmap_remove_pde: do the things to unmap a superpage in a process
  */
 static int
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
-    vm_page_t *free, struct rwlock **lockp)
+    struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m, mpte;
+	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 
+	PG_G = pmap_global_bit(pmap);
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_remove_pde: sva is not 2mpage aligned"));
@@ -2823,13 +3584,8 @@
 	oldpde = pte_load_clear(pdq);
 	if (oldpde & PG_W)
 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
-
-	/*
-	 * Machines that don't support invlpg, also don't support
-	 * PG_G.
-	 */
-	if (oldpde & PG_G)
-		pmap_invalidate_page(kernel_pmap, sva);
+	if ((oldpde & PG_G) != 0)
+		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 	if (oldpde & PG_MANAGED) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
@@ -2848,8 +3604,7 @@
 		}
 	}
 	if (pmap == kernel_pmap) {
-		if (!pmap_demote_pde_locked(pmap, pdq, sva, lockp))
-			panic("pmap_remove_pde: failed demotion");
+		pmap_remove_kernel_pde(pmap, pdq, sva);
 	} else {
 		mpte = pmap_lookup_pt_page(pmap, sva);
 		if (mpte != NULL) {
@@ -2870,12 +3625,16 @@
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
-    pd_entry_t ptepde, vm_page_t *free, struct rwlock **lockp)
+    pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
-	pt_entry_t oldpte;
+	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
 	vm_page_t m;
 
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
@@ -2903,11 +3662,13 @@
  * Remove a single page from a process address space
  */
 static void
-pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
+pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
+    struct spglist *free)
 {
 	struct rwlock *lock;
-	pt_entry_t *pte;
+	pt_entry_t *pte, PG_V;
 
+	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((*pde & PG_V) == 0)
 		return;
@@ -2935,10 +3696,13 @@
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
-	pt_entry_t *pte;
-	vm_page_t free = NULL;
+	pt_entry_t *pte, PG_G, PG_V;
+	struct spglist free;
 	int anyvalid;
 
+	PG_G = pmap_global_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
@@ -2946,6 +3710,7 @@
 		return;
 
 	anyvalid = 0;
+	SLIST_INIT(&free);
 
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
@@ -3064,7 +3829,7 @@
 		pmap_invalidate_all(pmap);
 	rw_runlock(&pvh_global_lock);	
 	PMAP_UNLOCK(pmap);
-	pmap_free_zero_pages(free);
+	pmap_free_zero_pages(&free);
 }
 
 /*
@@ -3086,18 +3851,17 @@
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
-	pt_entry_t *pte, tpte;
+	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
 	pd_entry_t *pde;
 	vm_offset_t va;
-	vm_page_t free;
+	struct spglist free;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
-	free = NULL;
+	SLIST_INIT(&free);
 	rw_wlock(&pvh_global_lock);
-	if ((m->flags & PG_FICTITIOUS) != 0)
-		goto small_mappings;
-	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
+	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
@@ -3106,10 +3870,12 @@
 		(void)pmap_demote_pde(pmap, pde, va);
 		PMAP_UNLOCK(pmap);
 	}
-small_mappings:
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
+		PG_A = pmap_accessed_bit(pmap);
+		PG_M = pmap_modified_bit(pmap);
+		PG_RW = pmap_rw_bit(pmap);
 		pmap_resident_count_dec(pmap, 1);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
@@ -3128,13 +3894,14 @@
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
-		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(&pvh_global_lock);
-	pmap_free_zero_pages(free);
+	pmap_free_zero_pages(&free);
 }
 
 /*
@@ -3147,7 +3914,12 @@
 	vm_offset_t eva, va;
 	vm_page_t m;
 	boolean_t anychanged;
+	pt_entry_t PG_G, PG_M, PG_RW;
 
+	PG_G = pmap_global_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_protect_pde: sva is not 2mpage aligned"));
@@ -3166,10 +3938,15 @@
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (newpde != oldpde) {
-		if (!atomic_cmpset_long(pde, oldpde, newpde))
+		/*
+		 * As an optimization to future operations on this PDE, clear
+		 * PG_PROMOTED.  The impending invalidation will remove any
+		 * lingering 4KB page mappings from the TLB.
+		 */
+		if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
 			goto retry;
-		if (oldpde & PG_G)
-			pmap_invalidate_page(pmap, sva);
+		if ((oldpde & PG_G) != 0)
+			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 		else
 			anychanged = TRUE;
 	}
@@ -3187,10 +3964,11 @@
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
-	pt_entry_t *pte;
+	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
 	boolean_t anychanged, pv_lists_locked;
 
-	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
+	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
+	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
@@ -3199,6 +3977,10 @@
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
+	PG_G = pmap_global_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
 	pv_lists_locked = FALSE;
 resume:
 	anychanged = FALSE;
@@ -3327,9 +4109,17 @@
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
-	vm_offset_t oldpteva;
+	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
 	vm_page_t mpte;
+	int PG_PTE_CACHE;
 
+	PG_A = pmap_accessed_bit(pmap);
+	PG_G = pmap_global_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
+
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
@@ -3379,10 +4169,9 @@
 			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 				goto setpte;
 			oldpte &= ~PG_RW;
-			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
-			    (va & ~PDRMASK);
 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
-			    " in pmap %p", oldpteva, pmap);
+			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
+			    (va & ~PDRMASK), pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 			atomic_add_long(&pmap_pde_p_failures, 1);
@@ -3404,7 +4193,13 @@
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_pde_pindex(va),
 	    ("pmap_promote_pde: page table page's pindex is wrong"));
-	pmap_insert_pt_page(pmap, mpte);
+	if (pmap_insert_pt_page(pmap, mpte)) {
+		atomic_add_long(&pmap_pde_p_failures, 1);
+		CTR2(KTR_PMAP,
+		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
+		    pmap);
+		return;
+	}
 
 	/*
 	 * Promote the pv entries.
@@ -3415,8 +4210,7 @@
 	/*
 	 * Propagate the PAT index to its proper position.
 	 */
-	if ((newpde & PG_PTE_PAT) != 0)
-		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
+	newpde = pmap_swap_pat(pmap, newpde);
 
 	/*
 	 * Map the superpage.
@@ -3424,7 +4218,7 @@
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 	else
-		pde_store(pde, PG_PS | newpde);
+		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
 
 	atomic_add_long(&pmap_pde_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
@@ -3443,18 +4237,25 @@
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
-void
-pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
-    vm_prot_t prot, boolean_t wired)
+int
+pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
+    u_int flags, int8_t psind __unused)
 {
 	struct rwlock *lock;
 	pd_entry_t *pde;
-	pt_entry_t *pte;
+	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
 	pt_entry_t newpte, origpte;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
+	boolean_t nosleep;
 
+	PG_A = pmap_accessed_bit(pmap);
+	PG_G = pmap_global_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
 	va = trunc_page(va);
 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
@@ -3463,27 +4264,36 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
 	    va >= kmi.clean_eva,
 	    ("pmap_enter: managed mapping within the clean submap"));
-	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
-	    VM_OBJECT_LOCKED(m->object),
-	    ("pmap_enter: page %p is not busy", m));
+	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
+		VM_OBJECT_ASSERT_LOCKED(m->object);
 	pa = VM_PAGE_TO_PHYS(m);
 	newpte = (pt_entry_t)(pa | PG_A | PG_V);
-	if ((access & VM_PROT_WRITE) != 0)
+	if ((flags & VM_PROT_WRITE) != 0)
 		newpte |= PG_M;
 	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= PG_RW;
 	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
-	    ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't"));
+	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
-	if (wired)
+	if ((flags & PMAP_ENTER_WIRED) != 0)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
-	newpte |= pmap_cache_bits(m->md.pat_mode, 0);
+	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
 
+	/*
+	 * Set modified bit gratuitously for writeable mappings if
+	 * the page is unmanaged. We do not want to take a fault
+	 * to do the dirty bit accounting for these mappings.
+	 */
+	if ((m->oflags & VPO_UNMANAGED) != 0) {
+		if ((newpte & PG_RW) != 0)
+			newpte |= PG_M;
+	}
+
 	mpte = NULL;
 
 	lock = NULL;
@@ -3508,7 +4318,16 @@
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
-		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
+		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
+		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
+		    nosleep ? NULL : &lock);
+		if (mpte == NULL && nosleep) {
+			if (lock != NULL)
+				rw_wunlock(lock);
+			rw_runlock(&pvh_global_lock);
+			PMAP_UNLOCK(pmap);
+			return (KERN_RESOURCE_SHORTAGE);
+		}
 		goto retry;
 	} else
 		panic("pmap_enter: invalid page directory va=%#lx", va);
@@ -3574,7 +4393,8 @@
 		pv = get_pv_entry(pmap, &lock);
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
-		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+		m->md.pv_gen++;
 		if ((newpte & PG_RW) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
@@ -3630,7 +4450,8 @@
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
-	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
+	    pmap_ps_enabled(pmap) &&
+	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_pde(pmap, pde, va, &lock);
 
@@ -3638,6 +4459,7 @@
 		rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
+	return (KERN_SUCCESS);
 }
 
 /*
@@ -3651,10 +4473,14 @@
     struct rwlock **lockp)
 {
 	pd_entry_t *pde, newpde;
-	vm_page_t free, mpde;
+	pt_entry_t PG_V;
+	vm_page_t mpde;
+	struct spglist free;
 
+	PG_V = pmap_valid_bit(pmap);
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
 	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
@@ -3670,7 +4496,7 @@
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
-	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
+	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 	    PG_PS | PG_V;
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		newpde |= PG_MANAGED;
@@ -3680,10 +4506,10 @@
 		 */
 		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
 		    lockp)) {
-			free = NULL;
+			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
 				pmap_invalidate_page(pmap, va);
-				pmap_free_zero_pages(free);
+				pmap_free_zero_pages(&free);
 			}
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
@@ -3701,7 +4527,8 @@
 	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 
 	/*
-	 * Map the superpage.
+	 * Map the superpage.  (This is not a promoted mapping; there will not
+	 * be any lingering 4KB page mappings in the TLB.)
 	 */
 	pde_store(pde, newpde);
 
@@ -3732,7 +4559,8 @@
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
-	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
+	VM_OBJECT_ASSERT_LOCKED(m_start->object);
+
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
@@ -3742,8 +4570,7 @@
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
-		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
-		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
+		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_pde(pmap, va, m, prot, &lock))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
@@ -3785,13 +4612,14 @@
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
-	vm_page_t free;
-	pt_entry_t *pte;
+	struct spglist free;
+	pt_entry_t *pte, PG_V;
 	vm_paddr_t pa;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
+	PG_V = pmap_valid_bit(pmap);
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
@@ -3856,10 +4684,10 @@
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
-			free = NULL;
+			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
-				pmap_free_zero_pages(free);
+				pmap_free_zero_pages(&free);
 			}
 			mpte = NULL;
 		}
@@ -3871,7 +4699,7 @@
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
-	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
+	pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		pa |= pg_nx;
 
@@ -3910,14 +4738,22 @@
     vm_pindex_t pindex, vm_size_t size)
 {
 	pd_entry_t *pde;
+	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa, ptepa;
 	vm_page_t p, pdpg;
 	int pat_mode;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
+		if (!pmap_ps_enabled(pmap))
+			return;
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
@@ -3955,8 +4791,8 @@
 		 * will not affect the termination of this loop.
 		 */ 
 		PMAP_LOCK(pmap);
-		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
-		    size; pa += NBPDR) {
+		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
+		    pa < ptepa + size; pa += NBPDR) {
 			pdpg = pmap_allocpde(pmap, addr, NULL);
 			if (pdpg == NULL) {
 				/*
@@ -3990,52 +4826,96 @@
 }
 
 /*
- *	Routine:	pmap_change_wiring
- *	Function:	Change the wiring attribute for a map/virtual-address
- *			pair.
- *	In/out conditions:
- *			The mapping must already exist in the pmap.
+ *	Clear the wired attribute from the mappings for the specified range of
+ *	addresses in the given pmap.  Every valid mapping within that range
+ *	must have the wired attribute set.  In contrast, invalid mappings
+ *	cannot have the wired attribute set, so they are ignored.
+ *
+ *	The wired attribute of the page table entry is not a hardware feature,
+ *	so there is no need to invalidate any TLB entries.
  */
 void
-pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
+pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
+	vm_offset_t va_next;
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
-	pt_entry_t *pte;
+	pt_entry_t *pte, PG_V;
 	boolean_t pv_lists_locked;
 
+	PG_V = pmap_valid_bit(pmap);
 	pv_lists_locked = FALSE;
+resume:
+	PMAP_LOCK(pmap);
+	for (; sva < eva; sva = va_next) {
+		pml4e = pmap_pml4e(pmap, sva);
+		if ((*pml4e & PG_V) == 0) {
+			va_next = (sva + NBPML4) & ~PML4MASK;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+		if ((*pdpe & PG_V) == 0) {
+			va_next = (sva + NBPDP) & ~PDPMASK;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+		va_next = (sva + NBPDR) & ~PDRMASK;
+		if (va_next < sva)
+			va_next = eva;
+		pde = pmap_pdpe_to_pde(pdpe, sva);
+		if ((*pde & PG_V) == 0)
+			continue;
+		if ((*pde & PG_PS) != 0) {
+			if ((*pde & PG_W) == 0)
+				panic("pmap_unwire: pde %#jx is missing PG_W",
+				    (uintmax_t)*pde);
 
-	/*
-	 * Wiring is not a hardware characteristic so there is no need to
-	 * invalidate TLB.
-	 */
-retry:
-	PMAP_LOCK(pmap);
-	pde = pmap_pde(pmap, va);
-	if ((*pde & PG_PS) != 0) {
-		if (!wired != ((*pde & PG_W) == 0)) {
-			if (!pv_lists_locked) {
-				pv_lists_locked = TRUE;
-				if (!rw_try_rlock(&pvh_global_lock)) {
-					PMAP_UNLOCK(pmap);
-					rw_rlock(&pvh_global_lock);
-					goto retry;
+			/*
+			 * Are we unwiring the entire large page?  If not,
+			 * demote the mapping and fall through.
+			 */
+			if (sva + NBPDR == va_next && eva >= va_next) {
+				atomic_clear_long(pde, PG_W);
+				pmap->pm_stats.wired_count -= NBPDR /
+				    PAGE_SIZE;
+				continue;
+			} else {
+				if (!pv_lists_locked) {
+					pv_lists_locked = TRUE;
+					if (!rw_try_rlock(&pvh_global_lock)) {
+						PMAP_UNLOCK(pmap);
+						rw_rlock(&pvh_global_lock);
+						/* Repeat sva. */
+						goto resume;
+					}
 				}
+				if (!pmap_demote_pde(pmap, pde, sva))
+					panic("pmap_unwire: demotion failed");
 			}
-			if (!pmap_demote_pde(pmap, pde, va))
-				panic("pmap_change_wiring: demotion failed");
-		} else
-			goto out;
+		}
+		if (va_next > eva)
+			va_next = eva;
+		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
+		    sva += PAGE_SIZE) {
+			if ((*pte & PG_V) == 0)
+				continue;
+			if ((*pte & PG_W) == 0)
+				panic("pmap_unwire: pte %#jx is missing PG_W",
+				    (uintmax_t)*pte);
+
+			/*
+			 * PG_W must be cleared atomically.  Although the pmap
+			 * lock synchronizes access to PG_W, another processor
+			 * could be setting PG_M and/or PG_A concurrently.
+			 */
+			atomic_clear_long(pte, PG_W);
+			pmap->pm_stats.wired_count--;
+		}
 	}
-	pte = pmap_pde_to_pte(pde, va);
-	if (wired && (*pte & PG_W) == 0) {
-		pmap->pm_stats.wired_count++;
-		atomic_set_long(pte, PG_W);
-	} else if (!wired && (*pte & PG_W) != 0) {
-		pmap->pm_stats.wired_count--;
-		atomic_clear_long(pte, PG_W);
-	}
-out:
 	if (pv_lists_locked)
 		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
@@ -4054,14 +4934,29 @@
     vm_offset_t src_addr)
 {
 	struct rwlock *lock;
-	vm_page_t   free;
+	struct spglist free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t va_next;
+	pt_entry_t PG_A, PG_M, PG_V;
 
 	if (dst_addr != src_addr)
 		return;
 
+	if (dst_pmap->pm_type != src_pmap->pm_type)
+		return;
+
+	/*
+	 * EPT page table entries that require emulation of A/D bits are
+	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
+	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
+	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
+	 * implementations flag an EPT misconfiguration for exec-only
+	 * mappings we skip this function entirely for emulated pmaps.
+	 */
+	if (pmap_emulate_ad_bits(dst_pmap))
+		return;
+
 	lock = NULL;
 	rw_rlock(&pvh_global_lock);
 	if (dst_pmap < src_pmap) {
@@ -4071,6 +4966,11 @@
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
+
+	PG_A = pmap_accessed_bit(dst_pmap);
+	PG_M = pmap_modified_bit(dst_pmap);
+	PG_V = pmap_valid_bit(dst_pmap);
+
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpde, dstmpte, srcmpte;
@@ -4107,6 +5007,8 @@
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
+			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
+				continue;
 			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
 			if (dstmpde == NULL)
 				break;
@@ -4162,13 +5064,13 @@
 					*dst_pte = ptetemp & ~(PG_W | PG_M |
 					    PG_A);
 					pmap_resident_count_inc(dst_pmap, 1);
-	 			} else {
-					free = NULL;
+				} else {
+					SLIST_INIT(&free);
 					if (pmap_unwire_ptp(dst_pmap, addr,
 					    dstmpte, &free)) {
-					    	pmap_invalidate_page(dst_pmap,
-					 	    addr);
-				    	    	pmap_free_zero_pages(free);
+						pmap_invalidate_page(dst_pmap,
+						    addr);
+						pmap_free_zero_pages(&free);
 					}
 					goto out;
 				}
@@ -4185,10 +5087,10 @@
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
-}	
+}
 
 /*
- *	pmap_zero_page zeros the specified hardware page by mapping 
+ *	pmap_zero_page zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  */
 void
@@ -4245,24 +5147,73 @@
 	pagecopy((void *)src, (void *)dst);
 }
 
+int unmapped_buf_allowed = 1;
+
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
+	vm_page_t m_a, m_b;
+	vm_paddr_t p_a, p_b;
+	pt_entry_t *pte;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	int cnt;
+	boolean_t pinned;
 
+	/*
+	 * NB:  The sequence of updating a page table followed by accesses
+	 * to the corresponding pages used in the !DMAP case is subject to
+	 * the situation described in the "AMD64 Architecture Programmer's
+	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
+	 * Coherency Considerations".  Therefore, issuing the INVLPG right
+	 * after modifying the PTE bits is crucial.
+	 */
+	pinned = FALSE;
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
+		m_a = ma[a_offset >> PAGE_SHIFT];
+		p_a = m_a->phys_addr;
+		b_pg_offset = b_offset & PAGE_MASK;
+		m_b = mb[b_offset >> PAGE_SHIFT];
+		p_b = m_b->phys_addr;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
-		a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]->
-		    phys_addr) + a_pg_offset;
-		b_pg_offset = b_offset & PAGE_MASK;
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
-		b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]->
-		    phys_addr) + b_pg_offset;
+		if (__predict_false(p_a < DMAP_MIN_ADDRESS ||
+		    p_a > DMAP_MIN_ADDRESS + dmaplimit)) {
+			mtx_lock(&cpage_lock);
+			sched_pin();
+			pinned = TRUE;
+			pte = vtopte(cpage_a);
+			*pte = p_a | X86_PG_A | X86_PG_V |
+			    pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
+			invlpg(cpage_a);
+			a_cp = (char *)cpage_a + a_pg_offset;
+		} else {
+			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
+		}
+		if (__predict_false(p_b < DMAP_MIN_ADDRESS ||
+		    p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
+			if (!pinned) {
+				mtx_lock(&cpage_lock);
+				sched_pin();
+				pinned = TRUE;
+			}
+			pte = vtopte(cpage_b);
+			*pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW |
+			    X86_PG_V | pmap_cache_bits(kernel_pmap,
+			    m_b->md.pat_mode, 0);
+			invlpg(cpage_b);
+			b_cp = (char *)cpage_b + b_pg_offset;
+		} else {
+			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
+		}
 		bcopy(a_cp, b_cp, cnt);
+		if (__predict_false(pinned)) {
+			sched_unpin();
+			mtx_unlock(&cpage_lock);
+			pinned = FALSE;
+		}
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
@@ -4291,7 +5242,7 @@
 	rw_rlock(&pvh_global_lock);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
-	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
@@ -4302,7 +5253,7 @@
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
-		TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
@@ -4326,42 +5277,61 @@
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
-	int count;
-
-	count = 0;
-	if ((m->oflags & VPO_UNMANAGED) != 0)
-		return (count);
-	rw_wlock(&pvh_global_lock);
-	count = pmap_pvh_wired_mappings(&m->md, count);
-	if ((m->flags & PG_FICTITIOUS) == 0) {
-	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
-	        count);
-	}
-	rw_wunlock(&pvh_global_lock);
-	return (count);
-}
-
-/*
- *	pmap_pvh_wired_mappings:
- *
- *	Return the updated number "count" of managed mappings that are wired.
- */
-static int
-pmap_pvh_wired_mappings(struct md_page *pvh, int count)
-{
+	struct rwlock *lock;
+	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
+	int count, md_gen, pvh_gen;
 
-	rw_assert(&pvh_global_lock, RA_WLOCKED);
-	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+	if ((m->oflags & VPO_UNMANAGED) != 0)
+		return (0);
+	rw_rlock(&pvh_global_lock);
+	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+	rw_rlock(lock);
+restart:
+	count = 0;
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
+		if (!PMAP_TRYLOCK(pmap)) {
+			md_gen = m->md.pv_gen;
+			rw_runlock(lock);
+			PMAP_LOCK(pmap);
+			rw_rlock(lock);
+			if (md_gen != m->md.pv_gen) {
+				PMAP_UNLOCK(pmap);
+				goto restart;
+			}
+		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		if ((*pte & PG_W) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
+	if ((m->flags & PG_FICTITIOUS) == 0) {
+		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
+			pmap = PV_PMAP(pv);
+			if (!PMAP_TRYLOCK(pmap)) {
+				md_gen = m->md.pv_gen;
+				pvh_gen = pvh->pv_gen;
+				rw_runlock(lock);
+				PMAP_LOCK(pmap);
+				rw_rlock(lock);
+				if (md_gen != m->md.pv_gen ||
+				    pvh_gen != pvh->pv_gen) {
+					PMAP_UNLOCK(pmap);
+					goto restart;
+				}
+			}
+			pte = pmap_pde(pmap, pv->pv_va);
+			if ((*pte & PG_W) != 0)
+				count++;
+			PMAP_UNLOCK(pmap);
+		}
+	}
+	rw_runlock(lock);
+	rw_runlock(&pvh_global_lock);
 	return (count);
 }
 
@@ -4389,12 +5359,20 @@
 }
 
 /*
- * Remove all pages from specified address space
- * this aids process exit speeds.  Also, this code
- * is special cased for current process only, but
- * can have the more generic (and slightly slower)
- * mode enabled.  This is much faster than pmap_remove
- * in the case of running down an entire address space.
+ * Destroy all managed, non-wired mappings in the given user-space
+ * pmap.  This pmap cannot be active on any processor besides the
+ * caller.
+ *                                                                                
+ * This function cannot be applied to the kernel pmap.  Moreover, it
+ * is not intended for general use.  It is only to be used during
+ * process termination.  Consequently, it can be implemented in ways
+ * that make it faster than pmap_remove().  First, it can more quickly
+ * destroy mappings by iterating over the pmap's collection of PV
+ * entries, rather than searching the page table.  Second, it doesn't
+ * have to test and clear the page table entries atomically, because
+ * no processor is currently accessing the user address space.  In
+ * particular, a page table entry's dirty bit won't change state once
+ * this function starts.
  */
 void
 pmap_remove_pages(pmap_t pmap)
@@ -4401,7 +5379,8 @@
 {
 	pd_entry_t ptepde;
 	pt_entry_t *pte, tpte;
-	vm_page_t free = NULL;
+	pt_entry_t PG_M, PG_RW, PG_V;
+	struct spglist free;
 	vm_page_t m, mpte, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
@@ -4410,12 +5389,34 @@
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx;
+	boolean_t superpage;
+	vm_paddr_t pa;
 
-	if (pmap != PCPU_GET(curpmap)) {
-		printf("warning: pmap_remove_pages called with non-current pmap\n");
-		return;
+	/*
+	 * Assert that the given pmap is only active on the current
+	 * CPU.  Unfortunately, we cannot block another CPU from
+	 * activating the pmap while this function is executing.
+	 */
+	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
+#ifdef INVARIANTS
+	{
+		cpuset_t other_cpus;
+
+		other_cpus = all_cpus;
+		critical_enter();
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+		CPU_AND(&other_cpus, &pmap->pm_active);
+		critical_exit();
+		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
 	}
+#endif
+
 	lock = NULL;
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
+	SLIST_INIT(&free);
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
@@ -4435,12 +5436,26 @@
 				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 				tpte = *pte;
 				if ((tpte & (PG_PS | PG_V)) == PG_V) {
+					superpage = FALSE;
 					ptepde = tpte;
 					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 					    PG_FRAME);
 					pte = &pte[pmap_pte_index(pv->pv_va)];
-					tpte = *pte & ~PG_PTE_PAT;
+					tpte = *pte;
+				} else {
+					/*
+					 * Keep track whether 'tpte' is a
+					 * superpage explicitly instead of
+					 * relying on PG_PS being set.
+					 *
+					 * This is because PG_PS is numerically
+					 * identical to PG_PTE_PAT and thus a
+					 * regular page could be mistaken for
+					 * a superpage.
+					 */
+					superpage = TRUE;
 				}
+
 				if ((tpte & PG_V) == 0) {
 					panic("bad pte va %lx pte %lx",
 					    pv->pv_va, tpte);
@@ -4454,8 +5469,13 @@
 					continue;
 				}
 
-				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
-				KASSERT(m->phys_addr == (tpte & PG_FRAME),
+				if (superpage)
+					pa = tpte & PG_PS_FRAME;
+				else
+					pa = tpte & PG_FRAME;
+
+				m = PHYS_TO_VM_PAGE(pa);
+				KASSERT(m->phys_addr == pa,
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
@@ -4471,7 +5491,7 @@
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
-					if ((tpte & PG_PS) != 0) {
+					if (superpage) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 					} else
@@ -4482,10 +5502,11 @@
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
-				if ((tpte & PG_PS) != 0) {
+				if (superpage) {
 					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
-					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
+					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
+					pvh->pv_gen++;
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
@@ -4504,7 +5525,8 @@
 					}
 				} else {
 					pmap_resident_count_dec(pmap, 1);
-					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+					m->md.pv_gen++;
 					if ((m->aflags & PGA_WRITEABLE) != 0 &&
 					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
@@ -4530,9 +5552,95 @@
 	pmap_invalidate_all(pmap);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
-	pmap_free_zero_pages(free);
+	pmap_free_zero_pages(&free);
 }
 
+static boolean_t
+pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
+{
+	struct rwlock *lock;
+	pv_entry_t pv;
+	struct md_page *pvh;
+	pt_entry_t *pte, mask;
+	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
+	pmap_t pmap;
+	int md_gen, pvh_gen;
+	boolean_t rv;
+
+	rv = FALSE;
+	rw_rlock(&pvh_global_lock);
+	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+	rw_rlock(lock);
+restart:
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
+		pmap = PV_PMAP(pv);
+		if (!PMAP_TRYLOCK(pmap)) {
+			md_gen = m->md.pv_gen;
+			rw_runlock(lock);
+			PMAP_LOCK(pmap);
+			rw_rlock(lock);
+			if (md_gen != m->md.pv_gen) {
+				PMAP_UNLOCK(pmap);
+				goto restart;
+			}
+		}
+		pte = pmap_pte(pmap, pv->pv_va);
+		mask = 0;
+		if (modified) {
+			PG_M = pmap_modified_bit(pmap);
+			PG_RW = pmap_rw_bit(pmap);
+			mask |= PG_RW | PG_M;
+		}
+		if (accessed) {
+			PG_A = pmap_accessed_bit(pmap);
+			PG_V = pmap_valid_bit(pmap);
+			mask |= PG_V | PG_A;
+		}
+		rv = (*pte & mask) == mask;
+		PMAP_UNLOCK(pmap);
+		if (rv)
+			goto out;
+	}
+	if ((m->flags & PG_FICTITIOUS) == 0) {
+		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
+			pmap = PV_PMAP(pv);
+			if (!PMAP_TRYLOCK(pmap)) {
+				md_gen = m->md.pv_gen;
+				pvh_gen = pvh->pv_gen;
+				rw_runlock(lock);
+				PMAP_LOCK(pmap);
+				rw_rlock(lock);
+				if (md_gen != m->md.pv_gen ||
+				    pvh_gen != pvh->pv_gen) {
+					PMAP_UNLOCK(pmap);
+					goto restart;
+				}
+			}
+			pte = pmap_pde(pmap, pv->pv_va);
+			mask = 0;
+			if (modified) {
+				PG_M = pmap_modified_bit(pmap);
+				PG_RW = pmap_rw_bit(pmap);
+				mask |= PG_RW | PG_M;
+			}
+			if (accessed) {
+				PG_A = pmap_accessed_bit(pmap);
+				PG_V = pmap_valid_bit(pmap);
+				mask |= PG_V | PG_A;
+			}
+			rv = (*pte & mask) == mask;
+			PMAP_UNLOCK(pmap);
+			if (rv)
+				goto out;
+		}
+	}
+out:
+	rw_runlock(lock);
+	rw_runlock(&pvh_global_lock);
+	return (rv);
+}
+
 /*
  *	pmap_is_modified:
  *
@@ -4542,59 +5650,25 @@
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
-	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
-	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
+	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if ((m->oflags & VPO_BUSY) == 0 &&
-	    (m->aflags & PGA_WRITEABLE) == 0)
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
-	rw_wlock(&pvh_global_lock);
-	rv = pmap_is_modified_pvh(&m->md) ||
-	    ((m->flags & PG_FICTITIOUS) == 0 &&
-	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	rw_wunlock(&pvh_global_lock);
-	return (rv);
+	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
- * Returns TRUE if any of the given mappings were used to modify
- * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
- * mappings are supported.
- */
-static boolean_t
-pmap_is_modified_pvh(struct md_page *pvh)
-{
-	pv_entry_t pv;
-	pt_entry_t *pte;
-	pmap_t pmap;
-	boolean_t rv;
-
-	rw_assert(&pvh_global_lock, RA_WLOCKED);
-	rv = FALSE;
-	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
-		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
-		pte = pmap_pte(pmap, pv->pv_va);
-		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
-		PMAP_UNLOCK(pmap);
-		if (rv)
-			break;
-	}
-	return (rv);
-}
-
-/*
  *	pmap_is_prefaultable:
  *
- *	Return whether or not the specified virtual address is elgible
+ *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
@@ -4601,9 +5675,10 @@
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
-	pt_entry_t *pte;
+	pt_entry_t *pte, PG_V;
 	boolean_t rv;
 
+	PG_V = pmap_valid_bit(pmap);
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = pmap_pde(pmap, addr);
@@ -4624,45 +5699,13 @@
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
-	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
-	rw_wlock(&pvh_global_lock);
-	rv = pmap_is_referenced_pvh(&m->md) ||
-	    ((m->flags & PG_FICTITIOUS) == 0 &&
-	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	rw_wunlock(&pvh_global_lock);
-	return (rv);
+	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
- * Returns TRUE if any of the given mappings were referenced and FALSE
- * otherwise.  Both page and 2mpage mappings are supported.
- */
-static boolean_t
-pmap_is_referenced_pvh(struct md_page *pvh)
-{
-	pv_entry_t pv;
-	pt_entry_t *pte;
-	pmap_t pmap;
-	boolean_t rv;
-
-	rw_assert(&pvh_global_lock, RA_WLOCKED);
-	rv = FALSE;
-	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
-		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
-		pte = pmap_pte(pmap, pv->pv_va);
-		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
-		PMAP_UNLOCK(pmap);
-		if (rv)
-			break;
-	}
-	return (rv);
-}
-
-/*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
@@ -4670,40 +5713,70 @@
 {
 	struct md_page *pvh;
 	pmap_t pmap;
+	struct rwlock *lock;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *pde;
-	pt_entry_t oldpte, *pte;
+	pt_entry_t oldpte, *pte, PG_M, PG_RW;
 	vm_offset_t va;
+	int pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
-	 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
-	 * another thread while the object is locked.  Thus, if PGA_WRITEABLE
-	 * is clear, no page table entries need updating.
+	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
+	 * set by another thread while the object is locked.  Thus,
+	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if ((m->oflags & VPO_BUSY) == 0 &&
-	    (m->aflags & PGA_WRITEABLE) == 0)
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	rw_wlock(&pvh_global_lock);
-	if ((m->flags & PG_FICTITIOUS) != 0)
-		goto small_mappings;
-	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
-	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+	rw_rlock(&pvh_global_lock);
+	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
+	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
+retry_pv_loop:
+	rw_wlock(lock);
+	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
+		if (!PMAP_TRYLOCK(pmap)) {
+			pvh_gen = pvh->pv_gen;
+			rw_wunlock(lock);
+			PMAP_LOCK(pmap);
+			rw_wlock(lock);
+			if (pvh_gen != pvh->pv_gen) {
+				PMAP_UNLOCK(pmap);
+				rw_wunlock(lock);
+				goto retry_pv_loop;
+			}
+		}
+		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		if ((*pde & PG_RW) != 0)
-			(void)pmap_demote_pde(pmap, pde, va);
+			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
+		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
+		    ("inconsistent pv lock %p %p for page %p",
+		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
-small_mappings:
-	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
+		if (!PMAP_TRYLOCK(pmap)) {
+			pvh_gen = pvh->pv_gen;
+			md_gen = m->md.pv_gen;
+			rw_wunlock(lock);
+			PMAP_LOCK(pmap);
+			rw_wlock(lock);
+			if (pvh_gen != pvh->pv_gen ||
+			    md_gen != m->md.pv_gen) {
+				PMAP_UNLOCK(pmap);
+				rw_wunlock(lock);
+				goto retry_pv_loop;
+			}
+		}
+		PG_M = pmap_modified_bit(pmap);
+		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
@@ -4721,10 +5794,40 @@
 		}
 		PMAP_UNLOCK(pmap);
 	}
+	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	rw_wunlock(&pvh_global_lock);
+	rw_runlock(&pvh_global_lock);
 }
 
+static __inline boolean_t
+safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
+{
+
+	if (!pmap_emulate_ad_bits(pmap))
+		return (TRUE);
+
+	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
+
+	/*
+	 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
+	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
+	 * if the EPT_PG_WRITE bit is set.
+	 */
+	if ((pte & EPT_PG_WRITE) != 0)
+		return (FALSE);
+
+	/*
+	 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
+	 */
+	if ((pte & EPT_PG_EXECUTE) == 0 ||
+	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+#define	PMAP_TS_REFERENCED_MAX	5
+
 /*
  *	pmap_ts_referenced:
  *
@@ -4741,28 +5844,77 @@
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
-	pv_entry_t pv, pvf, pvn;
+	pv_entry_t pv, pvf;
 	pmap_t pmap;
+	struct rwlock *lock;
 	pd_entry_t oldpde, *pde;
-	pt_entry_t *pte;
+	pt_entry_t *pte, PG_A;
 	vm_offset_t va;
-	int rtval = 0;
+	vm_paddr_t pa;
+	int cleared, md_gen, not_cleared, pvh_gen;
+	struct spglist free;
+	boolean_t demoted;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
-	rw_wlock(&pvh_global_lock);
-	if ((m->flags & PG_FICTITIOUS) != 0)
+	SLIST_INIT(&free);
+	cleared = 0;
+	pa = VM_PAGE_TO_PHYS(m);
+	lock = PHYS_TO_PV_LIST_LOCK(pa);
+	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
+	rw_rlock(&pvh_global_lock);
+	rw_wlock(lock);
+retry:
+	not_cleared = 0;
+	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
-	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
-	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
+	pv = pvf;
+	do {
+		if (pvf == NULL)
+			pvf = pv;
 		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
+		if (!PMAP_TRYLOCK(pmap)) {
+			pvh_gen = pvh->pv_gen;
+			rw_wunlock(lock);
+			PMAP_LOCK(pmap);
+			rw_wlock(lock);
+			if (pvh_gen != pvh->pv_gen) {
+				PMAP_UNLOCK(pmap);
+				goto retry;
+			}
+		}
+		PG_A = pmap_accessed_bit(pmap);
 		va = pv->pv_va;
-		pde = pmap_pde(pmap, va);
+		pde = pmap_pde(pmap, pv->pv_va);
 		oldpde = *pde;
-		if ((oldpde & PG_A) != 0) {
-			if (pmap_demote_pde(pmap, pde, va)) {
-				if ((oldpde & PG_W) == 0) {
+		if ((*pde & PG_A) != 0) {
+			/*
+			 * Since this reference bit is shared by 512 4KB
+			 * pages, it should not be cleared every time it is
+			 * tested.  Apply a simple "hash" function on the
+			 * physical page number, the virtual superpage number,
+			 * and the pmap address to select one 4KB page out of
+			 * the 512 on which testing the reference bit will
+			 * result in clearing that reference bit.  This
+			 * function is designed to avoid the selection of the
+			 * same 4KB page for every 2MB page mapping.
+			 *
+			 * On demotion, a mapping that hasn't been referenced
+			 * is simply destroyed.  To avoid the possibility of a
+			 * subsequent page fault on a demoted wired mapping,
+			 * always leave its reference bit set.  Moreover,
+			 * since the superpage is wired, the current state of
+			 * its reference bit won't affect page replacement.
+			 */
+			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
+			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
+			    (*pde & PG_W) == 0) {
+				if (safe_to_clear_referenced(pmap, oldpde)) {
+					atomic_clear_long(pde, PG_A);
+					pmap_invalidate_page(pmap, pv->pv_va);
+					demoted = FALSE;
+				} else if (pmap_demote_pde_locked(pmap, pde,
+				    pv->pv_va, &lock)) {
 					/*
 					 * Remove the mapping to a single page
 					 * so that a subsequent access may
@@ -4771,45 +5923,245 @@
 					 * this removal never frees a page
 					 * table page.
 					 */
+					demoted = TRUE;
 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
 					    PG_PS_FRAME);
-					pmap_remove_page(pmap, va, pde, NULL);
-					rtval++;
-					if (rtval > 4) {
-						PMAP_UNLOCK(pmap);
-						goto out;
-					}
+					pte = pmap_pde_to_pte(pde, va);
+					pmap_remove_pte(pmap, pte, va, *pde,
+					    NULL, &lock);
+					pmap_invalidate_page(pmap, va);
+				} else
+					demoted = TRUE;
+
+				if (demoted) {
+					/*
+					 * The superpage mapping was removed
+					 * entirely and therefore 'pv' is no
+					 * longer valid.
+					 */
+					if (pvf == pv)
+						pvf = NULL;
+					pv = NULL;
 				}
-			}
+				cleared++;
+				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
+				    ("inconsistent pv lock %p %p for page %p",
+				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
+			} else
+				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
-	}
+		/* Rotate the PV list if it has more than one entry. */
+		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
+			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
+			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+			pvh->pv_gen++;
+		}
+		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
+			goto out;
+	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
-	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
-		pvf = pv;
-		do {
-			pvn = TAILQ_NEXT(pv, pv_list);
-			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-			pmap = PV_PMAP(pv);
+	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
+		goto out;
+	pv = pvf;
+	do {
+		if (pvf == NULL)
+			pvf = pv;
+		pmap = PV_PMAP(pv);
+		if (!PMAP_TRYLOCK(pmap)) {
+			pvh_gen = pvh->pv_gen;
+			md_gen = m->md.pv_gen;
+			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
-			pde = pmap_pde(pmap, pv->pv_va);
-			KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
-			    " found a 2mpage in page %p's pv list", m));
-			pte = pmap_pde_to_pte(pde, pv->pv_va);
-			if ((*pte & PG_A) != 0) {
+			rw_wlock(lock);
+			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
+				PMAP_UNLOCK(pmap);
+				goto retry;
+			}
+		}
+		PG_A = pmap_accessed_bit(pmap);
+		pde = pmap_pde(pmap, pv->pv_va);
+		KASSERT((*pde & PG_PS) == 0,
+		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
+		    m));
+		pte = pmap_pde_to_pte(pde, pv->pv_va);
+		if ((*pte & PG_A) != 0) {
+			if (safe_to_clear_referenced(pmap, *pte)) {
 				atomic_clear_long(pte, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
-				rtval++;
-				if (rtval > 4)
-					pvn = NULL;
+				cleared++;
+			} else if ((*pte & PG_W) == 0) {
+				/*
+				 * Wired pages cannot be paged out so
+				 * doing accessed bit emulation for
+				 * them is wasted effort. We do the
+				 * hard work for unwired pages only.
+				 */
+				pmap_remove_pte(pmap, pte, pv->pv_va,
+				    *pde, &free, &lock);
+				pmap_invalidate_page(pmap, pv->pv_va);
+				cleared++;
+				if (pvf == pv)
+					pvf = NULL;
+				pv = NULL;
+				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
+				    ("inconsistent pv lock %p %p for page %p",
+				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
+			} else
+				not_cleared++;
+		}
+		PMAP_UNLOCK(pmap);
+		/* Rotate the PV list if it has more than one entry. */
+		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
+			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+			m->md.pv_gen++;
+		}
+	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
+	    not_cleared < PMAP_TS_REFERENCED_MAX);
+out:
+	rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
+	pmap_free_zero_pages(&free);
+	return (cleared + not_cleared);
+}
+
+/*
+ *	Apply the given advice to the specified range of addresses within the
+ *	given pmap.  Depending on the advice, clear the referenced and/or
+ *	modified flags in each mapping and set the mapped page's dirty field.
+ */
+void
+pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
+{
+	struct rwlock *lock;
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	pd_entry_t oldpde, *pde;
+	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
+	vm_offset_t va_next;
+	vm_page_t m;
+	boolean_t anychanged, pv_lists_locked;
+
+	if (advice != MADV_DONTNEED && advice != MADV_FREE)
+		return;
+
+	/*
+	 * A/D bit emulation requires an alternate code path when clearing
+	 * the modified and accessed bits below. Since this function is
+	 * advisory in nature we skip it entirely for pmaps that require
+	 * A/D bit emulation.
+	 */
+	if (pmap_emulate_ad_bits(pmap))
+		return;
+
+	PG_A = pmap_accessed_bit(pmap);
+	PG_G = pmap_global_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
+	pv_lists_locked = FALSE;
+resume:
+	anychanged = FALSE;
+	PMAP_LOCK(pmap);
+	for (; sva < eva; sva = va_next) {
+		pml4e = pmap_pml4e(pmap, sva);
+		if ((*pml4e & PG_V) == 0) {
+			va_next = (sva + NBPML4) & ~PML4MASK;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+		if ((*pdpe & PG_V) == 0) {
+			va_next = (sva + NBPDP) & ~PDPMASK;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+		va_next = (sva + NBPDR) & ~PDRMASK;
+		if (va_next < sva)
+			va_next = eva;
+		pde = pmap_pdpe_to_pde(pdpe, sva);
+		oldpde = *pde;
+		if ((oldpde & PG_V) == 0)
+			continue;
+		else if ((oldpde & PG_PS) != 0) {
+			if ((oldpde & PG_MANAGED) == 0)
+				continue;
+			if (!pv_lists_locked) {
+				pv_lists_locked = TRUE;
+				if (!rw_try_rlock(&pvh_global_lock)) {
+					if (anychanged)
+						pmap_invalidate_all(pmap);
+					PMAP_UNLOCK(pmap);
+					rw_rlock(&pvh_global_lock);
+					goto resume;
+				}
 			}
-			PMAP_UNLOCK(pmap);
-		} while ((pv = pvn) != NULL && pv != pvf);
+			lock = NULL;
+			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
+				if (lock != NULL)
+					rw_wunlock(lock);
+
+				/*
+				 * The large page mapping was destroyed.
+				 */
+				continue;
+			}
+
+			/*
+			 * Unless the page mappings are wired, remove the
+			 * mapping to a single page so that a subsequent
+			 * access may repromote.  Since the underlying page
+			 * table page is fully populated, this removal never
+			 * frees a page table page.
+			 */
+			if ((oldpde & PG_W) == 0) {
+				pte = pmap_pde_to_pte(pde, sva);
+				KASSERT((*pte & PG_V) != 0,
+				    ("pmap_advise: invalid PTE"));
+				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
+				    &lock);
+				anychanged = TRUE;
+			}
+			if (lock != NULL)
+				rw_wunlock(lock);
+		}
+		if (va_next > eva)
+			va_next = eva;
+		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
+		    sva += PAGE_SIZE) {
+			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
+			    PG_V))
+				continue;
+			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+				if (advice == MADV_DONTNEED) {
+					/*
+					 * Future calls to pmap_is_modified()
+					 * can be avoided by making the page
+					 * dirty now.
+					 */
+					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
+					vm_page_dirty(m);
+				}
+				atomic_clear_long(pte, PG_M | PG_A);
+			} else if ((*pte & PG_A) != 0)
+				atomic_clear_long(pte, PG_A);
+			else
+				continue;
+			if ((*pte & PG_G) != 0)
+				pmap_invalidate_page(pmap, sva);
+			else
+				anychanged = TRUE;
+		}
 	}
-out:
-	rw_wunlock(&pvh_global_lock);
-	return (rtval);
+	if (anychanged)
+		pmap_invalidate_all(pmap);
+	if (pv_lists_locked)
+		rw_runlock(&pvh_global_lock);
+	PMAP_UNLOCK(pmap);
 }
 
 /*
@@ -4822,34 +6174,50 @@
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t oldpde, *pde;
-	pt_entry_t oldpte, *pte;
+	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
+	struct rwlock *lock;
 	vm_offset_t va;
+	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	KASSERT((m->oflags & VPO_BUSY) == 0,
-	    ("pmap_clear_modify: page %p is busy", m));
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	KASSERT(!vm_page_xbusied(m),
+	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
-	 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
+	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	rw_wlock(&pvh_global_lock);
-	if ((m->flags & PG_FICTITIOUS) != 0)
-		goto small_mappings;
-	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
-	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
+	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
+	rw_rlock(&pvh_global_lock);
+	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+	rw_wlock(lock);
+restart:
+	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
+		if (!PMAP_TRYLOCK(pmap)) {
+			pvh_gen = pvh->pv_gen;
+			rw_wunlock(lock);
+			PMAP_LOCK(pmap);
+			rw_wlock(lock);
+			if (pvh_gen != pvh->pv_gen) {
+				PMAP_UNLOCK(pmap);
+				goto restart;
+			}
+		}
+		PG_M = pmap_modified_bit(pmap);
+		PG_V = pmap_valid_bit(pmap);
+		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		oldpde = *pde;
 		if ((oldpde & PG_RW) != 0) {
-			if (pmap_demote_pde(pmap, pde, va)) {
+			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
 				if ((oldpde & PG_W) == 0) {
 					/*
 					 * Write protect the mapping to a
@@ -4873,10 +6241,21 @@
 		}
 		PMAP_UNLOCK(pmap);
 	}
-small_mappings:
-	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
+		if (!PMAP_TRYLOCK(pmap)) {
+			md_gen = m->md.pv_gen;
+			pvh_gen = pvh->pv_gen;
+			rw_wunlock(lock);
+			PMAP_LOCK(pmap);
+			rw_wlock(lock);
+			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
+				PMAP_UNLOCK(pmap);
+				goto restart;
+			}
+		}
+		PG_M = pmap_modified_bit(pmap);
+		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 		    " a 2mpage in page %p's pv list", m));
@@ -4887,76 +6266,17 @@
 		}
 		PMAP_UNLOCK(pmap);
 	}
-	rw_wunlock(&pvh_global_lock);
+	rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
 }
 
 /*
- *	pmap_clear_reference:
- *
- *	Clear the reference bit on the specified physical page.
- */
-void
-pmap_clear_reference(vm_page_t m)
-{
-	struct md_page *pvh;
-	pmap_t pmap;
-	pv_entry_t next_pv, pv;
-	pd_entry_t oldpde, *pde;
-	pt_entry_t *pte;
-	vm_offset_t va;
-
-	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
-	    ("pmap_clear_reference: page %p is not managed", m));
-	rw_wlock(&pvh_global_lock);
-	if ((m->flags & PG_FICTITIOUS) != 0)
-		goto small_mappings;
-	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
-	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
-		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
-		va = pv->pv_va;
-		pde = pmap_pde(pmap, va);
-		oldpde = *pde;
-		if ((oldpde & PG_A) != 0) {
-			if (pmap_demote_pde(pmap, pde, va)) {
-				/*
-				 * Remove the mapping to a single page so
-				 * that a subsequent access may repromote.
-				 * Since the underlying page table page is
-				 * fully populated, this removal never frees
-				 * a page table page.
-				 */
-				va += VM_PAGE_TO_PHYS(m) - (oldpde &
-				    PG_PS_FRAME);
-				pmap_remove_page(pmap, va, pde, NULL);
-			}
-		}
-		PMAP_UNLOCK(pmap);
-	}
-small_mappings:
-	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		pmap = PV_PMAP(pv);
-		PMAP_LOCK(pmap);
-		pde = pmap_pde(pmap, pv->pv_va);
-		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
-		    " a 2mpage in page %p's pv list", m));
-		pte = pmap_pde_to_pte(pde, pv->pv_va);
-		if (*pte & PG_A) {
-			atomic_clear_long(pte, PG_A);
-			pmap_invalidate_page(pmap, pv->pv_va);
-		}
-		PMAP_UNLOCK(pmap);
-	}
-	rw_wunlock(&pvh_global_lock);
-}
-
-/*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 static __inline void
-pmap_pte_attr(pt_entry_t *pte, int cache_bits)
+pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
 {
 	u_int opte, npte;
 
@@ -4966,7 +6286,7 @@
 	 */
 	do {
 		opte = *(u_int *)pte;
-		npte = opte & ~PG_PTE_CACHE;
+		npte = opte & ~mask;
 		npte |= cache_bits;
 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 }
@@ -4973,7 +6293,7 @@
 
 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
 static __inline void
-pmap_pde_attr(pd_entry_t *pde, int cache_bits)
+pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
 {
 	u_int opde, npde;
 
@@ -4983,7 +6303,7 @@
 	 */
 	do {
 		opde = *(u_int *)pde;
-		npde = opde & ~PG_PDE_CACHE;
+		npde = opde & ~mask;
 		npde |= cache_bits;
 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 }
@@ -4997,28 +6317,58 @@
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
+	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	vm_size_t tmpsize;
+	int i;
 
-	/*
-	 * If the specified range of physical addresses fits within the direct
-	 * map window, use the direct map. 
-	 */
-	if (pa < dmaplimit && pa + size < dmaplimit) {
-		va = PHYS_TO_DMAP(pa);
-		if (!pmap_change_attr(va, size, mode))
-			return ((void *)va);
-	}
 	offset = pa & PAGE_MASK;
-	size = roundup(offset + size, PAGE_SIZE);
-	va = kmem_alloc_nofault(kernel_map, size);
-	if (!va)
-		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
+	size = round_page(offset + size);
 	pa = trunc_page(pa);
+
+	if (!pmap_initialized) {
+		va = 0;
+		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
+			ppim = pmap_preinit_mapping + i;
+			if (ppim->va == 0) {
+				ppim->pa = pa;
+				ppim->sz = size;
+				ppim->mode = mode;
+				ppim->va = virtual_avail;
+				virtual_avail += size;
+				va = ppim->va;
+				break;
+			}
+		}
+		if (va == 0)
+			panic("%s: too many preinit mappings", __func__);
+	} else {
+		/*
+		 * If we have a preinit mapping, re-use it.
+		 */
+		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
+			ppim = pmap_preinit_mapping + i;
+			if (ppim->pa == pa && ppim->sz == size &&
+			    ppim->mode == mode)
+				return ((void *)(ppim->va + offset));
+		}
+		/*
+		 * If the specified range of physical addresses fits within
+		 * the direct map window, use the direct map.
+		 */
+		if (pa < dmaplimit && pa + size < dmaplimit) {
+			va = PHYS_TO_DMAP(pa);
+			if (!pmap_change_attr(va, size, mode))
+				return ((void *)(va + offset));
+		}
+		va = kva_alloc(size);
+		if (va == 0)
+			panic("%s: Couldn't allocate KVA", __func__);
+	}
 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
-	pmap_invalidate_cache_range(va, va + tmpsize);
+	pmap_invalidate_cache_range(va, va + tmpsize, FALSE);
 	return ((void *)(va + offset));
 }
 
@@ -5039,15 +6389,32 @@
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
-	vm_offset_t base, offset;
+	struct pmap_preinit_mapping *ppim;
+	vm_offset_t offset;
+	int i;
 
 	/* If we gave a direct map region in pmap_mapdev, do nothing */
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 		return;
-	base = trunc_page(va);
 	offset = va & PAGE_MASK;
-	size = roundup(offset + size, PAGE_SIZE);
-	kmem_free(kernel_map, base, size);
+	size = round_page(offset + size);
+	va = trunc_page(va);
+	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
+		ppim = pmap_preinit_mapping + i;
+		if (ppim->va == va && ppim->sz == size) {
+			if (pmap_initialized)
+				return;
+			ppim->pa = 0;
+			ppim->va = 0;
+			ppim->sz = 0;
+			ppim->mode = 0;
+			if (va + size == virtual_avail)
+				virtual_avail = va;
+			return;
+		}
+	}
+	if (pmap_initialized)
+		kva_free(va, size);
 }
 
 /*
@@ -5058,9 +6425,15 @@
 {
 	pdp_entry_t newpdpe, oldpdpe;
 	pd_entry_t *firstpde, newpde, *pde;
+	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t mpdepa;
 	vm_page_t mpde;
 
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpdpe = *pdpe;
 	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
@@ -5158,7 +6531,7 @@
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
-	vm_paddr_t pa_start, pa_end;
+	vm_paddr_t pa_start, pa_end, pa_end1;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
@@ -5168,7 +6541,7 @@
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
-	size = roundup(offset + size, PAGE_SIZE);
+	size = round_page(offset + size);
 
 	/*
 	 * Only supported on kernel virtual addresses, including the direct
@@ -5177,8 +6550,8 @@
 	if (base < DMAP_MIN_ADDRESS)
 		return (EINVAL);
 
-	cache_bits_pde = pmap_cache_bits(mode, 1);
-	cache_bits_pte = pmap_cache_bits(mode, 0);
+	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
+	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
 	changed = FALSE;
 
 	/*
@@ -5187,7 +6560,7 @@
 	 */
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
-		if (*pdpe == 0)
+		if (pdpe == NULL || *pdpe == 0)
 			return (EINVAL);
 		if (*pdpe & PG_PS) {
 			/*
@@ -5195,7 +6568,7 @@
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 1GB page frame.
 			 */
-			if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) {
+			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_1gpage(tmpva) + NBPDP;
 				continue;
 			}
@@ -5222,7 +6595,7 @@
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 2MB page frame.
 			 */
-			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
+			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_2mpage(tmpva) + NBPDR;
 				continue;
 			}
@@ -5255,11 +6628,13 @@
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (*pdpe & PG_PS) {
-			if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) {
-				pmap_pde_attr(pdpe, cache_bits_pde);
+			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
+				pmap_pde_attr(pdpe, cache_bits_pde,
+				    X86_PG_PDE_CACHE);
 				changed = TRUE;
 			}
-			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
+			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pdpe & PG_PS_FRAME;
@@ -5283,11 +6658,13 @@
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde & PG_PS) {
-			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
-				pmap_pde_attr(pde, cache_bits_pde);
+			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
+				pmap_pde_attr(pde, cache_bits_pde,
+				    X86_PG_PDE_CACHE);
 				changed = TRUE;
 			}
-			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
+			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+			    (*pde & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pde & PG_PS_FRAME;
@@ -5309,11 +6686,13 @@
 			tmpva = trunc_2mpage(tmpva) + NBPDR;
 		} else {
 			pte = pmap_pde_to_pte(pde, tmpva);
-			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
-				pmap_pte_attr(pte, cache_bits_pte);
+			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
+				pmap_pte_attr(pte, cache_bits_pte,
+				    X86_PG_PTE_CACHE);
 				changed = TRUE;
 			}
-			if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
+			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+			    (*pte & PG_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pte & PG_FRAME;
@@ -5335,9 +6714,12 @@
 			tmpva += PAGE_SIZE;
 		}
 	}
-	if (error == 0 && pa_start != pa_end)
-		error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
-		    pa_end - pa_start, mode);
+	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
+		pa_end1 = MIN(pa_end, dmaplimit);
+		if (pa_start != pa_end1)
+			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
+			    pa_end1 - pa_start, mode);
+	}
 
 	/*
 	 * Flush CPU caches if required to make sure any data isn't cached that
@@ -5345,7 +6727,7 @@
 	 */
 	if (changed) {
 		pmap_invalidate_range(kernel_pmap, base, tmpva);
-		pmap_invalidate_cache_range(base, tmpva);
+		pmap_invalidate_cache_range(base, tmpva, FALSE);
 	}
 	return (error);
 }
@@ -5375,7 +6757,7 @@
 		changed = FALSE;
 		PMAP_LOCK(kernel_pmap);
 		pdpe = pmap_pdpe(kernel_pmap, va);
-		if ((*pdpe & PG_V) == 0)
+		if ((*pdpe & X86_PG_V) == 0)
 			panic("pmap_demote_DMAP: invalid PDPE");
 		if ((*pdpe & PG_PS) != 0) {
 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
@@ -5384,7 +6766,7 @@
 		}
 		if (len < NBPDR) {
 			pde = pmap_pdpe_to_pde(pdpe, va);
-			if ((*pde & PG_V) == 0)
+			if ((*pde & X86_PG_V) == 0)
 				panic("pmap_demote_DMAP: invalid PDE");
 			if ((*pde & PG_PS) != 0) {
 				if (!pmap_demote_pde(kernel_pmap, pde, va))
@@ -5405,10 +6787,15 @@
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pd_entry_t *pdep;
-	pt_entry_t pte;
+	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa;
 	int val;
 
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, addr);
@@ -5453,7 +6840,6 @@
 {
 	pmap_t	pmap, oldpmap;
 	u_int	cpuid;
-	u_int64_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
@@ -5462,13 +6848,14 @@
 #ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
+	CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 	CPU_SET(cpuid, &pmap->pm_active);
+	CPU_SET(cpuid, &pmap->pm_save);
 #endif
-	cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
-	td->td_pcb->pcb_cr3 = cr3;
-	load_cr3(cr3);
+	td->td_pcb->pcb_cr3 = pmap->pm_cr3;
+	load_cr3(pmap->pm_cr3);
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 }
@@ -5502,6 +6889,164 @@
 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 }
 
+#ifdef INVARIANTS
+static unsigned long num_dirty_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
+	     &num_dirty_emulations, 0, NULL);
+
+static unsigned long num_accessed_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
+	     &num_accessed_emulations, 0, NULL);
+
+static unsigned long num_superpage_accessed_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
+	     &num_superpage_accessed_emulations, 0, NULL);
+
+static unsigned long ad_emulation_superpage_promotions;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
+	     &ad_emulation_superpage_promotions, 0, NULL);
+#endif	/* INVARIANTS */
+
+int
+pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
+{
+	int rv;
+	struct rwlock *lock;
+	vm_page_t m, mpte;
+	pd_entry_t *pde;
+	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
+	boolean_t pv_lists_locked;
+
+	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
+	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
+
+	if (!pmap_emulate_ad_bits(pmap))
+		return (-1);
+
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
+	rv = -1;
+	lock = NULL;
+	pv_lists_locked = FALSE;
+retry:
+	PMAP_LOCK(pmap);
+
+	pde = pmap_pde(pmap, va);
+	if (pde == NULL || (*pde & PG_V) == 0)
+		goto done;
+
+	if ((*pde & PG_PS) != 0) {
+		if (ftype == VM_PROT_READ) {
+#ifdef INVARIANTS
+			atomic_add_long(&num_superpage_accessed_emulations, 1);
+#endif
+			*pde |= PG_A;
+			rv = 0;
+		}
+		goto done;
+	}
+
+	pte = pmap_pde_to_pte(pde, va);
+	if ((*pte & PG_V) == 0)
+		goto done;
+
+	if (ftype == VM_PROT_WRITE) {
+		if ((*pte & PG_RW) == 0)
+			goto done;
+		/*
+		 * Set the modified and accessed bits simultaneously.
+		 *
+		 * Intel EPT PTEs that do software emulation of A/D bits map
+		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
+		 * An EPT misconfiguration is triggered if the PTE is writable
+		 * but not readable (WR=10). This is avoided by setting PG_A
+		 * and PG_M simultaneously.
+		 */
+		*pte |= PG_M | PG_A;
+	} else {
+		*pte |= PG_A;
+	}
+
+	/* try to promote the mapping */
+	if (va < VM_MAXUSER_ADDRESS)
+		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
+	else
+		mpte = NULL;
+
+	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
+
+	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
+	    pmap_ps_enabled(pmap) &&
+	    (m->flags & PG_FICTITIOUS) == 0 &&
+	    vm_reserv_level_iffullpop(m) == 0) {
+		if (!pv_lists_locked) {
+			pv_lists_locked = TRUE;
+			if (!rw_try_rlock(&pvh_global_lock)) {
+				PMAP_UNLOCK(pmap);
+				rw_rlock(&pvh_global_lock);
+				goto retry;
+			}
+		}
+		pmap_promote_pde(pmap, pde, va, &lock);
+#ifdef INVARIANTS
+		atomic_add_long(&ad_emulation_superpage_promotions, 1);
+#endif
+	}
+#ifdef INVARIANTS
+	if (ftype == VM_PROT_WRITE)
+		atomic_add_long(&num_dirty_emulations, 1);
+	else
+		atomic_add_long(&num_accessed_emulations, 1);
+#endif
+	rv = 0;		/* success */
+done:
+	if (lock != NULL)
+		rw_wunlock(lock);
+	if (pv_lists_locked)
+		rw_runlock(&pvh_global_lock);
+	PMAP_UNLOCK(pmap);
+	return (rv);
+}
+
+void
+pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
+{
+	pml4_entry_t *pml4;
+	pdp_entry_t *pdp;
+	pd_entry_t *pde;
+	pt_entry_t *pte, PG_V;
+	int idx;
+
+	idx = 0;
+	PG_V = pmap_valid_bit(pmap);
+	PMAP_LOCK(pmap);
+
+	pml4 = pmap_pml4e(pmap, va);
+	ptr[idx++] = *pml4;
+	if ((*pml4 & PG_V) == 0)
+		goto done;
+
+	pdp = pmap_pml4e_to_pdpe(pml4, va);
+	ptr[idx++] = *pdp;
+	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
+		goto done;
+
+	pde = pmap_pdpe_to_pde(pdp, va);
+	ptr[idx++] = *pde;
+	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
+		goto done;
+
+	pte = pmap_pde_to_pte(pde, va);
+	ptr[idx++] = *pte;
+
+done:
+	PMAP_UNLOCK(pmap);
+	*num = idx;
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -5512,7 +7057,7 @@
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pde;
-	pt_entry_t *pte;
+	pt_entry_t *pte, PG_V;
 	vm_offset_t va;
 
 	if (have_addr) {
@@ -5522,6 +7067,7 @@
 		db_printf("show pte addr\n");
 		return;
 	}
+	PG_V = pmap_valid_bit(pmap);
 	pml4 = pmap_pml4e(pmap, va);
 	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
 	if ((*pml4 & PG_V) == 0) {

Modified: trunk/sys/amd64/amd64/prof_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/prof_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/prof_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1996 Bruce D. Evans.
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/prof_machdep.c 220433 2011-04-07 23:28:28Z jkim $");
 
 #ifdef GUPROF
 #if 0

Modified: trunk/sys/amd64/amd64/ptrace_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/ptrace_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/ptrace_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2011 Konstantin Belousov <kib at FreeBSD.org>
  * All rights reserved.
@@ -26,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/ptrace_machdep.c 332069 2018-04-05 13:39:53Z kib $");
 
 #include "opt_compat.h"
 
@@ -36,12 +37,27 @@
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/sysent.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
+#include <machine/frame.h>
+#include <machine/vmparam.h>
 
+#ifdef COMPAT_FREEBSD32
+struct ptrace_xstate_info32 {
+	uint32_t	xsave_mask1, xsave_mask2;
+	uint32_t	xsave_len;
+};
+#endif
+
 static int
 cpu_ptrace_xstate(struct thread *td, int req, void *addr, int data)
 {
+	struct ptrace_xstate_info info;
+#ifdef COMPAT_FREEBSD32
+	struct ptrace_xstate_info32 info32;
+#endif
 	char *savefpu;
 	int error;
 
@@ -49,7 +65,7 @@
 		return (EOPNOTSUPP);
 
 	switch (req) {
-	case PT_GETXSTATE:
+	case PT_GETXSTATE_OLD:
 		fpugetregs(td);
 		savefpu = (char *)(get_pcb_user_save_td(td) + 1);
 		error = copyout(savefpu, addr,
@@ -56,7 +72,7 @@
 		    cpu_max_ext_state_size - sizeof(struct savefpu));
 		break;
 
-	case PT_SETXSTATE:
+	case PT_SETXSTATE_OLD:
 		if (data > cpu_max_ext_state_size - sizeof(struct savefpu)) {
 			error = EINVAL;
 			break;
@@ -70,6 +86,52 @@
 		free(savefpu, M_TEMP);
 		break;
 
+	case PT_GETXSTATE_INFO:
+#ifdef COMPAT_FREEBSD32
+		if (SV_CURPROC_FLAG(SV_ILP32)) {
+			if (data != sizeof(info32)) {
+				error = EINVAL;
+			} else {
+				info32.xsave_len = cpu_max_ext_state_size;
+				info32.xsave_mask1 = xsave_mask;
+				info32.xsave_mask2 = xsave_mask >> 32;
+				error = copyout(&info32, addr, data);
+			}
+		} else
+#endif
+		{
+			if (data != sizeof(info)) {
+				error  = EINVAL;
+			} else {
+				bzero(&info, sizeof(info));
+				info.xsave_len = cpu_max_ext_state_size;
+				info.xsave_mask = xsave_mask;
+				error = copyout(&info, addr, data);
+			}
+		}
+		break;
+
+	case PT_GETXSTATE:
+		fpugetregs(td);
+		savefpu = (char *)(get_pcb_user_save_td(td));
+		error = copyout(savefpu, addr, cpu_max_ext_state_size);
+		break;
+
+	case PT_SETXSTATE:
+		if (data < sizeof(struct savefpu) ||
+		    data > cpu_max_ext_state_size) {
+			error = EINVAL;
+			break;
+		}
+		savefpu = malloc(data, M_TEMP, M_WAITOK);
+		error = copyin(addr, savefpu, data);
+		if (error == 0)
+			error = fpusetregs(td, (struct savefpu *)savefpu,
+			    savefpu + sizeof(struct savefpu), data -
+			    sizeof(struct savefpu));
+		free(savefpu, M_TEMP);
+		break;
+
 	default:
 		error = EINVAL;
 		break;
@@ -78,16 +140,29 @@
 	return (error);
 }
 
+static void
+cpu_ptrace_setbase(struct thread *td, int req, register_t r)
+{
+
+	if (req == PT_SETFSBASE) {
+		td->td_pcb->pcb_fsbase = r;
+		td->td_frame->tf_fs = _ufssel;
+	} else {
+		td->td_pcb->pcb_gsbase = r;
+		td->td_frame->tf_gs = _ugssel;
+	}
+	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+}
+
 #ifdef COMPAT_FREEBSD32
 #define PT_I386_GETXMMREGS	(PT_FIRSTMACH + 0)
 #define PT_I386_SETXMMREGS	(PT_FIRSTMACH + 1)
-#define PT_I386_GETXSTATE	(PT_FIRSTMACH + 2)
-#define PT_I386_SETXSTATE	(PT_FIRSTMACH + 3)
 
 static int
 cpu32_ptrace(struct thread *td, int req, void *addr, int data)
 {
 	struct savefpu *fpstate;
+	uint32_t r;
 	int error;
 
 	switch (req) {
@@ -104,14 +179,37 @@
 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 		break;
 
-	case PT_I386_GETXSTATE:
-		error = cpu_ptrace_xstate(td, PT_GETXSTATE, addr, data);
+	case PT_GETXSTATE_OLD:
+	case PT_SETXSTATE_OLD:
+	case PT_GETXSTATE_INFO:
+	case PT_GETXSTATE:
+	case PT_SETXSTATE:
+		error = cpu_ptrace_xstate(td, req, addr, data);
 		break;
 
-	case PT_I386_SETXSTATE:
-		error = cpu_ptrace_xstate(td, PT_SETXSTATE, addr, data);
+	case PT_GETFSBASE:
+	case PT_GETGSBASE:
+		if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+			error = EINVAL;
+			break;
+		}
+		r = req == PT_GETFSBASE ? td->td_pcb->pcb_fsbase :
+		    td->td_pcb->pcb_gsbase;
+		error = copyout(&r, addr, sizeof(r));
 		break;
 
+	case PT_SETFSBASE:
+	case PT_SETGSBASE:
+		if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+			error = EINVAL;
+			break;
+		}
+		error = copyin(addr, &r, sizeof(r));
+		if (error != 0)
+			break;
+		cpu_ptrace_setbase(td, req, r);
+		break;
+
 	default:
 		error = EINVAL;
 		break;
@@ -124,6 +222,7 @@
 int
 cpu_ptrace(struct thread *td, int req, void *addr, int data)
 {
+	register_t *r, rv;
 	int error;
 
 #ifdef COMPAT_FREEBSD32
@@ -131,18 +230,40 @@
 		return (cpu32_ptrace(td, req, addr, data));
 #endif
 
-	/* Support old values of PT_GETXSTATE and PT_SETXSTATE. */
+	/* Support old values of PT_GETXSTATE_OLD and PT_SETXSTATE_OLD. */
 	if (req == PT_FIRSTMACH + 0)
-		req = PT_GETXSTATE;
+		req = PT_GETXSTATE_OLD;
 	if (req == PT_FIRSTMACH + 1)
-		req = PT_SETXSTATE;
+		req = PT_SETXSTATE_OLD;
 
 	switch (req) {
+	case PT_GETXSTATE_OLD:
+	case PT_SETXSTATE_OLD:
+	case PT_GETXSTATE_INFO:
 	case PT_GETXSTATE:
 	case PT_SETXSTATE:
 		error = cpu_ptrace_xstate(td, req, addr, data);
 		break;
 
+	case PT_GETFSBASE:
+	case PT_GETGSBASE:
+		r = req == PT_GETFSBASE ? &td->td_pcb->pcb_fsbase :
+		    &td->td_pcb->pcb_gsbase;
+		error = copyout(r, addr, sizeof(*r));
+		break;
+
+	case PT_SETFSBASE:
+	case PT_SETGSBASE:
+		error = copyin(addr, &rv, sizeof(rv));
+		if (error != 0)
+			break;
+		if (rv >= td->td_proc->p_sysent->sv_maxuser) {
+			error = EINVAL;
+			break;
+		}
+		cpu_ptrace_setbase(td, req, rv);
+		break;
+
 	default:
 		error = EINVAL;
 		break;

Modified: trunk/sys/amd64/amd64/sigtramp.S
===================================================================
--- trunk/sys/amd64/amd64/sigtramp.S	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/sigtramp.S	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003 Peter Wemm <peter at freeBSD.org>
  * All rights reserved.
@@ -23,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/sigtramp.S 114349 2003-05-01 01:05:25Z peter $
  */
 
 #include <sys/syscall.h>


Property changes on: trunk/sys/amd64/amd64/sigtramp.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/stack_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/stack_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/stack_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2005 Antoine Brodin
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/stack_machdep.c 286396 2015-08-07 04:31:02Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -40,7 +41,7 @@
 #include <vm/pmap.h>
 
 static void
-stack_capture(struct stack *st, register_t rbp)
+stack_capture(struct thread *td, struct stack *st, register_t rbp)
 {
 	struct amd64_frame *frame;
 	vm_offset_t callpc;
@@ -56,8 +57,8 @@
 		if (stack_put(st, callpc) == -1)
 			break;
 		if (frame->f_frame <= frame ||
-		    (vm_offset_t)frame->f_frame >=
-		    (vm_offset_t)rbp + KSTACK_PAGES * PAGE_SIZE)
+		    (vm_offset_t)frame->f_frame >= td->td_kstack +
+		    td->td_kstack_pages * PAGE_SIZE)
 			break;
 		frame = frame->f_frame;
 	}
@@ -74,7 +75,7 @@
 		panic("stack_save_td: running");
 
 	rbp = td->td_pcb->pcb_rbp;
-	stack_capture(st, rbp);
+	stack_capture(td, st, rbp);
 }
 
 void
@@ -83,5 +84,5 @@
 	register_t rbp;
 
 	__asm __volatile("movq %%rbp,%0" : "=r" (rbp));
-	stack_capture(st, rbp);
+	stack_capture(curthread, st, rbp);
 }

Modified: trunk/sys/amd64/amd64/support.S
===================================================================
--- trunk/sys/amd64/amd64/support.S	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/support.S	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1993 The Regents of the University of California.
@@ -27,7 +28,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/support.S 310975 2016-12-31 13:23:28Z mjg $
  */
 
 #include "opt_ddb.h"
@@ -47,6 +48,7 @@
 
 /* done */
 ENTRY(bzero)
+	PUSH_FRAME_POINTER
 	movq	%rsi,%rcx
 	xorl	%eax,%eax
 	shrq	$3,%rcx
@@ -57,26 +59,23 @@
 	andq	$7,%rcx
 	rep
 	stosb
+	POP_FRAME_POINTER
 	ret
 END(bzero)
-	
+
 /* Address: %rdi */
 ENTRY(pagezero)
-	movq	$-PAGE_SIZE,%rdx
-	subq	%rdx,%rdi
+	PUSH_FRAME_POINTER
+	movq	$PAGE_SIZE/8,%rcx
 	xorl	%eax,%eax
-1:
-	movnti	%rax,(%rdi,%rdx)
-	movnti	%rax,8(%rdi,%rdx)
-	movnti	%rax,16(%rdi,%rdx)
-	movnti	%rax,24(%rdi,%rdx)
-	addq	$32,%rdx
-	jne	1b
-	sfence
+	rep
+	stosq
+	POP_FRAME_POINTER
 	ret
 END(pagezero)
 
 ENTRY(bcmp)
+	PUSH_FRAME_POINTER
 	movq	%rdx,%rcx
 	shrq	$3,%rcx
 	cld					/* compare forwards */
@@ -91,6 +90,7 @@
 1:
 	setne	%al
 	movsbl	%al,%eax
+	POP_FRAME_POINTER
 	ret
 END(bcmp)
 
@@ -100,6 +100,7 @@
  *  ws at tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
 ENTRY(bcopy)
+	PUSH_FRAME_POINTER
 	xchgq	%rsi,%rdi
 	movq	%rdx,%rcx
 
@@ -116,6 +117,7 @@
 	andq	$7,%rcx				/* any bytes left? */
 	rep
 	movsb
+	POP_FRAME_POINTER
 	ret
 
 	/* ALIGN_TEXT */
@@ -135,13 +137,16 @@
 	rep
 	movsq
 	cld
+	POP_FRAME_POINTER
 	ret
 END(bcopy)
-	
+
 /*
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy)
+	PUSH_FRAME_POINTER
+	movq	%rdi,%rax
 	movq	%rdx,%rcx
 	shrq	$3,%rcx				/* copy by 64-bit words */
 	cld					/* copy forwards */
@@ -151,6 +156,7 @@
 	andq	$7,%rcx				/* any bytes left? */
 	rep
 	movsb
+	POP_FRAME_POINTER
 	ret
 END(memcpy)
 
@@ -158,6 +164,7 @@
  * pagecopy(%rdi=from, %rsi=to)
  */
 ENTRY(pagecopy)
+	PUSH_FRAME_POINTER
 	movq	$-PAGE_SIZE,%rax
 	movq	%rax,%rdx
 	subq	%rax,%rdi
@@ -178,18 +185,21 @@
 	addq	$32,%rdx
 	jne	2b
 	sfence
+	POP_FRAME_POINTER
 	ret
 END(pagecopy)
 
-/* fillw(pat, base, cnt) */  
+/* fillw(pat, base, cnt) */
 /*       %rdi,%rsi, %rdx */
 ENTRY(fillw)
-	movq	%rdi,%rax   
+	PUSH_FRAME_POINTER
+	movq	%rdi,%rax
 	movq	%rsi,%rdi
 	movq	%rdx,%rcx
 	cld
 	rep
 	stosw
+	POP_FRAME_POINTER
 	ret
 END(fillw)
 
@@ -210,6 +220,7 @@
  *         %rdi,        %rsi,    %rdx
  */
 ENTRY(copyout)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rax
 	movq	$copyout_fault,PCB_ONFAULT(%rax)
 	testq	%rdx,%rdx			/* anything to do? */
@@ -255,6 +266,7 @@
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rdx
 	movq	%rax,PCB_ONFAULT(%rdx)
+	POP_FRAME_POINTER
 	ret
 
 	ALIGN_TEXT
@@ -262,6 +274,7 @@
 	movq	PCPU(CURPCB),%rdx
 	movq	$0,PCB_ONFAULT(%rdx)
 	movq	$EFAULT,%rax
+	POP_FRAME_POINTER
 	ret
 END(copyout)
 
@@ -270,6 +283,7 @@
  *        %rdi,      %rsi,      %rdx
  */
 ENTRY(copyin)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rax
 	movq	$copyin_fault,PCB_ONFAULT(%rax)
 	testq	%rdx,%rdx			/* anything to do? */
@@ -301,6 +315,7 @@
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rdx
 	movq	%rax,PCB_ONFAULT(%rdx)
+	POP_FRAME_POINTER
 	ret
 
 	ALIGN_TEXT
@@ -308,16 +323,19 @@
 	movq	PCPU(CURPCB),%rdx
 	movq	$0,PCB_ONFAULT(%rdx)
 	movq	$EFAULT,%rax
+	POP_FRAME_POINTER
 	ret
 END(copyin)
 
 /*
- * casuword32.  Compare and set user integer.  Returns -1 or the current value.
- *        dst = %rdi, old = %rsi, new = %rdx
+ * casueword32.  Compare and set user integer.  Returns -1 on fault,
+ *        0 if access was successful.  Old value is written to *oldp.
+ *        dst = %rdi, old = %esi, oldp = %rdx, new = %ecx
  */
-ENTRY(casuword32)
-	movq	PCPU(CURPCB),%rcx
-	movq	$fusufault,PCB_ONFAULT(%rcx)
+ENTRY(casueword32)
+	PUSH_FRAME_POINTER
+	movq	PCPU(CURPCB),%r8
+	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
@@ -327,26 +345,36 @@
 #ifdef SMP
 	lock
 #endif
-	cmpxchgl %edx,(%rdi)			/* new = %edx */
+	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
 
 	/*
 	 * The old value is in %eax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
-	 * be the current value.
+	 * be the current value.  Save %eax into %esi to prepare the return
+	 * value.
 	 */
+	movl	%eax,%esi
+	xorl	%eax,%eax
+	movq	%rax,PCB_ONFAULT(%r8)
 
-	movq	PCPU(CURPCB),%rcx
-	movq	$0,PCB_ONFAULT(%rcx)
+	/*
+	 * Access the oldp after the pcb_onfault is cleared, to correctly
+	 * catch corrupted pointer.
+	 */
+	movl	%esi,(%rdx)			/* oldp = %rdx */
+	POP_FRAME_POINTER
 	ret
-END(casuword32)
+END(casueword32)
 
 /*
- * casuword.  Compare and set user word.  Returns -1 or the current value.
- *        dst = %rdi, old = %rsi, new = %rdx
+ * casueword.  Compare and set user long.  Returns -1 on fault,
+ *        0 if access was successful.  Old value is written to *oldp.
+ *        dst = %rdi, old = %rsi, oldp = %rdx, new = %rcx
  */
-ENTRY(casuword)
-	movq	PCPU(CURPCB),%rcx
-	movq	$fusufault,PCB_ONFAULT(%rcx)
+ENTRY(casueword)
+	PUSH_FRAME_POINTER
+	movq	PCPU(CURPCB),%r8
+	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
@@ -356,28 +384,30 @@
 #ifdef SMP
 	lock
 #endif
-	cmpxchgq %rdx,(%rdi)			/* new = %rdx */
+	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
 
 	/*
-	 * The old value is in %eax.  If the store succeeded it will be the
+	 * The old value is in %rax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.
 	 */
-
-	movq	PCPU(CURPCB),%rcx
-	movq	$fusufault,PCB_ONFAULT(%rcx)
-	movq	$0,PCB_ONFAULT(%rcx)
+	movq	%rax,%rsi
+	xorl	%eax,%eax
+	movq	%rax,PCB_ONFAULT(%r8)
+	movq	%rsi,(%rdx)
+	POP_FRAME_POINTER
 	ret
-END(casuword)
+END(casueword)
 
 /*
  * Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit
- * byte from user memory.  All these functions are MPSAFE.
- * addr = %rdi
+ * byte from user memory.
+ * addr = %rdi, valp = %rsi
  */
 
-ALTENTRY(fuword64)
-ENTRY(fuword)
+ALTENTRY(fueword64)
+ENTRY(fueword)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
@@ -385,13 +415,17 @@
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
-	movq	(%rdi),%rax
-	movq	$0,PCB_ONFAULT(%rcx)
+	xorl	%eax,%eax
+	movq	(%rdi),%r11
+	movq	%rax,PCB_ONFAULT(%rcx)
+	movq	%r11,(%rsi)
+	POP_FRAME_POINTER
 	ret
-END(fuword64)	
-END(fuword)
+END(fueword64)
+END(fueword)
 
-ENTRY(fuword32)
+ENTRY(fueword32)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
@@ -399,10 +433,13 @@
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
-	movl	(%rdi),%eax
-	movq	$0,PCB_ONFAULT(%rcx)
+	xorl	%eax,%eax
+	movl	(%rdi),%r11d
+	movq	%rax,PCB_ONFAULT(%rcx)
+	movl	%r11d,(%rsi)
+	POP_FRAME_POINTER
 	ret
-END(fuword32)
+END(fueword32)
 
 /*
  * fuswintr() and suswintr() are specialized variants of fuword16() and
@@ -419,6 +456,7 @@
 END(fuswintr)
 
 ENTRY(fuword16)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
@@ -428,10 +466,12 @@
 
 	movzwl	(%rdi),%eax
 	movq	$0,PCB_ONFAULT(%rcx)
+	POP_FRAME_POINTER
 	ret
 END(fuword16)
 
 ENTRY(fubyte)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
@@ -441,6 +481,7 @@
 
 	movzbl	(%rdi),%eax
 	movq	$0,PCB_ONFAULT(%rcx)
+	POP_FRAME_POINTER
 	ret
 END(fubyte)
 
@@ -450,6 +491,7 @@
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	decq	%rax
+	POP_FRAME_POINTER
 	ret
 
 /*
@@ -459,6 +501,7 @@
  */
 ALTENTRY(suword64)
 ENTRY(suword)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
@@ -470,11 +513,13 @@
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx
 	movq	%rax,PCB_ONFAULT(%rcx)
+	POP_FRAME_POINTER
 	ret
 END(suword64)
 END(suword)
 
 ENTRY(suword32)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
@@ -486,10 +531,12 @@
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx
 	movq	%rax,PCB_ONFAULT(%rcx)
+	POP_FRAME_POINTER
 	ret
 END(suword32)
 
 ENTRY(suword16)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
@@ -501,10 +548,12 @@
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx		/* restore trashed register */
 	movq	%rax,PCB_ONFAULT(%rcx)
+	POP_FRAME_POINTER
 	ret
 END(suword16)
 
 ENTRY(subyte)
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
@@ -517,6 +566,7 @@
 	xorl	%eax,%eax
 	movq	PCPU(CURPCB),%rcx		/* restore trashed register */
 	movq	%rax,PCB_ONFAULT(%rcx)
+	POP_FRAME_POINTER
 	ret
 END(subyte)
 
@@ -530,6 +580,7 @@
  *	return the actual length in *lencopied.
  */
 ENTRY(copyinstr)
+	PUSH_FRAME_POINTER
 	movq	%rdx,%r8			/* %r8 = maxlen */
 	movq	%rcx,%r9			/* %r9 = *len */
 	xchgq	%rdi,%rsi			/* %rdi = from, %rsi = to */
@@ -586,6 +637,7 @@
 	subq	%rdx,%r8
 	movq	%r8,(%r9)
 1:
+	POP_FRAME_POINTER
 	ret
 END(copyinstr)
 
@@ -594,6 +646,7 @@
  *         %rdi, %rsi, %rdx, %rcx
  */
 ENTRY(copystr)
+	PUSH_FRAME_POINTER
 	movq	%rdx,%r8			/* %r8 = maxlen */
 
 	xchgq	%rdi,%rsi
@@ -623,6 +676,7 @@
 	subq	%rdx,%r8
 	movq	%r8,(%rcx)
 7:
+	POP_FRAME_POINTER
 	ret
 END(copystr)
 
@@ -692,6 +746,7 @@
  */
 ENTRY(rdmsr_safe)
 /* int rdmsr_safe(u_int msr, uint64_t *data) */
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
@@ -703,6 +758,7 @@
 	movq	%rax,(%rsi)
 	xorq	%rax,%rax
 	movq	%rax,PCB_ONFAULT(%r8)
+	POP_FRAME_POINTER
 	ret
 
 /*
@@ -710,6 +766,7 @@
  */
 ENTRY(wrmsr_safe)
 /* int wrmsr_safe(u_int msr, uint64_t data) */
+	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
@@ -720,6 +777,7 @@
 				   hi byte in edx, lo in %eax. */
 	xorq	%rax,%rax
 	movq	%rax,PCB_ONFAULT(%r8)
+	POP_FRAME_POINTER
 	ret
 
 /*
@@ -729,4 +787,5 @@
 msr_onfault:
 	movq	$0,PCB_ONFAULT(%r8)
 	movl	$EFAULT,%eax
+	POP_FRAME_POINTER
 	ret


Property changes on: trunk/sys/amd64/amd64/support.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/sys_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/sys_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/sys_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1990 The Regents of the University of California.
@@ -31,13 +32,13 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/sys_machdep.c 307940 2016-10-25 17:16:08Z glebius $");
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -207,6 +208,10 @@
 
 		case I386_SET_IOPERM:
 		default:
+#ifdef KTRACE
+			if (KTRPOINT(td, KTR_CAPFAIL))
+				ktrcapfail(CAPFAIL_SYSCALL, NULL, NULL);
+#endif
 			return (ECAPMODE);
 		}
 	}
@@ -316,7 +321,7 @@
 		fpugetregs(td);
 		error = copyout((char *)(get_pcb_user_save_td(td) + 1),
 		    a64xfpu.addr, a64xfpu.len);
-		return (error);
+		break;
 
 	default:
 		error = EINVAL;
@@ -330,18 +335,20 @@
 	struct thread *td;
 	struct i386_ioperm_args *uap;
 {
-	int i, error;
 	char *iomap;
 	struct amd64tss *tssp;
 	struct system_segment_descriptor *tss_sd;
 	u_long *addr;
 	struct pcb *pcb;
+	u_int i;
+	int error;
 
 	if ((error = priv_check(td, PRIV_IO)) != 0)
 		return (error);
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
-	if (uap->start + uap->length > IOPAGES * PAGE_SIZE * NBBY)
+	if (uap->start > uap->start + uap->length ||
+	    uap->start + uap->length > IOPAGES * PAGE_SIZE * NBBY)
 		return (EINVAL);
 
 	/*
@@ -352,8 +359,8 @@
 	 */
 	pcb = td->td_pcb;
 	if (pcb->pcb_tssp == NULL) {
-		tssp = (struct amd64tss *)kmem_alloc(kernel_map,
-		    ctob(IOPAGES+1));
+		tssp = (struct amd64tss *)kmem_malloc(kernel_arena,
+		    ctob(IOPAGES+1), M_WAITOK);
 		if (tssp == NULL)
 			return (ENOMEM);
 		iomap = (char *)&tssp[1];
@@ -459,8 +466,9 @@
 		return (mdp->md_ldt);
 	mtx_unlock(&dt_lock);
 	new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
-	new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map,
-	     max_ldt_segment * sizeof(struct user_segment_descriptor));
+	new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
+	     max_ldt_segment * sizeof(struct user_segment_descriptor),
+	     M_WAITOK | M_ZERO);
 	if (new_ldt->ldt_base == NULL) {
 		FREE(new_ldt, M_SUBPROC);
 		mtx_lock(&dt_lock);
@@ -479,7 +487,7 @@
 	mtx_lock(&dt_lock);
 	pldt = mdp->md_ldt;
 	if (pldt != NULL && !force) {
-		kmem_free(kernel_map, (vm_offset_t)new_ldt->ldt_base,
+		kmem_free(kernel_arena, (vm_offset_t)new_ldt->ldt_base,
 		    max_ldt_segment * sizeof(struct user_segment_descriptor));
 		free(new_ldt, M_SUBPROC);
 		return (pldt);
@@ -524,7 +532,7 @@
 {
 
 	if (--pldt->ldt_refcnt == 0) {
-		kmem_free(kernel_map, (vm_offset_t)pldt->ldt_base,
+		kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base,
 		    max_ldt_segment * sizeof(struct user_segment_descriptor));
 		free(pldt, M_SUBPROC);
 	}

Modified: trunk/sys/amd64/amd64/trap.c
===================================================================
--- trunk/sys/amd64/amd64/trap.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/trap.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
@@ -38,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/trap.c 333370 2018-05-08 17:05:39Z emaste $");
 
 /*
  * AMD64 Trap and System call handling
@@ -45,6 +46,7 @@
  */
 
 #include "opt_clock.h"
+#include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
@@ -96,31 +98,11 @@
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
+#endif
 
-/*
- * This is a hook which is initialised by the dtrace module
- * to handle traps which might occur during DTrace probe
- * execution.
- */
-dtrace_trap_func_t	dtrace_trap_func;
+extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(fast_syscall),
+    IDTVEC(fast_syscall32), IDTVEC(int0x80_syscall);
 
-dtrace_doubletrap_func_t	dtrace_doubletrap_func;
-
-/*
- * This is a hook which is initialised by the systrace module
- * when it is loaded. This keeps the DTrace syscall provider
- * implementation opaque. 
- */
-systrace_probe_func_t	systrace_probe_func;
-
-/*
- * These hooks are necessary for the pid, usdt and fasttrap providers.
- */
-dtrace_fasttrap_probe_ptr_t	dtrace_fasttrap_probe_ptr;
-dtrace_pid_probe_ptr_t		dtrace_pid_probe_ptr;
-dtrace_return_probe_ptr_t	dtrace_return_probe_ptr;
-#endif
-
 extern void trap(struct trapframe *frame);
 extern void syscall(struct trapframe *frame);
 void dblfault_handler(struct trapframe *frame);
@@ -128,7 +110,7 @@
 static int trap_pfault(struct trapframe *, int);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 
-#define MAX_TRAP_MSG		33
+#define MAX_TRAP_MSG		32
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
@@ -163,7 +145,6 @@
 	"reserved (unknown) fault",		/* 30 T_RESERVED */
 	"",					/* 31 unused (reserved) */
 	"DTrace pid return trap",		/* 32 T_DTRACE_RET */
-	"DTrace fasttrap probe trap",		/* 33 T_DTRACE_PROBE */
 };
 
 #ifdef KDB
@@ -195,6 +176,9 @@
 void
 trap(struct trapframe *frame)
 {
+#ifdef KDTRACE_HOOKS
+	struct reg regs;
+#endif
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int i = 0, ucode = 0, code;
@@ -246,33 +230,10 @@
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
-	 * a flag in it's per-cpu flags to indicate that it doesn't
+	 * a flag in its per-cpu flags to indicate that it doesn't
 	 * want to fault. On returning from the probe, the no-fault
 	 * flag is cleared and finally re-scheduling is enabled.
-	 *
-	 * If the DTrace kernel module has registered a trap handler,
-	 * call it and if it returns non-zero, assume that it has
-	 * handled the trap and modified the trap frame so that this
-	 * function can return normally.
 	 */
-	if (type == T_DTRACE_PROBE || type == T_DTRACE_RET ||
-	    type == T_BPTFLT) {
-		struct reg regs;
-
-		fill_frame_regs(frame, &regs);
-		if (type == T_DTRACE_PROBE &&
-		    dtrace_fasttrap_probe_ptr != NULL &&
-		    dtrace_fasttrap_probe_ptr(&regs) == 0)
-			goto out;
-		else if (type == T_BPTFLT &&
-		    dtrace_pid_probe_ptr != NULL &&
-		    dtrace_pid_probe_ptr(&regs) == 0)
-			goto out;
-		else if (type == T_DTRACE_RET &&
-		    dtrace_return_probe_ptr != NULL &&
-		    dtrace_return_probe_ptr(&regs) == 0)
-			goto out;
-	}
 	if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
 		goto out;
 #endif
@@ -327,6 +288,14 @@
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			enable_intr();
+#ifdef KDTRACE_HOOKS
+			if (type == T_BPTFLT) {
+				fill_frame_regs(frame, &regs);
+				if (dtrace_pid_probe_ptr != NULL &&
+				    dtrace_pid_probe_ptr(&regs) == 0)
+					goto out;
+			}
+#endif
 			frame->tf_rflags &= ~PSL_T;
 			i = SIGTRAP;
 			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
@@ -352,6 +321,10 @@
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
+		case T_ALIGNFLT:
+			i = SIGBUS;
+			ucode = BUS_ADRALN;
+			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			i = SIGBUS;
@@ -359,6 +332,13 @@
 			break;
 
 		case T_PAGEFLT:		/* page fault */
+			/*
+			 * Emulator can take care about this trap?
+			 */
+			if (*p->p_sysent->sv_trap != NULL &&
+			    (*p->p_sysent->sv_trap)(td) == 0)
+				goto userout;
+
 			addr = frame->tf_addr;
 			i = trap_pfault(frame, TRUE);
 			if (i == -1)
@@ -421,7 +401,7 @@
 				goto userout;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
-			break;
+			goto out;
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
@@ -452,6 +432,15 @@
 				goto userout;
 			i = SIGFPE;
 			break;
+#ifdef KDTRACE_HOOKS
+		case T_DTRACE_RET:
+			enable_intr();
+			fill_frame_regs(frame, &regs);
+			if (dtrace_return_probe_ptr != NULL &&
+			    dtrace_return_probe_ptr(&regs) == 0)
+				goto out;
+			goto userout;
+#endif
 		}
 	} else {
 		/* kernel trap */
@@ -464,8 +453,8 @@
 			goto out;
 
 		case T_DNA:
-			KASSERT(!PCB_USER_FPU(td->td_pcb),
-			    ("Unregistered use of FPU in kernel"));
+			if (PCB_USER_FPU(td->td_pcb))
+				panic("Unregistered use of FPU in kernel");
 			fpudna();
 			goto out;
 
@@ -473,8 +462,8 @@
 		case T_XMMFLT:		/* SIMD floating-point exception */
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			/*
-			 * XXXKIB for now disable any FPU traps in kernel
-			 * handler registration seems to be overkill
+			 * For now, supporting kernel handler
+			 * registration for FPU traps is overkill.
 			 */
 			trap_fatal(frame, 0);
 			goto out;
@@ -565,7 +554,40 @@
 				load_dr6(rdr6() & 0xfffffff0);
 				goto out;
 			}
+
 			/*
+			 * Malicious user code can configure a debug
+			 * register watchpoint to trap on data access
+			 * to the top of stack and then execute 'pop
+			 * %ss; int 3'.  Due to exception deferral for
+			 * 'pop %ss', the CPU will not interrupt 'int
+			 * 3' to raise the DB# exception for the debug
+			 * register but will postpone the DB# until
+			 * execution of the first instruction of the
+			 * BP# handler (in kernel mode).  Normally the
+			 * previous check would ignore DB# exceptions
+			 * for watchpoints on user addresses raised in
+			 * kernel mode.  However, some CPU errata
+			 * include cases where DB# exceptions do not
+			 * properly set bits in %dr6, e.g. Haswell
+			 * HSD23 and Skylake-X SKZ24.
+			 *
+			 * A deferred DB# can also be raised on the
+			 * first instructions of system call entry
+			 * points or single-step traps via similar use
+			 * of 'pop %ss' or 'mov xxx, %ss'.
+			 */
+			if (frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall) ||
+#ifdef COMPAT_FREEBSD32
+			    frame->tf_rip ==
+			    (uintptr_t)IDTVEC(int0x80_syscall) ||
+#endif
+			    frame->tf_rip == (uintptr_t)IDTVEC(bpt) ||
+			    frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
+			    /* Needed for AMD. */
+			    frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
+				return;
+			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
@@ -633,7 +655,6 @@
 
 user:
 	userret(td, frame);
-	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT(PCB_USER_FPU(td->td_pcb),
 	    ("Return from trap with kernel FPU ctx leaked"));
 userout:
@@ -647,7 +668,7 @@
 	int usermode;
 {
 	vm_offset_t va;
-	struct vmspace *vm = NULL;
+	struct vmspace *vm;
 	vm_map_t map;
 	int rv = 0;
 	vm_prot_t ftype;
@@ -710,14 +731,10 @@
 		map = kernel_map;
 	} else {
 		/*
-		 * This is a fault on non-kernel virtual memory.
-		 * vm is initialized above to NULL. If curproc is NULL
-		 * or curproc->p_vmspace is NULL the fault is fatal.
+		 * This is a fault on non-kernel virtual memory.  If either
+		 * p or p->p_vmspace is NULL, then the fault is fatal.
 		 */
-		if (p != NULL)
-			vm = p->p_vmspace;
-
-		if (vm == NULL)
+		if (p == NULL || (vm = p->p_vmspace) == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
@@ -737,6 +754,14 @@
 	}
 
 	/*
+	 * If the trap was caused by errant bits in the PTE then panic.
+	 */
+	if (frame->tf_err & PGEX_RSV) {
+		trap_fatal(frame, eva);
+		return (-1);
+	}
+
+	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
@@ -793,8 +818,7 @@
 		trap_fatal(frame, eva);
 		return (-1);
 	}
-
-	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
@@ -807,6 +831,9 @@
 	long esp;
 	struct soft_segment_descriptor softseg;
 	char *msg;
+#ifdef KDB
+	bool handled;
+#endif
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
@@ -830,6 +857,7 @@
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_I ? "instruction" : "data",
+			code & PGEX_RSV ? "reserved bits in PTE" :
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%lx:0x%lx\n",
@@ -858,19 +886,17 @@
 	if (frame->tf_rflags & PSL_RF)
 		printf("resume, ");
 	printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
-	printf("current process		= ");
-	if (curproc) {
-		printf("%lu (%s)\n",
-		    (u_long)curproc->p_pid, curthread->td_name ?
-		    curthread->td_name : "");
-	} else {
-		printf("Idle\n");
-	}
+	printf("current process		= %d (%s)\n",
+	    curproc->p_pid, curthread->td_name);
 
 #ifdef KDB
-	if (debugger_on_panic || kdb_active)
-		if (kdb_trap(type, 0, frame))
+	if (debugger_on_panic) {
+		kdb_why = KDB_WHY_TRAP;
+		handled = kdb_trap(type, 0, frame);
+		kdb_why = KDB_WHY_UNSET;
+		if (handled)
 			return;
+	}
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
@@ -990,7 +1016,7 @@
 	}
 
 	KASSERT(PCB_USER_FPU(td->td_pcb),
-	    ("System call %s returing with kernel FPU ctx leaked",
+	    ("System call %s returning with kernel FPU ctx leaked",
 	     syscallname(td->td_proc, sa.code)));
 	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
 	    ("System call %s returning with mangled pcb_save",

Modified: trunk/sys/amd64/amd64/uio_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/uio_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/uio_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2004 Alan L. Cox <alc at cs.rice.edu>
  * Copyright (c) 1982, 1986, 1991, 1993
@@ -16,7 +17,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
@@ -36,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/uio_machdep.c 266312 2014-05-17 13:59:11Z ian $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>

Modified: trunk/sys/amd64/amd64/uma_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/uma_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/uma_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003 Alan L. Cox <alc at cs.rice.edu>
  * All rights reserved.
@@ -25,10 +26,11 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/uma_machdep.c 287945 2015-09-17 23:31:44Z rstone $");
 
 #include <sys/param.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <vm/vm.h>
@@ -40,7 +42,7 @@
 #include <machine/vmparam.h>
 
 void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
 {
 	vm_page_t m;
 	vm_paddr_t pa;
@@ -48,12 +50,7 @@
 	int pflags;
 
 	*flags = UMA_SLAB_PRIV;
-	if ((wait & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
-		pflags = VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED;
-	else
-		pflags = VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED;
-	if (wait & M_ZERO)
-		pflags |= VM_ALLOC_ZERO;
+	pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED;
 	for (;;) {
 		m = vm_page_alloc(NULL, 0, pflags);
 		if (m == NULL) {
@@ -74,7 +71,7 @@
 }
 
 void
-uma_small_free(void *mem, int size, u_int8_t flags)
+uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
 {
 	vm_page_t m;
 	vm_paddr_t pa;

Modified: trunk/sys/amd64/amd64/vm_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/vm_machdep.c	2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/vm_machdep.c	2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
@@ -41,7 +42,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/vm_machdep.c 332759 2018-04-19 06:20:53Z avg $");
 
 #include "opt_isa.h"
 #include "opt_cpu.h"
@@ -59,7 +60,6 @@
 #include <sys/mutex.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
-#include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
@@ -90,9 +90,10 @@
 static volatile u_int	cpu_reset_proxy_active;
 #endif
 
-CTASSERT((struct thread **)OFFSETOF_CURTHREAD ==
-    &((struct pcpu *)NULL)->pc_curthread);
-CTASSERT((struct pcb **)OFFSETOF_CURPCB == &((struct pcpu *)NULL)->pc_curpcb);
+_Static_assert(OFFSETOF_CURTHREAD == offsetof(struct pcpu, pc_curthread),
+    "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread.");
+_Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb),
+    "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb.");
 
 struct savefpu *
 get_pcb_user_save_td(struct thread *td)
@@ -100,8 +101,8 @@
 	vm_offset_t p;
 
 	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
-	    cpu_max_ext_state_size;
-	KASSERT((p % 64) == 0, ("Unaligned pcb_user_save area"));
+	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN);
+	KASSERT((p % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area"));
 	return ((struct savefpu *)p);
 }
 
@@ -120,7 +121,8 @@
 	vm_offset_t p;
 
 	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
-	    cpu_max_ext_state_size - sizeof(struct pcb);
+	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN) -
+	    sizeof(struct pcb);
 	return ((struct pcb *)p);
 }
 
@@ -127,7 +129,7 @@
 void *
 alloc_fpusave(int flags)
 {
-	struct pcb *res;
+	void *res;
 	struct savefpu_ymm *sf;
 
 	res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags);
@@ -219,7 +221,7 @@
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 	pmap2 = vmspace_pmap(p2->p_vmspace);
-	pcb2->pcb_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap2->pm_pml4);
+	pcb2->pcb_cr3 = pmap2->pm_cr3;
 	pcb2->pcb_r12 = (register_t)fork_return;	/* fork_trampoline argument */
 	pcb2->pcb_rbp = 0;
 	pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *);
@@ -341,7 +343,7 @@
 	 * Clean TSS/iomap
 	 */
 	if (pcb->pcb_tssp != NULL) {
-		kmem_free(kernel_map, (vm_offset_t)pcb->pcb_tssp,
+		kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_tssp,
 		    ctob(IOPAGES + 1));
 		pcb->pcb_tssp = NULL;
 	}
@@ -400,9 +402,13 @@
 		 * for the next iteration.
 		 * %r10 restore is only required for freebsd/amd64 processes,
 		 * but shall be innocent for any ia32 ABI.
+		 *
+		 * Require full context restore to get the arguments
+		 * in the registers reloaded at return to usermode.
 		 */
 		td->td_frame->tf_rip -= td->td_frame->tf_err;
 		td->td_frame->tf_r10 = td->td_frame->tf_rcx;
+		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 		break;
 
 	case EJUSTRETURN:
@@ -409,13 +415,7 @@
 		break;
 
 	default:
-		if (td->td_proc->p_sysent->sv_errsize) {
-			if (error >= td->td_proc->p_sysent->sv_errsize)
-				error = -1;	/* XXX */
-			else
-				error = td->td_proc->p_sysent->sv_errtbl[error];
-		}
-		td->td_frame->tf_rax = error;
+		td->td_frame->tf_rax = SV_ABI_ERRNO(td->td_proc, error);
 		td->td_frame->tf_rflags |= PSL_C;
 		break;
 	}
@@ -442,7 +442,8 @@
 	 * values here.
 	 */
 	bcopy(td0->td_pcb, pcb2, sizeof(*pcb2));
-	clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE);
+	clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE |
+	    PCB_KERNFPU);
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
 	bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save,
 	    cpu_max_ext_state_size);
@@ -571,13 +572,11 @@
 static void
 cpu_reset_proxy()
 {
-	cpuset_t tcrp;
 
 	cpu_reset_proxy_active = 1;
 	while (cpu_reset_proxy_active == 1)
-		;	/* Wait for other cpu to see that we've started */
-	CPU_SETOF(cpu_reset_proxyid, &tcrp);
-	stop_cpus(tcrp);
+		ia32_pause(); /* Wait for other cpu to see that we've started */
+
 	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
 	DELAY(1000000);
 	cpu_reset_real();
@@ -591,7 +590,7 @@
 	cpuset_t map;
 	u_int cnt;
 
-	if (smp_active) {
+	if (smp_started) {
 		map = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &map);
 		CPU_NAND(&map, &stopped_cpus);
@@ -611,15 +610,18 @@
 			wmb();
 
 			cnt = 0;
-			while (cpu_reset_proxy_active == 0 && cnt < 10000000)
+			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
+				ia32_pause();
 				cnt++;	/* Wait for BSP to announce restart */
-			if (cpu_reset_proxy_active == 0)
+			}
+			if (cpu_reset_proxy_active == 0) {
 				printf("cpu_reset: Failed to restart BSP\n");
-			enable_intr();
-			cpu_reset_proxy_active = 2;
-
-			while (1);
-			/* NOTREACHED */
+			} else {
+				cpu_reset_proxy_active = 2;
+				while (1)
+					ia32_pause();
+				/* NOTREACHED */
+			}
 		}
 
 		DELAY(1000000);
@@ -690,27 +692,6 @@
 }
 
 /*
- * Allocate an sf_buf for the given vm_page.  On this machine, however, there
- * is no sf_buf object.  Instead, an opaque pointer to the given vm_page is
- * returned.
- */
-struct sf_buf *
-sf_buf_alloc(struct vm_page *m, int pri)
-{
-
-	return ((struct sf_buf *)m);
-}
-
-/*
- * Free the sf_buf.  In fact, do nothing because there are no resources
- * associated with the sf_buf.
- */
-void
-sf_buf_free(struct sf_buf *sf)
-{
-}
-
-/*
  * Software interrupt handler for queued VM system processing.
  */   
 void