[Midnightbsd-cvs] src [10177] trunk/sys/amd64/amd64: sync with freebsd
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Fri Jun 1 19:00:13 EDT 2018
Revision: 10177
http://svnweb.midnightbsd.org/src/?rev=10177
Author: laffer1
Date: 2018-06-01 19:00:12 -0400 (Fri, 01 Jun 2018)
Log Message:
-----------
sync with freebsd
Modified Paths:
--------------
trunk/sys/amd64/amd64/amd64_mem.c
trunk/sys/amd64/amd64/apic_vector.S
trunk/sys/amd64/amd64/atomic.c
trunk/sys/amd64/amd64/atpic_vector.S
trunk/sys/amd64/amd64/autoconf.c
trunk/sys/amd64/amd64/bios.c
trunk/sys/amd64/amd64/bpf_jit_machdep.c
trunk/sys/amd64/amd64/bpf_jit_machdep.h
trunk/sys/amd64/amd64/cpu_switch.S
trunk/sys/amd64/amd64/db_disasm.c
trunk/sys/amd64/amd64/db_interface.c
trunk/sys/amd64/amd64/db_trace.c
trunk/sys/amd64/amd64/elf_machdep.c
trunk/sys/amd64/amd64/exception.S
trunk/sys/amd64/amd64/fpu.c
trunk/sys/amd64/amd64/gdb_machdep.c
trunk/sys/amd64/amd64/genassym.c
trunk/sys/amd64/amd64/in_cksum.c
trunk/sys/amd64/amd64/initcpu.c
trunk/sys/amd64/amd64/io.c
trunk/sys/amd64/amd64/locore.S
trunk/sys/amd64/amd64/machdep.c
trunk/sys/amd64/amd64/mem.c
trunk/sys/amd64/amd64/minidump_machdep.c
trunk/sys/amd64/amd64/mp_machdep.c
trunk/sys/amd64/amd64/mp_watchdog.c
trunk/sys/amd64/amd64/mpboot.S
trunk/sys/amd64/amd64/pmap.c
trunk/sys/amd64/amd64/prof_machdep.c
trunk/sys/amd64/amd64/ptrace_machdep.c
trunk/sys/amd64/amd64/sigtramp.S
trunk/sys/amd64/amd64/stack_machdep.c
trunk/sys/amd64/amd64/support.S
trunk/sys/amd64/amd64/sys_machdep.c
trunk/sys/amd64/amd64/trap.c
trunk/sys/amd64/amd64/uio_machdep.c
trunk/sys/amd64/amd64/uma_machdep.c
trunk/sys/amd64/amd64/vm_machdep.c
Property Changed:
----------------
trunk/sys/amd64/amd64/apic_vector.S
trunk/sys/amd64/amd64/atpic_vector.S
trunk/sys/amd64/amd64/cpu_switch.S
trunk/sys/amd64/amd64/exception.S
trunk/sys/amd64/amd64/locore.S
trunk/sys/amd64/amd64/mpboot.S
trunk/sys/amd64/amd64/sigtramp.S
trunk/sys/amd64/amd64/support.S
Modified: trunk/sys/amd64/amd64/amd64_mem.c
===================================================================
--- trunk/sys/amd64/amd64/amd64_mem.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/amd64_mem.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1999 Michael Smith <msmith at freebsd.org>
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/amd64_mem.c 217506 2011-01-17 17:30:35Z jkim $");
#include <sys/param.h>
#include <sys/kernel.h>
Modified: trunk/sys/amd64/amd64/apic_vector.S
===================================================================
--- trunk/sys/amd64/amd64/apic_vector.S 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/apic_vector.S 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1989, 1990 William F. Jolitz.
* Copyright (c) 1990 The Regents of the University of California.
@@ -28,7 +29,7 @@
* SUCH DAMAGE.
*
* from: vector.s, 386BSD 0.1 unknown origin
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/apic_vector.S 302041 2016-06-21 04:51:55Z sephe $
*/
/*
@@ -43,6 +44,12 @@
#include "assym.s"
+#ifdef SMP
+#define LK lock ;
+#else
+#define LK
+#endif
+
/*
* I/O Interrupt Entry Point. Rather than having one entry point for
* each interrupt source, we use one entry point for each 32-bit word
@@ -128,73 +135,68 @@
MEXITCOUNT
jmp doreti
-#ifdef SMP
+#ifdef XENHVM
/*
- * Global address space TLB shootdown.
+ * Xen event channel upcall interrupt handler.
+ * Only used when the hypervisor supports direct vector callbacks.
*/
.text
SUPERALIGN_TEXT
-IDTVEC(invltlb)
-#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
+IDTVEC(xen_intr_upcall)
PUSH_FRAME
- movl PCPU(CPUID), %eax
-#ifdef COUNT_XINVLTLB_HITS
- incl xhits_gbl(,%rax,4)
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ movq %rsp, %rdi
+ call xen_intr_handle_upcall
+ MEXITCOUNT
+ jmp doreti
#endif
-#ifdef COUNT_IPIS
- movq ipi_invltlb_counts(,%rax,8),%rax
- incq (%rax)
-#endif
- POP_FRAME
-#endif
- pushq %rax
+#ifdef SMP
+/*
+ * Global address space TLB shootdown.
+ */
+ .text
- movq %cr3, %rax /* invalidate the TLB */
- movq %rax, %cr3
-
+ SUPERALIGN_TEXT
+invltlb_ret:
movq lapic, %rax
movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+ POP_FRAME
+ jmp doreti_iret
- lock
- incl smp_tlb_wait
+ SUPERALIGN_TEXT
+IDTVEC(invltlb_pcid)
+ PUSH_FRAME
- popq %rax
- jmp doreti_iret
+ call invltlb_pcid_handler
+ jmp invltlb_ret
+
+ SUPERALIGN_TEXT
+IDTVEC(invltlb)
+ PUSH_FRAME
+
+ call invltlb_handler
+ jmp invltlb_ret
+
/*
* Single page TLB shootdown
*/
.text
SUPERALIGN_TEXT
-IDTVEC(invlpg)
-#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
+IDTVEC(invlpg_pcid)
PUSH_FRAME
- movl PCPU(CPUID), %eax
-#ifdef COUNT_XINVLTLB_HITS
- incl xhits_pg(,%rax,4)
-#endif
-#ifdef COUNT_IPIS
- movq ipi_invlpg_counts(,%rax,8),%rax
- incq (%rax)
-#endif
- POP_FRAME
-#endif
- pushq %rax
+ call invlpg_pcid_handler
+ jmp invltlb_ret
- movq smp_tlb_addr1, %rax
- invlpg (%rax) /* invalidate single page */
+ SUPERALIGN_TEXT
+IDTVEC(invlpg)
+ PUSH_FRAME
- movq lapic, %rax
- movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+ call invlpg_handler
+ jmp invltlb_ret
- lock
- incl smp_tlb_wait
-
- popq %rax
- jmp doreti_iret
-
/*
* Page range TLB shootdown.
*/
@@ -201,39 +203,11 @@
.text
SUPERALIGN_TEXT
IDTVEC(invlrng)
-#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
PUSH_FRAME
- movl PCPU(CPUID), %eax
-#ifdef COUNT_XINVLTLB_HITS
- incl xhits_rng(,%rax,4)
-#endif
-#ifdef COUNT_IPIS
- movq ipi_invlrng_counts(,%rax,8),%rax
- incq (%rax)
-#endif
- POP_FRAME
-#endif
- pushq %rax
- pushq %rdx
+ call invlrng_handler
+ jmp invltlb_ret
- movq smp_tlb_addr1, %rdx
- movq smp_tlb_addr2, %rax
-1: invlpg (%rdx) /* invalidate single page */
- addq $PAGE_SIZE, %rdx
- cmpq %rax, %rdx
- jb 1b
-
- movq lapic, %rax
- movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
-
- lock
- incl smp_tlb_wait
-
- popq %rdx
- popq %rax
- jmp doreti_iret
-
/*
* Invalidate cache.
*/
@@ -240,27 +214,11 @@
.text
SUPERALIGN_TEXT
IDTVEC(invlcache)
-#ifdef COUNT_IPIS
PUSH_FRAME
- movl PCPU(CPUID), %eax
- movq ipi_invlcache_counts(,%rax,8),%rax
- incq (%rax)
- POP_FRAME
-#endif
- pushq %rax
+ call invlcache_handler
+ jmp invltlb_ret
- wbinvd
-
- movq lapic, %rax
- movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
-
- lock
- incl smp_tlb_wait
-
- popq %rax
- jmp doreti_iret
-
/*
* Handler for IPIs sent via the per-cpu IPI bitmap.
*/
Property changes on: trunk/sys/amd64/amd64/apic_vector.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/atomic.c
===================================================================
--- trunk/sys/amd64/amd64/atomic.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/atomic.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1999 Peter Jeremy
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/atomic.c 118031 2003-07-25 21:19:19Z obrien $");
/* This file creates publically callable functions to perform various
* simple arithmetic on memory which is atomic in the presence of
Modified: trunk/sys/amd64/amd64/atpic_vector.S
===================================================================
--- trunk/sys/amd64/amd64/atpic_vector.S 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/atpic_vector.S 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1989, 1990 William F. Jolitz.
* Copyright (c) 1990 The Regents of the University of California.
@@ -28,7 +29,7 @@
* SUCH DAMAGE.
*
* from: vector.s, 386BSD 0.1 unknown origin
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/atpic_vector.S 204309 2010-02-25 14:13:39Z attilio $
*/
/*
Property changes on: trunk/sys/amd64/amd64/atpic_vector.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/autoconf.c
===================================================================
--- trunk/sys/amd64/amd64/autoconf.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/autoconf.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
@@ -33,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/autoconf.c 146794 2005-05-29 23:44:22Z marcel $");
/*
* Setup the system to run on the current machine.
Modified: trunk/sys/amd64/amd64/bios.c
===================================================================
--- trunk/sys/amd64/amd64/bios.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/bios.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1997 Michael Smith
* Copyright (c) 1998 Jonathan Lemon
@@ -26,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/bios.c 148231 2005-07-21 09:48:37Z phk $");
/*
* Subset of the i386 bios support code. We cannot make bios16 nor bios32
Modified: trunk/sys/amd64/amd64/bpf_jit_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/bpf_jit_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/bpf_jit_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
* Copyright (C) 2005-2009 Jung-uk Kim <jkim at FreeBSD.org>
@@ -30,7 +31,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/bpf_jit_machdep.c 207081 2010-04-22 23:47:19Z jkim $");
#ifdef _KERNEL
#include "opt_bpf.h"
Modified: trunk/sys/amd64/amd64/bpf_jit_machdep.h
===================================================================
--- trunk/sys/amd64/amd64/bpf_jit_machdep.h 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/bpf_jit_machdep.h 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy)
* Copyright (C) 2005-2009 Jung-uk Kim <jkim at FreeBSD.org>
@@ -28,7 +29,7 @@
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/bpf_jit_machdep.h 207081 2010-04-22 23:47:19Z jkim $
*/
#ifndef _BPF_JIT_MACHDEP_H_
Modified: trunk/sys/amd64/amd64/cpu_switch.S
===================================================================
--- trunk/sys/amd64/amd64/cpu_switch.S 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/cpu_switch.S 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2003 Peter Wemm.
* Copyright (c) 1990 The Regents of the University of California.
@@ -30,7 +31,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/cpu_switch.S 271999 2014-09-22 20:34:36Z jhb $
*/
#include <machine/asmacros.h>
@@ -76,9 +77,8 @@
movq PCPU(CURPMAP),%rdx
LK btrl %eax,PM_ACTIVE(%rdx) /* clear old */
1:
- movq TD_PCB(%rsi),%r8 /* newtd->td_proc */
- movq PCB_CR3(%r8),%rdx
- movq %rdx,%cr3 /* new address space */
+ movq TD_PCB(%rsi),%r8 /* newtd->td_pcb */
+ movq PCB_CR3(%r8),%rcx /* new address space */
jmp swact
END(cpu_throw)
@@ -145,20 +145,41 @@
SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */
jmp sw1
swinact:
- movq %rcx,%cr3 /* new address space */
- movl PCPU(CPUID), %eax
+ movl PCPU(CPUID),%eax
/* Release bit from old pmap->pm_active */
- movq PCPU(CURPMAP),%rcx
- LK btrl %eax,PM_ACTIVE(%rcx) /* clear old */
- SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */
+ movq PCPU(CURPMAP),%r12
+ LK btrl %eax,PM_ACTIVE(%r12) /* clear old */
+ SETLK %rdx,TD_LOCK(%rdi) /* Release the old thread */
swact:
/* Set bit in new pmap->pm_active */
movq TD_PROC(%rsi),%rdx /* newproc */
movq P_VMSPACE(%rdx), %rdx
addq $VM_PMAP,%rdx
+ cmpl $-1,PM_PCID(%rdx)
+ je 1f
+ LK btsl %eax,PM_SAVE(%rdx)
+ jnc 1f
+ btsq $63,%rcx /* CR3_PCID_SAVE */
+ incq PCPU(PM_SAVE_CNT)
+1:
+ movq %rcx,%cr3 /* new address space */
LK btsl %eax,PM_ACTIVE(%rdx) /* set new */
movq %rdx,PCPU(CURPMAP)
+ /*
+ * We might lose the race and other CPU might have changed
+ * the pmap after we set our bit in pmap->pm_save. Recheck.
+ * Reload %cr3 with CR3_PCID_SAVE bit cleared if pmap was
+ * modified, causing TLB flush for this pcid.
+ */
+ btrq $63,%rcx
+ jnc 1f
+ LK btsl %eax,PM_SAVE(%rdx)
+ jc 1f
+ decq PCPU(PM_SAVE_CNT)
+ movq %rcx,%cr3
+1:
+
sw1:
#if defined(SCHED_ULE) && defined(SMP)
/* Wait for the new thread to become unblocked */
@@ -325,8 +346,8 @@
movq %r14,PCB_R14(%rdi)
movq %r15,PCB_R15(%rdi)
- movq %cr0,%rsi
- movq %rsi,PCB_CR0(%rdi)
+ movq %cr0,%rax
+ movq %rax,PCB_CR0(%rdi)
movq %cr2,%rax
movq %rax,PCB_CR2(%rdi)
movq %cr3,%rax
@@ -359,6 +380,26 @@
rdmsr
movl %eax,PCB_KGSBASE(%rdi)
movl %edx,PCB_KGSBASE+4(%rdi)
+ movl $MSR_EFER,%ecx
+ rdmsr
+ movl %eax,PCB_EFER(%rdi)
+ movl %edx,PCB_EFER+4(%rdi)
+ movl $MSR_STAR,%ecx
+ rdmsr
+ movl %eax,PCB_STAR(%rdi)
+ movl %edx,PCB_STAR+4(%rdi)
+ movl $MSR_LSTAR,%ecx
+ rdmsr
+ movl %eax,PCB_LSTAR(%rdi)
+ movl %edx,PCB_LSTAR+4(%rdi)
+ movl $MSR_CSTAR,%ecx
+ rdmsr
+ movl %eax,PCB_CSTAR(%rdi)
+ movl %edx,PCB_CSTAR+4(%rdi)
+ movl $MSR_SF_MASK,%ecx
+ rdmsr
+ movl %eax,PCB_SFMASK(%rdi)
+ movl %edx,PCB_SFMASK+4(%rdi)
sgdt PCB_GDT(%rdi)
sidt PCB_IDT(%rdi)
@@ -365,19 +406,117 @@
sldt PCB_LDT(%rdi)
str PCB_TR(%rdi)
-2: movq %rsi,%cr0 /* The previous %cr0 is saved in %rsi. */
-
movl $1,%eax
ret
END(savectx)
/*
- * Wrapper around fpusave to care about TS0_CR.
- */
-ENTRY(ctx_fpusave)
- movq %cr0,%rsi
- clts
- call fpusave
- movq %rsi,%cr0
+ * resumectx(pcb)
+ * Resuming processor state from pcb.
+ */
+ENTRY(resumectx)
+ /* Switch to KPML4phys. */
+ movq KPML4phys,%rax
+ movq %rax,%cr3
+
+ /* Force kernel segment registers. */
+ movl $KDSEL,%eax
+ movw %ax,%ds
+ movw %ax,%es
+ movw %ax,%ss
+ movl $KUF32SEL,%eax
+ movw %ax,%fs
+ movl $KUG32SEL,%eax
+ movw %ax,%gs
+
+ movl $MSR_FSBASE,%ecx
+ movl PCB_FSBASE(%rdi),%eax
+ movl 4 + PCB_FSBASE(%rdi),%edx
+ wrmsr
+ movl $MSR_GSBASE,%ecx
+ movl PCB_GSBASE(%rdi),%eax
+ movl 4 + PCB_GSBASE(%rdi),%edx
+ wrmsr
+ movl $MSR_KGSBASE,%ecx
+ movl PCB_KGSBASE(%rdi),%eax
+ movl 4 + PCB_KGSBASE(%rdi),%edx
+ wrmsr
+
+ /* Restore EFER. */
+ movl $MSR_EFER,%ecx
+ movl PCB_EFER(%rdi),%eax
+ wrmsr
+
+ /* Restore fast syscall stuff. */
+ movl $MSR_STAR,%ecx
+ movl PCB_STAR(%rdi),%eax
+ movl 4 + PCB_STAR(%rdi),%edx
+ wrmsr
+ movl $MSR_LSTAR,%ecx
+ movl PCB_LSTAR(%rdi),%eax
+ movl 4 + PCB_LSTAR(%rdi),%edx
+ wrmsr
+ movl $MSR_CSTAR,%ecx
+ movl PCB_CSTAR(%rdi),%eax
+ movl 4 + PCB_CSTAR(%rdi),%edx
+ wrmsr
+ movl $MSR_SF_MASK,%ecx
+ movl PCB_SFMASK(%rdi),%eax
+ wrmsr
+
+ /* Restore CR0, CR2, CR4 and CR3. */
+ movq PCB_CR0(%rdi),%rax
+ movq %rax,%cr0
+ movq PCB_CR2(%rdi),%rax
+ movq %rax,%cr2
+ movq PCB_CR4(%rdi),%rax
+ movq %rax,%cr4
+ movq PCB_CR3(%rdi),%rax
+ movq %rax,%cr3
+
+ /* Restore descriptor tables. */
+ lidt PCB_IDT(%rdi)
+ lldt PCB_LDT(%rdi)
+
+#define SDT_SYSTSS 9
+#define SDT_SYSBSY 11
+
+ /* Clear "task busy" bit and reload TR. */
+ movq PCPU(TSS),%rax
+ andb $(~SDT_SYSBSY | SDT_SYSTSS),5(%rax)
+ movw PCB_TR(%rdi),%ax
+ ltr %ax
+
+#undef SDT_SYSTSS
+#undef SDT_SYSBSY
+
+ /* Restore debug registers. */
+ movq PCB_DR0(%rdi),%rax
+ movq %rax,%dr0
+ movq PCB_DR1(%rdi),%rax
+ movq %rax,%dr1
+ movq PCB_DR2(%rdi),%rax
+ movq %rax,%dr2
+ movq PCB_DR3(%rdi),%rax
+ movq %rax,%dr3
+ movq PCB_DR6(%rdi),%rax
+ movq %rax,%dr6
+ movq PCB_DR7(%rdi),%rax
+ movq %rax,%dr7
+
+ /* Restore other callee saved registers. */
+ movq PCB_R15(%rdi),%r15
+ movq PCB_R14(%rdi),%r14
+ movq PCB_R13(%rdi),%r13
+ movq PCB_R12(%rdi),%r12
+ movq PCB_RBP(%rdi),%rbp
+ movq PCB_RSP(%rdi),%rsp
+ movq PCB_RBX(%rdi),%rbx
+
+ /* Restore return address. */
+ movq PCB_RIP(%rdi),%rax
+ movq %rax,(%rsp)
+
+ xorl %eax,%eax
ret
-END(ctx_fpusave)
+END(resumectx)
Property changes on: trunk/sys/amd64/amd64/cpu_switch.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/db_disasm.c
===================================================================
--- trunk/sys/amd64/amd64/db_disasm.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/db_disasm.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Mach Operating System
* Copyright (c) 1991,1990 Carnegie Mellon University
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/db_disasm.c 280272 2015-03-19 23:13:19Z markj $");
/*
* Instruction disassembler.
@@ -127,7 +128,7 @@
static const struct inst db_inst_0f388x[] = {
/*80*/ { "", TRUE, SDEP, op2(E, Rq), "invept" },
/*81*/ { "", TRUE, SDEP, op2(E, Rq), "invvpid" },
-/*82*/ { "", FALSE, NONE, 0, 0 },
+/*82*/ { "", TRUE, SDEP, op2(E, Rq), "invpcid" },
/*83*/ { "", FALSE, NONE, 0, 0 },
/*84*/ { "", FALSE, NONE, 0, 0 },
/*85*/ { "", FALSE, NONE, 0, 0 },
@@ -249,6 +250,26 @@
/*0f*/ { "", FALSE, NONE, 0, 0 },
};
+static const struct inst db_inst_0f1x[] = {
+/*10*/ { "", FALSE, NONE, 0, 0 },
+/*11*/ { "", FALSE, NONE, 0, 0 },
+/*12*/ { "", FALSE, NONE, 0, 0 },
+/*13*/ { "", FALSE, NONE, 0, 0 },
+/*14*/ { "", FALSE, NONE, 0, 0 },
+/*15*/ { "", FALSE, NONE, 0, 0 },
+/*16*/ { "", FALSE, NONE, 0, 0 },
+/*17*/ { "", FALSE, NONE, 0, 0 },
+
+/*18*/ { "", FALSE, NONE, 0, 0 },
+/*19*/ { "", FALSE, NONE, 0, 0 },
+/*1a*/ { "", FALSE, NONE, 0, 0 },
+/*1b*/ { "", FALSE, NONE, 0, 0 },
+/*1c*/ { "", FALSE, NONE, 0, 0 },
+/*1d*/ { "", FALSE, NONE, 0, 0 },
+/*1e*/ { "", FALSE, NONE, 0, 0 },
+/*1f*/ { "nopl", TRUE, SDEP, 0, "nopw" },
+};
+
static const struct inst db_inst_0f2x[] = {
/*20*/ { "mov", TRUE, LONG, op2(CR,El), 0 },
/*21*/ { "mov", TRUE, LONG, op2(DR,El), 0 },
@@ -430,7 +451,7 @@
static const struct inst * const db_inst_0f[] = {
db_inst_0f0x,
- 0,
+ db_inst_0f1x,
db_inst_0f2x,
db_inst_0f3x,
db_inst_0f4x,
Modified: trunk/sys/amd64/amd64/db_interface.c
===================================================================
--- trunk/sys/amd64/amd64/db_interface.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/db_interface.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Mach Operating System
* Copyright (c) 1991,1990 Carnegie Mellon University
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/db_interface.c 208392 2010-05-21 17:17:56Z jhb $");
/*
* Interface to new debugger.
Modified: trunk/sys/amd64/amd64/db_trace.c
===================================================================
--- trunk/sys/amd64/amd64/db_trace.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/db_trace.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Mach Operating System
* Copyright (c) 1991,1990 Carnegie Mellon University
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/db_trace.c 330132 2018-02-28 22:33:07Z jhb $");
#include "opt_compat.h"
@@ -52,17 +53,8 @@
#include <ddb/db_sym.h>
#include <ddb/db_variables.h>
-static db_varfcn_t db_dr0;
-static db_varfcn_t db_dr1;
-static db_varfcn_t db_dr2;
-static db_varfcn_t db_dr3;
-static db_varfcn_t db_dr4;
-static db_varfcn_t db_dr5;
-static db_varfcn_t db_dr6;
-static db_varfcn_t db_dr7;
static db_varfcn_t db_frame;
-static db_varfcn_t db_rsp;
-static db_varfcn_t db_ss;
+static db_varfcn_t db_frame_seg;
CTASSERT(sizeof(struct dbreg) == sizeof(((struct pcpu *)NULL)->pc_dbreg));
@@ -71,17 +63,17 @@
*/
#define DB_OFFSET(x) (db_expr_t *)offsetof(struct trapframe, x)
struct db_variable db_regs[] = {
- { "cs", DB_OFFSET(tf_cs), db_frame },
- { "ds", DB_OFFSET(tf_ds), db_frame },
- { "es", DB_OFFSET(tf_es), db_frame },
- { "fs", DB_OFFSET(tf_fs), db_frame },
- { "gs", DB_OFFSET(tf_gs), db_frame },
- { "ss", NULL, db_ss },
+ { "cs", DB_OFFSET(tf_cs), db_frame_seg },
+ { "ds", DB_OFFSET(tf_ds), db_frame_seg },
+ { "es", DB_OFFSET(tf_es), db_frame_seg },
+ { "fs", DB_OFFSET(tf_fs), db_frame_seg },
+ { "gs", DB_OFFSET(tf_gs), db_frame_seg },
+ { "ss", DB_OFFSET(tf_ss), db_frame_seg },
{ "rax", DB_OFFSET(tf_rax), db_frame },
{ "rcx", DB_OFFSET(tf_rcx), db_frame },
{ "rdx", DB_OFFSET(tf_rdx), db_frame },
{ "rbx", DB_OFFSET(tf_rbx), db_frame },
- { "rsp", NULL, db_rsp },
+ { "rsp", DB_OFFSET(tf_rsp), db_frame },
{ "rbp", DB_OFFSET(tf_rbp), db_frame },
{ "rsi", DB_OFFSET(tf_rsi), db_frame },
{ "rdi", DB_OFFSET(tf_rdi), db_frame },
@@ -95,57 +87,18 @@
{ "r15", DB_OFFSET(tf_r15), db_frame },
{ "rip", DB_OFFSET(tf_rip), db_frame },
{ "rflags", DB_OFFSET(tf_rflags), db_frame },
-#define DB_N_SHOW_REGS 24 /* Don't show registers after here. */
- { "dr0", NULL, db_dr0 },
- { "dr1", NULL, db_dr1 },
- { "dr2", NULL, db_dr2 },
- { "dr3", NULL, db_dr3 },
- { "dr4", NULL, db_dr4 },
- { "dr5", NULL, db_dr5 },
- { "dr6", NULL, db_dr6 },
- { "dr7", NULL, db_dr7 },
};
-struct db_variable *db_eregs = db_regs + DB_N_SHOW_REGS;
+struct db_variable *db_eregs = db_regs + nitems(db_regs);
-#define DB_DRX_FUNC(reg) \
-static int \
-db_ ## reg (vp, valuep, op) \
- struct db_variable *vp; \
- db_expr_t * valuep; \
- int op; \
-{ \
- if (op == DB_VAR_GET) \
- *valuep = r ## reg (); \
- else \
- load_ ## reg (*valuep); \
- return (1); \
-}
-
-DB_DRX_FUNC(dr0)
-DB_DRX_FUNC(dr1)
-DB_DRX_FUNC(dr2)
-DB_DRX_FUNC(dr3)
-DB_DRX_FUNC(dr4)
-DB_DRX_FUNC(dr5)
-DB_DRX_FUNC(dr6)
-DB_DRX_FUNC(dr7)
-
-static __inline long
-get_rsp(struct trapframe *tf)
-{
- return ((ISPL(tf->tf_cs)) ? tf->tf_rsp :
- (db_expr_t)tf + offsetof(struct trapframe, tf_rsp));
-}
-
static int
-db_frame(struct db_variable *vp, db_expr_t *valuep, int op)
+db_frame_seg(struct db_variable *vp, db_expr_t *valuep, int op)
{
- long *reg;
+ uint16_t *reg;
if (kdb_frame == NULL)
return (0);
- reg = (long *)((uintptr_t)kdb_frame + (db_expr_t)vp->valuep);
+ reg = (uint16_t *)((uintptr_t)kdb_frame + (db_expr_t)vp->valuep);
if (op == DB_VAR_GET)
*valuep = *reg;
else
@@ -154,33 +107,21 @@
}
static int
-db_rsp(struct db_variable *vp, db_expr_t *valuep, int op)
+db_frame(struct db_variable *vp, db_expr_t *valuep, int op)
{
+ long *reg;
if (kdb_frame == NULL)
return (0);
+ reg = (long *)((uintptr_t)kdb_frame + (db_expr_t)vp->valuep);
if (op == DB_VAR_GET)
- *valuep = get_rsp(kdb_frame);
- else if (ISPL(kdb_frame->tf_cs))
- kdb_frame->tf_rsp = *valuep;
+ *valuep = *reg;
+ else
+ *reg = *valuep;
return (1);
}
-static int
-db_ss(struct db_variable *vp, db_expr_t *valuep, int op)
-{
-
- if (kdb_frame == NULL)
- return (0);
-
- if (op == DB_VAR_GET)
- *valuep = (ISPL(kdb_frame->tf_cs)) ? kdb_frame->tf_ss : rss();
- else if (ISPL(kdb_frame->tf_cs))
- kdb_frame->tf_ss = *valuep;
- return (1);
-}
-
#define NORMAL 0
#define TRAP 1
#define INTERRUPT 2
@@ -188,9 +129,7 @@
#define TRAP_INTERRUPT 5
static void db_nextframe(struct amd64_frame **, db_addr_t *, struct thread *);
-static int db_numargs(struct amd64_frame *);
-static void db_print_stack_entry(const char *, int, char **, long *, db_addr_t,
- void *);
+static void db_print_stack_entry(const char *, db_addr_t, void *);
static void decode_syscall(int, struct thread *);
static const char * watchtype_str(int type);
@@ -198,62 +137,11 @@
int access, struct dbreg *d);
int amd64_clr_watch(int watchnum, struct dbreg *d);
-/*
- * Figure out how many arguments were passed into the frame at "fp".
- */
-static int
-db_numargs(fp)
- struct amd64_frame *fp;
+static void
+db_print_stack_entry(const char *name, db_addr_t callpc, void *frame)
{
-#if 1
- return (0); /* regparm, needs dwarf2 info */
-#else
- long *argp;
- int inst;
- int args;
- argp = (long *)db_get_value((long)&fp->f_retaddr, 8, FALSE);
- /*
- * XXX etext is wrong for LKMs. We should attempt to interpret
- * the instruction at the return address in all cases. This
- * may require better fault handling.
- */
- if (argp < (long *)btext || argp >= (long *)etext) {
- args = 5;
- } else {
- inst = db_get_value((long)argp, 4, FALSE);
- if ((inst & 0xff) == 0x59) /* popl %ecx */
- args = 1;
- else if ((inst & 0xffff) == 0xc483) /* addl $Ibs, %esp */
- args = ((inst >> 16) & 0xff) / 4;
- else
- args = 5;
- }
- return (args);
-#endif
-}
-
-static void
-db_print_stack_entry(name, narg, argnp, argp, callpc, frame)
- const char *name;
- int narg;
- char **argnp;
- long *argp;
- db_addr_t callpc;
- void *frame;
-{
- db_printf("%s(", name);
-#if 0
- while (narg) {
- if (argnp)
- db_printf("%s=", *argnp++);
- db_printf("%lr", (long)db_get_value((long)argp, 8, FALSE));
- argp++;
- if (--narg != 0)
- db_printf(",");
- }
-#endif
- db_printf(") at ");
+ db_printf("%s() at ", name != NULL ? name : "??");
db_printsym(callpc, DB_STGY_PROC);
if (frame != NULL)
db_printf("/frame 0x%lx", (register_t)frame);
@@ -348,7 +236,7 @@
return;
}
- db_print_stack_entry(name, 0, 0, 0, rip, &(*fp)->f_frame);
+ db_print_stack_entry(name, rip, &(*fp)->f_frame);
/*
* Point to base of trapframe which is just above the
@@ -357,7 +245,7 @@
tf = (struct trapframe *)((long)*fp + 16);
if (INKERNEL((long) tf)) {
- rsp = get_rsp(tf);
+ rsp = tf->tf_rsp;
rip = tf->tf_rip;
rbp = tf->tf_rbp;
switch (frame_type) {
@@ -384,17 +272,13 @@
}
static int
-db_backtrace(struct thread *td, struct trapframe *tf,
- struct amd64_frame *frame, db_addr_t pc, int count)
+db_backtrace(struct thread *td, struct trapframe *tf, struct amd64_frame *frame,
+ db_addr_t pc, register_t sp, int count)
{
struct amd64_frame *actframe;
-#define MAXNARG 16
- char *argnames[MAXNARG], **argnp = NULL;
const char *name;
- long *argp;
db_expr_t offset;
c_db_sym_t sym;
- int narg;
boolean_t first;
if (count == -1)
@@ -418,16 +302,29 @@
*/
actframe = frame;
if (first) {
- if (tf != NULL) {
+ first = FALSE;
+ if (sym == C_DB_SYM_NULL && sp != 0) {
+ /*
+ * If a symbol couldn't be found, we've probably
+ * jumped to a bogus location, so try and use
+ * the return address to find our caller.
+ */
+ db_print_stack_entry(name, pc, NULL);
+ pc = db_get_value(sp, 8, FALSE);
+ if (db_search_symbol(pc, DB_STGY_PROC,
+ &offset) == C_DB_SYM_NULL)
+ break;
+ continue;
+ } else if (tf != NULL) {
int instr;
instr = db_get_value(pc, 4, FALSE);
if ((instr & 0xffffffff) == 0xe5894855) {
/* pushq %rbp; movq %rsp, %rbp */
- actframe = (void *)(get_rsp(tf) - 8);
+ actframe = (void *)(tf->tf_rsp - 8);
} else if ((instr & 0xffffff) == 0xe58948) {
/* movq %rsp, %rbp */
- actframe = (void *)get_rsp(tf);
+ actframe = (void *)tf->tf_rsp;
if (tf->tf_rbp == 0) {
/* Fake frame better. */
frame = actframe;
@@ -434,33 +331,24 @@
}
} else if ((instr & 0xff) == 0xc3) {
/* ret */
- actframe = (void *)(get_rsp(tf) - 8);
+ actframe = (void *)(tf->tf_rsp - 8);
} else if (offset == 0) {
/* Probably an assembler symbol. */
- actframe = (void *)(get_rsp(tf) - 8);
+ actframe = (void *)(tf->tf_rsp - 8);
}
- } else if (strcmp(name, "fork_trampoline") == 0) {
+ } else if (name != NULL &&
+ strcmp(name, "fork_trampoline") == 0) {
/*
* Don't try to walk back on a stack for a
* process that hasn't actually been run yet.
*/
- db_print_stack_entry(name, 0, 0, 0, pc,
- actframe);
+ db_print_stack_entry(name, pc, actframe);
break;
}
- first = FALSE;
}
- argp = &actframe->f_arg0;
- narg = MAXNARG;
- if (sym != NULL && db_sym_numargs(sym, &narg, argnames)) {
- argnp = argnames;
- } else {
- narg = db_numargs(frame);
- }
+ db_print_stack_entry(name, pc, actframe);
- db_print_stack_entry(name, narg, argnp, argp, pc, actframe);
-
if (actframe != frame) {
/* `frame' belongs to caller. */
pc = (db_addr_t)
@@ -473,7 +361,7 @@
if (INKERNEL((long)pc) && !INKERNEL((long)frame)) {
sym = db_search_symbol(pc, DB_STGY_ANY, &offset);
db_symbol_values(sym, &name, NULL);
- db_print_stack_entry(name, 0, 0, 0, pc, frame);
+ db_print_stack_entry(name, pc, frame);
break;
}
if (!INKERNEL((long) frame)) {
@@ -495,7 +383,7 @@
frame = (struct amd64_frame *)rbp;
callpc = (db_addr_t)db_get_value((long)&frame->f_retaddr, 8, FALSE);
frame = frame->f_frame;
- db_backtrace(curthread, NULL, frame, callpc, -1);
+ db_backtrace(curthread, NULL, frame, callpc, 0, -1);
}
int
@@ -502,10 +390,12 @@
db_trace_thread(struct thread *thr, int count)
{
struct pcb *ctx;
+ struct trapframe *tf;
ctx = kdb_thr_ctx(thr);
- return (db_backtrace(thr, NULL, (struct amd64_frame *)ctx->pcb_rbp,
- ctx->pcb_rip, count));
+ tf = thr == kdb_thread ? kdb_frame : NULL;
+ return (db_backtrace(thr, tf, (struct amd64_frame *)ctx->pcb_rbp,
+ ctx->pcb_rip, ctx->pcb_rsp, count));
}
int
Modified: trunk/sys/amd64/amd64/elf_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/elf_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/elf_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright 1996-1998 John D. Polstra.
* All rights reserved.
@@ -24,7 +25,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/elf_machdep.c 294136 2016-01-16 07:56:49Z dchagin $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -36,7 +37,6 @@
#include <sys/sysent.h>
#include <sys/imgact_elf.h>
#include <sys/syscall.h>
-#include <sys/sysent.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
@@ -45,6 +45,7 @@
#include <vm/vm_param.h>
#include <machine/elf.h>
+#include <machine/fpu.h>
#include <machine/md_var.h>
struct sysentvec elf64_freebsd_sysvec = {
@@ -82,28 +83,11 @@
.sv_shared_page_base = SHAREDPAGE,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = NULL,
+ .sv_thread_detach = NULL,
+ .sv_trap = NULL,
};
INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec);
-void
-amd64_lower_shared_page(struct sysentvec *sv)
-{
- if (hw_lower_amd64_sharedpage != 0) {
- sv->sv_maxuser -= PAGE_SIZE;
- sv->sv_shared_page_base -= PAGE_SIZE;
- sv->sv_usrstack -= PAGE_SIZE;
- sv->sv_psstrings -= PAGE_SIZE;
- }
-}
-
-/*
- * Do this fixup before INIT_SYSENTVEC (SI_ORDER_ANY) because the latter
- * uses the value of sv_shared_page_base.
- */
-SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST,
- (sysinit_cfunc_t) amd64_lower_shared_page,
- &elf64_freebsd_sysvec);
-
static Elf64_Brandinfo freebsd_brand_info = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
@@ -136,13 +120,44 @@
(sysinit_cfunc_t) elf64_insert_brand_entry,
&freebsd_brand_oinfo);
+static Elf64_Brandinfo kfreebsd_brand_info = {
+ .brand = ELFOSABI_FREEBSD,
+ .machine = EM_X86_64,
+ .compat_3_brand = "FreeBSD",
+ .emul_path = NULL,
+ .interp_path = "/lib/ld-kfreebsd-x86-64.so.1",
+ .sysvec = &elf64_freebsd_sysvec,
+ .interp_newpath = NULL,
+ .brand_note = &elf64_kfreebsd_brandnote,
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
+};
+
+SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
+ (sysinit_cfunc_t) elf64_insert_brand_entry,
+ &kfreebsd_brand_info);
+
void
-elf64_dump_thread(struct thread *td __unused, void *dst __unused,
- size_t *off __unused)
+elf64_dump_thread(struct thread *td, void *dst, size_t *off)
{
+ void *buf;
+ size_t len;
+
+ len = 0;
+ if (use_xsave) {
+ if (dst != NULL) {
+ fpugetregs(td);
+ len += elf64_populate_note(NT_X86_XSTATE,
+ get_pcb_user_save_td(td), dst,
+ cpu_max_ext_state_size, &buf);
+ *(uint64_t *)((char *)buf + X86_XSTATE_XCR0_OFFSET) =
+ xsave_mask;
+ } else
+ len += elf64_populate_note(NT_X86_XSTATE, NULL, NULL,
+ cpu_max_ext_state_size, NULL);
+ }
+ *off = len;
}
-
/* Process one elf relocation with addend. */
static int
elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data,
@@ -155,6 +170,7 @@
Elf_Size rtype, symidx;
const Elf_Rel *rel;
const Elf_Rela *rela;
+ int error;
switch (type) {
case ELF_RELOC_REL:
@@ -190,9 +206,9 @@
break;
case R_X86_64_64: /* S + A */
- addr = lookup(lf, symidx, 1);
+ error = lookup(lf, symidx, 1, &addr);
val = addr + addend;
- if (addr == 0)
+ if (error != 0)
return -1;
if (*where != val)
*where = val;
@@ -199,10 +215,10 @@
break;
case R_X86_64_PC32: /* S + A - P */
- addr = lookup(lf, symidx, 1);
+ error = lookup(lf, symidx, 1, &addr);
where32 = (Elf32_Addr *)where;
val32 = (Elf32_Addr)(addr + addend - (Elf_Addr)where);
- if (addr == 0)
+ if (error != 0)
return -1;
if (*where32 != val32)
*where32 = val32;
@@ -209,10 +225,10 @@
break;
case R_X86_64_32S: /* S + A sign extend */
- addr = lookup(lf, symidx, 1);
+ error = lookup(lf, symidx, 1, &addr);
val32 = (Elf32_Addr)(addr + addend);
where32 = (Elf32_Addr *)where;
- if (addr == 0)
+ if (error != 0)
return -1;
if (*where32 != val32)
*where32 = val32;
@@ -229,8 +245,8 @@
case R_X86_64_GLOB_DAT: /* S */
case R_X86_64_JMP_SLOT: /* XXX need addend + offset */
- addr = lookup(lf, symidx, 1);
- if (addr == 0)
+ error = lookup(lf, symidx, 1, &addr);
+ if (error != 0)
return -1;
if (*where != addr)
*where = addr;
Modified: trunk/sys/amd64/amd64/exception.S
===================================================================
--- trunk/sys/amd64/amd64/exception.S 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/exception.S 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1989, 1990 William F. Jolitz.
* Copyright (c) 1990 The Regents of the University of California.
@@ -31,7 +32,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/exception.S 333370 2018-05-08 17:05:39Z emaste $
*/
#include "opt_atpic.h"
@@ -80,18 +81,22 @@
* This is equivalent to the i386 port's use of SDT_SYS386TGT.
*
* The cpu will push a certain amount of state onto the kernel stack for
- * the current process. See amd64/include/frame.h.
- * This includes the current RFLAGS (status register, which includes
+ * the current process. See amd64/include/frame.h.
+ * This includes the current RFLAGS (status register, which includes
* the interrupt disable state prior to the trap), the code segment register,
- * and the return instruction pointer are pushed by the cpu. The cpu
- * will also push an 'error' code for certain traps. We push a dummy
- * error code for those traps where the cpu doesn't in order to maintain
+ * and the return instruction pointer are pushed by the cpu. The cpu
+ * will also push an 'error' code for certain traps. We push a dummy
+ * error code for those traps where the cpu doesn't in order to maintain
* a consistent frame. We also push a contrived 'trap number'.
*
- * The cpu does not push the general registers, we must do that, and we
- * must restore them prior to calling 'iret'. The cpu adjusts the %cs and
- * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we
- * must load them with appropriate values for supervisor mode operation.
+ * The CPU does not push the general registers, so we must do that, and we
+ * must restore them prior to calling 'iret'. The CPU adjusts %cs and %ss
+ * but does not mess with %ds, %es, %gs or %fs. We swap the %gs base for
+ * for the kernel mode operation shortly, without changes to the selector
+ * loaded. Since superuser long mode works with any selectors loaded into
+ * segment registers other then %cs, which makes them mostly unused in long
+ * mode, and kernel does not reference %fs, leave them alone. The segment
+ * registers are reloaded on return to the usermode.
*/
MCOUNT_LABEL(user)
@@ -104,8 +109,6 @@
movq $0,TF_ADDR(%rsp) ; \
movq $0,TF_ERR(%rsp) ; \
jmp alltraps_noen
-IDTVEC(dbg)
- TRAP_NOEN(T_TRCTRAP)
IDTVEC(bpt)
TRAP_NOEN(T_BPTFLT)
#ifdef KDTRACE_HOOKS
@@ -208,6 +211,8 @@
* interrupt. For all other trap types, just handle them in
* the usual way.
*/
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
+ jnz calltrap /* ignore userland traps */
cmpl $T_BPTFLT,TF_TRAPNO(%rsp)
jne calltrap
@@ -430,6 +435,101 @@
sysret
/*
+ * DB# handler is very similar to NM#, because 'mov/pop %ss' delay
+ * generation of exception until the next instruction is executed,
+ * which might be a kernel entry. So we must execute the handler
+ * on IST stack and be ready for non-kernel GSBASE.
+ */
+IDTVEC(dbg)
+ subq $TF_RIP,%rsp
+ movl $(T_TRCTRAP),TF_TRAPNO(%rsp)
+ movq $0,TF_ADDR(%rsp)
+ movq $0,TF_ERR(%rsp)
+ movq %rdi,TF_RDI(%rsp)
+ movq %rsi,TF_RSI(%rsp)
+ movq %rdx,TF_RDX(%rsp)
+ movq %rcx,TF_RCX(%rsp)
+ movq %r8,TF_R8(%rsp)
+ movq %r9,TF_R9(%rsp)
+ movq %rax,TF_RAX(%rsp)
+ movq %rbx,TF_RBX(%rsp)
+ movq %rbp,TF_RBP(%rsp)
+ movq %r10,TF_R10(%rsp)
+ movq %r11,TF_R11(%rsp)
+ movq %r12,TF_R12(%rsp)
+ movq %r13,TF_R13(%rsp)
+ movq %r14,TF_R14(%rsp)
+ movq %r15,TF_R15(%rsp)
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
+ cld
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
+ jnz dbg_fromuserspace
+ /*
+ * We've interrupted the kernel. Preserve GS.base in %r12.
+ */
+ movl $MSR_GSBASE,%ecx
+ rdmsr
+ movq %rax,%r12
+ shlq $32,%rdx
+ orq %rdx,%r12
+ /* Retrieve and load the canonical value for GS.base. */
+ movq TF_SIZE(%rsp),%rdx
+ movl %edx,%eax
+ shrq $32,%rdx
+ wrmsr
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ movq %rsp,%rdi
+ call trap
+ MEXITCOUNT
+ /*
+ * Put back the preserved MSR_GSBASE value.
+ */
+ movl $MSR_GSBASE,%ecx
+ movq %r12,%rdx
+ movl %edx,%eax
+ shrq $32,%rdx
+ wrmsr
+ movq TF_RDI(%rsp),%rdi
+ movq TF_RSI(%rsp),%rsi
+ movq TF_RDX(%rsp),%rdx
+ movq TF_RCX(%rsp),%rcx
+ movq TF_R8(%rsp),%r8
+ movq TF_R9(%rsp),%r9
+ movq TF_RAX(%rsp),%rax
+ movq TF_RBX(%rsp),%rbx
+ movq TF_RBP(%rsp),%rbp
+ movq TF_R10(%rsp),%r10
+ movq TF_R11(%rsp),%r11
+ movq TF_R12(%rsp),%r12
+ movq TF_R13(%rsp),%r13
+ movq TF_R14(%rsp),%r14
+ movq TF_R15(%rsp),%r15
+ addq $TF_RIP,%rsp
+ jmp doreti_iret
+dbg_fromuserspace:
+ /*
+ * Switch to kernel GSBASE and kernel page table, and copy frame
+ * from the IST stack to the normal kernel stack, since trap()
+ * re-enables interrupts, and since we might trap on DB# while
+ * in trap().
+ */
+ swapgs
+ movq PCPU(RSP0),%rax
+ movl $TF_SIZE,%ecx
+ subq %rcx,%rax
+ movq %rax,%rdi
+ movq %rsp,%rsi
+ rep;movsb
+ movq %rax,%rsp
+ movq PCPU(CURPCB),%rdi
+ orl $PCB_FULL_IRET,PCB_FLAGS(%rdi)
+ jmp calltrap
+
+/*
* NMI handling is special.
*
* First, NMIs do not respect the state of the processor's RFLAGS.IF
@@ -508,7 +608,7 @@
#ifdef HWPMC_HOOKS
/*
* Capture a userspace callchain if needed.
- *
+ *
* - Check if the current trap was from user mode.
* - Check if the current thread is valid.
* - Check if the thread requires a user call chain to be
@@ -554,7 +654,7 @@
* At this point the processor has exited NMI mode and is running
* with interrupts turned off on the normal kernel stack.
*
- * If a pending NMI gets recognized at or after this point, it
+ * If a pending NMI gets recognized at or after this point, it
* will cause a kernel callchain to be traced.
*
* We turn interrupts back on, and call the user callchain capture hook.
@@ -572,7 +672,7 @@
#endif
testl %ebx,%ebx
jnz doreti_exit
-nmi_kernelexit:
+nmi_kernelexit:
/*
* Put back the preserved MSR_GSBASE value.
*/
@@ -655,6 +755,7 @@
.text
SUPERALIGN_TEXT
.type doreti, at function
+ .globl doreti
doreti:
FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */
/*
@@ -723,21 +824,38 @@
pushfq
cli
movl $MSR_GSBASE,%ecx
+ /* Save current kernel %gs base into %r12d:%r13d */
rdmsr
+ movl %eax,%r12d
+ movl %edx,%r13d
.globl ld_gs
ld_gs:
movw %si,%gs
+ /* Save user %gs base into %r14d:%r15d */
+ rdmsr
+ movl %eax,%r14d
+ movl %edx,%r15d
+ /* Restore kernel %gs base */
+ movl %r12d,%eax
+ movl %r13d,%edx
wrmsr
popfq
+ /*
+ * Restore user %gs base, either from PCB if used for TLS, or
+ * from the previously saved msr read.
+ */
+ movl $MSR_KGSBASE,%ecx
cmpw $KUG32SEL,%si
jne 1f
- movl $MSR_KGSBASE,%ecx
movl PCB_GSBASE(%r8),%eax
movl PCB_GSBASE+4(%r8),%edx
+ jmp ld_gsbase
+1:
+ movl %r14d,%eax
+ movl %r15d,%edx
.globl ld_gsbase
ld_gsbase:
- wrmsr
-1:
+ wrmsr /* May trap if non-canonical, but only for TLS. */
.globl ld_es
ld_es:
movw TF_ES(%rsp),%es
Property changes on: trunk/sys/amd64/amd64/exception.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/fpu.c
===================================================================
--- trunk/sys/amd64/amd64/fpu.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/fpu.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1990 William Jolitz.
* Copyright (c) 1991 The Regents of the University of California.
@@ -31,7 +32,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/fpu.c 325543 2017-11-08 11:39:42Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -127,10 +128,17 @@
*/
CTASSERT(sizeof(struct pcb) % XSAVE_AREA_ALIGN == 0);
+/*
+ * Ensure the copy of XCR0 saved in a core is contained in the padding
+ * area.
+ */
+CTASSERT(X86_XSTATE_XCR0_OFFSET >= offsetof(struct savefpu, sv_pad) &&
+ X86_XSTATE_XCR0_OFFSET + sizeof(uint64_t) <= sizeof(struct savefpu));
+
static void fpu_clean_state(void);
SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
- NULL, 1, "Floating point instructions executed in hardware");
+ SYSCTL_NULL_INT_PTR, 1, "Floating point instructions executed in hardware");
int use_xsave; /* non-static for cpu_switch.S */
uint64_t xsave_mask; /* the same */
@@ -162,6 +170,31 @@
fxrstor((char *)addr);
}
+void
+fpususpend(void *addr)
+{
+ u_long cr0;
+
+ cr0 = rcr0();
+ stop_emulating();
+ fpusave(addr);
+ load_cr0(cr0);
+}
+
+void
+fpuresume(void *addr)
+{
+ u_long cr0;
+
+ cr0 = rcr0();
+ stop_emulating();
+ fninit();
+ if (use_xsave)
+ load_xcr(XCR0, xsave_mask);
+ fpurestore(addr);
+ load_cr0(cr0);
+}
+
/*
* Enable XSAVE if supported and allowed by user.
* Calculate the xsave_mask.
@@ -188,6 +221,10 @@
TUNABLE_ULONG_FETCH("hw.xsave_mask", &xsave_mask_user);
xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE;
xsave_mask &= xsave_mask_user;
+ if ((xsave_mask & XFEATURE_AVX512) != XFEATURE_AVX512)
+ xsave_mask &= ~XFEATURE_AVX512;
+ if ((xsave_mask & XFEATURE_MPX) != XFEATURE_MPX)
+ xsave_mask &= ~XFEATURE_MPX;
cpuid_count(0xd, 0x1, cp);
if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) {
@@ -282,13 +319,15 @@
cpu_mxcsr_mask = 0xFFBF;
/*
- * The fninit instruction does not modify XMM registers. The
- * fpusave call dumped the garbage contained in the registers
- * after reset to the initial state saved. Clear XMM
- * registers file image to make the startup program state and
- * signal handler XMM register content predictable.
+ * The fninit instruction does not modify XMM registers or x87
+ * registers (MM/ST). The fpusave call dumped the garbage
+ * contained in the registers after reset to the initial state
+ * saved. Clear XMM and x87 registers file image to make the
+ * startup program state and signal handler XMM/x87 register
+ * content predictable.
*/
- bzero(&fpu_initialstate->sv_xmm[0], sizeof(struct xmmacc));
+ bzero(fpu_initialstate->sv_fp, sizeof(fpu_initialstate->sv_fp));
+ bzero(fpu_initialstate->sv_xmm, sizeof(fpu_initialstate->sv_xmm));
/*
* Create a table describing the layout of the CPU Extended
@@ -333,13 +372,13 @@
stop_emulating();
fpusave(curpcb->pcb_save);
start_emulating();
- PCPU_SET(fpcurthread, 0);
+ PCPU_SET(fpcurthread, NULL);
}
critical_exit();
}
int
-fpuformat()
+fpuformat(void)
{
return (_MC_FPFMT_XMM);
@@ -574,33 +613,37 @@
}
/*
- * Implement device not available (DNA) exception
+ * Device Not Available (DNA, #NM) exception handler.
*
- * It would be better to switch FP context here (if curthread != fpcurthread)
- * and not necessarily for every context switch, but it is too hard to
- * access foreign pcb's.
+ * It would be better to switch FP context here (if curthread !=
+ * fpcurthread) and not necessarily for every context switch, but it
+ * is too hard to access foreign pcb's.
*/
-
-static int err_count = 0;
-
void
fpudna(void)
{
+ /*
+ * This handler is entered with interrupts enabled, so context
+ * switches may occur before critical_enter() is executed. If
+ * a context switch occurs, then when we regain control, our
+ * state will have been completely restored. The CPU may
+ * change underneath us, but the only part of our context that
+ * lives in the CPU is CR0.TS and that will be "restored" by
+ * setting it on the new CPU.
+ */
critical_enter();
+
if (PCPU_GET(fpcurthread) == curthread) {
- printf("fpudna: fpcurthread == curthread %d times\n",
- ++err_count);
+ printf("fpudna: fpcurthread == curthread\n");
stop_emulating();
critical_exit();
return;
}
if (PCPU_GET(fpcurthread) != NULL) {
- printf("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
- PCPU_GET(fpcurthread),
- PCPU_GET(fpcurthread)->td_proc->p_pid,
- curthread, curthread->td_proc->p_pid);
- panic("fpudna");
+ panic("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
+ PCPU_GET(fpcurthread), PCPU_GET(fpcurthread)->td_tid,
+ curthread, curthread->td_tid);
}
stop_emulating();
/*
@@ -621,7 +664,8 @@
* fpu_initialstate, to ignite the XSAVEOPT
* tracking engine.
*/
- bcopy(fpu_initialstate, curpcb->pcb_save, cpu_max_ext_state_size);
+ bcopy(fpu_initialstate, curpcb->pcb_save,
+ cpu_max_ext_state_size);
fpurestore(curpcb->pcb_save);
if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
fldcw(curpcb->pcb_initial_fpucw);
@@ -636,7 +680,7 @@
}
void
-fpudrop()
+fpudrop(void)
{
struct thread *td;
@@ -761,6 +805,7 @@
struct pcb *pcb;
int error;
+ addr->sv_env.en_mxcsr &= cpu_mxcsr_mask;
pcb = td->td_pcb;
critical_enter();
if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) {
@@ -875,6 +920,7 @@
"Kernel contexts for FPU state");
#define FPU_KERN_CTX_FPUINITDONE 0x01
+#define FPU_KERN_CTX_DUMMY 0x02 /* avoided save for the kern thread */
struct fpu_kern_ctx {
struct savefpu *prev;
@@ -918,6 +964,10 @@
{
struct pcb *pcb;
+ if ((flags & FPU_KERN_KTHR) != 0 && is_fpu_kern_thread(0)) {
+ ctx->flags = FPU_KERN_CTX_DUMMY;
+ return (0);
+ }
pcb = td->td_pcb;
KASSERT(!PCB_USER_FPU(pcb) || pcb->pcb_save ==
get_pcb_user_save_pcb(pcb), ("mangled pcb_save"));
@@ -937,6 +987,9 @@
{
struct pcb *pcb;
+ if (is_fpu_kern_thread(0) && (ctx->flags & FPU_KERN_CTX_DUMMY) != 0)
+ return (0);
+ KASSERT((ctx->flags & FPU_KERN_CTX_DUMMY) == 0, ("dummy ctx"));
pcb = td->td_pcb;
critical_enter();
if (curthread == PCPU_GET(fpcurthread))
Modified: trunk/sys/amd64/amd64/gdb_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/gdb_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/gdb_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2004 Marcel Moolenaar
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/gdb_machdep.c 290734 2015-11-13 00:50:34Z jhb $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -48,6 +49,8 @@
void *
gdb_cpu_getreg(int regnum, size_t *regsz)
{
+ static uint32_t _kcodesel = GSEL(GCODE_SEL, SEL_KPL);
+ static uint32_t _kdatasel = GSEL(GDATA_SEL, SEL_KPL);
*regsz = gdb_cpu_regsz(regnum);
@@ -76,6 +79,8 @@
case 14: return (&kdb_thrctx->pcb_r14);
case 15: return (&kdb_thrctx->pcb_r15);
case 16: return (&kdb_thrctx->pcb_rip);
+ case 18: return (&_kcodesel);
+ case 19: return (&_kdatasel);
}
return (NULL);
}
Modified: trunk/sys/amd64/amd64/genassym.c
===================================================================
--- trunk/sys/amd64/amd64/genassym.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/genassym.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1982, 1990 The Regents of the University of California.
* All rights reserved.
@@ -33,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/genassym.c 286308 2015-08-05 07:35:34Z kib $");
#include "opt_compat.h"
#include "opt_hwpmc_hooks.h"
@@ -76,6 +77,8 @@
ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
+ASSYM(PM_SAVE, offsetof(struct pmap, pm_save));
+ASSYM(PM_PCID, offsetof(struct pmap, pm_pcid));
ASSYM(P_MD, offsetof(struct proc, p_md));
ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
@@ -109,7 +112,6 @@
ASSYM(addr_PML4pml4e, addr_PML4pml4e);
ASSYM(PDESIZE, sizeof(pd_entry_t));
ASSYM(PTESIZE, sizeof(pt_entry_t));
-ASSYM(PTESHIFT, PTESHIFT);
ASSYM(PAGE_SHIFT, PAGE_SHIFT);
ASSYM(PAGE_MASK, PAGE_MASK);
ASSYM(PDRSHIFT, PDRSHIFT);
@@ -152,11 +154,13 @@
ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
-ASSYM(PCB_GS32SD, offsetof(struct pcb, pcb_gs32sd));
ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp));
ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
-ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct savefpu));
-ASSYM(PCB_USERFPU, sizeof(struct pcb));
+ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer));
+ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star));
+ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar));
+ASSYM(PCB_CSTAR, offsetof(struct pcb, pcb_cstar));
+ASSYM(PCB_SFMASK, offsetof(struct pcb, pcb_sfmask));
ASSYM(PCB_SIZE, sizeof(struct pcb));
ASSYM(PCB_FULL_IRET, PCB_FULL_IRET);
ASSYM(PCB_DBREGS, PCB_DBREGS);
@@ -219,6 +223,7 @@
ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt));
ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
+ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
ASSYM(LA_VER, offsetof(struct LAPIC, version));
ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));
Modified: trunk/sys/amd64/amd64/in_cksum.c
===================================================================
--- trunk/sys/amd64/amd64/in_cksum.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/in_cksum.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/* $NetBSD: in_cksum.c,v 1.7 1997/09/02 13:18:15 thorpej Exp $ */
/*-
@@ -38,7 +39,7 @@
*/
#include <sys/cdefs.h> /* RCS ID & Copyright macro defns */
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/in_cksum.c 139731 2005-01-05 20:17:21Z imp $");
#include <sys/param.h>
#include <sys/mbuf.h>
Modified: trunk/sys/amd64/amd64/initcpu.c
===================================================================
--- trunk/sys/amd64/amd64/initcpu.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/initcpu.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) KATO Takenori, 1997, 1998.
*
@@ -28,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/initcpu.c 313150 2017-02-03 12:20:44Z kib $");
#include "opt_cpu.h"
@@ -48,11 +49,6 @@
static int hw_instruction_sse;
SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
&hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU");
-static int lower_sharedpage_init;
-int hw_lower_amd64_sharedpage;
-SYSCTL_INT(_hw, OID_AUTO, lower_amd64_sharedpage, CTLFLAG_RDTUN,
- &hw_lower_amd64_sharedpage, 0,
- "Lower sharedpage to work around Ryzen issue with executing code near the top of user memory");
/*
* -1: automatic (default)
* 0: keep enable CLFLUSH
@@ -79,7 +75,12 @@
u_int cpu_mxcsr_mask; /* Valid bits in mxcsr */
u_int cpu_clflush_line_size = 32;
u_int cpu_stdext_feature;
+u_int cpu_stdext_feature2;
u_int cpu_max_ext_state_size;
+u_int cpu_mon_mwait_flags; /* MONITOR/MWAIT flags (CPUID.05H.ECX) */
+u_int cpu_mon_min_size; /* MONITOR minimum range size, bytes */
+u_int cpu_mon_max_size; /* MONITOR minimum range size, bytes */
+u_int cpu_maxphyaddr; /* Max phys addr width in bits */
SYSCTL_UINT(_hw, OID_AUTO, via_feature_rng, CTLFLAG_RD,
&via_feature_rng, 0, "VIA RNG feature available in CPU");
@@ -89,6 +90,7 @@
static void
init_amd(void)
{
+ uint64_t msr;
/*
* Work around Erratum 721 for Family 10h and 12h processors.
@@ -113,26 +115,45 @@
}
/*
- * Work around a problem on Ryzen that is triggered by executing
- * code near the top of user memory, in our case the signal
- * trampoline code in the shared page on amd64.
- *
- * This function is executed once for the BSP before tunables take
- * effect so the value determined here can be overridden by the
- * tunable. This function is then executed again for each AP and
- * also on resume. Set a flag the first time so that value set by
- * the tunable is not overwritten.
- *
- * The stepping and/or microcode versions should be checked after
- * this issue is fixed by AMD so that we don't use this mode if not
- * needed.
+ * BIOS may fail to set InitApicIdCpuIdLo to 1 as it should per BKDG.
+ * So, do it here or otherwise some tools could be confused by
+ * Initial Local APIC ID reported with CPUID Function 1 in EBX.
*/
- if (lower_sharedpage_init == 0) {
- lower_sharedpage_init = 1;
- if (CPUID_TO_FAMILY(cpu_id) == 0x17) {
- hw_lower_amd64_sharedpage = 1;
+ if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
+ if ((cpu_feature2 & CPUID2_HV) == 0) {
+ msr = rdmsr(MSR_NB_CFG1);
+ msr |= (uint64_t)1 << 54;
+ wrmsr(MSR_NB_CFG1, msr);
}
}
+
+ /*
+ * BIOS may configure Family 10h processors to convert WC+ cache type
+ * to CD. That can hurt performance of guest VMs using nested paging.
+ * The relevant MSR bit is not documented in the BKDG,
+ * the fix is borrowed from Linux.
+ */
+ if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
+ if ((cpu_feature2 & CPUID2_HV) == 0) {
+ msr = rdmsr(0xc001102a);
+ msr &= ~((uint64_t)1 << 24);
+ wrmsr(0xc001102a, msr);
+ }
+ }
+
+ /*
+ * Work around Erratum 793: Specific Combination of Writes to Write
+ * Combined Memory Types and Locked Instructions May Cause Core Hang.
+ * See Revision Guide for AMD Family 16h Models 00h-0Fh Processors,
+ * revision 3.04 or later, publication 51810.
+ */
+ if (CPUID_TO_FAMILY(cpu_id) == 0x16 && CPUID_TO_MODEL(cpu_id) <= 0xf) {
+ if ((cpu_feature2 & CPUID2_HV) == 0) {
+ msr = rdmsr(0xc0011020);
+ msr |= (uint64_t)1 << 15;
+ wrmsr(0xc0011020, msr);
+ }
+ }
}
/*
@@ -216,7 +237,7 @@
}
void
-initializecpucache()
+initializecpucache(void)
{
/*
@@ -233,12 +254,17 @@
* CPUID_SS feature even though the native CPU supports it.
*/
TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable);
- if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1)
+ if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) {
cpu_feature &= ~CPUID_CLFSH;
+ cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
+ }
+
/*
- * Allow to disable CLFLUSH feature manually by
- * hw.clflush_disable tunable.
+ * The kernel's use of CLFLUSH{,OPT} can be disabled manually
+ * by setting the hw.clflush_disable tunable.
*/
- if (hw_clflush_disable == 1)
+ if (hw_clflush_disable == 1) {
cpu_feature &= ~CPUID_CLFSH;
+ cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
+ }
}
Modified: trunk/sys/amd64/amd64/io.c
===================================================================
--- trunk/sys/amd64/amd64/io.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/io.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2004 Mark R V Murray
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/io.c 207329 2010-04-28 15:38:01Z attilio $");
#include <sys/param.h>
#include <sys/proc.h>
Modified: trunk/sys/amd64/amd64/locore.S
===================================================================
--- trunk/sys/amd64/amd64/locore.S 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/locore.S 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2003 Peter Wemm <peter at FreeBSD.org>
* All rights reserved.
@@ -23,7 +24,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/locore.S 115431 2003-05-31 06:54:29Z peter $
*/
#include <machine/asmacros.h>
Property changes on: trunk/sys/amd64/amd64/locore.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/machdep.c
===================================================================
--- trunk/sys/amd64/amd64/machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2003 Peter Wemm.
* Copyright (c) 1992 Terrence R. Lambert.
@@ -39,7 +40,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/machdep.c 333370 2018-05-08 17:05:39Z emaste $");
#include "opt_atalk.h"
#include "opt_atpic.h"
@@ -53,6 +54,7 @@
#include "opt_maxmem.h"
#include "opt_mp_watchdog.h"
#include "opt_perfmon.h"
+#include "opt_platform.h"
#include "opt_sched.h"
#include "opt_kdtrace.h"
@@ -65,6 +67,7 @@
#include <sys/callout.h>
#include <sys/cons.h>
#include <sys/cpu.h>
+#include <sys/efi.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
@@ -80,6 +83,7 @@
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
+#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#ifdef SMP
@@ -132,6 +136,9 @@
#ifdef SMP
#include <machine/smp.h>
#endif
+#ifdef FDT
+#include <x86/fdt.h>
+#endif
#ifdef DEV_ATPIC
#include <x86/isa/icu.h>
@@ -147,10 +154,6 @@
extern u_int64_t hammer_time(u_int64_t, u_int64_t);
-extern void printcpuinfo(void); /* XXX header file */
-extern void identify_cpu(void);
-extern void panicifcpuunsupported(void);
-
#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
@@ -157,7 +160,7 @@
static void cpu_startup(void *);
static void get_fpcontext(struct thread *td, mcontext_t *mcp,
char *xfpusave, size_t xfpusave_len);
-static int set_fpcontext(struct thread *td, const mcontext_t *mcp,
+static int set_fpcontext(struct thread *td, mcontext_t *mcp,
char *xfpustate, size_t xfpustate_len);
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
@@ -211,6 +214,8 @@
struct mtx dt_lock; /* lock for GDT and LDT */
+void (*vmm_resume_p)(void);
+
static void
cpu_startup(dummy)
void *dummy;
@@ -230,9 +235,11 @@
if (sysenv != NULL) {
if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
strncmp(sysenv, "MacBook3,1", 10) == 0 ||
+ strncmp(sysenv, "MacBook4,1", 10) == 0 ||
strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
+ strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
strncmp(sysenv, "Macmini1,1", 10) == 0) {
if (bootverbose)
printf("Disabling LEGACY_USB_EN bit on "
@@ -251,7 +258,6 @@
#ifdef PERFMON
perfmon_init();
#endif
- realmem = Maxmem;
/*
* Display physical memory if SMBIOS reports reasonable amount.
@@ -265,6 +271,7 @@
if (memsize < ptoa((uintmax_t)cnt.v_free_count))
memsize = ptoa((uintmax_t)Maxmem);
printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
+ realmem = atop(memsize);
/*
* Display any holes after the first chunk of extended memory.
@@ -298,11 +305,6 @@
vm_pager_bufferinit();
cpu_setregs();
-
- /*
- * Add BSP as an interrupt target.
- */
- intr_add_cpu(0);
}
/*
@@ -383,10 +385,6 @@
/* Align to 16 bytes. */
sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
- /* Translate the signal if appropriate. */
- if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
- sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
-
/* Build the argument list for the signal handler. */
regs->tf_rdi = sig; /* arg 1 in %rdi */
regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
@@ -487,17 +485,7 @@
/*
* Don't allow users to change privileged or reserved flags.
*/
- /*
- * XXX do allow users to change the privileged flag PSL_RF.
- * The cpu sets PSL_RF in tf_rflags for faults. Debuggers
- * should sometimes set it there too. tf_rflags is kept in
- * the signal context during signal handling and there is no
- * other place to remember it, so the PSL_RF bit may be
- * corrupted by the signal handler without us knowing.
- * Corruption of the PSL_RF bit at worst causes one more or
- * one less debugger trap, so allowing it is fairly harmless.
- */
- if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
+ if (!EFL_SECURE(rflags, regs->tf_rflags)) {
uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
td->td_name, rflags);
return (EINVAL);
@@ -661,10 +649,10 @@
cpu_halt(void)
{
for (;;)
- __asm__ ("hlt");
+ halt();
}
-void (*cpu_idle_hook)(void) = NULL; /* ACPI idle hook. */
+void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */
static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
@@ -676,17 +664,19 @@
#define STATE_SLEEPING 0x2
static void
-cpu_idle_acpi(int busy)
+cpu_idle_acpi(sbintime_t sbt)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_SLEEPING;
+
+ /* See comments in cpu_idle_hlt(). */
disable_intr();
if (sched_runnable())
enable_intr();
else if (cpu_idle_hook)
- cpu_idle_hook();
+ cpu_idle_hook(sbt);
else
__asm __volatile("sti; hlt");
*state = STATE_RUNNING;
@@ -693,15 +683,28 @@
}
static void
-cpu_idle_hlt(int busy)
+cpu_idle_hlt(sbintime_t sbt)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_SLEEPING;
+
/*
- * We must absolutely guarentee that hlt is the next instruction
- * after sti or we introduce a timing window.
+ * Since we may be in a critical section from cpu_idle(), if
+ * an interrupt fires during that critical section we may have
+ * a pending preemption. If the CPU halts, then that thread
+ * may not execute until a later interrupt awakens the CPU.
+ * To handle this race, check for a runnable thread after
+ * disabling interrupts and immediately return if one is
+ * found. Also, we must absolutely guarentee that hlt is
+ * the next instruction after sti. This ensures that any
+ * interrupt that fires after the call to disable_intr() will
+ * immediately awaken the CPU from hlt. Finally, please note
+ * that on x86 this works fine because of interrupts enabled only
+ * after the instruction following sti takes place, while IF is set
+ * to 1 immediately, allowing hlt instruction to acknowledge the
+ * interrupt.
*/
disable_intr();
if (sched_runnable())
@@ -711,32 +714,31 @@
*state = STATE_RUNNING;
}
-/*
- * MWAIT cpu power states. Lower 4 bits are sub-states.
- */
-#define MWAIT_C0 0xf0
-#define MWAIT_C1 0x00
-#define MWAIT_C2 0x10
-#define MWAIT_C3 0x20
-#define MWAIT_C4 0x30
-
static void
-cpu_idle_mwait(int busy)
+cpu_idle_mwait(sbintime_t sbt)
{
int *state;
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_MWAIT;
- if (!sched_runnable()) {
- cpu_monitor(state, 0, 0);
- if (*state == STATE_MWAIT)
- cpu_mwait(0, MWAIT_C1);
+
+ /* See comments in cpu_idle_hlt(). */
+ disable_intr();
+ if (sched_runnable()) {
+ enable_intr();
+ *state = STATE_RUNNING;
+ return;
}
+ cpu_monitor(state, 0, 0);
+ if (*state == STATE_MWAIT)
+ __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
+ else
+ enable_intr();
*state = STATE_RUNNING;
}
static void
-cpu_idle_spin(int busy)
+cpu_idle_spin(sbintime_t sbt)
{
int *state;
int i;
@@ -743,6 +745,12 @@
state = (int *)PCPU_PTR(monitorbuf);
*state = STATE_RUNNING;
+
+ /*
+ * The sched_runnable() call is racy but as long as there is
+ * a loop missing it one time will have just a little impact if any
+ * (and it is much better than missing the check at all).
+ */
for (i = 0; i < 1000; i++) {
if (sched_runnable())
return;
@@ -779,12 +787,13 @@
}
}
-void (*cpu_idle_fn)(int) = cpu_idle_acpi;
+void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
void
cpu_idle(int busy)
{
uint64_t msr;
+ sbintime_t sbt = -1;
CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
busy, curcpu);
@@ -802,11 +811,11 @@
/* If we have time - switch timers into idle mode. */
if (!busy) {
critical_enter();
- cpu_idleclock();
+ sbt = cpu_idleclock();
}
/* Apply AMD APIC timer C1E workaround. */
- if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
+ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
msr = rdmsr(MSR_AMDK8_IPM);
if (msr & AMDK8_CMPHALT)
wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
@@ -813,7 +822,7 @@
}
/* Call main idle method. */
- cpu_idle_fn(busy);
+ cpu_idle_fn(sbt);
/* Switch timers mack into active mode. */
if (!busy) {
@@ -1015,6 +1024,7 @@
static char dblfault_stack[PAGE_SIZE] __aligned(16);
static char nmi0_stack[PAGE_SIZE] __aligned(16);
+static char dbg0_stack[PAGE_SIZE] __aligned(16);
CTASSERT(sizeof(struct nmi_pcpu) == 16);
struct amd64tss common_tss[MAXCPU];
@@ -1146,12 +1156,7 @@
};
void
-setidt(idx, func, typ, dpl, ist)
- int idx;
- inthand_t *func;
- int typ;
- int dpl;
- int ist;
+setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
{
struct gate_descriptor *ip;
@@ -1175,6 +1180,9 @@
#ifdef KDTRACE_HOOKS
IDTVEC(dtrace_ret),
#endif
+#ifdef XENHVM
+ IDTVEC(xen_intr_upcall),
+#endif
IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
#ifdef DDB
@@ -1199,6 +1207,51 @@
ip++;
}
}
+
+/* Show privileged registers. */
+DB_SHOW_COMMAND(sysregs, db_show_sysregs)
+{
+ struct {
+ uint16_t limit;
+ uint64_t base;
+ } __packed idtr, gdtr;
+ uint16_t ldt, tr;
+
+ __asm __volatile("sidt %0" : "=m" (idtr));
+ db_printf("idtr\t0x%016lx/%04x\n",
+ (u_long)idtr.base, (u_int)idtr.limit);
+ __asm __volatile("sgdt %0" : "=m" (gdtr));
+ db_printf("gdtr\t0x%016lx/%04x\n",
+ (u_long)gdtr.base, (u_int)gdtr.limit);
+ __asm __volatile("sldt %0" : "=r" (ldt));
+ db_printf("ldtr\t0x%04x\n", ldt);
+ __asm __volatile("str %0" : "=r" (tr));
+ db_printf("tr\t0x%04x\n", tr);
+ db_printf("cr0\t0x%016lx\n", rcr0());
+ db_printf("cr2\t0x%016lx\n", rcr2());
+ db_printf("cr3\t0x%016lx\n", rcr3());
+ db_printf("cr4\t0x%016lx\n", rcr4());
+ if (rcr4() & CR4_XSAVE)
+ db_printf("xcr0\t0x%016lx\n", rxcr(0));
+ db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
+ if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
+ db_printf("FEATURES_CTL\t%016lx\n",
+ rdmsr(MSR_IA32_FEATURE_CONTROL));
+ db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
+ db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
+ db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
+}
+
+DB_SHOW_COMMAND(dbregs, db_show_dbregs)
+{
+
+ db_printf("dr0\t0x%016lx\n", rdr0());
+ db_printf("dr1\t0x%016lx\n", rdr1());
+ db_printf("dr2\t0x%016lx\n", rdr2());
+ db_printf("dr3\t0x%016lx\n", rdr3());
+ db_printf("dr6\t0x%016lx\n", rdr6());
+ db_printf("dr7\t0x%016lx\n", rdr7());
+}
#endif
void
@@ -1276,30 +1329,26 @@
u_int basemem;
static int
-add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
+add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
+ int *physmap_idxp)
{
int i, insert_idx, physmap_idx;
physmap_idx = *physmap_idxp;
- if (boothowto & RB_VERBOSE)
- printf("SMAP type=%02x base=%016lx len=%016lx\n",
- smap->type, smap->base, smap->length);
-
- if (smap->type != SMAP_TYPE_MEMORY)
+ if (length == 0)
return (1);
- if (smap->length == 0)
- return (0);
-
/*
* Find insertion point while checking for overlap. Start off by
* assuming the new entry will be added to the end.
+ *
+ * NB: physmap_idx points to the next free slot.
*/
- insert_idx = physmap_idx + 2;
+ insert_idx = physmap_idx;
for (i = 0; i <= physmap_idx; i += 2) {
- if (smap->base < physmap[i + 1]) {
- if (smap->base + smap->length <= physmap[i]) {
+ if (base < physmap[i + 1]) {
+ if (base + length <= physmap[i]) {
insert_idx = i;
break;
}
@@ -1311,15 +1360,14 @@
}
/* See if we can prepend to the next entry. */
- if (insert_idx <= physmap_idx &&
- smap->base + smap->length == physmap[insert_idx]) {
- physmap[insert_idx] = smap->base;
+ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
+ physmap[insert_idx] = base;
return (1);
}
/* See if we can append to the previous entry. */
- if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
- physmap[insert_idx - 1] += smap->length;
+ if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
+ physmap[insert_idx - 1] += length;
return (1);
}
@@ -1335,17 +1383,148 @@
* Move the last 'N' entries down to make room for the new
* entry if needed.
*/
- for (i = physmap_idx; i > insert_idx; i -= 2) {
+ for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
physmap[i] = physmap[i - 2];
physmap[i + 1] = physmap[i - 1];
}
/* Insert the new entry. */
- physmap[insert_idx] = smap->base;
- physmap[insert_idx + 1] = smap->base + smap->length;
+ physmap[insert_idx] = base;
+ physmap[insert_idx + 1] = base + length;
return (1);
}
+static void
+add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap,
+ int *physmap_idx)
+{
+ struct bios_smap *smap, *smapend;
+ u_int32_t smapsize;
+
+ /*
+ * Memory map from INT 15:E820.
+ *
+ * subr_module.c says:
+ * "Consumer may safely assume that size value precedes data."
+ * ie: an int32_t immediately precedes smap.
+ */
+ smapsize = *((u_int32_t *)smapbase - 1);
+ smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
+
+ for (smap = smapbase; smap < smapend; smap++) {
+ if (boothowto & RB_VERBOSE)
+ printf("SMAP type=%02x base=%016lx len=%016lx\n",
+ smap->type, smap->base, smap->length);
+
+ if (smap->type != SMAP_TYPE_MEMORY)
+ continue;
+
+ if (!add_physmap_entry(smap->base, smap->length, physmap,
+ physmap_idx))
+ break;
+ }
+}
+
+#define efi_next_descriptor(ptr, size) \
+ ((struct efi_md *)(((uint8_t *) ptr) + size))
+
+static void
+add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
+ int *physmap_idx)
+{
+ struct efi_md *map, *p;
+ const char *type;
+ size_t efisz;
+ int ndesc, i;
+
+ static const char *types[] = {
+ "Reserved",
+ "LoaderCode",
+ "LoaderData",
+ "BootServicesCode",
+ "BootServicesData",
+ "RuntimeServicesCode",
+ "RuntimeServicesData",
+ "ConventionalMemory",
+ "UnusableMemory",
+ "ACPIReclaimMemory",
+ "ACPIMemoryNVS",
+ "MemoryMappedIO",
+ "MemoryMappedIOPortSpace",
+ "PalCode"
+ };
+
+ /*
+ * Memory map data provided by UEFI via the GetMemoryMap
+ * Boot Services API.
+ */
+ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
+ map = (struct efi_md *)((uint8_t *)efihdr + efisz);
+
+ if (efihdr->descriptor_size == 0)
+ return;
+ ndesc = efihdr->memory_size / efihdr->descriptor_size;
+
+ if (boothowto & RB_VERBOSE)
+ printf("%23s %12s %12s %8s %4s\n",
+ "Type", "Physical", "Virtual", "#Pages", "Attr");
+
+ for (i = 0, p = map; i < ndesc; i++,
+ p = efi_next_descriptor(p, efihdr->descriptor_size)) {
+ if (boothowto & RB_VERBOSE) {
+ if (p->md_type <= EFI_MD_TYPE_PALCODE)
+ type = types[p->md_type];
+ else
+ type = "<INVALID>";
+ printf("%23s %012lx %12p %08lx ", type, p->md_phys,
+ p->md_virt, p->md_pages);
+ if (p->md_attr & EFI_MD_ATTR_UC)
+ printf("UC ");
+ if (p->md_attr & EFI_MD_ATTR_WC)
+ printf("WC ");
+ if (p->md_attr & EFI_MD_ATTR_WT)
+ printf("WT ");
+ if (p->md_attr & EFI_MD_ATTR_WB)
+ printf("WB ");
+ if (p->md_attr & EFI_MD_ATTR_UCE)
+ printf("UCE ");
+ if (p->md_attr & EFI_MD_ATTR_WP)
+ printf("WP ");
+ if (p->md_attr & EFI_MD_ATTR_RP)
+ printf("RP ");
+ if (p->md_attr & EFI_MD_ATTR_XP)
+ printf("XP ");
+ if (p->md_attr & EFI_MD_ATTR_RT)
+ printf("RUNTIME");
+ printf("\n");
+ }
+
+ switch (p->md_type) {
+ case EFI_MD_TYPE_CODE:
+ case EFI_MD_TYPE_DATA:
+ case EFI_MD_TYPE_BS_CODE:
+ case EFI_MD_TYPE_BS_DATA:
+ case EFI_MD_TYPE_FREE:
+ /*
+ * We're allowed to use any entry with these types.
+ */
+ break;
+ default:
+ continue;
+ }
+
+ if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
+ physmap, physmap_idx))
+ break;
+ }
+}
+
+static char bootmethod[16] = "";
+SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
+ "System firmware boot method");
+
+#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
+
/*
* Populate the (physmap) array with base/bound pairs describing the
* available physical memory in the system, then test this memory and
@@ -1363,32 +1542,30 @@
vm_paddr_t pa, physmap[PHYSMAP_SIZE];
u_long physmem_start, physmem_tunable, memtest;
pt_entry_t *pte;
- struct bios_smap *smapbase, *smap, *smapend;
- u_int32_t smapsize;
+ struct bios_smap *smapbase;
+ struct efi_map_header *efihdr;
quad_t dcons_addr, dcons_size;
+ int page_counter;
bzero(physmap, sizeof(physmap));
- basemem = 0;
physmap_idx = 0;
- /*
- * get memory map from INT 15:E820, kindly supplied by the loader.
- *
- * subr_module.c says:
- * "Consumer may safely assume that size value precedes data."
- * ie: an int32_t immediately precedes smap.
- */
+ efihdr = (struct efi_map_header *)preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_EFI_MAP);
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
- if (smapbase == NULL)
- panic("No BIOS smap info from loader!");
- smapsize = *((u_int32_t *)smapbase - 1);
- smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
+ if (efihdr != NULL) {
+ add_efi_map_entries(efihdr, physmap, &physmap_idx);
+ strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
+ } else if (smapbase != NULL) {
+ add_smap_entries(smapbase, physmap, &physmap_idx);
+ strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
+ } else {
+ panic("No BIOS smap or EFI map info from loader!");
+ }
- for (smap = smapbase; smap < smapend; smap++)
- if (!add_smap_entry(smap, physmap, &physmap_idx))
- break;
+ physmap_idx -= 2;
/*
* Find the 'base memory' segment for SMP
@@ -1395,16 +1572,22 @@
*/
basemem = 0;
for (i = 0; i <= physmap_idx; i += 2) {
- if (physmap[i] == 0x00000000) {
+ if (physmap[i] <= 0xA0000) {
basemem = physmap[i + 1] / 1024;
break;
}
}
- if (basemem == 0)
- panic("BIOS smap did not include a basemem segment!");
+ if (basemem == 0 || basemem > 640) {
+ if (bootverbose)
+ printf(
+ "Memory map doesn't contain a basemem segment, faking it");
+ basemem = 640;
+ }
#ifdef SMP
/* make hole for AP bootstrap code */
+ if (physmap[1] >= 0x100000000)
+ panic("Basemem segment is not suitable for AP bootstrap code!");
physmap[1] = mp_bootaddress(physmap[1] / 1024);
#endif
@@ -1424,13 +1607,15 @@
Maxmem = atop(physmem_tunable);
/*
- * By default enable the memory test on real hardware, and disable
- * it if we appear to be running in a VM. This avoids touching all
- * pages unnecessarily, which doesn't matter on real hardware but is
- * bad for shared VM hosts. Use a general name so that
- * one could eventually do more with the code than just disable it.
+ * The boot memory test is disabled by default, as it takes a
+ * significant amount of time on large-memory systems, and is
+ * unfriendly to virtual machines as it unnecessarily touches all
+ * pages.
+ *
+ * A general name is used as the code may be extended to support
+ * additional tests beyond the current "page present" test.
*/
- memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
+ memtest = 0;
TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
/*
@@ -1456,12 +1641,14 @@
*/
physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
- if (physmem_start < PAGE_SIZE)
- physmap[0] = PAGE_SIZE;
- else if (physmem_start >= physmap[1])
- physmap[0] = round_page(physmap[1] - PAGE_SIZE);
- else
- physmap[0] = round_page(physmem_start);
+ if (physmap[0] < physmem_start) {
+ if (physmem_start < PAGE_SIZE)
+ physmap[0] = PAGE_SIZE;
+ else if (physmem_start >= physmap[1])
+ physmap[0] = round_page(physmap[1] - PAGE_SIZE);
+ else
+ physmap[0] = round_page(physmem_start);
+ }
pa_indx = 0;
da_indx = 1;
phys_avail[pa_indx++] = physmap[0];
@@ -1480,6 +1667,9 @@
* physmap is in bytes, so when converting to page boundaries,
* round up the start address and round down the end address.
*/
+ page_counter = 0;
+ if (memtest != 0)
+ printf("Testing system memory");
for (i = 0; i <= physmap_idx; i += 2) {
vm_paddr_t end;
@@ -1510,9 +1700,17 @@
goto skip_memtest;
/*
+ * Print a "." every GB to show we're making
+ * progress.
+ */
+ page_counter++;
+ if ((page_counter % PAGES_PER_GB) == 0)
+ printf(".");
+
+ /*
* map page into kernel: valid, read/write,non-cacheable
*/
- *pte = pa | PG_V | PG_RW | PG_N;
+ *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
invltlb();
tmp = *(int *)ptr;
@@ -1596,6 +1794,8 @@
}
*pte = 0;
invltlb();
+ if (memtest != 0)
+ printf("\n");
/*
* XXX
@@ -1649,12 +1849,15 @@
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
- kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
+ init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE, 0);
#ifdef DDB
ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
#endif
+ identify_cpu();
+ identify_hypervisor();
+
/* Init basic tunables, hz etc */
init_param1();
@@ -1707,7 +1910,7 @@
for (x = 0; x < NIDT; x++)
setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
@@ -1728,6 +1931,9 @@
#ifdef KDTRACE_HOOKS
setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
#endif
+#ifdef XENHVM
+ setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
+#endif
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (long) idt;
@@ -1740,38 +1946,14 @@
i8254_init();
/*
- * Initialize the console before we print anything out.
+ * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
+ * transition).
*/
- cninit();
+ if (kmdp != NULL && preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL)
+ vty_set_preferred(VTY_VT);
-#ifdef DEV_ISA
-#ifdef DEV_ATPIC
- elcr_probe();
- atpic_startup();
-#else
- /* Reset and mask the atpics and leave them shut down. */
- atpic_reset();
-
- /*
- * Point the ICU spurious interrupt vectors at the APIC spurious
- * interrupt handler.
- */
- setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
-#endif
-#else
-#error "have you forgotten the isa device?";
-#endif
-
- kdb_init();
-
-#ifdef KDB
- if (boothowto & RB_KDB)
- kdb_enter(KDB_WHY_BOOTFLAGS,
- "Boot flags requested debugger");
-#endif
-
- identify_cpu(); /* Final stage of CPU initialization */
+ finishidentcpu(); /* Final stage of CPU initialization */
initializecpu(); /* Initialize CPU registers */
initializecpucache();
@@ -1786,6 +1968,13 @@
np->np_pcpu = (register_t) pc;
common_tss[0].tss_ist2 = (long) np;
+ /*
+ * DB# stack, runs on ist4.
+ */
+ np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
+ np->np_pcpu = (register_t) pc;
+ common_tss[0].tss_ist4 = (long) np;
+
/* Set the IO permission bitmap (empty due to tss seg limit) */
common_tss[0].tss_iobase = sizeof(struct amd64tss) +
IOPAGES * PAGE_SIZE;
@@ -1808,6 +1997,35 @@
/* now running on new page tables, configured,and u/iom is accessible */
+ cninit();
+
+#ifdef DEV_ISA
+#ifdef DEV_ATPIC
+ elcr_probe();
+ atpic_startup();
+#else
+ /* Reset and mask the atpics and leave them shut down. */
+ atpic_reset();
+
+ /*
+ * Point the ICU spurious interrupt vectors at the APIC spurious
+ * interrupt handler.
+ */
+ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
+#endif
+#else
+#error "have you forgotten the isa device?";
+#endif
+
+ kdb_init();
+
+#ifdef KDB
+ if (boothowto & RB_KDB)
+ kdb_enter(KDB_WHY_BOOTFLAGS,
+ "Boot flags requested debugger");
+#endif
+
msgbufinit(msgbufp, msgbufsize);
fpuinit();
@@ -1817,6 +2035,7 @@
* area.
*/
thread0.td_pcb = get_pcb_td(&thread0);
+ thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
if (use_xsave) {
xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
@@ -1844,7 +2063,7 @@
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
- thread0.td_pcb->pcb_cr3 = KPML4phys;
+ thread0.td_pcb->pcb_cr3 = KPML4phys; /* PCID 0 is reserved for kernel */
thread0.td_frame = &proc0_tf;
env = getenv("kernelname");
@@ -1851,16 +2070,12 @@
if (env != NULL)
strlcpy(kernelname, env, sizeof(kernelname));
-#ifdef XENHVM
- if (inw(0x10) == 0x49d2) {
- if (bootverbose)
- printf("Xen detected: disabling emulated block and network devices\n");
- outw(0x10, 3);
- }
+ cpu_probe_amdc1e();
+
+#ifdef FDT
+ x86_init_fdt();
#endif
- cpu_probe_amdc1e();
-
/* Location of kernel stack for locore */
return ((u_int64_t)thread0.td_pcb);
}
@@ -1872,6 +2087,62 @@
pcpu->pc_acpi_id = 0xffffffff;
}
+static int
+smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+ struct bios_smap *smapbase;
+ struct bios_smap_xattr smap;
+ caddr_t kmdp;
+ uint32_t *smapattr;
+ int count, error, i;
+
+ /* Retrieve the system memory map from the loader. */
+ kmdp = preload_search_by_type("elf kernel");
+ if (kmdp == NULL)
+ kmdp = preload_search_by_type("elf64 kernel");
+ smapbase = (struct bios_smap *)preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_SMAP);
+ if (smapbase == NULL)
+ return (0);
+ smapattr = (uint32_t *)preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
+ count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
+ error = 0;
+ for (i = 0; i < count; i++) {
+ smap.base = smapbase[i].base;
+ smap.length = smapbase[i].length;
+ smap.type = smapbase[i].type;
+ if (smapattr != NULL)
+ smap.xattr = smapattr[i];
+ else
+ smap.xattr = 0;
+ error = SYSCTL_OUT(req, &smap, sizeof(smap));
+ }
+ return (error);
+}
+SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
+ smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
+
+static int
+efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+ struct efi_map_header *efihdr;
+ caddr_t kmdp;
+ uint32_t efisize;
+
+ kmdp = preload_search_by_type("elf kernel");
+ if (kmdp == NULL)
+ kmdp = preload_search_by_type("elf64 kernel");
+ efihdr = (struct efi_map_header *)preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_EFI_MAP);
+ if (efihdr == NULL)
+ return (0);
+ efisize = *((uint32_t *)efihdr - 1);
+ return (SYSCTL_OUT(req, efihdr, efisize));
+}
+SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
+ efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
+
void
spinlock_enter(void)
{
@@ -1926,7 +2197,9 @@
int
ptrace_set_pc(struct thread *td, unsigned long addr)
{
+
td->td_frame->tf_rip = addr;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
return (0);
}
@@ -2026,8 +2299,8 @@
tp->tf_fs = regs->r_fs;
tp->tf_gs = regs->r_gs;
tp->tf_flags = TF_HASSEGS;
- set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
}
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
return (0);
}
@@ -2175,7 +2448,7 @@
* touch the cs selector.
*/
int
-set_mcontext(struct thread *td, const mcontext_t *mcp)
+set_mcontext(struct thread *td, mcontext_t *mcp)
{
struct pcb *pcb;
struct trapframe *tp;
@@ -2262,10 +2535,9 @@
}
static int
-set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate,
+set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
size_t xfpustate_len)
{
- struct savefpu *fpstate;
int error;
if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
@@ -2278,9 +2550,8 @@
error = 0;
} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
- fpstate = (struct savefpu *)&mcp->mc_fpstate;
- fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
- error = fpusetregs(td, fpstate, xfpustate, xfpustate_len);
+ error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
+ xfpustate, xfpustate_len);
} else
return (EINVAL);
return (error);
Modified: trunk/sys/amd64/amd64/mem.c
===================================================================
--- trunk/sys/amd64/amd64/mem.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/mem.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1982, 1986, 1990 The Regents of the University of California.
@@ -37,7 +38,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/mem.c 309426 2016-12-02 19:02:12Z jhb $");
/*
* Memory special file
@@ -58,6 +59,7 @@
#include <sys/systm.h>
#include <sys/uio.h>
+#include <machine/md_var.h>
#include <machine/specialreg.h>
#include <machine/vmparam.h>
@@ -76,14 +78,15 @@
int
memrw(struct cdev *dev, struct uio *uio, int flags)
{
- int o;
- u_long c = 0, v;
struct iovec *iov;
- int error = 0;
- vm_offset_t addr, eaddr;
+ void *p;
+ ssize_t orig_resid;
+ u_long v, vd;
+ u_int c;
+ int error;
- GIANT_REQUIRED;
-
+ error = 0;
+ orig_resid = uio->uio_resid;
while (uio->uio_resid > 0 && error == 0) {
iov = uio->uio_iov;
if (iov->iov_len == 0) {
@@ -93,48 +96,67 @@
panic("memrw");
continue;
}
- if (dev2unit(dev) == CDEV_MINOR_MEM) {
- v = uio->uio_offset;
-kmemphys:
- o = v & PAGE_MASK;
- c = min(uio->uio_resid, (u_int)(PAGE_SIZE - o));
- error = uiomove((void *)PHYS_TO_DMAP(v), (int)c, uio);
- continue;
- }
- else if (dev2unit(dev) == CDEV_MINOR_KMEM) {
- v = uio->uio_offset;
+ v = uio->uio_offset;
+ c = ulmin(iov->iov_len, PAGE_SIZE - (u_int)(v & PAGE_MASK));
- if (v >= DMAP_MIN_ADDRESS && v < DMAP_MAX_ADDRESS) {
- v = DMAP_TO_PHYS(v);
- goto kmemphys;
+ switch (dev2unit(dev)) {
+ case CDEV_MINOR_KMEM:
+ /*
+ * Since c is clamped to be less or equal than
+ * PAGE_SIZE, the uiomove() call does not
+ * access past the end of the direct map.
+ */
+ if (v >= DMAP_MIN_ADDRESS &&
+ v < DMAP_MIN_ADDRESS + dmaplimit) {
+ error = uiomove((void *)v, c, uio);
+ break;
}
- c = iov->iov_len;
+ if (!kernacc((void *)v, c, uio->uio_rw == UIO_READ ?
+ VM_PROT_READ : VM_PROT_WRITE)) {
+ error = EFAULT;
+ break;
+ }
/*
- * Make sure that all of the pages are currently
- * resident so that we don't create any zero-fill
- * pages.
+ * If the extracted address is not accessible
+ * through the direct map, then we make a
+ * private (uncached) mapping because we can't
+ * depend on the existing kernel mapping
+ * remaining valid until the completion of
+ * uiomove().
+ *
+ * XXX We cannot provide access to the
+ * physical page 0 mapped into KVA.
*/
- addr = trunc_page(v);
- eaddr = round_page(v + c);
-
- if (addr < VM_MIN_KERNEL_ADDRESS)
- return (EFAULT);
- for (; addr < eaddr; addr += PAGE_SIZE)
- if (pmap_extract(kernel_pmap, addr) == 0)
- return (EFAULT);
-
- if (!kernacc((caddr_t)(long)v, c,
- uio->uio_rw == UIO_READ ?
- VM_PROT_READ : VM_PROT_WRITE))
- return (EFAULT);
-
- error = uiomove((caddr_t)(long)v, (int)c, uio);
- continue;
+ v = pmap_extract(kernel_pmap, v);
+ if (v == 0) {
+ error = EFAULT;
+ break;
+ }
+ /* FALLTHROUGH */
+ case CDEV_MINOR_MEM:
+ if (v < dmaplimit) {
+ vd = PHYS_TO_DMAP(v);
+ error = uiomove((void *)vd, c, uio);
+ break;
+ }
+ if (v > cpu_getmaxphyaddr()) {
+ error = EFAULT;
+ break;
+ }
+ p = pmap_mapdev(v, PAGE_SIZE);
+ error = uiomove(p, c, uio);
+ pmap_unmapdev((vm_offset_t)p, PAGE_SIZE);
+ break;
}
- /* else panic! */
}
+ /*
+ * Don't return error if any byte was written. Read and write
+ * can return error only if no i/o was performed.
+ */
+ if (uio->uio_resid != orig_resid)
+ error = 0;
return (error);
}
@@ -147,9 +169,11 @@
memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
int prot __unused, vm_memattr_t *memattr __unused)
{
- if (dev2unit(dev) == CDEV_MINOR_MEM)
+ if (dev2unit(dev) == CDEV_MINOR_MEM) {
+ if (offset > cpu_getmaxphyaddr())
+ return (-1);
*paddr = offset;
- else if (dev2unit(dev) == CDEV_MINOR_KMEM)
+ } else if (dev2unit(dev) == CDEV_MINOR_KMEM)
*paddr = vtophys(offset);
/* else panic! */
return (0);
Modified: trunk/sys/amd64/amd64/minidump_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/minidump_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/minidump_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2006 Peter Wemm
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/minidump_machdep.c 257575 2013-11-03 16:03:19Z kib $");
#include "opt_pmap.h"
#include "opt_watchdog.h"
@@ -39,7 +40,9 @@
#include <sys/msgbuf.h>
#include <sys/watchdog.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
#include <vm/vm_page.h>
+#include <vm/vm_phys.h>
#include <vm/pmap.h>
#include <machine/atomic.h>
#include <machine/elf.h>
@@ -125,8 +128,9 @@
int sofar, i;
sofar = 100 - ((progress * 100) / dumpsize);
- for (i = 0; i < 10; i++) {
- if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per)
+ for (i = 0; i < nitems(progress_track); i++) {
+ if (sofar < progress_track[i].min_per ||
+ sofar > progress_track[i].max_per)
continue;
if (progress_track[i].visited)
return;
@@ -155,8 +159,8 @@
printf("cant have both va and pa!\n");
return (EINVAL);
}
- if (pa != 0 && (((uintptr_t)ptr) % PAGE_SIZE) != 0) {
- printf("address not page aligned\n");
+ if ((((uintptr_t)pa) % PAGE_SIZE) != 0) {
+ printf("address not page aligned %p\n", ptr);
return (EINVAL);
}
if (ptr != NULL) {
@@ -219,8 +223,8 @@
vm_offset_t va;
int error;
uint64_t bits;
- uint64_t *pdp, *pd, *pt, pa;
- int i, j, k, n, bit;
+ uint64_t *pml4, *pdp, *pd, *pt, pa;
+ int i, ii, j, k, n, bit;
int retry_count;
struct minidumphdr mdhdr;
@@ -228,10 +232,11 @@
retry:
retry_count++;
counter = 0;
+ for (i = 0; i < nitems(progress_track); i++)
+ progress_track[i].visited = 0;
/* Walk page table pages, set bits in vm_page_dump */
pmapsize = 0;
- pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
- for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
+ for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
kernel_vm_end); ) {
/*
* We always write a page, even if it is zero. Each
@@ -238,6 +243,9 @@
* page written corresponds to 1GB of space
*/
pmapsize += PAGE_SIZE;
+ ii = (va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1);
+ pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
+ pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
if ((pdp[i] & PG_V) == 0) {
va += NBPDP;
@@ -362,9 +370,11 @@
/* Dump kernel page directory pages */
bzero(fakepd, sizeof(fakepd));
- pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
- for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
+ for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
kernel_vm_end); va += NBPDP) {
+ ii = (va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1);
+ pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
+ pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
/* We always write a page, even if it is zero */
Modified: trunk/sys/amd64/amd64/mp_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/mp_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/mp_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1996, by Steve Passe
* Copyright (c) 2003, by Peter Wemm
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/mp_machdep.c 333370 2018-05-08 17:05:39Z emaste $");
#include "opt_cpu.h"
#include "opt_ddb.h"
@@ -69,6 +70,7 @@
#include <machine/smp.h>
#include <machine/specialreg.h>
#include <machine/tss.h>
+#include <machine/cpu.h>
#define WARMBOOT_TARGET 0
#define WARMBOOT_OFF (KERNBASE + 0x0467)
@@ -97,16 +99,19 @@
/* Temporary variables for init_secondary() */
char *doublefault_stack;
char *nmi_stack;
+char *dbg_stack;
void *dpcpu;
struct pcb stoppcbs[MAXCPU];
-struct pcb **susppcbs;
-void **suspfpusave;
+struct susppcb **susppcbs;
/* Variables needed for SMP tlb shootdown. */
-vm_offset_t smp_tlb_addr1;
vm_offset_t smp_tlb_addr2;
+struct invpcid_descr smp_tlb_invpcid;
volatile int smp_tlb_wait;
+uint64_t pcid_cr3;
+pmap_t smp_tlb_pmap;
+extern int invpcid_works;
#ifdef COUNT_IPIS
/* Interrupt counts. */
@@ -120,8 +125,15 @@
static u_long *ipi_hardclock_counts[MAXCPU];
#endif
+/* Default cpu_ops implementation. */
+struct cpu_ops cpu_ops = {
+ .ipi_vectored = lapic_ipi_vectored
+};
+
extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
+extern int pmap_pcid_enabled;
+
/*
* Local data and functions.
*/
@@ -128,6 +140,9 @@
static volatile cpuset_t ipi_nmi_pending;
+volatile cpuset_t resuming_cpus;
+volatile cpuset_t toresume_cpus;
+
/* used to hold the AP's until we are ready to release them */
static struct mtx ap_boot_mtx;
@@ -148,7 +163,7 @@
int apic_cpuids[MAX_APIC_ID + 1];
/* Holds pending bitmap based IPIs per CPU */
-static volatile u_int cpu_ipi_pending[MAXCPU];
+volatile u_int cpu_ipi_pending[MAXCPU];
static u_int boot_address;
static int cpu_logical; /* logical cpus per core */
@@ -519,8 +534,15 @@
}
/* Install an inter-CPU IPI for TLB invalidation */
- setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
+ if (pmap_pcid_enabled) {
+ setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IPI_INVLPG, IDTVEC(invlpg_pcid), SDT_SYSIGT,
+ SEL_KPL, 0);
+ } else {
+ setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
+ }
setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
/* Install an inter-CPU IPI for cache invalidation. */
@@ -568,7 +590,7 @@
const char *hyperthread;
int i;
- printf("MidnightBSD/SMP: %d package(s) x %d core(s)",
+ printf("FreeBSD/SMP: %d package(s) x %d core(s)",
mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
if (hyperthreading_cpus > 1)
printf(" x %d HTT threads", cpu_logical);
@@ -627,6 +649,10 @@
np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
common_tss[cpu].tss_ist2 = (long) np;
+ /* The DB# stack runs on IST4. */
+ np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
+ common_tss[cpu].tss_ist4 = (long) np;
+
/* Prepare private GDT */
gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
for (x = 0; x < NGDT; x++) {
@@ -662,9 +688,14 @@
/* Save the per-cpu pointer for use by the NMI handler. */
np->np_pcpu = (register_t) pc;
+ /* Save the per-cpu pointer for use by the DB# handler. */
+ np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
+ np->np_pcpu = (register_t) pc;
+
wrmsr(MSR_FSBASE, 0); /* User value */
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */
+ fix_cpuid();
lidt(&r_idt);
@@ -706,12 +737,15 @@
/* set up CPU registers and state */
cpu_setregs();
- /* set up SSE/NX registers */
+ /* set up SSE/NX */
initializecpu();
/* set up FPU state on the AP */
fpuinit();
+ if (cpu_ops.cpu_init)
+ cpu_ops.cpu_init();
+
/* A quick check from sanity claus */
cpuid = PCPU_GET(cpuid);
if (PCPU_GET(apic_id) != lapic_id()) {
@@ -751,7 +785,6 @@
if (smp_cpus == mp_ncpus) {
/* enable IPI's, tlb shootdown, freezes etc */
atomic_store_rel_int(&smp_started, 1);
- smp_active = 1; /* historic */
}
/*
@@ -760,6 +793,8 @@
*/
load_cr4(rcr4() | CR4_PGE);
+ if (pmap_pcid_enabled)
+ load_cr4(rcr4() | CR4_PCIDE);
load_ds(_udatasel);
load_es(_udatasel);
load_fs(_ufssel);
@@ -786,6 +821,8 @@
* We tell the I/O APIC code about all the CPUs we want to receive
* interrupts. If we don't want certain CPUs to receive IRQs we
* can simply not tell the I/O APIC code about them in this function.
+ * We also do not tell it about the BSP since it tells itself about
+ * the BSP internally to work with UP kernels and on UP machines.
*/
static void
set_interrupt_apic_ids(void)
@@ -796,6 +833,8 @@
apic_id = cpu_apic_ids[i];
if (apic_id == -1)
continue;
+ if (cpu_info[apic_id].cpu_bsp)
+ continue;
if (cpu_info[apic_id].cpu_disabled)
continue;
@@ -935,10 +974,16 @@
apic_id = cpu_apic_ids[cpu];
/* allocate and set up an idle stack data page */
- bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
- doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
- nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
- dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE);
+ bootstacks[cpu] = (void *)kmem_malloc(kernel_arena,
+ KSTACK_PAGES * PAGE_SIZE, M_WAITOK | M_ZERO);
+ doublefault_stack = (char *)kmem_malloc(kernel_arena,
+ PAGE_SIZE, M_WAITOK | M_ZERO);
+ nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
+ M_WAITOK | M_ZERO);
+ dbg_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
+ M_WAITOK | M_ZERO);
+ dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
+ M_WAITOK | M_ZERO);
bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
bootAP = cpu;
@@ -983,57 +1028,8 @@
/* used as a watchpoint to signal AP startup */
cpus = mp_naps;
- /*
- * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
- * and running the target CPU. OR this INIT IPI might be latched (P5
- * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
- * ignored.
- */
+ ipi_startup(apic_id, vector);
- /* do an INIT IPI: assert RESET */
- lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
- APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
-
- /* wait for pending status end */
- lapic_ipi_wait(-1);
-
- /* do an INIT IPI: deassert RESET */
- lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
- APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
-
- /* wait for pending status end */
- DELAY(10000); /* wait ~10mS */
- lapic_ipi_wait(-1);
-
- /*
- * next we do a STARTUP IPI: the previous INIT IPI might still be
- * latched, (P5 bug) this 1st STARTUP would then terminate
- * immediately, and the previously started INIT IPI would continue. OR
- * the previous INIT IPI has already run. and this STARTUP IPI will
- * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
- * will run.
- */
-
- /* do a STARTUP IPI */
- lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
- APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
- vector, apic_id);
- lapic_ipi_wait(-1);
- DELAY(200); /* wait ~200uS */
-
- /*
- * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
- * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
- * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
- * recognized after hardware RESET or INIT IPI.
- */
-
- lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
- APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
- vector, apic_id);
- lapic_ipi_wait(-1);
- DELAY(200); /* wait ~200uS */
-
/* Wait up to 5 seconds for it to start. */
for (ms = 0; ms < 5000; ms++) {
if (mp_naps > cpus)
@@ -1080,6 +1076,69 @@
#endif /* COUNT_XINVLTLB_HITS */
/*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
+ /*
+ * This attempts to follow the algorithm described in the
+ * Intel Multiprocessor Specification v1.4 in section B.4.
+ * For each IPI, we allow the local APIC ~20us to deliver the
+ * IPI. If that times out, we panic.
+ */
+
+ /*
+ * first we do an INIT IPI: this INIT IPI might be run, resetting
+ * and running the target CPU. OR this INIT IPI might be latched (P5
+ * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
+ * ignored.
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+ APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
+ lapic_ipi_wait(100);
+
+ /* Explicitly deassert the INIT IPI. */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+ APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
+ apic_id);
+
+ DELAY(10000); /* wait ~10mS */
+
+ /*
+ * next we do a STARTUP IPI: the previous INIT IPI might still be
+ * latched, (P5 bug) this 1st STARTUP would then terminate
+ * immediately, and the previously started INIT IPI would continue. OR
+ * the previous INIT IPI has already run. and this STARTUP IPI will
+ * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
+ * will run.
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+ APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+ vector, apic_id);
+ if (!lapic_ipi_wait(100))
+ panic("Failed to deliver first STARTUP IPI to APIC %d",
+ apic_id);
+ DELAY(200); /* wait ~200uS */
+
+ /*
+ * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
+ * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
+ * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
+ * recognized after hardware RESET or INIT IPI.
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+ APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+ vector, apic_id);
+ if (!lapic_ipi_wait(100))
+ panic("Failed to deliver second STARTUP IPI to APIC %d",
+ apic_id);
+
+ DELAY(200); /* wait ~200uS */
+}
+
+/*
* Send an IPI to specified CPU handling the bitmap logic.
*/
static void
@@ -1100,7 +1159,7 @@
if (old_pending)
return;
}
- lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
+ cpu_ops.ipi_vectored(ipi, cpu_apic_ids[cpu]);
}
/*
@@ -1107,7 +1166,8 @@
* Flush the TLB on all other CPU's
*/
static void
-smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+smp_tlb_shootdown(u_int vector, pmap_t pmap, vm_offset_t addr1,
+ vm_offset_t addr2)
{
u_int ncpu;
@@ -1117,8 +1177,15 @@
if (!(read_rflags() & PSL_I))
panic("%s: interrupts disabled", __func__);
mtx_lock_spin(&smp_ipi_mtx);
- smp_tlb_addr1 = addr1;
+ smp_tlb_invpcid.addr = addr1;
+ if (pmap == NULL) {
+ smp_tlb_invpcid.pcid = 0;
+ } else {
+ smp_tlb_invpcid.pcid = pmap->pm_pcid;
+ pcid_cr3 = pmap->pm_cr3;
+ }
smp_tlb_addr2 = addr2;
+ smp_tlb_pmap = pmap;
atomic_store_rel_int(&smp_tlb_wait, 0);
ipi_all_but_self(vector);
while (smp_tlb_wait < ncpu)
@@ -1127,7 +1194,8 @@
}
static void
-smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
+ vm_offset_t addr1, vm_offset_t addr2)
{
int cpu, ncpu, othercpus;
@@ -1143,8 +1211,15 @@
if (!(read_rflags() & PSL_I))
panic("%s: interrupts disabled", __func__);
mtx_lock_spin(&smp_ipi_mtx);
- smp_tlb_addr1 = addr1;
+ smp_tlb_invpcid.addr = addr1;
+ if (pmap == NULL) {
+ smp_tlb_invpcid.pcid = 0;
+ } else {
+ smp_tlb_invpcid.pcid = pmap->pm_pcid;
+ pcid_cr3 = pmap->pm_cr3;
+ }
smp_tlb_addr2 = addr2;
+ smp_tlb_pmap = pmap;
atomic_store_rel_int(&smp_tlb_wait, 0);
if (CPU_ISFULLSET(&mask)) {
ncpu = othercpus;
@@ -1151,7 +1226,7 @@
ipi_all_but_self(vector);
} else {
ncpu = 0;
- while ((cpu = cpusetobj_ffs(&mask)) != 0) {
+ while ((cpu = CPU_FFS(&mask)) != 0) {
cpu--;
CPU_CLR(cpu, &mask);
CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
@@ -1170,15 +1245,15 @@
{
if (smp_started)
- smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
+ smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0);
}
void
-smp_invltlb(void)
+smp_invltlb(pmap_t pmap)
{
if (smp_started) {
- smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+ smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0);
#ifdef COUNT_XINVLTLB_HITS
ipi_global++;
#endif
@@ -1186,11 +1261,11 @@
}
void
-smp_invlpg(vm_offset_t addr)
+smp_invlpg(pmap_t pmap, vm_offset_t addr)
{
if (smp_started) {
- smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+ smp_tlb_shootdown(IPI_INVLPG, pmap, addr, 0);
#ifdef COUNT_XINVLTLB_HITS
ipi_page++;
#endif
@@ -1198,11 +1273,11 @@
}
void
-smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+smp_invlpg_range(pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2)
{
if (smp_started) {
- smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+ smp_tlb_shootdown(IPI_INVLRNG, pmap, addr1, addr2);
#ifdef COUNT_XINVLTLB_HITS
ipi_range++;
ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
@@ -1211,11 +1286,11 @@
}
void
-smp_masked_invltlb(cpuset_t mask)
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
{
if (smp_started) {
- smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
#ifdef COUNT_XINVLTLB_HITS
ipi_masked_global++;
#endif
@@ -1223,11 +1298,11 @@
}
void
-smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
+smp_masked_invlpg(cpuset_t mask, pmap_t pmap, vm_offset_t addr)
{
if (smp_started) {
- smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
#ifdef COUNT_XINVLTLB_HITS
ipi_masked_page++;
#endif
@@ -1235,11 +1310,13 @@
}
void
-smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
+smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1,
+ vm_offset_t addr2)
{
if (smp_started) {
- smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1,
+ addr2);
#ifdef COUNT_XINVLTLB_HITS
ipi_masked_range++;
ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
@@ -1300,7 +1377,7 @@
if (ipi == IPI_STOP_HARD)
CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus);
- while ((cpu = cpusetobj_ffs(&cpus)) != 0) {
+ while ((cpu = CPU_FFS(&cpus)) != 0) {
cpu--;
CPU_CLR(cpu, &cpus);
CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
@@ -1352,7 +1429,7 @@
CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus);
CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
- lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
+ cpu_ops.ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
}
int
@@ -1417,33 +1494,223 @@
{
u_int cpu;
+ mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
+
cpu = PCPU_GET(cpuid);
-
- if (savectx(susppcbs[cpu])) {
- ctx_fpusave(suspfpusave[cpu]);
+ if (savectx(&susppcbs[cpu]->sp_pcb)) {
+ fpususpend(susppcbs[cpu]->sp_fpususpend);
wbinvd();
- CPU_SET_ATOMIC(cpu, &stopped_cpus);
+ CPU_SET_ATOMIC(cpu, &suspended_cpus);
+ /*
+ * Hack for xen, which does not use resumectx() so never
+ * uses the next clause: set resuming_cpus early so that
+ * resume_cpus() can wait on the same bitmap for acpi and
+ * xen. resuming_cpus now means eventually_resumable_cpus.
+ */
+ CPU_SET_ATOMIC(cpu, &resuming_cpus);
} else {
+ fpuresume(susppcbs[cpu]->sp_fpususpend);
pmap_init_pat();
- load_cr3(susppcbs[cpu]->pcb_cr3);
initializecpu();
PCPU_SET(switchtime, 0);
PCPU_SET(switchticks, ticks);
+
+ /* Indicate that we are resuming */
+ CPU_CLR_ATOMIC(cpu, &suspended_cpus);
}
- /* Wait for resume */
- while (!CPU_ISSET(cpu, &started_cpus))
+ /* Wait for resume directive */
+ while (!CPU_ISSET(cpu, &toresume_cpus))
ia32_pause();
+ if (cpu_ops.cpu_resume)
+ cpu_ops.cpu_resume();
+ if (vmm_resume_p)
+ vmm_resume_p();
+
/* Resume MCA and local APIC */
mca_resume();
lapic_setup(0);
- CPU_CLR_ATOMIC(cpu, &started_cpus);
- CPU_CLR_ATOMIC(cpu, &stopped_cpus);
+ /* Indicate that we are resumed */
+ CPU_CLR_ATOMIC(cpu, &resuming_cpus);
+ CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+ CPU_CLR_ATOMIC(cpu, &toresume_cpus);
}
/*
+ * Handlers for TLB related IPIs
+ */
+void
+invltlb_handler(void)
+{
+#ifdef COUNT_XINVLTLB_HITS
+ xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+ (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ invltlb();
+ atomic_add_int(&smp_tlb_wait, 1);
+}
+
+void
+invltlb_pcid_handler(void)
+{
+ uint64_t cr3;
+ u_int cpuid;
+#ifdef COUNT_XINVLTLB_HITS
+ xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+ (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ if (smp_tlb_invpcid.pcid != (uint64_t)-1 &&
+ smp_tlb_invpcid.pcid != 0) {
+ if (invpcid_works) {
+ invpcid(&smp_tlb_invpcid, INVPCID_CTX);
+ } else {
+ /* Otherwise reload %cr3 twice. */
+ cr3 = rcr3();
+ if (cr3 != pcid_cr3) {
+ load_cr3(pcid_cr3);
+ cr3 |= CR3_PCID_SAVE;
+ }
+ load_cr3(cr3);
+ }
+ } else {
+ invltlb_globpcid();
+ }
+ if (smp_tlb_pmap != NULL) {
+ cpuid = PCPU_GET(cpuid);
+ if (!CPU_ISSET(cpuid, &smp_tlb_pmap->pm_active))
+ CPU_CLR_ATOMIC(cpuid, &smp_tlb_pmap->pm_save);
+ }
+
+ atomic_add_int(&smp_tlb_wait, 1);
+}
+
+void
+invlpg_handler(void)
+{
+#ifdef COUNT_XINVLTLB_HITS
+ xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+ (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ invlpg(smp_tlb_invpcid.addr);
+ atomic_add_int(&smp_tlb_wait, 1);
+}
+
+void
+invlpg_pcid_handler(void)
+{
+ uint64_t cr3;
+#ifdef COUNT_XINVLTLB_HITS
+ xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+ (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ if (smp_tlb_invpcid.pcid == (uint64_t)-1) {
+ invltlb_globpcid();
+ } else if (smp_tlb_invpcid.pcid == 0) {
+ invlpg(smp_tlb_invpcid.addr);
+ } else if (invpcid_works) {
+ invpcid(&smp_tlb_invpcid, INVPCID_ADDR);
+ } else {
+ /*
+ * PCID supported, but INVPCID is not.
+ * Temporarily switch to the target address
+ * space and do INVLPG.
+ */
+ cr3 = rcr3();
+ if (cr3 != pcid_cr3)
+ load_cr3(pcid_cr3 | CR3_PCID_SAVE);
+ invlpg(smp_tlb_invpcid.addr);
+ load_cr3(cr3 | CR3_PCID_SAVE);
+ }
+
+ atomic_add_int(&smp_tlb_wait, 1);
+}
+
+static inline void
+invlpg_range(vm_offset_t start, vm_offset_t end)
+{
+
+ do {
+ invlpg(start);
+ start += PAGE_SIZE;
+ } while (start < end);
+}
+
+void
+invlrng_handler(void)
+{
+ struct invpcid_descr d;
+ vm_offset_t addr;
+ uint64_t cr3;
+ u_int cpuid;
+#ifdef COUNT_XINVLTLB_HITS
+ xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+ (*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ addr = smp_tlb_invpcid.addr;
+ if (pmap_pcid_enabled) {
+ if (smp_tlb_invpcid.pcid == 0) {
+ /*
+ * kernel pmap - use invlpg to invalidate
+ * global mapping.
+ */
+ invlpg_range(addr, smp_tlb_addr2);
+ } else if (smp_tlb_invpcid.pcid == (uint64_t)-1) {
+ invltlb_globpcid();
+ if (smp_tlb_pmap != NULL) {
+ cpuid = PCPU_GET(cpuid);
+ if (!CPU_ISSET(cpuid, &smp_tlb_pmap->pm_active))
+ CPU_CLR_ATOMIC(cpuid,
+ &smp_tlb_pmap->pm_save);
+ }
+ } else if (invpcid_works) {
+ d = smp_tlb_invpcid;
+ do {
+ invpcid(&d, INVPCID_ADDR);
+ d.addr += PAGE_SIZE;
+ } while (d.addr <= smp_tlb_addr2);
+ } else {
+ cr3 = rcr3();
+ if (cr3 != pcid_cr3)
+ load_cr3(pcid_cr3 | CR3_PCID_SAVE);
+ invlpg_range(addr, smp_tlb_addr2);
+ load_cr3(cr3 | CR3_PCID_SAVE);
+ }
+ } else {
+ invlpg_range(addr, smp_tlb_addr2);
+ }
+
+ atomic_add_int(&smp_tlb_wait, 1);
+}
+
+void
+invlcache_handler(void)
+{
+#ifdef COUNT_IPIS
+ (*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ wbinvd();
+ atomic_add_int(&smp_tlb_wait, 1);
+}
+
+/*
* This is called once the rest of the system is up and running and we're
* ready to let the AP's out of the pen.
*/
Modified: trunk/sys/amd64/amd64/mp_watchdog.c
===================================================================
--- trunk/sys/amd64/amd64/mp_watchdog.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/mp_watchdog.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2004 Robert N. M. Watson
* All rights reserved.
@@ -23,7 +24,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/mp_watchdog.c 314667 2017-03-04 13:03:31Z avg $
*/
#include "opt_mp_watchdog.h"
@@ -86,7 +87,7 @@
watchdog_init(void *arg)
{
- callout_init(&watchdog_callout, CALLOUT_MPSAFE);
+ callout_init(&watchdog_callout, 1);
if (watchdog_cpu != -1)
watchdog_change(watchdog_cpu);
}
Modified: trunk/sys/amd64/amd64/mpboot.S
===================================================================
--- trunk/sys/amd64/amd64/mpboot.S 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/mpboot.S 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2003 Peter Wemm
* All rights reserved.
@@ -23,7 +24,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/mpboot.S 130224 2004-06-08 01:02:52Z peter $
*/
#include <machine/asmacros.h> /* miscellaneous asm macros */
Property changes on: trunk/sys/amd64/amd64/mpboot.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/pmap.c
===================================================================
--- trunk/sys/amd64/amd64/pmap.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/pmap.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1991 Regents of the University of California.
* All rights reserved.
@@ -76,19 +77,14 @@
* SUCH DAMAGE.
*/
+#define AMD64_NPT_AWARE
+
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 322063 2017-08-04 21:38:34Z marius $");
/*
* Manages physical address maps.
*
- * In addition to hardware address maps, this
- * module is called upon to provide software-use-only
- * maps which may or may not be stored in the same
- * form as hardware maps. These pseudo-maps are
- * used to store intermediate results from copy
- * operations to and from address spaces.
- *
* Since the information managed by this module is
* also stored by the logical address mapping module,
* this module may throw away valid virtual-to-physical
@@ -123,11 +119,8 @@
#include <sys/vmmeter.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
-#ifdef SMP
+#include <sys/_unrhdr.h>
#include <sys/smp.h>
-#else
-#include <sys/cpuset.h>
-#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -138,6 +131,8 @@
#include <vm/vm_extern.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
#include <vm/vm_reserv.h>
#include <vm/uma.h>
@@ -152,6 +147,132 @@
#include <machine/smp.h>
#endif
+static __inline boolean_t
+pmap_type_guest(pmap_t pmap)
+{
+
+ return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
+}
+
+static __inline boolean_t
+pmap_emulate_ad_bits(pmap_t pmap)
+{
+
+ return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
+}
+
+static __inline pt_entry_t
+pmap_valid_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ case PT_RVI:
+ mask = X86_PG_V;
+ break;
+ case PT_EPT:
+ if (pmap_emulate_ad_bits(pmap))
+ mask = EPT_PG_EMUL_V;
+ else
+ mask = EPT_PG_READ;
+ break;
+ default:
+ panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline pt_entry_t
+pmap_rw_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ case PT_RVI:
+ mask = X86_PG_RW;
+ break;
+ case PT_EPT:
+ if (pmap_emulate_ad_bits(pmap))
+ mask = EPT_PG_EMUL_RW;
+ else
+ mask = EPT_PG_WRITE;
+ break;
+ default:
+ panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline pt_entry_t
+pmap_global_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ mask = X86_PG_G;
+ break;
+ case PT_RVI:
+ case PT_EPT:
+ mask = 0;
+ break;
+ default:
+ panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline pt_entry_t
+pmap_accessed_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ case PT_RVI:
+ mask = X86_PG_A;
+ break;
+ case PT_EPT:
+ if (pmap_emulate_ad_bits(pmap))
+ mask = EPT_PG_READ;
+ else
+ mask = EPT_PG_A;
+ break;
+ default:
+ panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline pt_entry_t
+pmap_modified_bit(pmap_t pmap)
+{
+ pt_entry_t mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ case PT_RVI:
+ mask = X86_PG_M;
+ break;
+ case PT_EPT:
+ if (pmap_emulate_ad_bits(pmap))
+ mask = EPT_PG_WRITE;
+ else
+ mask = EPT_PG_M;
+ break;
+ default:
+ panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
#if !defined(DIAGNOSTIC)
#ifdef __GNUC_GNU_INLINE__
#define PMAP_INLINE __attribute__((__gnu_inline__)) inline
@@ -209,8 +330,12 @@
vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
+int nkpt;
+SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
+ "Number of kernel page table pages allocated on bootup");
+
static int ndmpdp;
-static vm_paddr_t dmaplimit;
+vm_paddr_t dmaplimit;
vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
pt_entry_t pg_nx;
@@ -234,17 +359,21 @@
static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
+static int ndmpdpphys; /* number of DMPDPphys pages */
/*
- * Isolate the global pv list lock from data and other locks to prevent false
- * sharing within the cache.
+ * pmap_mapdev support pre initialization (i.e. console)
*/
-static struct {
- struct rwlock lock;
- char padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
-} pvh_global __aligned(CACHE_LINE_SIZE);
+#define PMAP_PREINIT_MAPPING_COUNT 8
+static struct pmap_preinit_mapping {
+ vm_paddr_t pa;
+ vm_offset_t va;
+ vm_size_t sz;
+ int mode;
+} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
+static int pmap_initialized;
-#define pvh_global_lock pvh_global.lock
+static struct rwlock_padalign pvh_global_lock;
/*
* Data for the pv entry allocation mechanism
@@ -253,6 +382,7 @@
static struct mtx pv_chunks_mutex;
static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
static struct md_page *pv_table;
+static struct md_page pv_dummy;
/*
* All those kernel PT submaps that BSD is so fond of
@@ -260,6 +390,38 @@
pt_entry_t *CMAP1 = 0;
caddr_t CADDR1 = 0;
+static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
+
+static struct unrhdr pcid_unr;
+static struct mtx pcid_mtx;
+int pmap_pcid_enabled = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
+ 0, "Is TLB Context ID enabled ?");
+int invpcid_works = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
+ "Is the invpcid instruction available ?");
+
+static int
+pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
+{
+ int i;
+ uint64_t res;
+
+ res = 0;
+ CPU_FOREACH(i) {
+ res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
+ }
+ return (sysctl_handle_64(oidp, &res, 0, req));
+}
+SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
+ CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
+ "Count of saved TLB context on switch");
+
+/* pmap_copy_pages() over non-DMAP */
+static struct mtx cpage_lock;
+static vm_offset_t cpage_a;
+static vm_offset_t cpage_b;
+
/*
* Crashdump maps.
*/
@@ -281,7 +443,6 @@
static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
vm_offset_t va);
-static int pmap_pvh_wired_mappings(struct md_page *pvh, int count);
static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
@@ -294,30 +455,29 @@
static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
-static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
-static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
-static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
+static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
+static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
+ pd_entry_t pde);
static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
-static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
+static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
struct rwlock **lockp);
static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
vm_prot_t prot);
-static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
+static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
- vm_page_t *free, struct rwlock **lockp);
-static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
- vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free,
- struct rwlock **lockp);
+ struct spglist *free, struct rwlock **lockp);
+static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
+ pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
- vm_page_t *free);
+ struct spglist *free);
static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
vm_page_t m, struct rwlock **lockp);
static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
pd_entry_t newpde);
-static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
+static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
struct rwlock **lockp);
@@ -327,13 +487,10 @@
struct rwlock **lockp);
static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
- vm_page_t *free);
-static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
+ struct spglist *free);
+static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
-CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
-CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
-
/*
* Move the kernel virtual free pointer to the next
* 2MB. This is used to help improve performance
@@ -413,7 +570,9 @@
pmap_pdpe(pmap_t pmap, vm_offset_t va)
{
pml4_entry_t *pml4e;
+ pt_entry_t PG_V;
+ PG_V = pmap_valid_bit(pmap);
pml4e = pmap_pml4e(pmap, va);
if ((*pml4e & PG_V) == 0)
return (NULL);
@@ -435,7 +594,9 @@
pmap_pde(pmap_t pmap, vm_offset_t va)
{
pdp_entry_t *pdpe;
+ pt_entry_t PG_V;
+ PG_V = pmap_valid_bit(pmap);
pdpe = pmap_pdpe(pmap, va);
if (pdpe == NULL || (*pdpe & PG_V) == 0)
return (NULL);
@@ -457,7 +618,9 @@
pmap_pte(pmap_t pmap, vm_offset_t va)
{
pd_entry_t *pde;
+ pt_entry_t PG_V;
+ PG_V = pmap_valid_bit(pmap);
pde = pmap_pde(pmap, va);
if (pde == NULL || (*pde & PG_V) == 0)
return (NULL);
@@ -479,6 +642,9 @@
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ KASSERT(pmap->pm_stats.resident_count >= count,
+ ("pmap %p resident count underflow %ld %d", pmap,
+ pmap->pm_stats.resident_count, count));
pmap->pm_stats.resident_count -= count;
}
@@ -487,6 +653,8 @@
{
u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
+
return (PTmap + ((va >> PAGE_SHIFT) & mask));
}
@@ -495,6 +663,8 @@
{
u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
+
return (PDmap + ((va >> PDRSHIFT) & mask));
}
@@ -511,21 +681,67 @@
CTASSERT(powerof2(NDMPML4E));
+/* number of kernel PDP slots */
+#define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG)
+
static void
+nkpt_init(vm_paddr_t addr)
+{
+ int pt_pages;
+
+#ifdef NKPT
+ pt_pages = NKPT;
+#else
+ pt_pages = howmany(addr, 1 << PDRSHIFT);
+ pt_pages += NKPDPE(pt_pages);
+
+ /*
+ * Add some slop beyond the bare minimum required for bootstrapping
+ * the kernel.
+ *
+ * This is quite important when allocating KVA for kernel modules.
+ * The modules are required to be linked in the negative 2GB of
+ * the address space. If we run out of KVA in this region then
+ * pmap_growkernel() will need to allocate page table pages to map
+ * the entire 512GB of KVA space which is an unnecessary tax on
+ * physical memory.
+ *
+ * Secondly, device memory mapped as part of setting up the low-
+ * level console(s) is taken from KVA, starting at virtual_avail.
+ * This is because cninit() is called after pmap_bootstrap() but
+ * before vm_init() and pmap_init(). 20MB for a frame buffer is
+ * not uncommon.
+ */
+ pt_pages += 32; /* 64MB additional slop. */
+#endif
+ nkpt = pt_pages;
+}
+
+static void
create_pagetables(vm_paddr_t *firstaddr)
{
- int i, j, ndm1g;
+ int i, j, ndm1g, nkpdpe;
+ pt_entry_t *pt_p;
+ pd_entry_t *pd_p;
+ pdp_entry_t *pdp_p;
+ pml4_entry_t *p4_p;
- /* Allocate pages */
- KPTphys = allocpages(firstaddr, NKPT);
- KPML4phys = allocpages(firstaddr, 1);
- KPDPphys = allocpages(firstaddr, NKPML4E);
- KPDphys = allocpages(firstaddr, NKPDPE);
-
+ /* Allocate page table pages for the direct map */
ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
if (ndmpdp < 4) /* Minimum 4GB of dirmap */
ndmpdp = 4;
- DMPDPphys = allocpages(firstaddr, NDMPML4E);
+ ndmpdpphys = howmany(ndmpdp, NPDPEPG);
+ if (ndmpdpphys > NDMPML4E) {
+ /*
+ * Each NDMPML4E allows 512 GB, so limit to that,
+ * and then readjust ndmpdp and ndmpdpphys.
+ */
+ printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
+ Maxmem = atop(NDMPML4E * NBPML4);
+ ndmpdpphys = NDMPML4E;
+ ndmpdp = NDMPML4E * NPDEPG;
+ }
+ DMPDPphys = allocpages(firstaddr, ndmpdpphys);
ndm1g = 0;
if ((amd_feature & AMDID_PAGE1GB) != 0)
ndm1g = ptoa(Maxmem) >> PDPSHIFT;
@@ -533,33 +749,49 @@
DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
+ /* Allocate pages */
+ KPML4phys = allocpages(firstaddr, 1);
+ KPDPphys = allocpages(firstaddr, NKPML4E);
+
+ /*
+ * Allocate the initial number of kernel page table pages required to
+ * bootstrap. We defer this until after all memory-size dependent
+ * allocations are done (e.g. direct map), so that we don't have to
+ * build in too much slop in our estimate.
+ *
+ * Note that when NKPML4E > 1, we have an empty page underneath
+ * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
+ * pages. (pmap_enter requires a PD page to exist for each KPML4E.)
+ */
+ nkpt_init(*firstaddr);
+ nkpdpe = NKPDPE(nkpt);
+
+ KPTphys = allocpages(firstaddr, nkpt);
+ KPDphys = allocpages(firstaddr, nkpdpe);
+
/* Fill in the underlying page table pages */
- /* Read-only from zero to physfree */
+ /* Nominally read-only (but really R/W) from zero to physfree */
/* XXX not fully used, underneath 2M pages */
- for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
- ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
- ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
- }
+ pt_p = (pt_entry_t *)KPTphys;
+ for (i = 0; ptoa(i) < *firstaddr; i++)
+ pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
/* Now map the page tables at their location within PTmap */
- for (i = 0; i < NKPT; i++) {
- ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
- ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
- }
+ pd_p = (pd_entry_t *)KPDphys;
+ for (i = 0; i < nkpt; i++)
+ pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
/* Map from zero to end of allocations under 2M pages */
/* This replaces some of the KPTphys entries above */
- for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
- ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
- ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
- }
+ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
+ pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
+ X86_PG_G;
- /* And connect up the PD to the PDP */
- for (i = 0; i < NKPDPE; i++) {
- ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
- (i << PAGE_SHIFT);
- ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
- }
+ /* And connect up the PD to the PDP (leaving room for L4 pages) */
+ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
+ for (i = 0; i < nkpdpe; i++)
+ pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
+ PG_U;
/*
* Now, set up the direct map region using 2MB and/or 1GB pages. If
@@ -569,37 +801,41 @@
* memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
* that are partially used.
*/
+ pd_p = (pd_entry_t *)DMPDphys;
for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
- ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
+ pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
/* Preset PG_M and PG_A because demotion expects it. */
- ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
- PG_M | PG_A;
+ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+ X86_PG_M | X86_PG_A;
}
+ pdp_p = (pdp_entry_t *)DMPDPphys;
for (i = 0; i < ndm1g; i++) {
- ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
+ pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
/* Preset PG_M and PG_A because demotion expects it. */
- ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
- PG_M | PG_A;
+ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+ X86_PG_M | X86_PG_A;
}
for (j = 0; i < ndmpdp; i++, j++) {
- ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
- ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
+ pdp_p[i] = DMPDphys + ptoa(j);
+ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
}
/* And recursively map PML4 to itself in order to get PTmap */
- ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
- ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
+ p4_p = (pml4_entry_t *)KPML4phys;
+ p4_p[PML4PML4I] = KPML4phys;
+ p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
/* Connect the Direct Map slot(s) up to the PML4. */
- for (i = 0; i < NDMPML4E; i++) {
- ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys +
- (i << PAGE_SHIFT);
- ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U;
+ for (i = 0; i < ndmpdpphys; i++) {
+ p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
+ p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
}
- /* Connect the KVA slot up to the PML4 */
- ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
- ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
+ /* Connect the KVA slots up to the PML4 */
+ for (i = 0; i < NKPML4E; i++) {
+ p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
+ p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
+ }
}
/*
@@ -616,7 +852,7 @@
pmap_bootstrap(vm_paddr_t *firstaddr)
{
vm_offset_t va;
- pt_entry_t *pte, *unused;
+ pt_entry_t *pte;
/*
* Create an initial set of page tables to run the kernel in.
@@ -623,6 +859,15 @@
*/
create_pagetables(firstaddr);
+ /*
+ * Add a physical memory segment (vm_phys_seg) corresponding to the
+ * preallocated kernel page table pages so that vm_page structures
+ * representing these pages will be created. The vm_page structures
+ * are required for promotion of the corresponding kernel virtual
+ * addresses to superpage mappings.
+ */
+ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
+
virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
virtual_avail = pmap_kmem_choose(virtual_avail);
@@ -640,9 +885,11 @@
*/
PMAP_LOCK_INIT(kernel_pmap);
kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
- kernel_pmap->pm_root = NULL;
+ kernel_pmap->pm_cr3 = KPML4phys;
CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
+ CPU_FILL(&kernel_pmap->pm_save); /* always superset of pm_active */
TAILQ_INIT(&kernel_pmap->pm_pvchunk);
+ kernel_pmap->pm_flags = pmap_flags;
/*
* Initialize the global pv list lock.
@@ -660,19 +907,37 @@
pte = vtopte(va);
/*
- * CMAP1 is only used for the memory test.
+ * Crashdump maps. The first page is reused as CMAP1 for the
+ * memory test.
*/
- SYSMAP(caddr_t, CMAP1, CADDR1, 1)
+ SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
+ CADDR1 = crashdumpmap;
+ virtual_avail = va;
+
/*
- * Crashdump maps.
+ * Initialize the PAT MSR.
+ * pmap_init_pat() clears and sets CR4_PGE, which, as a
+ * side-effect, invalidates stale PG_G TLB entries that might
+ * have been created in our pre-boot environment.
*/
- SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
+ pmap_init_pat();
- virtual_avail = va;
-
- /* Initialize the PAT MSR. */
- pmap_init_pat();
+ /* Initialize TLB Context Id. */
+ TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
+ if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
+ load_cr4(rcr4() | CR4_PCIDE);
+ mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
+ init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
+ /* Check for INVPCID support */
+ invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
+ != 0;
+ kernel_pmap->pm_pcid = 0;
+#ifndef SMP
+ pmap_pcid_enabled = 0;
+#endif
+ } else
+ pmap_pcid_enabled = 0;
}
/*
@@ -776,6 +1041,7 @@
void
pmap_init(void)
{
+ struct pmap_preinit_mapping *ppim;
vm_page_t mpte;
vm_size_t s;
int i, pv_npg;
@@ -784,7 +1050,7 @@
* Initialize the vm page array entries for the kernel pmap's
* page table pages.
*/
- for (i = 0; i < NKPT; i++) {
+ for (i = 0; i < nkpt; i++) {
mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
KASSERT(mpte >= vm_page_array &&
mpte < &vm_page_array[vm_page_array_size],
@@ -794,12 +1060,18 @@
}
/*
- * If the kernel is running in a virtual machine on an AMD Family 10h
- * processor, then it must assume that MCA is enabled by the virtual
- * machine monitor.
+ * If the kernel is running on a virtual machine, then it must assume
+ * that MCA is enabled by the hypervisor. Moreover, the kernel must
+ * be prepared for the hypervisor changing the vendor and family that
+ * are reported by CPUID. Consequently, the workaround for AMD Family
+ * 10h Erratum 383 is enabled if the processor's feature set does not
+ * include at least one feature that is only supported by older Intel
+ * or newer AMD processors.
*/
- if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD &&
- CPUID_TO_FAMILY(cpu_id) == 0x10)
+ if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 &&
+ (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
+ CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
+ AMDID2_FMA4)) == 0)
workaround_erratum383 = 1;
/*
@@ -826,8 +1098,7 @@
/*
* Calculate the size of the pv head table for superpages.
*/
- for (i = 0; phys_avail[i + 1]; i += 2);
- pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR;
+ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
/*
* Allocate memory for the pv head table for superpages.
@@ -834,9 +1105,31 @@
*/
s = (vm_size_t)(pv_npg * sizeof(struct md_page));
s = round_page(s);
- pv_table = (struct md_page *)kmem_alloc(kernel_map, s);
+ pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
+ M_WAITOK | M_ZERO);
for (i = 0; i < pv_npg; i++)
TAILQ_INIT(&pv_table[i].pv_list);
+ TAILQ_INIT(&pv_dummy.pv_list);
+
+ mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
+ cpage_a = kva_alloc(PAGE_SIZE);
+ cpage_b = kva_alloc(PAGE_SIZE);
+
+ pmap_initialized = 1;
+ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
+ ppim = pmap_preinit_mapping + i;
+ if (ppim->va == 0)
+ continue;
+ /* Make the direct map consistent */
+ if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) {
+ (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
+ ppim->sz, ppim->mode);
+ }
+ if (!bootverbose)
+ continue;
+ printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
+ ppim->pa, ppim->va, ppim->sz, ppim->mode);
+ }
}
static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
@@ -869,12 +1162,41 @@
* Low level helper routines.....
***************************************************/
+static pt_entry_t
+pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
+{
+ int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ case PT_RVI:
+ /* Verify that both PAT bits are not set at the same time */
+ KASSERT((entry & x86_pat_bits) != x86_pat_bits,
+ ("Invalid PAT bits in entry %#lx", entry));
+
+ /* Swap the PAT bits if one of them is set */
+ if ((entry & x86_pat_bits) != 0)
+ entry ^= x86_pat_bits;
+ break;
+ case PT_EPT:
+ /*
+ * Nothing to do - the memory attributes are represented
+ * the same way for regular pages and superpages.
+ */
+ break;
+ default:
+ panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
+ }
+
+ return (entry);
+}
+
/*
* Determine the appropriate bits to set in a PTE or PDE for a specified
* caching mode.
*/
static int
-pmap_cache_bits(int mode, boolean_t is_pde)
+pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
{
int cache_bits, pat_flag, pat_idx;
@@ -881,23 +1203,94 @@
if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
panic("Unknown caching mode %d\n", mode);
- /* The PAT bit is different for PTE's and PDE's. */
- pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
+ switch (pmap->pm_type) {
+ case PT_X86:
+ case PT_RVI:
+ /* The PAT bit is different for PTE's and PDE's. */
+ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
- /* Map the caching mode to a PAT index. */
- pat_idx = pat_index[mode];
+ /* Map the caching mode to a PAT index. */
+ pat_idx = pat_index[mode];
- /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
- cache_bits = 0;
- if (pat_idx & 0x4)
- cache_bits |= pat_flag;
- if (pat_idx & 0x2)
- cache_bits |= PG_NC_PCD;
- if (pat_idx & 0x1)
- cache_bits |= PG_NC_PWT;
+ /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
+ cache_bits = 0;
+ if (pat_idx & 0x4)
+ cache_bits |= pat_flag;
+ if (pat_idx & 0x2)
+ cache_bits |= PG_NC_PCD;
+ if (pat_idx & 0x1)
+ cache_bits |= PG_NC_PWT;
+ break;
+
+ case PT_EPT:
+ cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
+ break;
+
+ default:
+ panic("unsupported pmap type %d", pmap->pm_type);
+ }
+
return (cache_bits);
}
+static int
+pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
+{
+ int mask;
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ case PT_RVI:
+ mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
+ break;
+ case PT_EPT:
+ mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
+ break;
+ default:
+ panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
+ }
+
+ return (mask);
+}
+
+static __inline boolean_t
+pmap_ps_enabled(pmap_t pmap)
+{
+
+ return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
+}
+
+static void
+pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
+{
+
+ switch (pmap->pm_type) {
+ case PT_X86:
+ break;
+ case PT_RVI:
+ case PT_EPT:
+ /*
+ * XXX
+ * This is a little bogus since the generation number is
+ * supposed to be bumped up when a region of the address
+ * space is invalidated in the page tables.
+ *
+ * In this case the old PDE entry is valid but yet we want
+ * to make sure that any mappings using the old entry are
+ * invalidated in the TLB.
+ *
+ * The reason this works as expected is because we rendezvous
+ * "all" host cpus and force any vcpu context to exit as a
+ * side-effect.
+ */
+ atomic_add_acq_long(&pmap->pm_eptgen, 1);
+ break;
+ default:
+ panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
+ }
+ pde_store(pde, newpde);
+}
+
/*
* After changing the page size for the specified virtual address in the page
* table, flush the corresponding entries from the processor's TLB. Only the
@@ -906,10 +1299,18 @@
* The calling thread must be pinned to a processor.
*/
static void
-pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
+pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
{
- u_long cr4;
+ pt_entry_t PG_G;
+ if (pmap_type_guest(pmap))
+ return;
+
+ KASSERT(pmap->pm_type == PT_X86,
+ ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
+
+ PG_G = pmap_global_bit(pmap);
+
if ((newpde & PG_PS) == 0)
/* Demotion: flush a specific 2MB page mapping. */
invlpg(va);
@@ -924,19 +1325,33 @@
* Promotion: flush every 4KB page mapping from the TLB,
* including any global (PG_G) mappings.
*/
- cr4 = rcr4();
- load_cr4(cr4 & ~CR4_PGE);
- /*
- * Although preemption at this point could be detrimental to
- * performance, it would not lead to an error. PG_G is simply
- * ignored if CR4.PGE is clear. Moreover, in case this block
- * is re-entered, the load_cr4() either above or below will
- * modify CR4.PGE flushing the TLB.
- */
- load_cr4(cr4 | CR4_PGE);
+ invltlb_globpcid();
}
}
#ifdef SMP
+
+static void
+pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
+{
+ struct invpcid_descr d;
+ uint64_t cr3;
+
+ if (invpcid_works) {
+ d.pcid = pmap->pm_pcid;
+ d.pad = 0;
+ d.addr = va;
+ invpcid(&d, INVPCID_ADDR);
+ return;
+ }
+
+ cr3 = rcr3();
+ critical_enter();
+ load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
+ invlpg(va);
+ load_cr3(cr3 | CR3_PCID_SAVE);
+ critical_exit();
+}
+
/*
* For SMP, these functions have to use the IPI mechanism for coherence.
*
@@ -956,6 +1371,46 @@
* immutable. The kernel page table is always active on every
* processor.
*/
+
+/*
+ * Interrupt the cpus that are executing in the guest context.
+ * This will force the vcpu to exit and the cached EPT mappings
+ * will be invalidated by the host before the next vmresume.
+ */
+static __inline void
+pmap_invalidate_ept(pmap_t pmap)
+{
+ int ipinum;
+
+ sched_pin();
+ KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+ ("pmap_invalidate_ept: absurd pm_active"));
+
+ /*
+ * The TLB mappings associated with a vcpu context are not
+ * flushed each time a different vcpu is chosen to execute.
+ *
+ * This is in contrast with a process's vtop mappings that
+ * are flushed from the TLB on each context switch.
+ *
+ * Therefore we need to do more than just a TLB shootdown on
+ * the active cpus in 'pmap->pm_active'. To do this we keep
+ * track of the number of invalidations performed on this pmap.
+ *
+ * Each vcpu keeps a cache of this counter and compares it
+ * just before a vmresume. If the counter is out-of-date an
+ * invept will be done to flush stale mappings from the TLB.
+ */
+ atomic_add_acq_long(&pmap->pm_eptgen, 1);
+
+ /*
+ * Force the vcpu to exit and trap back into the hypervisor.
+ */
+ ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
+ ipi_selected(pmap->pm_active, ipinum);
+ sched_unpin();
+}
+
void
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
@@ -962,10 +1417,29 @@
cpuset_t other_cpus;
u_int cpuid;
+ if (pmap_type_guest(pmap)) {
+ pmap_invalidate_ept(pmap);
+ return;
+ }
+
+ KASSERT(pmap->pm_type == PT_X86,
+ ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
+
sched_pin();
if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
- invlpg(va);
- smp_invlpg(va);
+ if (!pmap_pcid_enabled) {
+ invlpg(va);
+ } else {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+ if (pmap == PCPU_GET(curpmap))
+ invlpg(va);
+ else
+ pmap_invalidate_page_pcid(pmap, va);
+ } else {
+ invltlb_globpcid();
+ }
+ }
+ smp_invlpg(pmap, va);
} else {
cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
@@ -972,13 +1446,48 @@
CPU_CLR(cpuid, &other_cpus);
if (CPU_ISSET(cpuid, &pmap->pm_active))
invlpg(va);
- CPU_AND(&other_cpus, &pmap->pm_active);
+ else if (pmap_pcid_enabled) {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
+ pmap_invalidate_page_pcid(pmap, va);
+ else
+ invltlb_globpcid();
+ }
+ if (pmap_pcid_enabled)
+ CPU_AND(&other_cpus, &pmap->pm_save);
+ else
+ CPU_AND(&other_cpus, &pmap->pm_active);
if (!CPU_EMPTY(&other_cpus))
- smp_masked_invlpg(other_cpus, va);
+ smp_masked_invlpg(other_cpus, pmap, va);
}
sched_unpin();
}
+static void
+pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ struct invpcid_descr d;
+ uint64_t cr3;
+ vm_offset_t addr;
+
+ if (invpcid_works) {
+ d.pcid = pmap->pm_pcid;
+ d.pad = 0;
+ for (addr = sva; addr < eva; addr += PAGE_SIZE) {
+ d.addr = addr;
+ invpcid(&d, INVPCID_ADDR);
+ }
+ return;
+ }
+
+ cr3 = rcr3();
+ critical_enter();
+ load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE);
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ load_cr3(cr3 | CR3_PCID_SAVE);
+ critical_exit();
+}
+
void
pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
@@ -986,21 +1495,53 @@
vm_offset_t addr;
u_int cpuid;
+ if (pmap_type_guest(pmap)) {
+ pmap_invalidate_ept(pmap);
+ return;
+ }
+
+ KASSERT(pmap->pm_type == PT_X86,
+ ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
+
sched_pin();
if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
- smp_invlpg_range(sva, eva);
+ if (!pmap_pcid_enabled) {
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ } else {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+ if (pmap == PCPU_GET(curpmap)) {
+ for (addr = sva; addr < eva;
+ addr += PAGE_SIZE)
+ invlpg(addr);
+ } else {
+ pmap_invalidate_range_pcid(pmap,
+ sva, eva);
+ }
+ } else {
+ invltlb_globpcid();
+ }
+ }
+ smp_invlpg_range(pmap, sva, eva);
} else {
cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
- if (CPU_ISSET(cpuid, &pmap->pm_active))
+ if (CPU_ISSET(cpuid, &pmap->pm_active)) {
for (addr = sva; addr < eva; addr += PAGE_SIZE)
invlpg(addr);
- CPU_AND(&other_cpus, &pmap->pm_active);
+ } else if (pmap_pcid_enabled) {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
+ pmap_invalidate_range_pcid(pmap, sva, eva);
+ else
+ invltlb_globpcid();
+ }
+ if (pmap_pcid_enabled)
+ CPU_AND(&other_cpus, &pmap->pm_save);
+ else
+ CPU_AND(&other_cpus, &pmap->pm_active);
if (!CPU_EMPTY(&other_cpus))
- smp_masked_invlpg_range(other_cpus, sva, eva);
+ smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
}
sched_unpin();
}
@@ -1009,21 +1550,72 @@
pmap_invalidate_all(pmap_t pmap)
{
cpuset_t other_cpus;
+ struct invpcid_descr d;
+ uint64_t cr3;
u_int cpuid;
+ if (pmap_type_guest(pmap)) {
+ pmap_invalidate_ept(pmap);
+ return;
+ }
+
+ KASSERT(pmap->pm_type == PT_X86,
+ ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
+
sched_pin();
- if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
- invltlb();
- smp_invltlb();
+ cpuid = PCPU_GET(cpuid);
+ if (pmap == kernel_pmap ||
+ (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
+ !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+ if (invpcid_works) {
+ bzero(&d, sizeof(d));
+ invpcid(&d, INVPCID_CTXGLOB);
+ } else {
+ invltlb_globpcid();
+ }
+ if (!CPU_ISSET(cpuid, &pmap->pm_active))
+ CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
+ smp_invltlb(pmap);
} else {
- cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
- if (CPU_ISSET(cpuid, &pmap->pm_active))
+
+ /*
+ * This logic is duplicated in the Xinvltlb shootdown
+ * IPI handler.
+ */
+ if (pmap_pcid_enabled) {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+ if (invpcid_works) {
+ d.pcid = pmap->pm_pcid;
+ d.pad = 0;
+ d.addr = 0;
+ invpcid(&d, INVPCID_CTX);
+ } else {
+ cr3 = rcr3();
+ critical_enter();
+
+ /*
+ * Bit 63 is clear, pcid TLB
+ * entries are invalidated.
+ */
+ load_cr3(pmap->pm_cr3);
+ load_cr3(cr3 | CR3_PCID_SAVE);
+ critical_exit();
+ }
+ } else {
+ invltlb_globpcid();
+ }
+ } else if (CPU_ISSET(cpuid, &pmap->pm_active))
invltlb();
- CPU_AND(&other_cpus, &pmap->pm_active);
+ if (!CPU_ISSET(cpuid, &pmap->pm_active))
+ CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
+ if (pmap_pcid_enabled)
+ CPU_AND(&other_cpus, &pmap->pm_save);
+ else
+ CPU_AND(&other_cpus, &pmap->pm_active);
if (!CPU_EMPTY(&other_cpus))
- smp_masked_invltlb(other_cpus);
+ smp_masked_invltlb(other_cpus, pmap);
}
sched_unpin();
}
@@ -1040,6 +1632,7 @@
struct pde_action {
cpuset_t invalidate; /* processors that invalidate their TLB */
+ pmap_t pmap;
vm_offset_t va;
pd_entry_t *pde;
pd_entry_t newpde;
@@ -1052,7 +1645,7 @@
struct pde_action *act = arg;
if (act->store == PCPU_GET(cpuid))
- pde_store(act->pde, act->newpde);
+ pmap_update_pde_store(act->pmap, act->pde, act->newpde);
}
static void
@@ -1061,7 +1654,7 @@
struct pde_action *act = arg;
if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
- pmap_update_pde_invalidate(act->va, act->newpde);
+ pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
}
/*
@@ -1083,14 +1676,17 @@
cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
- if (pmap == kernel_pmap)
+ if (pmap == kernel_pmap || pmap_type_guest(pmap))
active = all_cpus;
- else
+ else {
active = pmap->pm_active;
+ CPU_AND_ATOMIC(&pmap->pm_save, &active);
+ }
if (CPU_OVERLAP(&active, &other_cpus)) {
act.store = cpuid;
act.invalidate = active;
act.va = va;
+ act.pmap = pmap;
act.pde = pde;
act.newpde = newpde;
CPU_SET(cpuid, &active);
@@ -1098,9 +1694,9 @@
smp_no_rendevous_barrier, pmap_update_pde_action,
pmap_update_pde_teardown, &act);
} else {
- pde_store(pde, newpde);
+ pmap_update_pde_store(pmap, pde, newpde);
if (CPU_ISSET(cpuid, &active))
- pmap_update_pde_invalidate(va, newpde);
+ pmap_update_pde_invalidate(pmap, va, newpde);
}
sched_unpin();
}
@@ -1113,8 +1709,18 @@
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
- if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
- invlpg(va);
+ switch (pmap->pm_type) {
+ case PT_X86:
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ invlpg(va);
+ break;
+ case PT_RVI:
+ case PT_EPT:
+ pmap->pm_eptgen++;
+ break;
+ default:
+ panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
+ }
}
PMAP_INLINE void
@@ -1122,9 +1728,19 @@
{
vm_offset_t addr;
- if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
+ switch (pmap->pm_type) {
+ case PT_X86:
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ break;
+ case PT_RVI:
+ case PT_EPT:
+ pmap->pm_eptgen++;
+ break;
+ default:
+ panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
+ }
}
PMAP_INLINE void
@@ -1131,8 +1747,18 @@
pmap_invalidate_all(pmap_t pmap)
{
- if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
- invltlb();
+ switch (pmap->pm_type) {
+ case PT_X86:
+ if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+ invltlb();
+ break;
+ case PT_RVI:
+ case PT_EPT:
+ pmap->pm_eptgen++;
+ break;
+ default:
+ panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
+ }
}
PMAP_INLINE void
@@ -1146,28 +1772,54 @@
pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
{
- pde_store(pde, newpde);
+ pmap_update_pde_store(pmap, pde, newpde);
if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
- pmap_update_pde_invalidate(va, newpde);
+ pmap_update_pde_invalidate(pmap, va, newpde);
+ else
+ CPU_ZERO(&pmap->pm_save);
}
#endif /* !SMP */
+static void
+pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
+{
+
+ /*
+ * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
+ * by a promotion that did not invalidate the 512 4KB page mappings
+ * that might exist in the TLB. Consequently, at this point, the TLB
+ * may hold both 4KB and 2MB page mappings for the address range [va,
+ * va + NBPDR). Therefore, the entire range must be invalidated here.
+ * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
+ * 4KB page mappings for the address range [va, va + NBPDR), and so a
+ * single INVLPG suffices to invalidate the 2MB page mapping from the
+ * TLB.
+ */
+ if ((pde & PG_PROMOTED) != 0)
+ pmap_invalidate_range(pmap, va, va + NBPDR - 1);
+ else
+ pmap_invalidate_page(pmap, va);
+}
+
#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024)
void
-pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
+pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
{
- KASSERT((sva & PAGE_MASK) == 0,
- ("pmap_invalidate_cache_range: sva not page-aligned"));
- KASSERT((eva & PAGE_MASK) == 0,
- ("pmap_invalidate_cache_range: eva not page-aligned"));
+ if (force) {
+ sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
+ } else {
+ KASSERT((sva & PAGE_MASK) == 0,
+ ("pmap_invalidate_cache_range: sva not page-aligned"));
+ KASSERT((eva & PAGE_MASK) == 0,
+ ("pmap_invalidate_cache_range: eva not page-aligned"));
+ }
- if (cpu_feature & CPUID_SS)
- ; /* If "Self Snoop" is supported, do nothing. */
- else if ((cpu_feature & CPUID_CLFSH) != 0 &&
+ if ((cpu_feature & CPUID_SS) != 0 && !force)
+ ; /* If "Self Snoop" is supported and allowed, do nothing. */
+ else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
eva - sva < PMAP_CLFLUSH_THRESHOLD) {
-
/*
* XXX: Some CPUs fault, hang, or trash the local APIC
* registers if we use CLFLUSH on the local APIC
@@ -1178,16 +1830,29 @@
return;
/*
- * Otherwise, do per-cache line flush. Use the mfence
+ * Otherwise, do per-cache line flush. Use the sfence
* instruction to insure that previous stores are
* included in the write-back. The processor
* propagates flush to other processors in the cache
* coherence domain.
*/
- mfence();
+ sfence();
for (; sva < eva; sva += cpu_clflush_line_size)
+ clflushopt(sva);
+ sfence();
+ } else if ((cpu_feature & CPUID_CLFSH) != 0 &&
+ eva - sva < PMAP_CLFLUSH_THRESHOLD) {
+ if (pmap_kextract(sva) == lapic_paddr)
+ return;
+ /*
+ * Writes are ordered by CLFLUSH on Intel CPUs.
+ */
+ if (cpu_vendor_id != CPU_VENDOR_INTEL)
+ mfence();
+ for (; sva < eva; sva += cpu_clflush_line_size)
clflush(sva);
- mfence();
+ if (cpu_vendor_id != CPU_VENDOR_INTEL)
+ mfence();
} else {
/*
@@ -1211,33 +1876,35 @@
{
vm_offset_t daddr, eva;
int i;
+ bool useclflushopt;
+ useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
- (cpu_feature & CPUID_CLFSH) == 0)
+ ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
pmap_invalidate_cache();
else {
- mfence();
+ if (useclflushopt)
+ sfence();
+ else if (cpu_vendor_id != CPU_VENDOR_INTEL)
+ mfence();
for (i = 0; i < count; i++) {
daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
eva = daddr + PAGE_SIZE;
- for (; daddr < eva; daddr += cpu_clflush_line_size)
- clflush(daddr);
+ for (; daddr < eva; daddr += cpu_clflush_line_size) {
+ if (useclflushopt)
+ clflushopt(daddr);
+ else
+ clflush(daddr);
+ }
}
- mfence();
+ if (useclflushopt)
+ sfence();
+ else if (cpu_vendor_id != CPU_VENDOR_INTEL)
+ mfence();
}
}
/*
- * Are we current address space or kernel?
- */
-static __inline int
-pmap_is_current(pmap_t pmap)
-{
- return (pmap == kernel_pmap ||
- (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
-}
-
-/*
* Routine: pmap_extract
* Function:
* Extract the physical page address associated
@@ -1248,10 +1915,11 @@
{
pdp_entry_t *pdpe;
pd_entry_t *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
vm_paddr_t pa;
pa = 0;
+ PG_V = pmap_valid_bit(pmap);
PMAP_LOCK(pmap);
pdpe = pmap_pdpe(pmap, va);
if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -1286,12 +1954,14 @@
pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
{
pd_entry_t pde, *pdep;
- pt_entry_t pte;
+ pt_entry_t pte, PG_RW, PG_V;
vm_paddr_t pa;
vm_page_t m;
pa = 0;
m = NULL;
+ PG_RW = pmap_rw_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
PMAP_LOCK(pmap);
retry:
pdep = pmap_pde(pmap, va);
@@ -1364,7 +2034,7 @@
pt_entry_t *pte;
pte = vtopte(va);
- pte_store(pte, pa | PG_RW | PG_V | PG_G);
+ pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
}
static __inline void
@@ -1371,9 +2041,11 @@
pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
{
pt_entry_t *pte;
+ int cache_bits;
pte = vtopte(va);
- pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
+ cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
+ pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
}
/*
@@ -1422,6 +2094,7 @@
{
pt_entry_t *endpte, oldpte, pa, *pte;
vm_page_t m;
+ int cache_bits;
oldpte = 0;
pte = vtopte(sva);
@@ -1428,14 +2101,15 @@
endpte = pte + count;
while (pte < endpte) {
m = *ma++;
- pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
- if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
+ cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
+ pa = VM_PAGE_TO_PHYS(m) | cache_bits;
+ if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
oldpte |= *pte;
- pte_store(pte, pa | PG_G | PG_RW | PG_V);
+ pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
}
pte++;
}
- if (__predict_false((oldpte & PG_V) != 0))
+ if (__predict_false((oldpte & X86_PG_V) != 0))
pmap_invalidate_range(kernel_pmap, sva, sva + count *
PAGE_SIZE);
}
@@ -1463,13 +2137,12 @@
* Page table page management routines.....
***************************************************/
static __inline void
-pmap_free_zero_pages(vm_page_t free)
+pmap_free_zero_pages(struct spglist *free)
{
vm_page_t m;
- while (free != NULL) {
- m = free;
- free = m->right;
+ while ((m = SLIST_FIRST(free)) != NULL) {
+ SLIST_REMOVE_HEAD(free, plinks.s.ss);
/* Preserve the page's PG_ZERO setting. */
vm_page_free_toq(m);
}
@@ -1481,7 +2154,8 @@
* physical memory manager after the TLB has been updated.
*/
static __inline void
-pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO)
+pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
+ boolean_t set_PG_ZERO)
{
if (set_PG_ZERO)
@@ -1488,8 +2162,7 @@
m->flags |= PG_ZERO;
else
m->flags &= ~PG_ZERO;
- m->right = *free;
- *free = m;
+ SLIST_INSERT_HEAD(free, m, plinks.s.ss);
}
/*
@@ -1498,31 +2171,12 @@
* for mapping a distinct range of virtual addresses. The pmap's collection is
* ordered by this virtual address range.
*/
-static void
+static __inline int
pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
{
- vm_page_t root;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
- root = pmap->pm_root;
- if (root == NULL) {
- mpte->left = NULL;
- mpte->right = NULL;
- } else {
- root = vm_page_splay(mpte->pindex, root);
- if (mpte->pindex < root->pindex) {
- mpte->left = root->left;
- mpte->right = root;
- root->left = NULL;
- } else if (mpte->pindex == root->pindex)
- panic("pmap_insert_pt_page: pindex already inserted");
- else {
- mpte->right = root->right;
- mpte->left = root;
- root->right = NULL;
- }
- }
- pmap->pm_root = mpte;
+ return (vm_radix_insert(&pmap->pm_root, mpte));
}
/*
@@ -1530,19 +2184,12 @@
* specified pmap's collection of idle page table pages. Returns NULL if there
* is no page table page corresponding to the specified virtual address.
*/
-static vm_page_t
+static __inline vm_page_t
pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va)
{
- vm_page_t mpte;
- vm_pindex_t pindex = pmap_pde_pindex(va);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
- if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) {
- mpte = vm_page_splay(pindex, mpte);
- if ((pmap->pm_root = mpte)->pindex != pindex)
- mpte = NULL;
- }
- return (mpte);
+ return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va)));
}
/*
@@ -1550,25 +2197,12 @@
* of idle page table pages. The specified page table page must be a member of
* the pmap's collection.
*/
-static void
+static __inline void
pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte)
{
- vm_page_t root;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
- if (mpte != pmap->pm_root) {
- root = vm_page_splay(mpte->pindex, pmap->pm_root);
- KASSERT(mpte == root,
- ("pmap_remove_pt_page: mpte %p is missing from pmap %p",
- mpte, pmap));
- }
- if (mpte->left == NULL)
- root = mpte->right;
- else {
- root = vm_page_splay(mpte->pindex, mpte->left);
- root->right = mpte->right;
- }
- pmap->pm_root = root;
+ vm_radix_remove(&pmap->pm_root, mpte->pindex);
}
/*
@@ -1578,7 +2212,7 @@
* page table page was unmapped and FALSE otherwise.
*/
static inline boolean_t
-pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
+pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
{
--m->wire_count;
@@ -1590,7 +2224,7 @@
}
static void
-_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
+_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
{
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -1648,7 +2282,8 @@
* conditionally free the page, and manage the hold/wire counts.
*/
static int
-pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
+pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
+ struct spglist *free)
{
vm_page_t mpte;
@@ -1665,11 +2300,15 @@
PMAP_LOCK_INIT(pmap);
pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
- pmap->pm_root = NULL;
+ pmap->pm_cr3 = KPML4phys;
+ pmap->pm_root.rt_root = 0;
CPU_ZERO(&pmap->pm_active);
+ CPU_ZERO(&pmap->pm_save);
PCPU_SET(curpmap, pmap);
TAILQ_INIT(&pmap->pm_pvchunk);
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+ pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
+ pmap->pm_flags = pmap_flags;
}
/*
@@ -1677,13 +2316,12 @@
* such as one in a vmspace structure.
*/
int
-pmap_pinit(pmap_t pmap)
+pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
{
vm_page_t pml4pg;
+ vm_paddr_t pml4phys;
int i;
- PMAP_LOCK_INIT(pmap);
-
/*
* allocate the page directory page
*/
@@ -1691,29 +2329,61 @@
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
VM_WAIT;
- pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+ pml4phys = VM_PAGE_TO_PHYS(pml4pg);
+ pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
+ pmap->pm_pcid = -1;
+ pmap->pm_cr3 = ~0; /* initialize to an invalid value */
if ((pml4pg->flags & PG_ZERO) == 0)
pagezero(pmap->pm_pml4);
- /* Wire in kernel global address entries. */
- pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
- for (i = 0; i < NDMPML4E; i++) {
- pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
- PG_RW | PG_V | PG_U;
+ /*
+ * Do not install the host kernel mappings in the nested page
+ * tables. These mappings are meaningless in the guest physical
+ * address space.
+ */
+ if ((pmap->pm_type = pm_type) == PT_X86) {
+ pmap->pm_cr3 = pml4phys;
+
+ /* Wire in kernel global address entries. */
+ for (i = 0; i < NKPML4E; i++) {
+ pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
+ X86_PG_RW | X86_PG_V | PG_U;
+ }
+ for (i = 0; i < ndmpdpphys; i++) {
+ pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
+ X86_PG_RW | X86_PG_V | PG_U;
+ }
+
+ /* install self-referential address mapping entry(s) */
+ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
+ X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
+
+ if (pmap_pcid_enabled) {
+ pmap->pm_pcid = alloc_unr(&pcid_unr);
+ if (pmap->pm_pcid != -1)
+ pmap->pm_cr3 |= pmap->pm_pcid;
+ }
}
- /* install self-referential address mapping entry(s) */
- pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
-
- pmap->pm_root = NULL;
+ pmap->pm_root.rt_root = 0;
CPU_ZERO(&pmap->pm_active);
TAILQ_INIT(&pmap->pm_pvchunk);
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+ pmap->pm_flags = flags;
+ pmap->pm_eptgen = 0;
+ CPU_ZERO(&pmap->pm_save);
return (1);
}
+int
+pmap_pinit(pmap_t pmap)
+{
+
+ return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
+}
+
/*
* This routine is called if the desired page table page does not exist.
*
@@ -1729,9 +2399,15 @@
_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
{
vm_page_t m, pdppg, pdpg;
+ pt_entry_t PG_A, PG_M, PG_RW, PG_V;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
/*
* Allocate a page table page.
*/
@@ -1859,9 +2535,11 @@
pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
{
vm_pindex_t pdpindex, ptepindex;
- pdp_entry_t *pdpe;
+ pdp_entry_t *pdpe, PG_V;
vm_page_t pdpg;
+ PG_V = pmap_valid_bit(pmap);
+
retry:
pdpe = pmap_pdpe(pmap, va);
if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -1883,9 +2561,11 @@
pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
{
vm_pindex_t ptepindex;
- pd_entry_t *pd;
+ pd_entry_t *pd, PG_V;
vm_page_t m;
+ PG_V = pmap_valid_bit(pmap);
+
/*
* Calculate pagetable page index
*/
@@ -1948,13 +2628,22 @@
KASSERT(pmap->pm_stats.resident_count == 0,
("pmap_release: pmap resident count %ld != 0",
pmap->pm_stats.resident_count));
- KASSERT(pmap->pm_root == NULL,
+ KASSERT(vm_radix_is_empty(&pmap->pm_root),
("pmap_release: pmap has reserved page table page(s)"));
- m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
+ if (pmap_pcid_enabled) {
+ /*
+ * Invalidate any left TLB entries, to allow the reuse
+ * of the pcid.
+ */
+ pmap_invalidate_all(pmap);
+ }
- pmap->pm_pml4[KPML4I] = 0; /* KVA */
- for (i = 0; i < NDMPML4E; i++) /* Direct Map */
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
+
+ for (i = 0; i < NKPML4E; i++) /* KVA */
+ pmap->pm_pml4[KPML4BASE + i] = 0;
+ for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
pmap->pm_pml4[DMPML4I + i] = 0;
pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
@@ -1961,7 +2650,8 @@
m->wire_count--;
atomic_subtract_int(&cnt.v_wire_count, 1);
vm_page_free_zero(m);
- PMAP_LOCK_DESTROY(pmap);
+ if (pmap->pm_pcid != -1)
+ free_unr(&pcid_unr, pmap->pm_pcid);
}
static int
@@ -2003,7 +2693,7 @@
* "kernel_vm_end" and the kernel page table as they were.
*
* The correctness of this action is based on the following
- * argument: vm_map_findspace() allocates contiguous ranges of the
+ * argument: vm_map_insert() allocates contiguous ranges of the
* kernel virtual address space. It calls this function if a range
* ends after "kernel_vm_end". If the kernel is mapped between
* "kernel_vm_end" and "addr", then the range cannot begin at
@@ -2012,7 +2702,7 @@
* any new kernel page table pages between "kernel_vm_end" and
* "KERNBASE".
*/
- if (KERNBASE < addr && addr <= KERNBASE + NKPT * NBPDR)
+ if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
return;
addr = roundup2(addr, NBPDR);
@@ -2020,7 +2710,7 @@
addr = kernel_map->max_offset;
while (kernel_vm_end < addr) {
pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
- if ((*pdpe & PG_V) == 0) {
+ if ((*pdpe & X86_PG_V) == 0) {
/* We need a new PDP entry */
nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
@@ -2030,12 +2720,12 @@
if ((nkpg->flags & PG_ZERO) == 0)
pmap_zero_page(nkpg);
paddr = VM_PAGE_TO_PHYS(nkpg);
- *pdpe = (pdp_entry_t)
- (paddr | PG_V | PG_RW | PG_A | PG_M);
+ *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
+ X86_PG_A | X86_PG_M);
continue; /* try again */
}
pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
- if ((*pde & PG_V) != 0) {
+ if ((*pde & X86_PG_V) != 0) {
kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
if (kernel_vm_end - 1 >= kernel_map->max_offset) {
kernel_vm_end = kernel_map->max_offset;
@@ -2052,7 +2742,7 @@
if ((nkpg->flags & PG_ZERO) == 0)
pmap_zero_page(nkpg);
paddr = VM_PAGE_TO_PHYS(nkpg);
- newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
+ newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
pde_store(pde, newpdir);
kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
@@ -2132,20 +2822,24 @@
pd_entry_t *pde;
pmap_t pmap;
pt_entry_t *pte, tpte;
+ pt_entry_t PG_G, PG_A, PG_M, PG_RW;
pv_entry_t pv;
vm_offset_t va;
- vm_page_t free, m, m_pc;
+ vm_page_t m, m_pc;
+ struct spglist free;
uint64_t inuse;
int bit, field, freed;
-
+
rw_assert(&pvh_global_lock, RA_LOCKED);
PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
pmap = NULL;
- free = m_pc = NULL;
+ m_pc = NULL;
+ PG_G = PG_A = PG_M = PG_RW = 0;
+ SLIST_INIT(&free);
TAILQ_INIT(&new_tail);
mtx_lock(&pv_chunks_mutex);
- while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && free == NULL) {
+ while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) {
TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
mtx_unlock(&pv_chunks_mutex);
if (pmap != pc->pc_pmap) {
@@ -2166,6 +2860,10 @@
mtx_lock(&pv_chunks_mutex);
continue;
}
+ PG_G = pmap_global_bit(pmap);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
}
/*
@@ -2193,7 +2891,8 @@
if ((tpte & PG_A) != 0)
vm_page_aflag_set(m, PGA_REFERENCED);
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ m->md.pv_gen++;
if (TAILQ_EMPTY(&m->md.pv_list) &&
(m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -2203,7 +2902,7 @@
}
}
pc->pc_map[field] |= 1UL << bit;
- pmap_unuse_pt(pmap, va, *pde, &free);
+ pmap_unuse_pt(pmap, va, *pde, &free);
freed++;
}
}
@@ -2243,14 +2942,14 @@
if (pmap != locked_pmap)
PMAP_UNLOCK(pmap);
}
- if (m_pc == NULL && free != NULL) {
- m_pc = free;
- free = m_pc->right;
+ if (m_pc == NULL && !SLIST_EMPTY(&free)) {
+ m_pc = SLIST_FIRST(&free);
+ SLIST_REMOVE_HEAD(&free, plinks.s.ss);
/* Recycle a freed page table page. */
m_pc->wire_count = 1;
atomic_add_int(&cnt.v_wire_count, 1);
}
- pmap_free_zero_pages(free);
+ pmap_free_zero_pages(&free);
return (m_pc);
}
@@ -2477,9 +3176,10 @@
pv_entry_t pv;
rw_assert(&pvh_global_lock, RA_LOCKED);
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
- TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
+ TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
+ pvh->pv_gen++;
break;
}
}
@@ -2518,7 +3218,8 @@
pv = pmap_pvh_remove(pvh, pmap, va);
KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
m = PHYS_TO_VM_PAGE(pa);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ m->md.pv_gen++;
/* Instantiate the remaining NPTEPG - 1 pv entries. */
PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
va_last = va + NBPDR - PAGE_SIZE;
@@ -2536,7 +3237,8 @@
m++;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_pv_demote_pde: page %p is not managed", m));
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ m->md.pv_gen++;
if (va == va_last)
goto out;
}
@@ -2584,7 +3286,8 @@
pv = pmap_pvh_remove(&m->md, pmap, va);
KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
pvh = pa_to_pvh(pa);
- TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+ pvh->pv_gen++;
/* Free the remaining NPTEPG - 1 pv entries. */
va_last = va + NBPDR - PAGE_SIZE;
do {
@@ -2625,7 +3328,8 @@
if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
pv->pv_va = va;
CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ m->md.pv_gen++;
return (TRUE);
} else
return (FALSE);
@@ -2649,7 +3353,8 @@
pv->pv_va = va;
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
pvh = pa_to_pvh(pa);
- TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+ pvh->pv_gen++;
return (TRUE);
} else
return (FALSE);
@@ -2692,15 +3397,26 @@
{
pd_entry_t newpde, oldpde;
pt_entry_t *firstpte, newpte;
+ pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
vm_paddr_t mptepa;
- vm_page_t free, mpte;
+ vm_page_t mpte;
+ struct spglist free;
+ vm_offset_t sva;
+ int PG_PTE_CACHE;
+ PG_G = pmap_global_bit(pmap);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
+
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
oldpde = *pde;
KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
- mpte = pmap_lookup_pt_page(pmap, va);
- if (mpte != NULL)
+ if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) !=
+ NULL)
pmap_remove_pt_page(pmap, mpte);
else {
KASSERT((oldpde & PG_W) == 0,
@@ -2723,11 +3439,12 @@
pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
- free = NULL;
- pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
- lockp);
- pmap_invalidate_page(pmap, trunc_2mpage(va));
- pmap_free_zero_pages(free);
+ SLIST_INIT(&free);
+ sva = trunc_2mpage(va);
+ pmap_remove_pde(pmap, pde, sva, &free, lockp);
+ if ((oldpde & PG_G) == 0)
+ pmap_invalidate_pde_page(pmap, sva, oldpde);
+ pmap_free_zero_pages(&free);
CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
" in pmap %p", va, pmap);
return (FALSE);
@@ -2743,8 +3460,7 @@
KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
("pmap_demote_pde: oldpde is missing PG_M"));
newpte = oldpde & ~PG_PS;
- if ((newpte & PG_PDE_PAT) != 0)
- newpte ^= PG_PDE_PAT | PG_PTE_PAT;
+ newpte = pmap_swap_pat(pmap, newpte);
/*
* If the page table page is new, initialize it.
@@ -2806,17 +3522,62 @@
}
/*
+ * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
+ */
+static void
+pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
+{
+ pd_entry_t newpde;
+ vm_paddr_t mptepa;
+ vm_page_t mpte;
+
+ KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ mpte = pmap_lookup_pt_page(pmap, va);
+ if (mpte == NULL)
+ panic("pmap_remove_kernel_pde: Missing pt page.");
+
+ pmap_remove_pt_page(pmap, mpte);
+ mptepa = VM_PAGE_TO_PHYS(mpte);
+ newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
+
+ /*
+ * Initialize the page table page.
+ */
+ pagezero((void *)PHYS_TO_DMAP(mptepa));
+
+ /*
+ * Demote the mapping.
+ */
+ if (workaround_erratum383)
+ pmap_update_pde(pmap, va, pde, newpde);
+ else
+ pde_store(pde, newpde);
+
+ /*
+ * Invalidate a stale recursive mapping of the page table page.
+ */
+ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
+}
+
+/*
* pmap_remove_pde: do the things to unmap a superpage in a process
*/
static int
pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
- vm_page_t *free, struct rwlock **lockp)
+ struct spglist *free, struct rwlock **lockp)
{
struct md_page *pvh;
pd_entry_t oldpde;
vm_offset_t eva, va;
vm_page_t m, mpte;
+ pt_entry_t PG_G, PG_A, PG_M, PG_RW;
+ PG_G = pmap_global_bit(pmap);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT((sva & PDRMASK) == 0,
("pmap_remove_pde: sva is not 2mpage aligned"));
@@ -2823,13 +3584,8 @@
oldpde = pte_load_clear(pdq);
if (oldpde & PG_W)
pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
-
- /*
- * Machines that don't support invlpg, also don't support
- * PG_G.
- */
- if (oldpde & PG_G)
- pmap_invalidate_page(kernel_pmap, sva);
+ if ((oldpde & PG_G) != 0)
+ pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
if (oldpde & PG_MANAGED) {
CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
@@ -2848,8 +3604,7 @@
}
}
if (pmap == kernel_pmap) {
- if (!pmap_demote_pde_locked(pmap, pdq, sva, lockp))
- panic("pmap_remove_pde: failed demotion");
+ pmap_remove_kernel_pde(pmap, pdq, sva);
} else {
mpte = pmap_lookup_pt_page(pmap, sva);
if (mpte != NULL) {
@@ -2870,12 +3625,16 @@
*/
static int
pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
- pd_entry_t ptepde, vm_page_t *free, struct rwlock **lockp)
+ pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
{
struct md_page *pvh;
- pt_entry_t oldpte;
+ pt_entry_t oldpte, PG_A, PG_M, PG_RW;
vm_page_t m;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
oldpte = pte_load_clear(ptq);
if (oldpte & PG_W)
@@ -2903,11 +3662,13 @@
* Remove a single page from a process address space
*/
static void
-pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
+pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
+ struct spglist *free)
{
struct rwlock *lock;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
+ PG_V = pmap_valid_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
if ((*pde & PG_V) == 0)
return;
@@ -2935,10 +3696,13 @@
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
- pt_entry_t *pte;
- vm_page_t free = NULL;
+ pt_entry_t *pte, PG_G, PG_V;
+ struct spglist free;
int anyvalid;
+ PG_G = pmap_global_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+
/*
* Perform an unsynchronized read. This is, however, safe.
*/
@@ -2946,6 +3710,7 @@
return;
anyvalid = 0;
+ SLIST_INIT(&free);
rw_rlock(&pvh_global_lock);
PMAP_LOCK(pmap);
@@ -3064,7 +3829,7 @@
pmap_invalidate_all(pmap);
rw_runlock(&pvh_global_lock);
PMAP_UNLOCK(pmap);
- pmap_free_zero_pages(free);
+ pmap_free_zero_pages(&free);
}
/*
@@ -3086,18 +3851,17 @@
struct md_page *pvh;
pv_entry_t pv;
pmap_t pmap;
- pt_entry_t *pte, tpte;
+ pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
pd_entry_t *pde;
vm_offset_t va;
- vm_page_t free;
+ struct spglist free;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_remove_all: page %p is not managed", m));
- free = NULL;
+ SLIST_INIT(&free);
rw_wlock(&pvh_global_lock);
- if ((m->flags & PG_FICTITIOUS) != 0)
- goto small_mappings;
- pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
+ pa_to_pvh(VM_PAGE_TO_PHYS(m));
while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
@@ -3106,10 +3870,12 @@
(void)pmap_demote_pde(pmap, pde, va);
PMAP_UNLOCK(pmap);
}
-small_mappings:
while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
pmap = PV_PMAP(pv);
PMAP_LOCK(pmap);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
pmap_resident_count_dec(pmap, 1);
pde = pmap_pde(pmap, pv->pv_va);
KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
@@ -3128,13 +3894,14 @@
vm_page_dirty(m);
pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
pmap_invalidate_page(pmap, pv->pv_va);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ m->md.pv_gen++;
free_pv_entry(pmap, pv);
PMAP_UNLOCK(pmap);
}
vm_page_aflag_clear(m, PGA_WRITEABLE);
rw_wunlock(&pvh_global_lock);
- pmap_free_zero_pages(free);
+ pmap_free_zero_pages(&free);
}
/*
@@ -3147,7 +3914,12 @@
vm_offset_t eva, va;
vm_page_t m;
boolean_t anychanged;
+ pt_entry_t PG_G, PG_M, PG_RW;
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT((sva & PDRMASK) == 0,
("pmap_protect_pde: sva is not 2mpage aligned"));
@@ -3166,10 +3938,15 @@
if ((prot & VM_PROT_EXECUTE) == 0)
newpde |= pg_nx;
if (newpde != oldpde) {
- if (!atomic_cmpset_long(pde, oldpde, newpde))
+ /*
+ * As an optimization to future operations on this PDE, clear
+ * PG_PROMOTED. The impending invalidation will remove any
+ * lingering 4KB page mappings from the TLB.
+ */
+ if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
goto retry;
- if (oldpde & PG_G)
- pmap_invalidate_page(pmap, sva);
+ if ((oldpde & PG_G) != 0)
+ pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
else
anychanged = TRUE;
}
@@ -3187,10 +3964,11 @@
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
boolean_t anychanged, pv_lists_locked;
- if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
+ KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
+ if (prot == VM_PROT_NONE) {
pmap_remove(pmap, sva, eva);
return;
}
@@ -3199,6 +3977,10 @@
(VM_PROT_WRITE|VM_PROT_EXECUTE))
return;
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
pv_lists_locked = FALSE;
resume:
anychanged = FALSE;
@@ -3327,9 +4109,17 @@
{
pd_entry_t newpde;
pt_entry_t *firstpte, oldpte, pa, *pte;
- vm_offset_t oldpteva;
+ pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
vm_page_t mpte;
+ int PG_PTE_CACHE;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+ PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
+
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
/*
@@ -3379,10 +4169,9 @@
if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
goto setpte;
oldpte &= ~PG_RW;
- oldpteva = (oldpte & PG_FRAME & PDRMASK) |
- (va & ~PDRMASK);
CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
- " in pmap %p", oldpteva, pmap);
+ " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
+ (va & ~PDRMASK), pmap);
}
if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
atomic_add_long(&pmap_pde_p_failures, 1);
@@ -3404,7 +4193,13 @@
("pmap_promote_pde: page table page is out of range"));
KASSERT(mpte->pindex == pmap_pde_pindex(va),
("pmap_promote_pde: page table page's pindex is wrong"));
- pmap_insert_pt_page(pmap, mpte);
+ if (pmap_insert_pt_page(pmap, mpte)) {
+ atomic_add_long(&pmap_pde_p_failures, 1);
+ CTR2(KTR_PMAP,
+ "pmap_promote_pde: failure for va %#lx in pmap %p", va,
+ pmap);
+ return;
+ }
/*
* Promote the pv entries.
@@ -3415,8 +4210,7 @@
/*
* Propagate the PAT index to its proper position.
*/
- if ((newpde & PG_PTE_PAT) != 0)
- newpde ^= PG_PDE_PAT | PG_PTE_PAT;
+ newpde = pmap_swap_pat(pmap, newpde);
/*
* Map the superpage.
@@ -3424,7 +4218,7 @@
if (workaround_erratum383)
pmap_update_pde(pmap, va, pde, PG_PS | newpde);
else
- pde_store(pde, PG_PS | newpde);
+ pde_store(pde, PG_PROMOTED | PG_PS | newpde);
atomic_add_long(&pmap_pde_promotions, 1);
CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
@@ -3443,18 +4237,25 @@
* or lose information. That is, this routine must actually
* insert this page into the given map NOW.
*/
-void
-pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
- vm_prot_t prot, boolean_t wired)
+int
+pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
+ u_int flags, int8_t psind __unused)
{
struct rwlock *lock;
pd_entry_t *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
pt_entry_t newpte, origpte;
pv_entry_t pv;
vm_paddr_t opa, pa;
vm_page_t mpte, om;
+ boolean_t nosleep;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
va = trunc_page(va);
KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
@@ -3463,27 +4264,36 @@
KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
va >= kmi.clean_eva,
("pmap_enter: managed mapping within the clean submap"));
- KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
- VM_OBJECT_LOCKED(m->object),
- ("pmap_enter: page %p is not busy", m));
+ if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
+ VM_OBJECT_ASSERT_LOCKED(m->object);
pa = VM_PAGE_TO_PHYS(m);
newpte = (pt_entry_t)(pa | PG_A | PG_V);
- if ((access & VM_PROT_WRITE) != 0)
+ if ((flags & VM_PROT_WRITE) != 0)
newpte |= PG_M;
if ((prot & VM_PROT_WRITE) != 0)
newpte |= PG_RW;
KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
- ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't"));
+ ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
if ((prot & VM_PROT_EXECUTE) == 0)
newpte |= pg_nx;
- if (wired)
+ if ((flags & PMAP_ENTER_WIRED) != 0)
newpte |= PG_W;
if (va < VM_MAXUSER_ADDRESS)
newpte |= PG_U;
if (pmap == kernel_pmap)
newpte |= PG_G;
- newpte |= pmap_cache_bits(m->md.pat_mode, 0);
+ newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
+ /*
+ * Set modified bit gratuitously for writeable mappings if
+ * the page is unmanaged. We do not want to take a fault
+ * to do the dirty bit accounting for these mappings.
+ */
+ if ((m->oflags & VPO_UNMANAGED) != 0) {
+ if ((newpte & PG_RW) != 0)
+ newpte |= PG_M;
+ }
+
mpte = NULL;
lock = NULL;
@@ -3508,7 +4318,16 @@
* Here if the pte page isn't mapped, or if it has been
* deallocated.
*/
- mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
+ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
+ mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
+ nosleep ? NULL : &lock);
+ if (mpte == NULL && nosleep) {
+ if (lock != NULL)
+ rw_wunlock(lock);
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+ return (KERN_RESOURCE_SHORTAGE);
+ }
goto retry;
} else
panic("pmap_enter: invalid page directory va=%#lx", va);
@@ -3574,7 +4393,8 @@
pv = get_pv_entry(pmap, &lock);
pv->pv_va = va;
CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ m->md.pv_gen++;
if ((newpte & PG_RW) != 0)
vm_page_aflag_set(m, PGA_WRITEABLE);
}
@@ -3630,7 +4450,8 @@
* populated, then attempt promotion.
*/
if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
- pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
+ pmap_ps_enabled(pmap) &&
+ (m->flags & PG_FICTITIOUS) == 0 &&
vm_reserv_level_iffullpop(m) == 0)
pmap_promote_pde(pmap, pde, va, &lock);
@@ -3638,6 +4459,7 @@
rw_wunlock(lock);
rw_runlock(&pvh_global_lock);
PMAP_UNLOCK(pmap);
+ return (KERN_SUCCESS);
}
/*
@@ -3651,10 +4473,14 @@
struct rwlock **lockp)
{
pd_entry_t *pde, newpde;
- vm_page_t free, mpde;
+ pt_entry_t PG_V;
+ vm_page_t mpde;
+ struct spglist free;
+ PG_V = pmap_valid_bit(pmap);
rw_assert(&pvh_global_lock, RA_LOCKED);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
" in pmap %p", va, pmap);
@@ -3670,7 +4496,7 @@
" in pmap %p", va, pmap);
return (FALSE);
}
- newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
+ newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
PG_PS | PG_V;
if ((m->oflags & VPO_UNMANAGED) == 0) {
newpde |= PG_MANAGED;
@@ -3680,10 +4506,10 @@
*/
if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
lockp)) {
- free = NULL;
+ SLIST_INIT(&free);
if (pmap_unwire_ptp(pmap, va, mpde, &free)) {
pmap_invalidate_page(pmap, va);
- pmap_free_zero_pages(free);
+ pmap_free_zero_pages(&free);
}
CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
" in pmap %p", va, pmap);
@@ -3701,7 +4527,8 @@
pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
/*
- * Map the superpage.
+ * Map the superpage. (This is not a promoted mapping; there will not
+ * be any lingering 4KB page mappings in the TLB.)
*/
pde_store(pde, newpde);
@@ -3732,7 +4559,8 @@
vm_page_t m, mpte;
vm_pindex_t diff, psize;
- VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
+ VM_OBJECT_ASSERT_LOCKED(m_start->object);
+
psize = atop(end - start);
mpte = NULL;
m = m_start;
@@ -3742,8 +4570,7 @@
while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
va = start + ptoa(diff);
if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
- (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
- pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
+ m->psind == 1 && pmap_ps_enabled(pmap) &&
pmap_enter_pde(pmap, va, m, prot, &lock))
m = &m[NBPDR / PAGE_SIZE - 1];
else
@@ -3785,13 +4612,14 @@
pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
{
- vm_page_t free;
- pt_entry_t *pte;
+ struct spglist free;
+ pt_entry_t *pte, PG_V;
vm_paddr_t pa;
KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
(m->oflags & VPO_UNMANAGED) != 0,
("pmap_enter_quick_locked: managed mapping within the clean submap"));
+ PG_V = pmap_valid_bit(pmap);
rw_assert(&pvh_global_lock, RA_LOCKED);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -3856,10 +4684,10 @@
if ((m->oflags & VPO_UNMANAGED) == 0 &&
!pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
if (mpte != NULL) {
- free = NULL;
+ SLIST_INIT(&free);
if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
pmap_invalidate_page(pmap, va);
- pmap_free_zero_pages(free);
+ pmap_free_zero_pages(&free);
}
mpte = NULL;
}
@@ -3871,7 +4699,7 @@
*/
pmap_resident_count_inc(pmap, 1);
- pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
+ pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
if ((prot & VM_PROT_EXECUTE) == 0)
pa |= pg_nx;
@@ -3910,14 +4738,22 @@
vm_pindex_t pindex, vm_size_t size)
{
pd_entry_t *pde;
+ pt_entry_t PG_A, PG_M, PG_RW, PG_V;
vm_paddr_t pa, ptepa;
vm_page_t p, pdpg;
int pat_mode;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
("pmap_object_init_pt: non-device object"));
if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
+ if (!pmap_ps_enabled(pmap))
+ return;
if (!vm_object_populate(object, pindex, pindex + atop(size)))
return;
p = vm_page_lookup(object, pindex);
@@ -3955,8 +4791,8 @@
* will not affect the termination of this loop.
*/
PMAP_LOCK(pmap);
- for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
- size; pa += NBPDR) {
+ for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
+ pa < ptepa + size; pa += NBPDR) {
pdpg = pmap_allocpde(pmap, addr, NULL);
if (pdpg == NULL) {
/*
@@ -3990,52 +4826,96 @@
}
/*
- * Routine: pmap_change_wiring
- * Function: Change the wiring attribute for a map/virtual-address
- * pair.
- * In/out conditions:
- * The mapping must already exist in the pmap.
+ * Clear the wired attribute from the mappings for the specified range of
+ * addresses in the given pmap. Every valid mapping within that range
+ * must have the wired attribute set. In contrast, invalid mappings
+ * cannot have the wired attribute set, so they are ignored.
+ *
+ * The wired attribute of the page table entry is not a hardware feature,
+ * so there is no need to invalidate any TLB entries.
*/
void
-pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
+pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
+ vm_offset_t va_next;
+ pml4_entry_t *pml4e;
+ pdp_entry_t *pdpe;
pd_entry_t *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
boolean_t pv_lists_locked;
+ PG_V = pmap_valid_bit(pmap);
pv_lists_locked = FALSE;
+resume:
+ PMAP_LOCK(pmap);
+ for (; sva < eva; sva = va_next) {
+ pml4e = pmap_pml4e(pmap, sva);
+ if ((*pml4e & PG_V) == 0) {
+ va_next = (sva + NBPML4) & ~PML4MASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+ pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ if ((*pdpe & PG_V) == 0) {
+ va_next = (sva + NBPDP) & ~PDPMASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+ va_next = (sva + NBPDR) & ~PDRMASK;
+ if (va_next < sva)
+ va_next = eva;
+ pde = pmap_pdpe_to_pde(pdpe, sva);
+ if ((*pde & PG_V) == 0)
+ continue;
+ if ((*pde & PG_PS) != 0) {
+ if ((*pde & PG_W) == 0)
+ panic("pmap_unwire: pde %#jx is missing PG_W",
+ (uintmax_t)*pde);
- /*
- * Wiring is not a hardware characteristic so there is no need to
- * invalidate TLB.
- */
-retry:
- PMAP_LOCK(pmap);
- pde = pmap_pde(pmap, va);
- if ((*pde & PG_PS) != 0) {
- if (!wired != ((*pde & PG_W) == 0)) {
- if (!pv_lists_locked) {
- pv_lists_locked = TRUE;
- if (!rw_try_rlock(&pvh_global_lock)) {
- PMAP_UNLOCK(pmap);
- rw_rlock(&pvh_global_lock);
- goto retry;
+ /*
+ * Are we unwiring the entire large page? If not,
+ * demote the mapping and fall through.
+ */
+ if (sva + NBPDR == va_next && eva >= va_next) {
+ atomic_clear_long(pde, PG_W);
+ pmap->pm_stats.wired_count -= NBPDR /
+ PAGE_SIZE;
+ continue;
+ } else {
+ if (!pv_lists_locked) {
+ pv_lists_locked = TRUE;
+ if (!rw_try_rlock(&pvh_global_lock)) {
+ PMAP_UNLOCK(pmap);
+ rw_rlock(&pvh_global_lock);
+ /* Repeat sva. */
+ goto resume;
+ }
}
+ if (!pmap_demote_pde(pmap, pde, sva))
+ panic("pmap_unwire: demotion failed");
}
- if (!pmap_demote_pde(pmap, pde, va))
- panic("pmap_change_wiring: demotion failed");
- } else
- goto out;
+ }
+ if (va_next > eva)
+ va_next = eva;
+ for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
+ sva += PAGE_SIZE) {
+ if ((*pte & PG_V) == 0)
+ continue;
+ if ((*pte & PG_W) == 0)
+ panic("pmap_unwire: pte %#jx is missing PG_W",
+ (uintmax_t)*pte);
+
+ /*
+ * PG_W must be cleared atomically. Although the pmap
+ * lock synchronizes access to PG_W, another processor
+ * could be setting PG_M and/or PG_A concurrently.
+ */
+ atomic_clear_long(pte, PG_W);
+ pmap->pm_stats.wired_count--;
+ }
}
- pte = pmap_pde_to_pte(pde, va);
- if (wired && (*pte & PG_W) == 0) {
- pmap->pm_stats.wired_count++;
- atomic_set_long(pte, PG_W);
- } else if (!wired && (*pte & PG_W) != 0) {
- pmap->pm_stats.wired_count--;
- atomic_clear_long(pte, PG_W);
- }
-out:
if (pv_lists_locked)
rw_runlock(&pvh_global_lock);
PMAP_UNLOCK(pmap);
@@ -4054,14 +4934,29 @@
vm_offset_t src_addr)
{
struct rwlock *lock;
- vm_page_t free;
+ struct spglist free;
vm_offset_t addr;
vm_offset_t end_addr = src_addr + len;
vm_offset_t va_next;
+ pt_entry_t PG_A, PG_M, PG_V;
if (dst_addr != src_addr)
return;
+ if (dst_pmap->pm_type != src_pmap->pm_type)
+ return;
+
+ /*
+ * EPT page table entries that require emulation of A/D bits are
+ * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
+ * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
+ * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
+ * implementations flag an EPT misconfiguration for exec-only
+ * mappings we skip this function entirely for emulated pmaps.
+ */
+ if (pmap_emulate_ad_bits(dst_pmap))
+ return;
+
lock = NULL;
rw_rlock(&pvh_global_lock);
if (dst_pmap < src_pmap) {
@@ -4071,6 +4966,11 @@
PMAP_LOCK(src_pmap);
PMAP_LOCK(dst_pmap);
}
+
+ PG_A = pmap_accessed_bit(dst_pmap);
+ PG_M = pmap_modified_bit(dst_pmap);
+ PG_V = pmap_valid_bit(dst_pmap);
+
for (addr = src_addr; addr < end_addr; addr = va_next) {
pt_entry_t *src_pte, *dst_pte;
vm_page_t dstmpde, dstmpte, srcmpte;
@@ -4107,6 +5007,8 @@
continue;
if (srcptepaddr & PG_PS) {
+ if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
+ continue;
dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
if (dstmpde == NULL)
break;
@@ -4162,13 +5064,13 @@
*dst_pte = ptetemp & ~(PG_W | PG_M |
PG_A);
pmap_resident_count_inc(dst_pmap, 1);
- } else {
- free = NULL;
+ } else {
+ SLIST_INIT(&free);
if (pmap_unwire_ptp(dst_pmap, addr,
dstmpte, &free)) {
- pmap_invalidate_page(dst_pmap,
- addr);
- pmap_free_zero_pages(free);
+ pmap_invalidate_page(dst_pmap,
+ addr);
+ pmap_free_zero_pages(&free);
}
goto out;
}
@@ -4185,10 +5087,10 @@
rw_runlock(&pvh_global_lock);
PMAP_UNLOCK(src_pmap);
PMAP_UNLOCK(dst_pmap);
-}
+}
/*
- * pmap_zero_page zeros the specified hardware page by mapping
+ * pmap_zero_page zeros the specified hardware page by mapping
* the page into KVM and using bzero to clear its contents.
*/
void
@@ -4245,24 +5147,73 @@
pagecopy((void *)src, (void *)dst);
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
{
void *a_cp, *b_cp;
+ vm_page_t m_a, m_b;
+ vm_paddr_t p_a, p_b;
+ pt_entry_t *pte;
vm_offset_t a_pg_offset, b_pg_offset;
int cnt;
+ boolean_t pinned;
+ /*
+ * NB: The sequence of updating a page table followed by accesses
+ * to the corresponding pages used in the !DMAP case is subject to
+ * the situation described in the "AMD64 Architecture Programmer's
+ * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
+ * Coherency Considerations". Therefore, issuing the INVLPG right
+ * after modifying the PTE bits is crucial.
+ */
+ pinned = FALSE;
while (xfersize > 0) {
a_pg_offset = a_offset & PAGE_MASK;
+ m_a = ma[a_offset >> PAGE_SHIFT];
+ p_a = m_a->phys_addr;
+ b_pg_offset = b_offset & PAGE_MASK;
+ m_b = mb[b_offset >> PAGE_SHIFT];
+ p_b = m_b->phys_addr;
cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
- a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]->
- phys_addr) + a_pg_offset;
- b_pg_offset = b_offset & PAGE_MASK;
cnt = min(cnt, PAGE_SIZE - b_pg_offset);
- b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]->
- phys_addr) + b_pg_offset;
+ if (__predict_false(p_a < DMAP_MIN_ADDRESS ||
+ p_a > DMAP_MIN_ADDRESS + dmaplimit)) {
+ mtx_lock(&cpage_lock);
+ sched_pin();
+ pinned = TRUE;
+ pte = vtopte(cpage_a);
+ *pte = p_a | X86_PG_A | X86_PG_V |
+ pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
+ invlpg(cpage_a);
+ a_cp = (char *)cpage_a + a_pg_offset;
+ } else {
+ a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
+ }
+ if (__predict_false(p_b < DMAP_MIN_ADDRESS ||
+ p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
+ if (!pinned) {
+ mtx_lock(&cpage_lock);
+ sched_pin();
+ pinned = TRUE;
+ }
+ pte = vtopte(cpage_b);
+ *pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW |
+ X86_PG_V | pmap_cache_bits(kernel_pmap,
+ m_b->md.pat_mode, 0);
+ invlpg(cpage_b);
+ b_cp = (char *)cpage_b + b_pg_offset;
+ } else {
+ b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
+ }
bcopy(a_cp, b_cp, cnt);
+ if (__predict_false(pinned)) {
+ sched_unpin();
+ mtx_unlock(&cpage_lock);
+ pinned = FALSE;
+ }
a_offset += cnt;
b_offset += cnt;
xfersize -= cnt;
@@ -4291,7 +5242,7 @@
rw_rlock(&pvh_global_lock);
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
rw_rlock(lock);
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
if (PV_PMAP(pv) == pmap) {
rv = TRUE;
break;
@@ -4302,7 +5253,7 @@
}
if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
if (PV_PMAP(pv) == pmap) {
rv = TRUE;
break;
@@ -4326,42 +5277,61 @@
int
pmap_page_wired_mappings(vm_page_t m)
{
- int count;
-
- count = 0;
- if ((m->oflags & VPO_UNMANAGED) != 0)
- return (count);
- rw_wlock(&pvh_global_lock);
- count = pmap_pvh_wired_mappings(&m->md, count);
- if ((m->flags & PG_FICTITIOUS) == 0) {
- count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
- count);
- }
- rw_wunlock(&pvh_global_lock);
- return (count);
-}
-
-/*
- * pmap_pvh_wired_mappings:
- *
- * Return the updated number "count" of managed mappings that are wired.
- */
-static int
-pmap_pvh_wired_mappings(struct md_page *pvh, int count)
-{
+ struct rwlock *lock;
+ struct md_page *pvh;
pmap_t pmap;
pt_entry_t *pte;
pv_entry_t pv;
+ int count, md_gen, pvh_gen;
- rw_assert(&pvh_global_lock, RA_WLOCKED);
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
+ if ((m->oflags & VPO_UNMANAGED) != 0)
+ return (0);
+ rw_rlock(&pvh_global_lock);
+ lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+ rw_rlock(lock);
+restart:
+ count = 0;
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
+ if (!PMAP_TRYLOCK(pmap)) {
+ md_gen = m->md.pv_gen;
+ rw_runlock(lock);
+ PMAP_LOCK(pmap);
+ rw_rlock(lock);
+ if (md_gen != m->md.pv_gen) {
+ PMAP_UNLOCK(pmap);
+ goto restart;
+ }
+ }
pte = pmap_pte(pmap, pv->pv_va);
if ((*pte & PG_W) != 0)
count++;
PMAP_UNLOCK(pmap);
}
+ if ((m->flags & PG_FICTITIOUS) == 0) {
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
+ pmap = PV_PMAP(pv);
+ if (!PMAP_TRYLOCK(pmap)) {
+ md_gen = m->md.pv_gen;
+ pvh_gen = pvh->pv_gen;
+ rw_runlock(lock);
+ PMAP_LOCK(pmap);
+ rw_rlock(lock);
+ if (md_gen != m->md.pv_gen ||
+ pvh_gen != pvh->pv_gen) {
+ PMAP_UNLOCK(pmap);
+ goto restart;
+ }
+ }
+ pte = pmap_pde(pmap, pv->pv_va);
+ if ((*pte & PG_W) != 0)
+ count++;
+ PMAP_UNLOCK(pmap);
+ }
+ }
+ rw_runlock(lock);
+ rw_runlock(&pvh_global_lock);
return (count);
}
@@ -4389,12 +5359,20 @@
}
/*
- * Remove all pages from specified address space
- * this aids process exit speeds. Also, this code
- * is special cased for current process only, but
- * can have the more generic (and slightly slower)
- * mode enabled. This is much faster than pmap_remove
- * in the case of running down an entire address space.
+ * Destroy all managed, non-wired mappings in the given user-space
+ * pmap. This pmap cannot be active on any processor besides the
+ * caller.
+ *
+ * This function cannot be applied to the kernel pmap. Moreover, it
+ * is not intended for general use. It is only to be used during
+ * process termination. Consequently, it can be implemented in ways
+ * that make it faster than pmap_remove(). First, it can more quickly
+ * destroy mappings by iterating over the pmap's collection of PV
+ * entries, rather than searching the page table. Second, it doesn't
+ * have to test and clear the page table entries atomically, because
+ * no processor is currently accessing the user address space. In
+ * particular, a page table entry's dirty bit won't change state once
+ * this function starts.
*/
void
pmap_remove_pages(pmap_t pmap)
@@ -4401,7 +5379,8 @@
{
pd_entry_t ptepde;
pt_entry_t *pte, tpte;
- vm_page_t free = NULL;
+ pt_entry_t PG_M, PG_RW, PG_V;
+ struct spglist free;
vm_page_t m, mpte, mt;
pv_entry_t pv;
struct md_page *pvh;
@@ -4410,12 +5389,34 @@
int64_t bit;
uint64_t inuse, bitmask;
int allfree, field, freed, idx;
+ boolean_t superpage;
+ vm_paddr_t pa;
- if (pmap != PCPU_GET(curpmap)) {
- printf("warning: pmap_remove_pages called with non-current pmap\n");
- return;
+ /*
+ * Assert that the given pmap is only active on the current
+ * CPU. Unfortunately, we cannot block another CPU from
+ * activating the pmap while this function is executing.
+ */
+ KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
+#ifdef INVARIANTS
+ {
+ cpuset_t other_cpus;
+
+ other_cpus = all_cpus;
+ critical_enter();
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ CPU_AND(&other_cpus, &pmap->pm_active);
+ critical_exit();
+ KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
}
+#endif
+
lock = NULL;
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
+ SLIST_INIT(&free);
rw_rlock(&pvh_global_lock);
PMAP_LOCK(pmap);
TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
@@ -4435,12 +5436,26 @@
pte = pmap_pdpe_to_pde(pte, pv->pv_va);
tpte = *pte;
if ((tpte & (PG_PS | PG_V)) == PG_V) {
+ superpage = FALSE;
ptepde = tpte;
pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
PG_FRAME);
pte = &pte[pmap_pte_index(pv->pv_va)];
- tpte = *pte & ~PG_PTE_PAT;
+ tpte = *pte;
+ } else {
+ /*
+ * Keep track whether 'tpte' is a
+ * superpage explicitly instead of
+ * relying on PG_PS being set.
+ *
+ * This is because PG_PS is numerically
+ * identical to PG_PTE_PAT and thus a
+ * regular page could be mistaken for
+ * a superpage.
+ */
+ superpage = TRUE;
}
+
if ((tpte & PG_V) == 0) {
panic("bad pte va %lx pte %lx",
pv->pv_va, tpte);
@@ -4454,8 +5469,13 @@
continue;
}
- m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
- KASSERT(m->phys_addr == (tpte & PG_FRAME),
+ if (superpage)
+ pa = tpte & PG_PS_FRAME;
+ else
+ pa = tpte & PG_FRAME;
+
+ m = PHYS_TO_VM_PAGE(pa);
+ KASSERT(m->phys_addr == pa,
("vm_page_t %p phys_addr mismatch %016jx %016jx",
m, (uintmax_t)m->phys_addr,
(uintmax_t)tpte));
@@ -4471,7 +5491,7 @@
* Update the vm_page_t clean/reference bits.
*/
if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
- if ((tpte & PG_PS) != 0) {
+ if (superpage) {
for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
vm_page_dirty(mt);
} else
@@ -4482,10 +5502,11 @@
/* Mark free */
pc->pc_map[field] |= bitmask;
- if ((tpte & PG_PS) != 0) {
+ if (superpage) {
pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
pvh = pa_to_pvh(tpte & PG_PS_FRAME);
- TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
+ TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
+ pvh->pv_gen++;
if (TAILQ_EMPTY(&pvh->pv_list)) {
for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
if ((mt->aflags & PGA_WRITEABLE) != 0 &&
@@ -4504,7 +5525,8 @@
}
} else {
pmap_resident_count_dec(pmap, 1);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ m->md.pv_gen++;
if ((m->aflags & PGA_WRITEABLE) != 0 &&
TAILQ_EMPTY(&m->md.pv_list) &&
(m->flags & PG_FICTITIOUS) == 0) {
@@ -4530,9 +5552,95 @@
pmap_invalidate_all(pmap);
rw_runlock(&pvh_global_lock);
PMAP_UNLOCK(pmap);
- pmap_free_zero_pages(free);
+ pmap_free_zero_pages(&free);
}
+static boolean_t
+pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
+{
+ struct rwlock *lock;
+ pv_entry_t pv;
+ struct md_page *pvh;
+ pt_entry_t *pte, mask;
+ pt_entry_t PG_A, PG_M, PG_RW, PG_V;
+ pmap_t pmap;
+ int md_gen, pvh_gen;
+ boolean_t rv;
+
+ rv = FALSE;
+ rw_rlock(&pvh_global_lock);
+ lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+ rw_rlock(lock);
+restart:
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
+ pmap = PV_PMAP(pv);
+ if (!PMAP_TRYLOCK(pmap)) {
+ md_gen = m->md.pv_gen;
+ rw_runlock(lock);
+ PMAP_LOCK(pmap);
+ rw_rlock(lock);
+ if (md_gen != m->md.pv_gen) {
+ PMAP_UNLOCK(pmap);
+ goto restart;
+ }
+ }
+ pte = pmap_pte(pmap, pv->pv_va);
+ mask = 0;
+ if (modified) {
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+ mask |= PG_RW | PG_M;
+ }
+ if (accessed) {
+ PG_A = pmap_accessed_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ mask |= PG_V | PG_A;
+ }
+ rv = (*pte & mask) == mask;
+ PMAP_UNLOCK(pmap);
+ if (rv)
+ goto out;
+ }
+ if ((m->flags & PG_FICTITIOUS) == 0) {
+ pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
+ pmap = PV_PMAP(pv);
+ if (!PMAP_TRYLOCK(pmap)) {
+ md_gen = m->md.pv_gen;
+ pvh_gen = pvh->pv_gen;
+ rw_runlock(lock);
+ PMAP_LOCK(pmap);
+ rw_rlock(lock);
+ if (md_gen != m->md.pv_gen ||
+ pvh_gen != pvh->pv_gen) {
+ PMAP_UNLOCK(pmap);
+ goto restart;
+ }
+ }
+ pte = pmap_pde(pmap, pv->pv_va);
+ mask = 0;
+ if (modified) {
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+ mask |= PG_RW | PG_M;
+ }
+ if (accessed) {
+ PG_A = pmap_accessed_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ mask |= PG_V | PG_A;
+ }
+ rv = (*pte & mask) == mask;
+ PMAP_UNLOCK(pmap);
+ if (rv)
+ goto out;
+ }
+ }
+out:
+ rw_runlock(lock);
+ rw_runlock(&pvh_global_lock);
+ return (rv);
+}
+
/*
* pmap_is_modified:
*
@@ -4542,59 +5650,25 @@
boolean_t
pmap_is_modified(vm_page_t m)
{
- boolean_t rv;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_is_modified: page %p is not managed", m));
/*
- * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be
+ * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
* concurrently set while the object is locked. Thus, if PGA_WRITEABLE
* is clear, no PTEs can have PG_M set.
*/
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if ((m->oflags & VPO_BUSY) == 0 &&
- (m->aflags & PGA_WRITEABLE) == 0)
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
return (FALSE);
- rw_wlock(&pvh_global_lock);
- rv = pmap_is_modified_pvh(&m->md) ||
- ((m->flags & PG_FICTITIOUS) == 0 &&
- pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
- rw_wunlock(&pvh_global_lock);
- return (rv);
+ return (pmap_page_test_mappings(m, FALSE, TRUE));
}
/*
- * Returns TRUE if any of the given mappings were used to modify
- * physical memory. Otherwise, returns FALSE. Both page and 2mpage
- * mappings are supported.
- */
-static boolean_t
-pmap_is_modified_pvh(struct md_page *pvh)
-{
- pv_entry_t pv;
- pt_entry_t *pte;
- pmap_t pmap;
- boolean_t rv;
-
- rw_assert(&pvh_global_lock, RA_WLOCKED);
- rv = FALSE;
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
- pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
- pte = pmap_pte(pmap, pv->pv_va);
- rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
- PMAP_UNLOCK(pmap);
- if (rv)
- break;
- }
- return (rv);
-}
-
-/*
* pmap_is_prefaultable:
*
- * Return whether or not the specified virtual address is elgible
+ * Return whether or not the specified virtual address is eligible
* for prefault.
*/
boolean_t
@@ -4601,9 +5675,10 @@
pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
{
pd_entry_t *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
boolean_t rv;
+ PG_V = pmap_valid_bit(pmap);
rv = FALSE;
PMAP_LOCK(pmap);
pde = pmap_pde(pmap, addr);
@@ -4624,45 +5699,13 @@
boolean_t
pmap_is_referenced(vm_page_t m)
{
- boolean_t rv;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_is_referenced: page %p is not managed", m));
- rw_wlock(&pvh_global_lock);
- rv = pmap_is_referenced_pvh(&m->md) ||
- ((m->flags & PG_FICTITIOUS) == 0 &&
- pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
- rw_wunlock(&pvh_global_lock);
- return (rv);
+ return (pmap_page_test_mappings(m, TRUE, FALSE));
}
/*
- * Returns TRUE if any of the given mappings were referenced and FALSE
- * otherwise. Both page and 2mpage mappings are supported.
- */
-static boolean_t
-pmap_is_referenced_pvh(struct md_page *pvh)
-{
- pv_entry_t pv;
- pt_entry_t *pte;
- pmap_t pmap;
- boolean_t rv;
-
- rw_assert(&pvh_global_lock, RA_WLOCKED);
- rv = FALSE;
- TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
- pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
- pte = pmap_pte(pmap, pv->pv_va);
- rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
- PMAP_UNLOCK(pmap);
- if (rv)
- break;
- }
- return (rv);
-}
-
-/*
* Clear the write and modified bits in each of the given page's mappings.
*/
void
@@ -4670,40 +5713,70 @@
{
struct md_page *pvh;
pmap_t pmap;
+ struct rwlock *lock;
pv_entry_t next_pv, pv;
pd_entry_t *pde;
- pt_entry_t oldpte, *pte;
+ pt_entry_t oldpte, *pte, PG_M, PG_RW;
vm_offset_t va;
+ int pvh_gen, md_gen;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_remove_write: page %p is not managed", m));
/*
- * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by
- * another thread while the object is locked. Thus, if PGA_WRITEABLE
- * is clear, no page table entries need updating.
+ * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
+ * set by another thread while the object is locked. Thus,
+ * if PGA_WRITEABLE is clear, no page table entries need updating.
*/
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- if ((m->oflags & VPO_BUSY) == 0 &&
- (m->aflags & PGA_WRITEABLE) == 0)
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
return;
- rw_wlock(&pvh_global_lock);
- if ((m->flags & PG_FICTITIOUS) != 0)
- goto small_mappings;
- pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+ rw_rlock(&pvh_global_lock);
+ lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+ pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
+ pa_to_pvh(VM_PAGE_TO_PHYS(m));
+retry_pv_loop:
+ rw_wlock(lock);
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
+ if (!PMAP_TRYLOCK(pmap)) {
+ pvh_gen = pvh->pv_gen;
+ rw_wunlock(lock);
+ PMAP_LOCK(pmap);
+ rw_wlock(lock);
+ if (pvh_gen != pvh->pv_gen) {
+ PMAP_UNLOCK(pmap);
+ rw_wunlock(lock);
+ goto retry_pv_loop;
+ }
+ }
+ PG_RW = pmap_rw_bit(pmap);
va = pv->pv_va;
pde = pmap_pde(pmap, va);
if ((*pde & PG_RW) != 0)
- (void)pmap_demote_pde(pmap, pde, va);
+ (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
+ KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
+ ("inconsistent pv lock %p %p for page %p",
+ lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
PMAP_UNLOCK(pmap);
}
-small_mappings:
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
+ if (!PMAP_TRYLOCK(pmap)) {
+ pvh_gen = pvh->pv_gen;
+ md_gen = m->md.pv_gen;
+ rw_wunlock(lock);
+ PMAP_LOCK(pmap);
+ rw_wlock(lock);
+ if (pvh_gen != pvh->pv_gen ||
+ md_gen != m->md.pv_gen) {
+ PMAP_UNLOCK(pmap);
+ rw_wunlock(lock);
+ goto retry_pv_loop;
+ }
+ }
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
pde = pmap_pde(pmap, pv->pv_va);
KASSERT((*pde & PG_PS) == 0,
("pmap_remove_write: found a 2mpage in page %p's pv list",
@@ -4721,10 +5794,40 @@
}
PMAP_UNLOCK(pmap);
}
+ rw_wunlock(lock);
vm_page_aflag_clear(m, PGA_WRITEABLE);
- rw_wunlock(&pvh_global_lock);
+ rw_runlock(&pvh_global_lock);
}
+static __inline boolean_t
+safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
+{
+
+ if (!pmap_emulate_ad_bits(pmap))
+ return (TRUE);
+
+ KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
+
+ /*
+ * RWX = 010 or 110 will cause an unconditional EPT misconfiguration
+ * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
+ * if the EPT_PG_WRITE bit is set.
+ */
+ if ((pte & EPT_PG_WRITE) != 0)
+ return (FALSE);
+
+ /*
+ * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
+ */
+ if ((pte & EPT_PG_EXECUTE) == 0 ||
+ ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+#define PMAP_TS_REFERENCED_MAX 5
+
/*
* pmap_ts_referenced:
*
@@ -4741,28 +5844,77 @@
pmap_ts_referenced(vm_page_t m)
{
struct md_page *pvh;
- pv_entry_t pv, pvf, pvn;
+ pv_entry_t pv, pvf;
pmap_t pmap;
+ struct rwlock *lock;
pd_entry_t oldpde, *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_A;
vm_offset_t va;
- int rtval = 0;
+ vm_paddr_t pa;
+ int cleared, md_gen, not_cleared, pvh_gen;
+ struct spglist free;
+ boolean_t demoted;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_ts_referenced: page %p is not managed", m));
- rw_wlock(&pvh_global_lock);
- if ((m->flags & PG_FICTITIOUS) != 0)
+ SLIST_INIT(&free);
+ cleared = 0;
+ pa = VM_PAGE_TO_PHYS(m);
+ lock = PHYS_TO_PV_LIST_LOCK(pa);
+ pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
+ rw_rlock(&pvh_global_lock);
+ rw_wlock(lock);
+retry:
+ not_cleared = 0;
+ if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
goto small_mappings;
- pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) {
+ pv = pvf;
+ do {
+ if (pvf == NULL)
+ pvf = pv;
pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
+ if (!PMAP_TRYLOCK(pmap)) {
+ pvh_gen = pvh->pv_gen;
+ rw_wunlock(lock);
+ PMAP_LOCK(pmap);
+ rw_wlock(lock);
+ if (pvh_gen != pvh->pv_gen) {
+ PMAP_UNLOCK(pmap);
+ goto retry;
+ }
+ }
+ PG_A = pmap_accessed_bit(pmap);
va = pv->pv_va;
- pde = pmap_pde(pmap, va);
+ pde = pmap_pde(pmap, pv->pv_va);
oldpde = *pde;
- if ((oldpde & PG_A) != 0) {
- if (pmap_demote_pde(pmap, pde, va)) {
- if ((oldpde & PG_W) == 0) {
+ if ((*pde & PG_A) != 0) {
+ /*
+ * Since this reference bit is shared by 512 4KB
+ * pages, it should not be cleared every time it is
+ * tested. Apply a simple "hash" function on the
+ * physical page number, the virtual superpage number,
+ * and the pmap address to select one 4KB page out of
+ * the 512 on which testing the reference bit will
+ * result in clearing that reference bit. This
+ * function is designed to avoid the selection of the
+ * same 4KB page for every 2MB page mapping.
+ *
+ * On demotion, a mapping that hasn't been referenced
+ * is simply destroyed. To avoid the possibility of a
+ * subsequent page fault on a demoted wired mapping,
+ * always leave its reference bit set. Moreover,
+ * since the superpage is wired, the current state of
+ * its reference bit won't affect page replacement.
+ */
+ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
+ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
+ (*pde & PG_W) == 0) {
+ if (safe_to_clear_referenced(pmap, oldpde)) {
+ atomic_clear_long(pde, PG_A);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ demoted = FALSE;
+ } else if (pmap_demote_pde_locked(pmap, pde,
+ pv->pv_va, &lock)) {
/*
* Remove the mapping to a single page
* so that a subsequent access may
@@ -4771,45 +5923,245 @@
* this removal never frees a page
* table page.
*/
+ demoted = TRUE;
va += VM_PAGE_TO_PHYS(m) - (oldpde &
PG_PS_FRAME);
- pmap_remove_page(pmap, va, pde, NULL);
- rtval++;
- if (rtval > 4) {
- PMAP_UNLOCK(pmap);
- goto out;
- }
+ pte = pmap_pde_to_pte(pde, va);
+ pmap_remove_pte(pmap, pte, va, *pde,
+ NULL, &lock);
+ pmap_invalidate_page(pmap, va);
+ } else
+ demoted = TRUE;
+
+ if (demoted) {
+ /*
+ * The superpage mapping was removed
+ * entirely and therefore 'pv' is no
+ * longer valid.
+ */
+ if (pvf == pv)
+ pvf = NULL;
+ pv = NULL;
}
- }
+ cleared++;
+ KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
+ ("inconsistent pv lock %p %p for page %p",
+ lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
+ } else
+ not_cleared++;
}
PMAP_UNLOCK(pmap);
- }
+ /* Rotate the PV list if it has more than one entry. */
+ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
+ TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
+ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+ pvh->pv_gen++;
+ }
+ if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
+ goto out;
+ } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
small_mappings:
- if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
- pvf = pv;
- do {
- pvn = TAILQ_NEXT(pv, pv_list);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
- pmap = PV_PMAP(pv);
+ if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
+ goto out;
+ pv = pvf;
+ do {
+ if (pvf == NULL)
+ pvf = pv;
+ pmap = PV_PMAP(pv);
+ if (!PMAP_TRYLOCK(pmap)) {
+ pvh_gen = pvh->pv_gen;
+ md_gen = m->md.pv_gen;
+ rw_wunlock(lock);
PMAP_LOCK(pmap);
- pde = pmap_pde(pmap, pv->pv_va);
- KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:"
- " found a 2mpage in page %p's pv list", m));
- pte = pmap_pde_to_pte(pde, pv->pv_va);
- if ((*pte & PG_A) != 0) {
+ rw_wlock(lock);
+ if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
+ PMAP_UNLOCK(pmap);
+ goto retry;
+ }
+ }
+ PG_A = pmap_accessed_bit(pmap);
+ pde = pmap_pde(pmap, pv->pv_va);
+ KASSERT((*pde & PG_PS) == 0,
+ ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
+ m));
+ pte = pmap_pde_to_pte(pde, pv->pv_va);
+ if ((*pte & PG_A) != 0) {
+ if (safe_to_clear_referenced(pmap, *pte)) {
atomic_clear_long(pte, PG_A);
pmap_invalidate_page(pmap, pv->pv_va);
- rtval++;
- if (rtval > 4)
- pvn = NULL;
+ cleared++;
+ } else if ((*pte & PG_W) == 0) {
+ /*
+ * Wired pages cannot be paged out so
+ * doing accessed bit emulation for
+ * them is wasted effort. We do the
+ * hard work for unwired pages only.
+ */
+ pmap_remove_pte(pmap, pte, pv->pv_va,
+ *pde, &free, &lock);
+ pmap_invalidate_page(pmap, pv->pv_va);
+ cleared++;
+ if (pvf == pv)
+ pvf = NULL;
+ pv = NULL;
+ KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
+ ("inconsistent pv lock %p %p for page %p",
+ lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
+ } else
+ not_cleared++;
+ }
+ PMAP_UNLOCK(pmap);
+ /* Rotate the PV list if it has more than one entry. */
+ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+ m->md.pv_gen++;
+ }
+ } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
+ not_cleared < PMAP_TS_REFERENCED_MAX);
+out:
+ rw_wunlock(lock);
+ rw_runlock(&pvh_global_lock);
+ pmap_free_zero_pages(&free);
+ return (cleared + not_cleared);
+}
+
+/*
+ * Apply the given advice to the specified range of addresses within the
+ * given pmap. Depending on the advice, clear the referenced and/or
+ * modified flags in each mapping and set the mapped page's dirty field.
+ */
+void
+pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
+{
+ struct rwlock *lock;
+ pml4_entry_t *pml4e;
+ pdp_entry_t *pdpe;
+ pd_entry_t oldpde, *pde;
+ pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
+ vm_offset_t va_next;
+ vm_page_t m;
+ boolean_t anychanged, pv_lists_locked;
+
+ if (advice != MADV_DONTNEED && advice != MADV_FREE)
+ return;
+
+ /*
+ * A/D bit emulation requires an alternate code path when clearing
+ * the modified and accessed bits below. Since this function is
+ * advisory in nature we skip it entirely for pmaps that require
+ * A/D bit emulation.
+ */
+ if (pmap_emulate_ad_bits(pmap))
+ return;
+
+ PG_A = pmap_accessed_bit(pmap);
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
+ pv_lists_locked = FALSE;
+resume:
+ anychanged = FALSE;
+ PMAP_LOCK(pmap);
+ for (; sva < eva; sva = va_next) {
+ pml4e = pmap_pml4e(pmap, sva);
+ if ((*pml4e & PG_V) == 0) {
+ va_next = (sva + NBPML4) & ~PML4MASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+ pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ if ((*pdpe & PG_V) == 0) {
+ va_next = (sva + NBPDP) & ~PDPMASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+ va_next = (sva + NBPDR) & ~PDRMASK;
+ if (va_next < sva)
+ va_next = eva;
+ pde = pmap_pdpe_to_pde(pdpe, sva);
+ oldpde = *pde;
+ if ((oldpde & PG_V) == 0)
+ continue;
+ else if ((oldpde & PG_PS) != 0) {
+ if ((oldpde & PG_MANAGED) == 0)
+ continue;
+ if (!pv_lists_locked) {
+ pv_lists_locked = TRUE;
+ if (!rw_try_rlock(&pvh_global_lock)) {
+ if (anychanged)
+ pmap_invalidate_all(pmap);
+ PMAP_UNLOCK(pmap);
+ rw_rlock(&pvh_global_lock);
+ goto resume;
+ }
}
- PMAP_UNLOCK(pmap);
- } while ((pv = pvn) != NULL && pv != pvf);
+ lock = NULL;
+ if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
+ if (lock != NULL)
+ rw_wunlock(lock);
+
+ /*
+ * The large page mapping was destroyed.
+ */
+ continue;
+ }
+
+ /*
+ * Unless the page mappings are wired, remove the
+ * mapping to a single page so that a subsequent
+ * access may repromote. Since the underlying page
+ * table page is fully populated, this removal never
+ * frees a page table page.
+ */
+ if ((oldpde & PG_W) == 0) {
+ pte = pmap_pde_to_pte(pde, sva);
+ KASSERT((*pte & PG_V) != 0,
+ ("pmap_advise: invalid PTE"));
+ pmap_remove_pte(pmap, pte, sva, *pde, NULL,
+ &lock);
+ anychanged = TRUE;
+ }
+ if (lock != NULL)
+ rw_wunlock(lock);
+ }
+ if (va_next > eva)
+ va_next = eva;
+ for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
+ sva += PAGE_SIZE) {
+ if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED |
+ PG_V))
+ continue;
+ else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+ if (advice == MADV_DONTNEED) {
+ /*
+ * Future calls to pmap_is_modified()
+ * can be avoided by making the page
+ * dirty now.
+ */
+ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
+ vm_page_dirty(m);
+ }
+ atomic_clear_long(pte, PG_M | PG_A);
+ } else if ((*pte & PG_A) != 0)
+ atomic_clear_long(pte, PG_A);
+ else
+ continue;
+ if ((*pte & PG_G) != 0)
+ pmap_invalidate_page(pmap, sva);
+ else
+ anychanged = TRUE;
+ }
}
-out:
- rw_wunlock(&pvh_global_lock);
- return (rtval);
+ if (anychanged)
+ pmap_invalidate_all(pmap);
+ if (pv_lists_locked)
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
}
/*
@@ -4822,34 +6174,50 @@
pmap_t pmap;
pv_entry_t next_pv, pv;
pd_entry_t oldpde, *pde;
- pt_entry_t oldpte, *pte;
+ pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
+ struct rwlock *lock;
vm_offset_t va;
+ int md_gen, pvh_gen;
KASSERT((m->oflags & VPO_UNMANAGED) == 0,
("pmap_clear_modify: page %p is not managed", m));
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
- KASSERT((m->oflags & VPO_BUSY) == 0,
- ("pmap_clear_modify: page %p is busy", m));
+ VM_OBJECT_ASSERT_WLOCKED(m->object);
+ KASSERT(!vm_page_xbusied(m),
+ ("pmap_clear_modify: page %p is exclusive busied", m));
/*
* If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
* If the object containing the page is locked and the page is not
- * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set.
+ * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
*/
if ((m->aflags & PGA_WRITEABLE) == 0)
return;
- rw_wlock(&pvh_global_lock);
- if ((m->flags & PG_FICTITIOUS) != 0)
- goto small_mappings;
- pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
+ pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
+ pa_to_pvh(VM_PAGE_TO_PHYS(m));
+ rw_rlock(&pvh_global_lock);
+ lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+ rw_wlock(lock);
+restart:
+ TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
+ if (!PMAP_TRYLOCK(pmap)) {
+ pvh_gen = pvh->pv_gen;
+ rw_wunlock(lock);
+ PMAP_LOCK(pmap);
+ rw_wlock(lock);
+ if (pvh_gen != pvh->pv_gen) {
+ PMAP_UNLOCK(pmap);
+ goto restart;
+ }
+ }
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
va = pv->pv_va;
pde = pmap_pde(pmap, va);
oldpde = *pde;
if ((oldpde & PG_RW) != 0) {
- if (pmap_demote_pde(pmap, pde, va)) {
+ if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
if ((oldpde & PG_W) == 0) {
/*
* Write protect the mapping to a
@@ -4873,10 +6241,21 @@
}
PMAP_UNLOCK(pmap);
}
-small_mappings:
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
+ if (!PMAP_TRYLOCK(pmap)) {
+ md_gen = m->md.pv_gen;
+ pvh_gen = pvh->pv_gen;
+ rw_wunlock(lock);
+ PMAP_LOCK(pmap);
+ rw_wlock(lock);
+ if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
+ PMAP_UNLOCK(pmap);
+ goto restart;
+ }
+ }
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
pde = pmap_pde(pmap, pv->pv_va);
KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
" a 2mpage in page %p's pv list", m));
@@ -4887,76 +6266,17 @@
}
PMAP_UNLOCK(pmap);
}
- rw_wunlock(&pvh_global_lock);
+ rw_wunlock(lock);
+ rw_runlock(&pvh_global_lock);
}
/*
- * pmap_clear_reference:
- *
- * Clear the reference bit on the specified physical page.
- */
-void
-pmap_clear_reference(vm_page_t m)
-{
- struct md_page *pvh;
- pmap_t pmap;
- pv_entry_t next_pv, pv;
- pd_entry_t oldpde, *pde;
- pt_entry_t *pte;
- vm_offset_t va;
-
- KASSERT((m->oflags & VPO_UNMANAGED) == 0,
- ("pmap_clear_reference: page %p is not managed", m));
- rw_wlock(&pvh_global_lock);
- if ((m->flags & PG_FICTITIOUS) != 0)
- goto small_mappings;
- pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
- TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) {
- pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
- va = pv->pv_va;
- pde = pmap_pde(pmap, va);
- oldpde = *pde;
- if ((oldpde & PG_A) != 0) {
- if (pmap_demote_pde(pmap, pde, va)) {
- /*
- * Remove the mapping to a single page so
- * that a subsequent access may repromote.
- * Since the underlying page table page is
- * fully populated, this removal never frees
- * a page table page.
- */
- va += VM_PAGE_TO_PHYS(m) - (oldpde &
- PG_PS_FRAME);
- pmap_remove_page(pmap, va, pde, NULL);
- }
- }
- PMAP_UNLOCK(pmap);
- }
-small_mappings:
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
- pmap = PV_PMAP(pv);
- PMAP_LOCK(pmap);
- pde = pmap_pde(pmap, pv->pv_va);
- KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found"
- " a 2mpage in page %p's pv list", m));
- pte = pmap_pde_to_pte(pde, pv->pv_va);
- if (*pte & PG_A) {
- atomic_clear_long(pte, PG_A);
- pmap_invalidate_page(pmap, pv->pv_va);
- }
- PMAP_UNLOCK(pmap);
- }
- rw_wunlock(&pvh_global_lock);
-}
-
-/*
* Miscellaneous support routines follow
*/
/* Adjust the cache mode for a 4KB page mapped via a PTE. */
static __inline void
-pmap_pte_attr(pt_entry_t *pte, int cache_bits)
+pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
{
u_int opte, npte;
@@ -4966,7 +6286,7 @@
*/
do {
opte = *(u_int *)pte;
- npte = opte & ~PG_PTE_CACHE;
+ npte = opte & ~mask;
npte |= cache_bits;
} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
}
@@ -4973,7 +6293,7 @@
/* Adjust the cache mode for a 2MB page mapped via a PDE. */
static __inline void
-pmap_pde_attr(pd_entry_t *pde, int cache_bits)
+pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
{
u_int opde, npde;
@@ -4983,7 +6303,7 @@
*/
do {
opde = *(u_int *)pde;
- npde = opde & ~PG_PDE_CACHE;
+ npde = opde & ~mask;
npde |= cache_bits;
} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
}
@@ -4997,28 +6317,58 @@
void *
pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
{
+ struct pmap_preinit_mapping *ppim;
vm_offset_t va, offset;
vm_size_t tmpsize;
+ int i;
- /*
- * If the specified range of physical addresses fits within the direct
- * map window, use the direct map.
- */
- if (pa < dmaplimit && pa + size < dmaplimit) {
- va = PHYS_TO_DMAP(pa);
- if (!pmap_change_attr(va, size, mode))
- return ((void *)va);
- }
offset = pa & PAGE_MASK;
- size = roundup(offset + size, PAGE_SIZE);
- va = kmem_alloc_nofault(kernel_map, size);
- if (!va)
- panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
+ size = round_page(offset + size);
pa = trunc_page(pa);
+
+ if (!pmap_initialized) {
+ va = 0;
+ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
+ ppim = pmap_preinit_mapping + i;
+ if (ppim->va == 0) {
+ ppim->pa = pa;
+ ppim->sz = size;
+ ppim->mode = mode;
+ ppim->va = virtual_avail;
+ virtual_avail += size;
+ va = ppim->va;
+ break;
+ }
+ }
+ if (va == 0)
+ panic("%s: too many preinit mappings", __func__);
+ } else {
+ /*
+ * If we have a preinit mapping, re-use it.
+ */
+ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
+ ppim = pmap_preinit_mapping + i;
+ if (ppim->pa == pa && ppim->sz == size &&
+ ppim->mode == mode)
+ return ((void *)(ppim->va + offset));
+ }
+ /*
+ * If the specified range of physical addresses fits within
+ * the direct map window, use the direct map.
+ */
+ if (pa < dmaplimit && pa + size < dmaplimit) {
+ va = PHYS_TO_DMAP(pa);
+ if (!pmap_change_attr(va, size, mode))
+ return ((void *)(va + offset));
+ }
+ va = kva_alloc(size);
+ if (va == 0)
+ panic("%s: Couldn't allocate KVA", __func__);
+ }
for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
- pmap_invalidate_cache_range(va, va + tmpsize);
+ pmap_invalidate_cache_range(va, va + tmpsize, FALSE);
return ((void *)(va + offset));
}
@@ -5039,15 +6389,32 @@
void
pmap_unmapdev(vm_offset_t va, vm_size_t size)
{
- vm_offset_t base, offset;
+ struct pmap_preinit_mapping *ppim;
+ vm_offset_t offset;
+ int i;
/* If we gave a direct map region in pmap_mapdev, do nothing */
if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
return;
- base = trunc_page(va);
offset = va & PAGE_MASK;
- size = roundup(offset + size, PAGE_SIZE);
- kmem_free(kernel_map, base, size);
+ size = round_page(offset + size);
+ va = trunc_page(va);
+ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
+ ppim = pmap_preinit_mapping + i;
+ if (ppim->va == va && ppim->sz == size) {
+ if (pmap_initialized)
+ return;
+ ppim->pa = 0;
+ ppim->va = 0;
+ ppim->sz = 0;
+ ppim->mode = 0;
+ if (va + size == virtual_avail)
+ virtual_avail = va;
+ return;
+ }
+ }
+ if (pmap_initialized)
+ kva_free(va, size);
}
/*
@@ -5058,9 +6425,15 @@
{
pdp_entry_t newpdpe, oldpdpe;
pd_entry_t *firstpde, newpde, *pde;
+ pt_entry_t PG_A, PG_M, PG_RW, PG_V;
vm_paddr_t mpdepa;
vm_page_t mpde;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
oldpdpe = *pdpe;
KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
@@ -5158,7 +6531,7 @@
pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
{
vm_offset_t base, offset, tmpva;
- vm_paddr_t pa_start, pa_end;
+ vm_paddr_t pa_start, pa_end, pa_end1;
pdp_entry_t *pdpe;
pd_entry_t *pde;
pt_entry_t *pte;
@@ -5168,7 +6541,7 @@
PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
base = trunc_page(va);
offset = va & PAGE_MASK;
- size = roundup(offset + size, PAGE_SIZE);
+ size = round_page(offset + size);
/*
* Only supported on kernel virtual addresses, including the direct
@@ -5177,8 +6550,8 @@
if (base < DMAP_MIN_ADDRESS)
return (EINVAL);
- cache_bits_pde = pmap_cache_bits(mode, 1);
- cache_bits_pte = pmap_cache_bits(mode, 0);
+ cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
+ cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
changed = FALSE;
/*
@@ -5187,7 +6560,7 @@
*/
for (tmpva = base; tmpva < base + size; ) {
pdpe = pmap_pdpe(kernel_pmap, tmpva);
- if (*pdpe == 0)
+ if (pdpe == NULL || *pdpe == 0)
return (EINVAL);
if (*pdpe & PG_PS) {
/*
@@ -5195,7 +6568,7 @@
* memory type, then we need not demote this page. Just
* increment tmpva to the next 1GB page frame.
*/
- if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) {
+ if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
tmpva = trunc_1gpage(tmpva) + NBPDP;
continue;
}
@@ -5222,7 +6595,7 @@
* memory type, then we need not demote this page. Just
* increment tmpva to the next 2MB page frame.
*/
- if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
+ if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
tmpva = trunc_2mpage(tmpva) + NBPDR;
continue;
}
@@ -5255,11 +6628,13 @@
for (tmpva = base; tmpva < base + size; ) {
pdpe = pmap_pdpe(kernel_pmap, tmpva);
if (*pdpe & PG_PS) {
- if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) {
- pmap_pde_attr(pdpe, cache_bits_pde);
+ if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
+ pmap_pde_attr(pdpe, cache_bits_pde,
+ X86_PG_PDE_CACHE);
changed = TRUE;
}
- if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
+ if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+ (*pdpe & PG_PS_FRAME) < dmaplimit) {
if (pa_start == pa_end) {
/* Start physical address run. */
pa_start = *pdpe & PG_PS_FRAME;
@@ -5283,11 +6658,13 @@
}
pde = pmap_pdpe_to_pde(pdpe, tmpva);
if (*pde & PG_PS) {
- if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
- pmap_pde_attr(pde, cache_bits_pde);
+ if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
+ pmap_pde_attr(pde, cache_bits_pde,
+ X86_PG_PDE_CACHE);
changed = TRUE;
}
- if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
+ if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+ (*pde & PG_PS_FRAME) < dmaplimit) {
if (pa_start == pa_end) {
/* Start physical address run. */
pa_start = *pde & PG_PS_FRAME;
@@ -5309,11 +6686,13 @@
tmpva = trunc_2mpage(tmpva) + NBPDR;
} else {
pte = pmap_pde_to_pte(pde, tmpva);
- if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
- pmap_pte_attr(pte, cache_bits_pte);
+ if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
+ pmap_pte_attr(pte, cache_bits_pte,
+ X86_PG_PTE_CACHE);
changed = TRUE;
}
- if (tmpva >= VM_MIN_KERNEL_ADDRESS) {
+ if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
+ (*pte & PG_FRAME) < dmaplimit) {
if (pa_start == pa_end) {
/* Start physical address run. */
pa_start = *pte & PG_FRAME;
@@ -5335,9 +6714,12 @@
tmpva += PAGE_SIZE;
}
}
- if (error == 0 && pa_start != pa_end)
- error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
- pa_end - pa_start, mode);
+ if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
+ pa_end1 = MIN(pa_end, dmaplimit);
+ if (pa_start != pa_end1)
+ error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
+ pa_end1 - pa_start, mode);
+ }
/*
* Flush CPU caches if required to make sure any data isn't cached that
@@ -5345,7 +6727,7 @@
*/
if (changed) {
pmap_invalidate_range(kernel_pmap, base, tmpva);
- pmap_invalidate_cache_range(base, tmpva);
+ pmap_invalidate_cache_range(base, tmpva, FALSE);
}
return (error);
}
@@ -5375,7 +6757,7 @@
changed = FALSE;
PMAP_LOCK(kernel_pmap);
pdpe = pmap_pdpe(kernel_pmap, va);
- if ((*pdpe & PG_V) == 0)
+ if ((*pdpe & X86_PG_V) == 0)
panic("pmap_demote_DMAP: invalid PDPE");
if ((*pdpe & PG_PS) != 0) {
if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
@@ -5384,7 +6766,7 @@
}
if (len < NBPDR) {
pde = pmap_pdpe_to_pde(pdpe, va);
- if ((*pde & PG_V) == 0)
+ if ((*pde & X86_PG_V) == 0)
panic("pmap_demote_DMAP: invalid PDE");
if ((*pde & PG_PS) != 0) {
if (!pmap_demote_pde(kernel_pmap, pde, va))
@@ -5405,10 +6787,15 @@
pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
{
pd_entry_t *pdep;
- pt_entry_t pte;
+ pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
vm_paddr_t pa;
int val;
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
PMAP_LOCK(pmap);
retry:
pdep = pmap_pde(pmap, addr);
@@ -5453,7 +6840,6 @@
{
pmap_t pmap, oldpmap;
u_int cpuid;
- u_int64_t cr3;
critical_enter();
pmap = vmspace_pmap(td->td_proc->p_vmspace);
@@ -5462,13 +6848,14 @@
#ifdef SMP
CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
+ CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
#else
CPU_CLR(cpuid, &oldpmap->pm_active);
CPU_SET(cpuid, &pmap->pm_active);
+ CPU_SET(cpuid, &pmap->pm_save);
#endif
- cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
- td->td_pcb->pcb_cr3 = cr3;
- load_cr3(cr3);
+ td->td_pcb->pcb_cr3 = pmap->pm_cr3;
+ load_cr3(pmap->pm_cr3);
PCPU_SET(curpmap, pmap);
critical_exit();
}
@@ -5502,6 +6889,164 @@
*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
}
+#ifdef INVARIANTS
+static unsigned long num_dirty_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
+ &num_dirty_emulations, 0, NULL);
+
+static unsigned long num_accessed_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
+ &num_accessed_emulations, 0, NULL);
+
+static unsigned long num_superpage_accessed_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
+ &num_superpage_accessed_emulations, 0, NULL);
+
+static unsigned long ad_emulation_superpage_promotions;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
+ &ad_emulation_superpage_promotions, 0, NULL);
+#endif /* INVARIANTS */
+
+int
+pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
+{
+ int rv;
+ struct rwlock *lock;
+ vm_page_t m, mpte;
+ pd_entry_t *pde;
+ pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
+ boolean_t pv_lists_locked;
+
+ KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
+ ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
+
+ if (!pmap_emulate_ad_bits(pmap))
+ return (-1);
+
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
+ rv = -1;
+ lock = NULL;
+ pv_lists_locked = FALSE;
+retry:
+ PMAP_LOCK(pmap);
+
+ pde = pmap_pde(pmap, va);
+ if (pde == NULL || (*pde & PG_V) == 0)
+ goto done;
+
+ if ((*pde & PG_PS) != 0) {
+ if (ftype == VM_PROT_READ) {
+#ifdef INVARIANTS
+ atomic_add_long(&num_superpage_accessed_emulations, 1);
+#endif
+ *pde |= PG_A;
+ rv = 0;
+ }
+ goto done;
+ }
+
+ pte = pmap_pde_to_pte(pde, va);
+ if ((*pte & PG_V) == 0)
+ goto done;
+
+ if (ftype == VM_PROT_WRITE) {
+ if ((*pte & PG_RW) == 0)
+ goto done;
+ /*
+ * Set the modified and accessed bits simultaneously.
+ *
+ * Intel EPT PTEs that do software emulation of A/D bits map
+ * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
+ * An EPT misconfiguration is triggered if the PTE is writable
+ * but not readable (WR=10). This is avoided by setting PG_A
+ * and PG_M simultaneously.
+ */
+ *pte |= PG_M | PG_A;
+ } else {
+ *pte |= PG_A;
+ }
+
+ /* try to promote the mapping */
+ if (va < VM_MAXUSER_ADDRESS)
+ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
+ else
+ mpte = NULL;
+
+ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
+
+ if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
+ pmap_ps_enabled(pmap) &&
+ (m->flags & PG_FICTITIOUS) == 0 &&
+ vm_reserv_level_iffullpop(m) == 0) {
+ if (!pv_lists_locked) {
+ pv_lists_locked = TRUE;
+ if (!rw_try_rlock(&pvh_global_lock)) {
+ PMAP_UNLOCK(pmap);
+ rw_rlock(&pvh_global_lock);
+ goto retry;
+ }
+ }
+ pmap_promote_pde(pmap, pde, va, &lock);
+#ifdef INVARIANTS
+ atomic_add_long(&ad_emulation_superpage_promotions, 1);
+#endif
+ }
+#ifdef INVARIANTS
+ if (ftype == VM_PROT_WRITE)
+ atomic_add_long(&num_dirty_emulations, 1);
+ else
+ atomic_add_long(&num_accessed_emulations, 1);
+#endif
+ rv = 0; /* success */
+done:
+ if (lock != NULL)
+ rw_wunlock(lock);
+ if (pv_lists_locked)
+ rw_runlock(&pvh_global_lock);
+ PMAP_UNLOCK(pmap);
+ return (rv);
+}
+
+void
+pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
+{
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pde;
+ pt_entry_t *pte, PG_V;
+ int idx;
+
+ idx = 0;
+ PG_V = pmap_valid_bit(pmap);
+ PMAP_LOCK(pmap);
+
+ pml4 = pmap_pml4e(pmap, va);
+ ptr[idx++] = *pml4;
+ if ((*pml4 & PG_V) == 0)
+ goto done;
+
+ pdp = pmap_pml4e_to_pdpe(pml4, va);
+ ptr[idx++] = *pdp;
+ if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
+ goto done;
+
+ pde = pmap_pdpe_to_pde(pdp, va);
+ ptr[idx++] = *pde;
+ if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
+ goto done;
+
+ pte = pmap_pde_to_pte(pde, va);
+ ptr[idx++] = *pte;
+
+done:
+ PMAP_UNLOCK(pmap);
+ *num = idx;
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
@@ -5512,7 +7057,7 @@
pml4_entry_t *pml4;
pdp_entry_t *pdp;
pd_entry_t *pde;
- pt_entry_t *pte;
+ pt_entry_t *pte, PG_V;
vm_offset_t va;
if (have_addr) {
@@ -5522,6 +7067,7 @@
db_printf("show pte addr\n");
return;
}
+ PG_V = pmap_valid_bit(pmap);
pml4 = pmap_pml4e(pmap, va);
db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
if ((*pml4 & PG_V) == 0) {
Modified: trunk/sys/amd64/amd64/prof_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/prof_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/prof_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1996 Bruce D. Evans.
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/prof_machdep.c 220433 2011-04-07 23:28:28Z jkim $");
#ifdef GUPROF
#if 0
Modified: trunk/sys/amd64/amd64/ptrace_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/ptrace_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/ptrace_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2011 Konstantin Belousov <kib at FreeBSD.org>
* All rights reserved.
@@ -26,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/ptrace_machdep.c 332069 2018-04-05 13:39:53Z kib $");
#include "opt_compat.h"
@@ -36,12 +37,27 @@
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/sysent.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
+#include <machine/frame.h>
+#include <machine/vmparam.h>
+#ifdef COMPAT_FREEBSD32
+struct ptrace_xstate_info32 {
+ uint32_t xsave_mask1, xsave_mask2;
+ uint32_t xsave_len;
+};
+#endif
+
static int
cpu_ptrace_xstate(struct thread *td, int req, void *addr, int data)
{
+ struct ptrace_xstate_info info;
+#ifdef COMPAT_FREEBSD32
+ struct ptrace_xstate_info32 info32;
+#endif
char *savefpu;
int error;
@@ -49,7 +65,7 @@
return (EOPNOTSUPP);
switch (req) {
- case PT_GETXSTATE:
+ case PT_GETXSTATE_OLD:
fpugetregs(td);
savefpu = (char *)(get_pcb_user_save_td(td) + 1);
error = copyout(savefpu, addr,
@@ -56,7 +72,7 @@
cpu_max_ext_state_size - sizeof(struct savefpu));
break;
- case PT_SETXSTATE:
+ case PT_SETXSTATE_OLD:
if (data > cpu_max_ext_state_size - sizeof(struct savefpu)) {
error = EINVAL;
break;
@@ -70,6 +86,52 @@
free(savefpu, M_TEMP);
break;
+ case PT_GETXSTATE_INFO:
+#ifdef COMPAT_FREEBSD32
+ if (SV_CURPROC_FLAG(SV_ILP32)) {
+ if (data != sizeof(info32)) {
+ error = EINVAL;
+ } else {
+ info32.xsave_len = cpu_max_ext_state_size;
+ info32.xsave_mask1 = xsave_mask;
+ info32.xsave_mask2 = xsave_mask >> 32;
+ error = copyout(&info32, addr, data);
+ }
+ } else
+#endif
+ {
+ if (data != sizeof(info)) {
+ error = EINVAL;
+ } else {
+ bzero(&info, sizeof(info));
+ info.xsave_len = cpu_max_ext_state_size;
+ info.xsave_mask = xsave_mask;
+ error = copyout(&info, addr, data);
+ }
+ }
+ break;
+
+ case PT_GETXSTATE:
+ fpugetregs(td);
+ savefpu = (char *)(get_pcb_user_save_td(td));
+ error = copyout(savefpu, addr, cpu_max_ext_state_size);
+ break;
+
+ case PT_SETXSTATE:
+ if (data < sizeof(struct savefpu) ||
+ data > cpu_max_ext_state_size) {
+ error = EINVAL;
+ break;
+ }
+ savefpu = malloc(data, M_TEMP, M_WAITOK);
+ error = copyin(addr, savefpu, data);
+ if (error == 0)
+ error = fpusetregs(td, (struct savefpu *)savefpu,
+ savefpu + sizeof(struct savefpu), data -
+ sizeof(struct savefpu));
+ free(savefpu, M_TEMP);
+ break;
+
default:
error = EINVAL;
break;
@@ -78,16 +140,29 @@
return (error);
}
+static void
+cpu_ptrace_setbase(struct thread *td, int req, register_t r)
+{
+
+ if (req == PT_SETFSBASE) {
+ td->td_pcb->pcb_fsbase = r;
+ td->td_frame->tf_fs = _ufssel;
+ } else {
+ td->td_pcb->pcb_gsbase = r;
+ td->td_frame->tf_gs = _ugssel;
+ }
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
+}
+
#ifdef COMPAT_FREEBSD32
#define PT_I386_GETXMMREGS (PT_FIRSTMACH + 0)
#define PT_I386_SETXMMREGS (PT_FIRSTMACH + 1)
-#define PT_I386_GETXSTATE (PT_FIRSTMACH + 2)
-#define PT_I386_SETXSTATE (PT_FIRSTMACH + 3)
static int
cpu32_ptrace(struct thread *td, int req, void *addr, int data)
{
struct savefpu *fpstate;
+ uint32_t r;
int error;
switch (req) {
@@ -104,14 +179,37 @@
fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
break;
- case PT_I386_GETXSTATE:
- error = cpu_ptrace_xstate(td, PT_GETXSTATE, addr, data);
+ case PT_GETXSTATE_OLD:
+ case PT_SETXSTATE_OLD:
+ case PT_GETXSTATE_INFO:
+ case PT_GETXSTATE:
+ case PT_SETXSTATE:
+ error = cpu_ptrace_xstate(td, req, addr, data);
break;
- case PT_I386_SETXSTATE:
- error = cpu_ptrace_xstate(td, PT_SETXSTATE, addr, data);
+ case PT_GETFSBASE:
+ case PT_GETGSBASE:
+ if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+ error = EINVAL;
+ break;
+ }
+ r = req == PT_GETFSBASE ? td->td_pcb->pcb_fsbase :
+ td->td_pcb->pcb_gsbase;
+ error = copyout(&r, addr, sizeof(r));
break;
+ case PT_SETFSBASE:
+ case PT_SETGSBASE:
+ if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+ error = EINVAL;
+ break;
+ }
+ error = copyin(addr, &r, sizeof(r));
+ if (error != 0)
+ break;
+ cpu_ptrace_setbase(td, req, r);
+ break;
+
default:
error = EINVAL;
break;
@@ -124,6 +222,7 @@
int
cpu_ptrace(struct thread *td, int req, void *addr, int data)
{
+ register_t *r, rv;
int error;
#ifdef COMPAT_FREEBSD32
@@ -131,18 +230,40 @@
return (cpu32_ptrace(td, req, addr, data));
#endif
- /* Support old values of PT_GETXSTATE and PT_SETXSTATE. */
+ /* Support old values of PT_GETXSTATE_OLD and PT_SETXSTATE_OLD. */
if (req == PT_FIRSTMACH + 0)
- req = PT_GETXSTATE;
+ req = PT_GETXSTATE_OLD;
if (req == PT_FIRSTMACH + 1)
- req = PT_SETXSTATE;
+ req = PT_SETXSTATE_OLD;
switch (req) {
+ case PT_GETXSTATE_OLD:
+ case PT_SETXSTATE_OLD:
+ case PT_GETXSTATE_INFO:
case PT_GETXSTATE:
case PT_SETXSTATE:
error = cpu_ptrace_xstate(td, req, addr, data);
break;
+ case PT_GETFSBASE:
+ case PT_GETGSBASE:
+ r = req == PT_GETFSBASE ? &td->td_pcb->pcb_fsbase :
+ &td->td_pcb->pcb_gsbase;
+ error = copyout(r, addr, sizeof(*r));
+ break;
+
+ case PT_SETFSBASE:
+ case PT_SETGSBASE:
+ error = copyin(addr, &rv, sizeof(rv));
+ if (error != 0)
+ break;
+ if (rv >= td->td_proc->p_sysent->sv_maxuser) {
+ error = EINVAL;
+ break;
+ }
+ cpu_ptrace_setbase(td, req, rv);
+ break;
+
default:
error = EINVAL;
break;
Modified: trunk/sys/amd64/amd64/sigtramp.S
===================================================================
--- trunk/sys/amd64/amd64/sigtramp.S 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/sigtramp.S 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2003 Peter Wemm <peter at freeBSD.org>
* All rights reserved.
@@ -23,7 +24,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/sigtramp.S 114349 2003-05-01 01:05:25Z peter $
*/
#include <sys/syscall.h>
Property changes on: trunk/sys/amd64/amd64/sigtramp.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/stack_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/stack_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/stack_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2005 Antoine Brodin
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/stack_machdep.c 286396 2015-08-07 04:31:02Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -40,7 +41,7 @@
#include <vm/pmap.h>
static void
-stack_capture(struct stack *st, register_t rbp)
+stack_capture(struct thread *td, struct stack *st, register_t rbp)
{
struct amd64_frame *frame;
vm_offset_t callpc;
@@ -56,8 +57,8 @@
if (stack_put(st, callpc) == -1)
break;
if (frame->f_frame <= frame ||
- (vm_offset_t)frame->f_frame >=
- (vm_offset_t)rbp + KSTACK_PAGES * PAGE_SIZE)
+ (vm_offset_t)frame->f_frame >= td->td_kstack +
+ td->td_kstack_pages * PAGE_SIZE)
break;
frame = frame->f_frame;
}
@@ -74,7 +75,7 @@
panic("stack_save_td: running");
rbp = td->td_pcb->pcb_rbp;
- stack_capture(st, rbp);
+ stack_capture(td, st, rbp);
}
void
@@ -83,5 +84,5 @@
register_t rbp;
__asm __volatile("movq %%rbp,%0" : "=r" (rbp));
- stack_capture(st, rbp);
+ stack_capture(curthread, st, rbp);
}
Modified: trunk/sys/amd64/amd64/support.S
===================================================================
--- trunk/sys/amd64/amd64/support.S 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/support.S 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2003 Peter Wemm.
* Copyright (c) 1993 The Regents of the University of California.
@@ -27,7 +28,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/amd64/amd64/support.S 310975 2016-12-31 13:23:28Z mjg $
*/
#include "opt_ddb.h"
@@ -47,6 +48,7 @@
/* done */
ENTRY(bzero)
+ PUSH_FRAME_POINTER
movq %rsi,%rcx
xorl %eax,%eax
shrq $3,%rcx
@@ -57,26 +59,23 @@
andq $7,%rcx
rep
stosb
+ POP_FRAME_POINTER
ret
END(bzero)
-
+
/* Address: %rdi */
ENTRY(pagezero)
- movq $-PAGE_SIZE,%rdx
- subq %rdx,%rdi
+ PUSH_FRAME_POINTER
+ movq $PAGE_SIZE/8,%rcx
xorl %eax,%eax
-1:
- movnti %rax,(%rdi,%rdx)
- movnti %rax,8(%rdi,%rdx)
- movnti %rax,16(%rdi,%rdx)
- movnti %rax,24(%rdi,%rdx)
- addq $32,%rdx
- jne 1b
- sfence
+ rep
+ stosq
+ POP_FRAME_POINTER
ret
END(pagezero)
ENTRY(bcmp)
+ PUSH_FRAME_POINTER
movq %rdx,%rcx
shrq $3,%rcx
cld /* compare forwards */
@@ -91,6 +90,7 @@
1:
setne %al
movsbl %al,%eax
+ POP_FRAME_POINTER
ret
END(bcmp)
@@ -100,6 +100,7 @@
* ws at tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
ENTRY(bcopy)
+ PUSH_FRAME_POINTER
xchgq %rsi,%rdi
movq %rdx,%rcx
@@ -116,6 +117,7 @@
andq $7,%rcx /* any bytes left? */
rep
movsb
+ POP_FRAME_POINTER
ret
/* ALIGN_TEXT */
@@ -135,13 +137,16 @@
rep
movsq
cld
+ POP_FRAME_POINTER
ret
END(bcopy)
-
+
/*
* Note: memcpy does not support overlapping copies
*/
ENTRY(memcpy)
+ PUSH_FRAME_POINTER
+ movq %rdi,%rax
movq %rdx,%rcx
shrq $3,%rcx /* copy by 64-bit words */
cld /* copy forwards */
@@ -151,6 +156,7 @@
andq $7,%rcx /* any bytes left? */
rep
movsb
+ POP_FRAME_POINTER
ret
END(memcpy)
@@ -158,6 +164,7 @@
* pagecopy(%rdi=from, %rsi=to)
*/
ENTRY(pagecopy)
+ PUSH_FRAME_POINTER
movq $-PAGE_SIZE,%rax
movq %rax,%rdx
subq %rax,%rdi
@@ -178,18 +185,21 @@
addq $32,%rdx
jne 2b
sfence
+ POP_FRAME_POINTER
ret
END(pagecopy)
-/* fillw(pat, base, cnt) */
+/* fillw(pat, base, cnt) */
/* %rdi,%rsi, %rdx */
ENTRY(fillw)
- movq %rdi,%rax
+ PUSH_FRAME_POINTER
+ movq %rdi,%rax
movq %rsi,%rdi
movq %rdx,%rcx
cld
rep
stosw
+ POP_FRAME_POINTER
ret
END(fillw)
@@ -210,6 +220,7 @@
* %rdi, %rsi, %rdx
*/
ENTRY(copyout)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rax
movq $copyout_fault,PCB_ONFAULT(%rax)
testq %rdx,%rdx /* anything to do? */
@@ -255,6 +266,7 @@
xorl %eax,%eax
movq PCPU(CURPCB),%rdx
movq %rax,PCB_ONFAULT(%rdx)
+ POP_FRAME_POINTER
ret
ALIGN_TEXT
@@ -262,6 +274,7 @@
movq PCPU(CURPCB),%rdx
movq $0,PCB_ONFAULT(%rdx)
movq $EFAULT,%rax
+ POP_FRAME_POINTER
ret
END(copyout)
@@ -270,6 +283,7 @@
* %rdi, %rsi, %rdx
*/
ENTRY(copyin)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rax
movq $copyin_fault,PCB_ONFAULT(%rax)
testq %rdx,%rdx /* anything to do? */
@@ -301,6 +315,7 @@
xorl %eax,%eax
movq PCPU(CURPCB),%rdx
movq %rax,PCB_ONFAULT(%rdx)
+ POP_FRAME_POINTER
ret
ALIGN_TEXT
@@ -308,16 +323,19 @@
movq PCPU(CURPCB),%rdx
movq $0,PCB_ONFAULT(%rdx)
movq $EFAULT,%rax
+ POP_FRAME_POINTER
ret
END(copyin)
/*
- * casuword32. Compare and set user integer. Returns -1 or the current value.
- * dst = %rdi, old = %rsi, new = %rdx
+ * casueword32. Compare and set user integer. Returns -1 on fault,
+ * 0 if access was successful. Old value is written to *oldp.
+ * dst = %rdi, old = %esi, oldp = %rdx, new = %ecx
*/
-ENTRY(casuword32)
- movq PCPU(CURPCB),%rcx
- movq $fusufault,PCB_ONFAULT(%rcx)
+ENTRY(casueword32)
+ PUSH_FRAME_POINTER
+ movq PCPU(CURPCB),%r8
+ movq $fusufault,PCB_ONFAULT(%r8)
movq $VM_MAXUSER_ADDRESS-4,%rax
cmpq %rax,%rdi /* verify address is valid */
@@ -327,26 +345,36 @@
#ifdef SMP
lock
#endif
- cmpxchgl %edx,(%rdi) /* new = %edx */
+ cmpxchgl %ecx,(%rdi) /* new = %ecx */
/*
* The old value is in %eax. If the store succeeded it will be the
* value we expected (old) from before the store, otherwise it will
- * be the current value.
+ * be the current value. Save %eax into %esi to prepare the return
+ * value.
*/
+ movl %eax,%esi
+ xorl %eax,%eax
+ movq %rax,PCB_ONFAULT(%r8)
- movq PCPU(CURPCB),%rcx
- movq $0,PCB_ONFAULT(%rcx)
+ /*
+ * Access the oldp after the pcb_onfault is cleared, to correctly
+ * catch corrupted pointer.
+ */
+ movl %esi,(%rdx) /* oldp = %rdx */
+ POP_FRAME_POINTER
ret
-END(casuword32)
+END(casueword32)
/*
- * casuword. Compare and set user word. Returns -1 or the current value.
- * dst = %rdi, old = %rsi, new = %rdx
+ * casueword. Compare and set user long. Returns -1 on fault,
+ * 0 if access was successful. Old value is written to *oldp.
+ * dst = %rdi, old = %rsi, oldp = %rdx, new = %rcx
*/
-ENTRY(casuword)
- movq PCPU(CURPCB),%rcx
- movq $fusufault,PCB_ONFAULT(%rcx)
+ENTRY(casueword)
+ PUSH_FRAME_POINTER
+ movq PCPU(CURPCB),%r8
+ movq $fusufault,PCB_ONFAULT(%r8)
movq $VM_MAXUSER_ADDRESS-4,%rax
cmpq %rax,%rdi /* verify address is valid */
@@ -356,28 +384,30 @@
#ifdef SMP
lock
#endif
- cmpxchgq %rdx,(%rdi) /* new = %rdx */
+ cmpxchgq %rcx,(%rdi) /* new = %rcx */
/*
- * The old value is in %eax. If the store succeeded it will be the
+ * The old value is in %rax. If the store succeeded it will be the
* value we expected (old) from before the store, otherwise it will
* be the current value.
*/
-
- movq PCPU(CURPCB),%rcx
- movq $fusufault,PCB_ONFAULT(%rcx)
- movq $0,PCB_ONFAULT(%rcx)
+ movq %rax,%rsi
+ xorl %eax,%eax
+ movq %rax,PCB_ONFAULT(%r8)
+ movq %rsi,(%rdx)
+ POP_FRAME_POINTER
ret
-END(casuword)
+END(casueword)
/*
* Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit
- * byte from user memory. All these functions are MPSAFE.
- * addr = %rdi
+ * byte from user memory.
+ * addr = %rdi, valp = %rsi
*/
-ALTENTRY(fuword64)
-ENTRY(fuword)
+ALTENTRY(fueword64)
+ENTRY(fueword)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rcx
movq $fusufault,PCB_ONFAULT(%rcx)
@@ -385,13 +415,17 @@
cmpq %rax,%rdi /* verify address is valid */
ja fusufault
- movq (%rdi),%rax
- movq $0,PCB_ONFAULT(%rcx)
+ xorl %eax,%eax
+ movq (%rdi),%r11
+ movq %rax,PCB_ONFAULT(%rcx)
+ movq %r11,(%rsi)
+ POP_FRAME_POINTER
ret
-END(fuword64)
-END(fuword)
+END(fueword64)
+END(fueword)
-ENTRY(fuword32)
+ENTRY(fueword32)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rcx
movq $fusufault,PCB_ONFAULT(%rcx)
@@ -399,10 +433,13 @@
cmpq %rax,%rdi /* verify address is valid */
ja fusufault
- movl (%rdi),%eax
- movq $0,PCB_ONFAULT(%rcx)
+ xorl %eax,%eax
+ movl (%rdi),%r11d
+ movq %rax,PCB_ONFAULT(%rcx)
+ movl %r11d,(%rsi)
+ POP_FRAME_POINTER
ret
-END(fuword32)
+END(fueword32)
/*
* fuswintr() and suswintr() are specialized variants of fuword16() and
@@ -419,6 +456,7 @@
END(fuswintr)
ENTRY(fuword16)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rcx
movq $fusufault,PCB_ONFAULT(%rcx)
@@ -428,10 +466,12 @@
movzwl (%rdi),%eax
movq $0,PCB_ONFAULT(%rcx)
+ POP_FRAME_POINTER
ret
END(fuword16)
ENTRY(fubyte)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rcx
movq $fusufault,PCB_ONFAULT(%rcx)
@@ -441,6 +481,7 @@
movzbl (%rdi),%eax
movq $0,PCB_ONFAULT(%rcx)
+ POP_FRAME_POINTER
ret
END(fubyte)
@@ -450,6 +491,7 @@
xorl %eax,%eax
movq %rax,PCB_ONFAULT(%rcx)
decq %rax
+ POP_FRAME_POINTER
ret
/*
@@ -459,6 +501,7 @@
*/
ALTENTRY(suword64)
ENTRY(suword)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rcx
movq $fusufault,PCB_ONFAULT(%rcx)
@@ -470,11 +513,13 @@
xorl %eax,%eax
movq PCPU(CURPCB),%rcx
movq %rax,PCB_ONFAULT(%rcx)
+ POP_FRAME_POINTER
ret
END(suword64)
END(suword)
ENTRY(suword32)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rcx
movq $fusufault,PCB_ONFAULT(%rcx)
@@ -486,10 +531,12 @@
xorl %eax,%eax
movq PCPU(CURPCB),%rcx
movq %rax,PCB_ONFAULT(%rcx)
+ POP_FRAME_POINTER
ret
END(suword32)
ENTRY(suword16)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rcx
movq $fusufault,PCB_ONFAULT(%rcx)
@@ -501,10 +548,12 @@
xorl %eax,%eax
movq PCPU(CURPCB),%rcx /* restore trashed register */
movq %rax,PCB_ONFAULT(%rcx)
+ POP_FRAME_POINTER
ret
END(suword16)
ENTRY(subyte)
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%rcx
movq $fusufault,PCB_ONFAULT(%rcx)
@@ -517,6 +566,7 @@
xorl %eax,%eax
movq PCPU(CURPCB),%rcx /* restore trashed register */
movq %rax,PCB_ONFAULT(%rcx)
+ POP_FRAME_POINTER
ret
END(subyte)
@@ -530,6 +580,7 @@
* return the actual length in *lencopied.
*/
ENTRY(copyinstr)
+ PUSH_FRAME_POINTER
movq %rdx,%r8 /* %r8 = maxlen */
movq %rcx,%r9 /* %r9 = *len */
xchgq %rdi,%rsi /* %rdi = from, %rsi = to */
@@ -586,6 +637,7 @@
subq %rdx,%r8
movq %r8,(%r9)
1:
+ POP_FRAME_POINTER
ret
END(copyinstr)
@@ -594,6 +646,7 @@
* %rdi, %rsi, %rdx, %rcx
*/
ENTRY(copystr)
+ PUSH_FRAME_POINTER
movq %rdx,%r8 /* %r8 = maxlen */
xchgq %rdi,%rsi
@@ -623,6 +676,7 @@
subq %rdx,%r8
movq %r8,(%rcx)
7:
+ POP_FRAME_POINTER
ret
END(copystr)
@@ -692,6 +746,7 @@
*/
ENTRY(rdmsr_safe)
/* int rdmsr_safe(u_int msr, uint64_t *data) */
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%r8
movq $msr_onfault,PCB_ONFAULT(%r8)
movl %edi,%ecx
@@ -703,6 +758,7 @@
movq %rax,(%rsi)
xorq %rax,%rax
movq %rax,PCB_ONFAULT(%r8)
+ POP_FRAME_POINTER
ret
/*
@@ -710,6 +766,7 @@
*/
ENTRY(wrmsr_safe)
/* int wrmsr_safe(u_int msr, uint64_t data) */
+ PUSH_FRAME_POINTER
movq PCPU(CURPCB),%r8
movq $msr_onfault,PCB_ONFAULT(%r8)
movl %edi,%ecx
@@ -720,6 +777,7 @@
hi byte in edx, lo in %eax. */
xorq %rax,%rax
movq %rax,PCB_ONFAULT(%r8)
+ POP_FRAME_POINTER
ret
/*
@@ -729,4 +787,5 @@
msr_onfault:
movq $0,PCB_ONFAULT(%r8)
movl $EFAULT,%eax
+ POP_FRAME_POINTER
ret
Property changes on: trunk/sys/amd64/amd64/support.S
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/amd64/amd64/sys_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/sys_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/sys_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2003 Peter Wemm.
* Copyright (c) 1990 The Regents of the University of California.
@@ -31,13 +32,13 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/sys_machdep.c 307940 2016-10-25 17:16:08Z glebius $");
#include "opt_capsicum.h"
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -207,6 +208,10 @@
case I386_SET_IOPERM:
default:
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_SYSCALL, NULL, NULL);
+#endif
return (ECAPMODE);
}
}
@@ -316,7 +321,7 @@
fpugetregs(td);
error = copyout((char *)(get_pcb_user_save_td(td) + 1),
a64xfpu.addr, a64xfpu.len);
- return (error);
+ break;
default:
error = EINVAL;
@@ -330,18 +335,20 @@
struct thread *td;
struct i386_ioperm_args *uap;
{
- int i, error;
char *iomap;
struct amd64tss *tssp;
struct system_segment_descriptor *tss_sd;
u_long *addr;
struct pcb *pcb;
+ u_int i;
+ int error;
if ((error = priv_check(td, PRIV_IO)) != 0)
return (error);
if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
return (error);
- if (uap->start + uap->length > IOPAGES * PAGE_SIZE * NBBY)
+ if (uap->start > uap->start + uap->length ||
+ uap->start + uap->length > IOPAGES * PAGE_SIZE * NBBY)
return (EINVAL);
/*
@@ -352,8 +359,8 @@
*/
pcb = td->td_pcb;
if (pcb->pcb_tssp == NULL) {
- tssp = (struct amd64tss *)kmem_alloc(kernel_map,
- ctob(IOPAGES+1));
+ tssp = (struct amd64tss *)kmem_malloc(kernel_arena,
+ ctob(IOPAGES+1), M_WAITOK);
if (tssp == NULL)
return (ENOMEM);
iomap = (char *)&tssp[1];
@@ -459,8 +466,9 @@
return (mdp->md_ldt);
mtx_unlock(&dt_lock);
new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
- new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map,
- max_ldt_segment * sizeof(struct user_segment_descriptor));
+ new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
+ max_ldt_segment * sizeof(struct user_segment_descriptor),
+ M_WAITOK | M_ZERO);
if (new_ldt->ldt_base == NULL) {
FREE(new_ldt, M_SUBPROC);
mtx_lock(&dt_lock);
@@ -479,7 +487,7 @@
mtx_lock(&dt_lock);
pldt = mdp->md_ldt;
if (pldt != NULL && !force) {
- kmem_free(kernel_map, (vm_offset_t)new_ldt->ldt_base,
+ kmem_free(kernel_arena, (vm_offset_t)new_ldt->ldt_base,
max_ldt_segment * sizeof(struct user_segment_descriptor));
free(new_ldt, M_SUBPROC);
return (pldt);
@@ -524,7 +532,7 @@
{
if (--pldt->ldt_refcnt == 0) {
- kmem_free(kernel_map, (vm_offset_t)pldt->ldt_base,
+ kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base,
max_ldt_segment * sizeof(struct user_segment_descriptor));
free(pldt, M_SUBPROC);
}
Modified: trunk/sys/amd64/amd64/trap.c
===================================================================
--- trunk/sys/amd64/amd64/trap.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/trap.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (C) 1994, David Greenman
* Copyright (c) 1990, 1993
@@ -38,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/trap.c 333370 2018-05-08 17:05:39Z emaste $");
/*
* AMD64 Trap and System call handling
@@ -45,6 +46,7 @@
*/
#include "opt_clock.h"
+#include "opt_compat.h"
#include "opt_cpu.h"
#include "opt_hwpmc_hooks.h"
#include "opt_isa.h"
@@ -96,31 +98,11 @@
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
+#endif
-/*
- * This is a hook which is initialised by the dtrace module
- * to handle traps which might occur during DTrace probe
- * execution.
- */
-dtrace_trap_func_t dtrace_trap_func;
+extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(fast_syscall),
+ IDTVEC(fast_syscall32), IDTVEC(int0x80_syscall);
-dtrace_doubletrap_func_t dtrace_doubletrap_func;
-
-/*
- * This is a hook which is initialised by the systrace module
- * when it is loaded. This keeps the DTrace syscall provider
- * implementation opaque.
- */
-systrace_probe_func_t systrace_probe_func;
-
-/*
- * These hooks are necessary for the pid, usdt and fasttrap providers.
- */
-dtrace_fasttrap_probe_ptr_t dtrace_fasttrap_probe_ptr;
-dtrace_pid_probe_ptr_t dtrace_pid_probe_ptr;
-dtrace_return_probe_ptr_t dtrace_return_probe_ptr;
-#endif
-
extern void trap(struct trapframe *frame);
extern void syscall(struct trapframe *frame);
void dblfault_handler(struct trapframe *frame);
@@ -128,7 +110,7 @@
static int trap_pfault(struct trapframe *, int);
static void trap_fatal(struct trapframe *, vm_offset_t);
-#define MAX_TRAP_MSG 33
+#define MAX_TRAP_MSG 32
static char *trap_msg[] = {
"", /* 0 unused */
"privileged instruction fault", /* 1 T_PRIVINFLT */
@@ -163,7 +145,6 @@
"reserved (unknown) fault", /* 30 T_RESERVED */
"", /* 31 unused (reserved) */
"DTrace pid return trap", /* 32 T_DTRACE_RET */
- "DTrace fasttrap probe trap", /* 33 T_DTRACE_PROBE */
};
#ifdef KDB
@@ -195,6 +176,9 @@
void
trap(struct trapframe *frame)
{
+#ifdef KDTRACE_HOOKS
+ struct reg regs;
+#endif
struct thread *td = curthread;
struct proc *p = td->td_proc;
int i = 0, ucode = 0, code;
@@ -246,33 +230,10 @@
/*
* A trap can occur while DTrace executes a probe. Before
* executing the probe, DTrace blocks re-scheduling and sets
- * a flag in it's per-cpu flags to indicate that it doesn't
+ * a flag in its per-cpu flags to indicate that it doesn't
* want to fault. On returning from the probe, the no-fault
* flag is cleared and finally re-scheduling is enabled.
- *
- * If the DTrace kernel module has registered a trap handler,
- * call it and if it returns non-zero, assume that it has
- * handled the trap and modified the trap frame so that this
- * function can return normally.
*/
- if (type == T_DTRACE_PROBE || type == T_DTRACE_RET ||
- type == T_BPTFLT) {
- struct reg regs;
-
- fill_frame_regs(frame, ®s);
- if (type == T_DTRACE_PROBE &&
- dtrace_fasttrap_probe_ptr != NULL &&
- dtrace_fasttrap_probe_ptr(®s) == 0)
- goto out;
- else if (type == T_BPTFLT &&
- dtrace_pid_probe_ptr != NULL &&
- dtrace_pid_probe_ptr(®s) == 0)
- goto out;
- else if (type == T_DTRACE_RET &&
- dtrace_return_probe_ptr != NULL &&
- dtrace_return_probe_ptr(®s) == 0)
- goto out;
- }
if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
goto out;
#endif
@@ -327,6 +288,14 @@
case T_BPTFLT: /* bpt instruction fault */
case T_TRCTRAP: /* trace trap */
enable_intr();
+#ifdef KDTRACE_HOOKS
+ if (type == T_BPTFLT) {
+ fill_frame_regs(frame, ®s);
+ if (dtrace_pid_probe_ptr != NULL &&
+ dtrace_pid_probe_ptr(®s) == 0)
+ goto out;
+ }
+#endif
frame->tf_rflags &= ~PSL_T;
i = SIGTRAP;
ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
@@ -352,6 +321,10 @@
i = SIGBUS;
ucode = BUS_OBJERR;
break;
+ case T_ALIGNFLT:
+ i = SIGBUS;
+ ucode = BUS_ADRALN;
+ break;
case T_DOUBLEFLT: /* double fault */
default:
i = SIGBUS;
@@ -359,6 +332,13 @@
break;
case T_PAGEFLT: /* page fault */
+ /*
+ * Emulator can take care about this trap?
+ */
+ if (*p->p_sysent->sv_trap != NULL &&
+ (*p->p_sysent->sv_trap)(td) == 0)
+ goto userout;
+
addr = frame->tf_addr;
i = trap_pfault(frame, TRUE);
if (i == -1)
@@ -421,7 +401,7 @@
goto userout;
} else if (panic_on_nmi)
panic("NMI indicates hardware failure");
- break;
+ goto out;
#endif /* DEV_ISA */
case T_OFLOW: /* integer overflow fault */
@@ -452,6 +432,15 @@
goto userout;
i = SIGFPE;
break;
+#ifdef KDTRACE_HOOKS
+ case T_DTRACE_RET:
+ enable_intr();
+ fill_frame_regs(frame, ®s);
+ if (dtrace_return_probe_ptr != NULL &&
+ dtrace_return_probe_ptr(®s) == 0)
+ goto out;
+ goto userout;
+#endif
}
} else {
/* kernel trap */
@@ -464,8 +453,8 @@
goto out;
case T_DNA:
- KASSERT(!PCB_USER_FPU(td->td_pcb),
- ("Unregistered use of FPU in kernel"));
+ if (PCB_USER_FPU(td->td_pcb))
+ panic("Unregistered use of FPU in kernel");
fpudna();
goto out;
@@ -473,8 +462,8 @@
case T_XMMFLT: /* SIMD floating-point exception */
case T_FPOPFLT: /* FPU operand fetch fault */
/*
- * XXXKIB for now disable any FPU traps in kernel
- * handler registration seems to be overkill
+ * For now, supporting kernel handler
+ * registration for FPU traps is overkill.
*/
trap_fatal(frame, 0);
goto out;
@@ -565,7 +554,40 @@
load_dr6(rdr6() & 0xfffffff0);
goto out;
}
+
/*
+ * Malicious user code can configure a debug
+ * register watchpoint to trap on data access
+ * to the top of stack and then execute 'pop
+ * %ss; int 3'. Due to exception deferral for
+ * 'pop %ss', the CPU will not interrupt 'int
+ * 3' to raise the DB# exception for the debug
+ * register but will postpone the DB# until
+ * execution of the first instruction of the
+ * BP# handler (in kernel mode). Normally the
+ * previous check would ignore DB# exceptions
+ * for watchpoints on user addresses raised in
+ * kernel mode. However, some CPU errata
+ * include cases where DB# exceptions do not
+ * properly set bits in %dr6, e.g. Haswell
+ * HSD23 and Skylake-X SKZ24.
+ *
+ * A deferred DB# can also be raised on the
+ * first instructions of system call entry
+ * points or single-step traps via similar use
+ * of 'pop %ss' or 'mov xxx, %ss'.
+ */
+ if (frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall) ||
+#ifdef COMPAT_FREEBSD32
+ frame->tf_rip ==
+ (uintptr_t)IDTVEC(int0x80_syscall) ||
+#endif
+ frame->tf_rip == (uintptr_t)IDTVEC(bpt) ||
+ frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
+ /* Needed for AMD. */
+ frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
+ return;
+ /*
* FALLTHROUGH (TRCTRAP kernel mode, kernel address)
*/
case T_BPTFLT:
@@ -633,7 +655,6 @@
user:
userret(td, frame);
- mtx_assert(&Giant, MA_NOTOWNED);
KASSERT(PCB_USER_FPU(td->td_pcb),
("Return from trap with kernel FPU ctx leaked"));
userout:
@@ -647,7 +668,7 @@
int usermode;
{
vm_offset_t va;
- struct vmspace *vm = NULL;
+ struct vmspace *vm;
vm_map_t map;
int rv = 0;
vm_prot_t ftype;
@@ -710,14 +731,10 @@
map = kernel_map;
} else {
/*
- * This is a fault on non-kernel virtual memory.
- * vm is initialized above to NULL. If curproc is NULL
- * or curproc->p_vmspace is NULL the fault is fatal.
+ * This is a fault on non-kernel virtual memory. If either
+ * p or p->p_vmspace is NULL, then the fault is fatal.
*/
- if (p != NULL)
- vm = p->p_vmspace;
-
- if (vm == NULL)
+ if (p == NULL || (vm = p->p_vmspace) == NULL)
goto nogo;
map = &vm->vm_map;
@@ -737,6 +754,14 @@
}
/*
+ * If the trap was caused by errant bits in the PTE then panic.
+ */
+ if (frame->tf_err & PGEX_RSV) {
+ trap_fatal(frame, eva);
+ return (-1);
+ }
+
+ /*
* PGEX_I is defined only if the execute disable bit capability is
* supported and enabled.
*/
@@ -793,8 +818,7 @@
trap_fatal(frame, eva);
return (-1);
}
-
- return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+ return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
}
static void
@@ -807,6 +831,9 @@
long esp;
struct soft_segment_descriptor softseg;
char *msg;
+#ifdef KDB
+ bool handled;
+#endif
code = frame->tf_err;
type = frame->tf_trapno;
@@ -830,6 +857,7 @@
code & PGEX_U ? "user" : "supervisor",
code & PGEX_W ? "write" : "read",
code & PGEX_I ? "instruction" : "data",
+ code & PGEX_RSV ? "reserved bits in PTE" :
code & PGEX_P ? "protection violation" : "page not present");
}
printf("instruction pointer = 0x%lx:0x%lx\n",
@@ -858,19 +886,17 @@
if (frame->tf_rflags & PSL_RF)
printf("resume, ");
printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
- printf("current process = ");
- if (curproc) {
- printf("%lu (%s)\n",
- (u_long)curproc->p_pid, curthread->td_name ?
- curthread->td_name : "");
- } else {
- printf("Idle\n");
- }
+ printf("current process = %d (%s)\n",
+ curproc->p_pid, curthread->td_name);
#ifdef KDB
- if (debugger_on_panic || kdb_active)
- if (kdb_trap(type, 0, frame))
+ if (debugger_on_panic) {
+ kdb_why = KDB_WHY_TRAP;
+ handled = kdb_trap(type, 0, frame);
+ kdb_why = KDB_WHY_UNSET;
+ if (handled)
return;
+ }
#endif
printf("trap number = %d\n", type);
if (type <= MAX_TRAP_MSG)
@@ -990,7 +1016,7 @@
}
KASSERT(PCB_USER_FPU(td->td_pcb),
- ("System call %s returing with kernel FPU ctx leaked",
+ ("System call %s returning with kernel FPU ctx leaked",
syscallname(td->td_proc, sa.code)));
KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
("System call %s returning with mangled pcb_save",
Modified: trunk/sys/amd64/amd64/uio_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/uio_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/uio_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2004 Alan L. Cox <alc at cs.rice.edu>
* Copyright (c) 1982, 1986, 1991, 1993
@@ -16,7 +17,7 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
@@ -36,7 +37,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/uio_machdep.c 266312 2014-05-17 13:59:11Z ian $");
#include <sys/param.h>
#include <sys/kernel.h>
Modified: trunk/sys/amd64/amd64/uma_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/uma_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/uma_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2003 Alan L. Cox <alc at cs.rice.edu>
* All rights reserved.
@@ -25,10 +26,11 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/uma_machdep.c 287945 2015-09-17 23:31:44Z rstone $");
#include <sys/param.h>
#include <sys/lock.h>
+#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <vm/vm.h>
@@ -40,7 +42,7 @@
#include <machine/vmparam.h>
void *
-uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
{
vm_page_t m;
vm_paddr_t pa;
@@ -48,12 +50,7 @@
int pflags;
*flags = UMA_SLAB_PRIV;
- if ((wait & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
- pflags = VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED;
- else
- pflags = VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED;
- if (wait & M_ZERO)
- pflags |= VM_ALLOC_ZERO;
+ pflags = malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED;
for (;;) {
m = vm_page_alloc(NULL, 0, pflags);
if (m == NULL) {
@@ -74,7 +71,7 @@
}
void
-uma_small_free(void *mem, int size, u_int8_t flags)
+uma_small_free(void *mem, vm_size_t size, u_int8_t flags)
{
vm_page_t m;
vm_paddr_t pa;
Modified: trunk/sys/amd64/amd64/vm_machdep.c
===================================================================
--- trunk/sys/amd64/amd64/vm_machdep.c 2018-06-01 22:59:34 UTC (rev 10176)
+++ trunk/sys/amd64/amd64/vm_machdep.c 2018-06-01 23:00:12 UTC (rev 10177)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1982, 1986 The Regents of the University of California.
* Copyright (c) 1989, 1990 William Jolitz
@@ -41,7 +42,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/vm_machdep.c 332759 2018-04-19 06:20:53Z avg $");
#include "opt_isa.h"
#include "opt_cpu.h"
@@ -59,7 +60,6 @@
#include <sys/mutex.h>
#include <sys/pioctl.h>
#include <sys/proc.h>
-#include <sys/sf_buf.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
@@ -90,9 +90,10 @@
static volatile u_int cpu_reset_proxy_active;
#endif
-CTASSERT((struct thread **)OFFSETOF_CURTHREAD ==
- &((struct pcpu *)NULL)->pc_curthread);
-CTASSERT((struct pcb **)OFFSETOF_CURPCB == &((struct pcpu *)NULL)->pc_curpcb);
+_Static_assert(OFFSETOF_CURTHREAD == offsetof(struct pcpu, pc_curthread),
+ "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread.");
+_Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb),
+ "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb.");
struct savefpu *
get_pcb_user_save_td(struct thread *td)
@@ -100,8 +101,8 @@
vm_offset_t p;
p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
- cpu_max_ext_state_size;
- KASSERT((p % 64) == 0, ("Unaligned pcb_user_save area"));
+ roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN);
+ KASSERT((p % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area"));
return ((struct savefpu *)p);
}
@@ -120,7 +121,8 @@
vm_offset_t p;
p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
- cpu_max_ext_state_size - sizeof(struct pcb);
+ roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN) -
+ sizeof(struct pcb);
return ((struct pcb *)p);
}
@@ -127,7 +129,7 @@
void *
alloc_fpusave(int flags)
{
- struct pcb *res;
+ void *res;
struct savefpu_ymm *sf;
res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags);
@@ -219,7 +221,7 @@
* return address on stack. These are the kernel mode register values.
*/
pmap2 = vmspace_pmap(p2->p_vmspace);
- pcb2->pcb_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap2->pm_pml4);
+ pcb2->pcb_cr3 = pmap2->pm_cr3;
pcb2->pcb_r12 = (register_t)fork_return; /* fork_trampoline argument */
pcb2->pcb_rbp = 0;
pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *);
@@ -341,7 +343,7 @@
* Clean TSS/iomap
*/
if (pcb->pcb_tssp != NULL) {
- kmem_free(kernel_map, (vm_offset_t)pcb->pcb_tssp,
+ kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_tssp,
ctob(IOPAGES + 1));
pcb->pcb_tssp = NULL;
}
@@ -400,9 +402,13 @@
* for the next iteration.
* %r10 restore is only required for freebsd/amd64 processes,
* but shall be innocent for any ia32 ABI.
+ *
+ * Require full context restore to get the arguments
+ * in the registers reloaded at return to usermode.
*/
td->td_frame->tf_rip -= td->td_frame->tf_err;
td->td_frame->tf_r10 = td->td_frame->tf_rcx;
+ set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
break;
case EJUSTRETURN:
@@ -409,13 +415,7 @@
break;
default:
- if (td->td_proc->p_sysent->sv_errsize) {
- if (error >= td->td_proc->p_sysent->sv_errsize)
- error = -1; /* XXX */
- else
- error = td->td_proc->p_sysent->sv_errtbl[error];
- }
- td->td_frame->tf_rax = error;
+ td->td_frame->tf_rax = SV_ABI_ERRNO(td->td_proc, error);
td->td_frame->tf_rflags |= PSL_C;
break;
}
@@ -442,7 +442,8 @@
* values here.
*/
bcopy(td0->td_pcb, pcb2, sizeof(*pcb2));
- clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE);
+ clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE |
+ PCB_KERNFPU);
pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save,
cpu_max_ext_state_size);
@@ -571,13 +572,11 @@
static void
cpu_reset_proxy()
{
- cpuset_t tcrp;
cpu_reset_proxy_active = 1;
while (cpu_reset_proxy_active == 1)
- ; /* Wait for other cpu to see that we've started */
- CPU_SETOF(cpu_reset_proxyid, &tcrp);
- stop_cpus(tcrp);
+ ia32_pause(); /* Wait for other cpu to see that we've started */
+
printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
DELAY(1000000);
cpu_reset_real();
@@ -591,7 +590,7 @@
cpuset_t map;
u_int cnt;
- if (smp_active) {
+ if (smp_started) {
map = all_cpus;
CPU_CLR(PCPU_GET(cpuid), &map);
CPU_NAND(&map, &stopped_cpus);
@@ -611,15 +610,18 @@
wmb();
cnt = 0;
- while (cpu_reset_proxy_active == 0 && cnt < 10000000)
+ while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
+ ia32_pause();
cnt++; /* Wait for BSP to announce restart */
- if (cpu_reset_proxy_active == 0)
+ }
+ if (cpu_reset_proxy_active == 0) {
printf("cpu_reset: Failed to restart BSP\n");
- enable_intr();
- cpu_reset_proxy_active = 2;
-
- while (1);
- /* NOTREACHED */
+ } else {
+ cpu_reset_proxy_active = 2;
+ while (1)
+ ia32_pause();
+ /* NOTREACHED */
+ }
}
DELAY(1000000);
@@ -690,27 +692,6 @@
}
/*
- * Allocate an sf_buf for the given vm_page. On this machine, however, there
- * is no sf_buf object. Instead, an opaque pointer to the given vm_page is
- * returned.
- */
-struct sf_buf *
-sf_buf_alloc(struct vm_page *m, int pri)
-{
-
- return ((struct sf_buf *)m);
-}
-
-/*
- * Free the sf_buf. In fact, do nothing because there are no resources
- * associated with the sf_buf.
- */
-void
-sf_buf_free(struct sf_buf *sf)
-{
-}
-
-/*
* Software interrupt handler for queued VM system processing.
*/
void
More information about the Midnightbsd-cvs
mailing list