[Midnightbsd-cvs] src [12310] trunk/sys/x86: sync with FreeBSD 11-stable

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Sat Feb 8 14:32:42 EST 2020


Revision: 12310
          http://svnweb.midnightbsd.org/src/?rev=12310
Author:   laffer1
Date:     2020-02-08 14:32:41 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/x86/iommu/busdma_dmar.c
    trunk/sys/x86/iommu/busdma_dmar.h
    trunk/sys/x86/iommu/intel_ctx.c
    trunk/sys/x86/iommu/intel_dmar.h
    trunk/sys/x86/iommu/intel_drv.c
    trunk/sys/x86/iommu/intel_fault.c
    trunk/sys/x86/iommu/intel_gas.c
    trunk/sys/x86/iommu/intel_idpgtbl.c
    trunk/sys/x86/iommu/intel_qi.c
    trunk/sys/x86/iommu/intel_quirks.c
    trunk/sys/x86/iommu/intel_reg.h
    trunk/sys/x86/iommu/intel_utils.c
    trunk/sys/x86/isa/atpic.c
    trunk/sys/x86/isa/atrtc.c
    trunk/sys/x86/isa/clock.c
    trunk/sys/x86/isa/elcr.c
    trunk/sys/x86/isa/icu.h
    trunk/sys/x86/isa/isa.c
    trunk/sys/x86/isa/isa_dma.c
    trunk/sys/x86/isa/nmi.c
    trunk/sys/x86/isa/orm.c
    trunk/sys/x86/pci/pci_bus.c
    trunk/sys/x86/pci/qpi.c
    trunk/sys/x86/x86/bus_machdep.c
    trunk/sys/x86/x86/busdma_bounce.c
    trunk/sys/x86/x86/busdma_machdep.c
    trunk/sys/x86/x86/dump_machdep.c
    trunk/sys/x86/x86/fdt_machdep.c
    trunk/sys/x86/x86/identcpu.c
    trunk/sys/x86/x86/intr_machdep.c
    trunk/sys/x86/x86/io_apic.c
    trunk/sys/x86/x86/legacy.c
    trunk/sys/x86/x86/local_apic.c
    trunk/sys/x86/x86/mca.c
    trunk/sys/x86/x86/mptable.c
    trunk/sys/x86/x86/mptable_pci.c
    trunk/sys/x86/x86/msi.c
    trunk/sys/x86/x86/nexus.c
    trunk/sys/x86/x86/tsc.c
    trunk/sys/x86/xen/hvm.c
    trunk/sys/x86/xen/xen_intr.c

Added Paths:
-----------
    trunk/sys/x86/iommu/intel_intrmap.c
    trunk/sys/x86/iommu/iommu_intrmap.h
    trunk/sys/x86/x86/autoconf.c
    trunk/sys/x86/x86/cpu_machdep.c
    trunk/sys/x86/x86/delay.c
    trunk/sys/x86/x86/mp_watchdog.c
    trunk/sys/x86/x86/mp_x86.c
    trunk/sys/x86/x86/pvclock.c
    trunk/sys/x86/x86/stack_machdep.c
    trunk/sys/x86/x86/ucode.c
    trunk/sys/x86/x86/x86_mem.c
    trunk/sys/x86/xen/pv.c
    trunk/sys/x86/xen/pvcpu_enum.c
    trunk/sys/x86/xen/xen_apic.c
    trunk/sys/x86/xen/xen_msi.c
    trunk/sys/x86/xen/xen_nexus.c
    trunk/sys/x86/xen/xen_pci_bus.c
    trunk/sys/x86/xen/xenpv.c

Modified: trunk/sys/x86/iommu/busdma_dmar.c
===================================================================
--- trunk/sys/x86/iommu/busdma_dmar.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/busdma_dmar.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/busdma_dmar.c 284021 2015-06-05 08:36:25Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/busdma_dmar.c 316392 2017-04-02 07:11:15Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -48,6 +48,7 @@
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/uio.h>
+#include <sys/vmem.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <vm/vm.h>
@@ -74,14 +75,34 @@
 dmar_bus_dma_is_dev_disabled(int domain, int bus, int slot, int func)
 {
 	char str[128], *env;
+	int default_bounce;
+	bool ret;
+	static const char bounce_str[] = "bounce";
+	static const char dmar_str[] = "dmar";
 
-	snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d.bounce",
+	default_bounce = 0;
+	env = kern_getenv("hw.busdma.default");
+	if (env != NULL) {
+		if (strcmp(env, bounce_str) == 0)
+			default_bounce = 1;
+		else if (strcmp(env, dmar_str) == 0)
+			default_bounce = 0;
+		freeenv(env);
+	}
+
+	snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d",
 	    domain, bus, slot, func);
-	env = getenv(str);
+	env = kern_getenv(str);
 	if (env == NULL)
-		return (false);
+		return (default_bounce != 0);
+	if (strcmp(env, bounce_str) == 0)
+		ret = true;
+	else if (strcmp(env, dmar_str) == 0)
+		ret = false;
+	else
+		ret = default_bounce != 0;
 	freeenv(env);
-	return (true);
+	return (ret);
 }
 
 /*
@@ -93,7 +114,7 @@
  * domain, and must collectively be assigned to use either DMAR or
  * bounce mapping.
  */
-static device_t
+device_t
 dmar_get_requester(device_t dev, uint16_t *rid)
 {
 	devclass_t pci_class;
@@ -225,7 +246,7 @@
 	disabled = dmar_bus_dma_is_dev_disabled(pci_get_domain(requester), 
 	    pci_get_bus(requester), pci_get_slot(requester), 
 	    pci_get_function(requester));
-	ctx = dmar_get_ctx(dmar, requester, rid, disabled, rmrr);
+	ctx = dmar_get_ctx_for_dev(dmar, requester, rid, disabled, rmrr);
 	if (ctx == NULL)
 		return (NULL);
 	if (disabled) {
@@ -256,6 +277,8 @@
 	/* Not in scope of any DMAR ? */
 	if (dmar == NULL)
 		return (NULL);
+	if (!dmar->dma_enabled)
+		return (NULL);
 	dmar_quirks_pre_use(dmar);
 	dmar_instantiate_rmrr_ctxs(dmar);
 
@@ -369,16 +392,18 @@
 {
 	struct bus_dma_tag_dmar *tag;
 	struct bus_dmamap_dmar *map;
+	struct dmar_domain *domain;
 
 	tag = (struct bus_dma_tag_dmar *)dmat;
 	map = (struct bus_dmamap_dmar *)map1;
 	if (map != NULL) {
-		DMAR_CTX_LOCK(tag->ctx);
+		domain = tag->ctx->domain;
+		DMAR_DOMAIN_LOCK(domain);
 		if (!TAILQ_EMPTY(&map->map_entries)) {
-			DMAR_CTX_UNLOCK(tag->ctx);
+			DMAR_DOMAIN_UNLOCK(domain);
 			return (EBUSY);
 		}
-		DMAR_CTX_UNLOCK(tag->ctx);
+		DMAR_DOMAIN_UNLOCK(domain);
 		free(map, M_DMAR_DMAMAP);
 	}
 	tag->map_count--;
@@ -455,6 +480,7 @@
     struct dmar_map_entries_tailq *unroll_list)
 {
 	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	struct dmar_map_entry *entry;
 	dmar_gaddr_t size;
 	bus_size_t buflen1;
@@ -464,6 +490,7 @@
 	if (segs == NULL)
 		segs = tag->segments;
 	ctx = tag->ctx;
+	domain = ctx->domain;
 	seg = *segp;
 	error = 0;
 	idx = 0;
@@ -485,7 +512,7 @@
 		if (seg + 1 < tag->common.nsegments)
 			gas_flags |= DMAR_GM_CANSPLIT;
 
-		error = dmar_gas_map(ctx, &tag->common, size, offset,
+		error = dmar_gas_map(domain, &tag->common, size, offset,
 		    DMAR_MAP_ENTRY_READ | DMAR_MAP_ENTRY_WRITE,
 		    gas_flags, ma + idx, &entry);
 		if (error != 0)
@@ -532,10 +559,10 @@
 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
 		    (uintmax_t)buflen1, (uintmax_t)tag->common.maxsegsz));
 
-		DMAR_CTX_LOCK(ctx);
+		DMAR_DOMAIN_LOCK(domain);
 		TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
 		entry->flags |= DMAR_MAP_ENTRY_MAP;
-		DMAR_CTX_UNLOCK(ctx);
+		DMAR_DOMAIN_UNLOCK(domain);
 		TAILQ_INSERT_TAIL(unroll_list, entry, unroll_link);
 
 		segs[seg].ds_addr = entry->start + offset;
@@ -557,11 +584,13 @@
     int flags, bus_dma_segment_t *segs, int *segp)
 {
 	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	struct dmar_map_entry *entry, *entry1;
 	struct dmar_map_entries_tailq unroll_list;
 	int error;
 
 	ctx = tag->ctx;
+	domain = ctx->domain;
 	atomic_add_long(&ctx->loads, 1);
 
 	TAILQ_INIT(&unroll_list);
@@ -573,7 +602,7 @@
 		 * partial buffer load, so unfortunately we have to
 		 * revert all work done.
 		 */
-		DMAR_CTX_LOCK(ctx);
+		DMAR_DOMAIN_LOCK(domain);
 		TAILQ_FOREACH_SAFE(entry, &unroll_list, unroll_link,
 		    entry1) {
 			/*
@@ -584,12 +613,12 @@
 			 */
 			TAILQ_REMOVE(&map->map_entries, entry, dmamap_link);
 			TAILQ_REMOVE(&unroll_list, entry, unroll_link);
-			TAILQ_INSERT_TAIL(&ctx->unload_entries, entry,
+			TAILQ_INSERT_TAIL(&domain->unload_entries, entry,
 			    dmamap_link);
 		}
-		DMAR_CTX_UNLOCK(ctx);
-		taskqueue_enqueue(ctx->dmar->delayed_taskqueue,
-		    &ctx->unload_task);
+		DMAR_DOMAIN_UNLOCK(domain);
+		taskqueue_enqueue(domain->dmar->delayed_taskqueue,
+		    &domain->unload_task);
 	}
 
 	if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 &&
@@ -596,7 +625,7 @@
 	    !map->cansleep)
 		error = EINPROGRESS;
 	if (error == EINPROGRESS)
-		dmar_bus_schedule_dmamap(ctx->dmar, map);
+		dmar_bus_schedule_dmamap(domain->dmar, map);
 	return (error);
 }
 
@@ -762,6 +791,7 @@
 	struct bus_dma_tag_dmar *tag;
 	struct bus_dmamap_dmar *map;
 	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 #if defined(__amd64__)
 	struct dmar_map_entries_tailq entries;
 #endif
@@ -769,20 +799,22 @@
 	tag = (struct bus_dma_tag_dmar *)dmat;
 	map = (struct bus_dmamap_dmar *)map1;
 	ctx = tag->ctx;
+	domain = ctx->domain;
 	atomic_add_long(&ctx->unloads, 1);
 
 #if defined(__i386__)
-	DMAR_CTX_LOCK(ctx);
-	TAILQ_CONCAT(&ctx->unload_entries, &map->map_entries, dmamap_link);
-	DMAR_CTX_UNLOCK(ctx);
-	taskqueue_enqueue(ctx->dmar->delayed_taskqueue, &ctx->unload_task);
+	DMAR_DOMAIN_LOCK(domain);
+	TAILQ_CONCAT(&domain->unload_entries, &map->map_entries, dmamap_link);
+	DMAR_DOMAIN_UNLOCK(domain);
+	taskqueue_enqueue(domain->dmar->delayed_taskqueue,
+	    &domain->unload_task);
 #else /* defined(__amd64__) */
 	TAILQ_INIT(&entries);
-	DMAR_CTX_LOCK(ctx);
+	DMAR_DOMAIN_LOCK(domain);
 	TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link);
-	DMAR_CTX_UNLOCK(ctx);
+	DMAR_DOMAIN_UNLOCK(domain);
 	THREAD_NO_SLEEPING();
-	dmar_ctx_unload(ctx, &entries, false);
+	dmar_domain_unload(domain, &entries, false);
 	THREAD_SLEEPING_OK();
 	KASSERT(TAILQ_EMPTY(&entries), ("lazy dmar_ctx_unload %p", ctx));
 #endif
@@ -855,6 +887,8 @@
 dmar_init_busdma(struct dmar_unit *unit)
 {
 
+	unit->dma_enabled = 1;
+	TUNABLE_INT_FETCH("hw.dmar.dma", &unit->dma_enabled);
 	TAILQ_INIT(&unit->delayed_maps);
 	TASK_INIT(&unit->dmamap_load_task, 0, dmar_bus_task_dmamap, unit);
 	unit->delayed_taskqueue = taskqueue_create("dmar", M_WAITOK,

Modified: trunk/sys/x86/iommu/busdma_dmar.h
===================================================================
--- trunk/sys/x86/iommu/busdma_dmar.h	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/busdma_dmar.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/iommu/busdma_dmar.h 257251 2013-10-28 13:33:29Z kib $
+ * $FreeBSD: stable/11/sys/x86/iommu/busdma_dmar.h 257251 2013-10-28 13:33:29Z kib $
  */
 
 #ifndef __X86_IOMMU_BUSDMA_DMAR_H

Modified: trunk/sys/x86/iommu/intel_ctx.c
===================================================================
--- trunk/sys/x86/iommu/intel_ctx.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_ctx.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_ctx.c 279485 2015-03-01 10:35:54Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_ctx.c 320357 2017-06-26 12:30:39Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -49,6 +49,7 @@
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/uio.h>
+#include <sys/vmem.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
@@ -68,8 +69,12 @@
 #include <dev/pci/pcivar.h>
 
 static MALLOC_DEFINE(M_DMAR_CTX, "dmar_ctx", "Intel DMAR Context");
+static MALLOC_DEFINE(M_DMAR_DOMAIN, "dmar_dom", "Intel DMAR Domain");
 
-static void dmar_ctx_unload_task(void *arg, int pending);
+static void dmar_domain_unload_task(void *arg, int pending);
+static void dmar_unref_domain_locked(struct dmar_unit *dmar,
+    struct dmar_domain *domain);
+static void dmar_domain_destroy(struct dmar_domain *domain);
 
 static void
 dmar_ensure_ctx_page(struct dmar_unit *dmar, int bus)
@@ -108,8 +113,8 @@
 {
 	dmar_ctx_entry_t *ctxp;
 
-	ctxp = dmar_map_pgtbl(ctx->dmar->ctx_obj, 1 + PCI_RID2BUS(ctx->rid),
-	    DMAR_PGF_NOALLOC | DMAR_PGF_WAITOK, sfp);
+	ctxp = dmar_map_pgtbl(ctx->domain->dmar->ctx_obj, 1 +
+	    PCI_RID2BUS(ctx->rid), DMAR_PGF_NOALLOC | DMAR_PGF_WAITOK, sfp);
 	ctxp += ctx->rid & 0xff;
 	return (ctxp);
 }
@@ -119,7 +124,7 @@
 {
 	bus_addr_t maxaddr;
 
-	maxaddr = MIN(ctx->end, BUS_SPACE_MAXADDR);
+	maxaddr = MIN(ctx->domain->end, BUS_SPACE_MAXADDR);
 	ctx->ctx_tag.common.ref_count = 1; /* Prevent free */
 	ctx->ctx_tag.common.impl = &bus_dma_dmar_impl;
 	ctx->ctx_tag.common.boundary = PCI_DMA_BOUNDARY;
@@ -130,33 +135,42 @@
 	ctx->ctx_tag.common.maxsegsz = maxaddr;
 	ctx->ctx_tag.ctx = ctx;
 	ctx->ctx_tag.owner = dev;
-	/* XXXKIB initialize tag further */
 }
 
 static void
-ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp)
+ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp, bool move)
 {
 	struct dmar_unit *unit;
+	struct dmar_domain *domain;
 	vm_page_t ctx_root;
 
-	unit = ctx->dmar;
-	KASSERT(ctxp->ctx1 == 0 && ctxp->ctx2 == 0,
+	domain = ctx->domain;
+	unit = domain->dmar;
+	KASSERT(move || (ctxp->ctx1 == 0 && ctxp->ctx2 == 0),
 	    ("dmar%d: initialized ctx entry %d:%d:%d 0x%jx 0x%jx",
 	    unit->unit, pci_get_bus(ctx->ctx_tag.owner),
 	    pci_get_slot(ctx->ctx_tag.owner),
 	    pci_get_function(ctx->ctx_tag.owner),
-	    ctxp->ctx1,
-	    ctxp->ctx2));
-	ctxp->ctx2 = DMAR_CTX2_DID(ctx->domain);
-	ctxp->ctx2 |= ctx->awlvl;
-	if ((ctx->flags & DMAR_CTX_IDMAP) != 0 &&
+	    ctxp->ctx1, ctxp->ctx2));
+	/*
+	 * For update due to move, the store is not atomic.  It is
+	 * possible that DMAR read upper doubleword, while low
+	 * doubleword is not yet updated.  The domain id is stored in
+	 * the upper doubleword, while the table pointer in the lower.
+	 *
+	 * There is no good solution, for the same reason it is wrong
+	 * to clear P bit in the ctx entry for update.
+	 */
+	dmar_pte_store1(&ctxp->ctx2, DMAR_CTX2_DID(domain->domain) |
+	    domain->awlvl);
+	if ((domain->flags & DMAR_DOMAIN_IDMAP) != 0 &&
 	    (unit->hw_ecap & DMAR_ECAP_PT) != 0) {
-		KASSERT(ctx->pgtbl_obj == NULL,
+		KASSERT(domain->pgtbl_obj == NULL,
 		    ("ctx %p non-null pgtbl_obj", ctx));
-		dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P);
+		dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P);
 	} else {
-		ctx_root = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_NOALLOC);
-		dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_UNTR |
+		ctx_root = dmar_pgalloc(domain->pgtbl_obj, 0, DMAR_PGF_NOALLOC);
+		dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_UNTR |
 		    (DMAR_CTX1_ASR_MASK & VM_PAGE_TO_PHYS(ctx_root)) |
 		    DMAR_CTX1_P);
 	}
@@ -164,8 +178,32 @@
 }
 
 static int
-ctx_init_rmrr(struct dmar_ctx *ctx, device_t dev)
+dmar_flush_for_ctx_entry(struct dmar_unit *dmar, bool force)
 {
+	int error;
+
+	/*
+	 * If dmar declares Caching Mode as Set, follow 11.5 "Caching
+	 * Mode Consideration" and do the (global) invalidation of the
+	 * negative TLB entries.
+	 */
+	if ((dmar->hw_cap & DMAR_CAP_CM) == 0 && !force)
+		return (0);
+	if (dmar->qi_enabled) {
+		dmar_qi_invalidate_ctx_glob_locked(dmar);
+		if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force)
+			dmar_qi_invalidate_iotlb_glob_locked(dmar);
+		return (0);
+	}
+	error = dmar_inv_ctx_glob(dmar);
+	if (error == 0 && ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force))
+		error = dmar_inv_iotlb_glob(dmar);
+	return (error);
+}
+
+static int
+domain_init_rmrr(struct dmar_domain *domain, device_t dev)
+{
 	struct dmar_map_entries_tailq rmrr_entries;
 	struct dmar_map_entry *entry, *entry1;
 	vm_page_t *ma;
@@ -175,7 +213,7 @@
 
 	error = 0;
 	TAILQ_INIT(&rmrr_entries);
-	dmar_ctx_parse_rmrr(ctx, dev, &rmrr_entries);
+	dmar_dev_parse_rmrr(domain, dev, &rmrr_entries);
 	TAILQ_FOREACH_SAFE(entry, &rmrr_entries, unroll_link, entry1) {
 		/*
 		 * VT-d specification requires that the start of an
@@ -195,7 +233,7 @@
 			if (bootverbose) {
 				device_printf(dev, "BIOS bug: dmar%d RMRR "
 				    "region (%jx, %jx) corrected\n",
-				    ctx->dmar->unit, start, end);
+				    domain->dmar->unit, start, end);
 			}
 			entry->end += DMAR_PAGE_SIZE * 0x20;
 		}
@@ -205,8 +243,9 @@
 			ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i,
 			    VM_MEMATTR_DEFAULT);
 		}
-		error1 = dmar_gas_map_region(ctx, entry, DMAR_MAP_ENTRY_READ |
-		    DMAR_MAP_ENTRY_WRITE, DMAR_GM_CANWAIT, ma);
+		error1 = dmar_gas_map_region(domain, entry,
+		    DMAR_MAP_ENTRY_READ | DMAR_MAP_ENTRY_WRITE,
+		    DMAR_GM_CANWAIT, ma);
 		/*
 		 * Non-failed RMRR entries are owned by context rb
 		 * tree.  Get rid of the failed entry, but do not stop
@@ -214,18 +253,19 @@
 		 * loaded and removed on the context destruction.
 		 */
 		if (error1 == 0 && entry->end != entry->start) {
-			DMAR_LOCK(ctx->dmar);
-			ctx->flags |= DMAR_CTX_RMRR;
-			DMAR_UNLOCK(ctx->dmar);
+			DMAR_LOCK(domain->dmar);
+			domain->refs++; /* XXXKIB prevent free */
+			domain->flags |= DMAR_DOMAIN_RMRR;
+			DMAR_UNLOCK(domain->dmar);
 		} else {
 			if (error1 != 0) {
 				device_printf(dev,
 			    "dmar%d failed to map RMRR region (%jx, %jx) %d\n",
-				    ctx->dmar->unit, start, end, error1);
+				    domain->dmar->unit, start, end, error1);
 				error = error1;
 			}
 			TAILQ_REMOVE(&rmrr_entries, entry, unroll_link);
-			dmar_gas_free_entry(ctx, entry);
+			dmar_gas_free_entry(domain, entry);
 		}
 		for (i = 0; i < size; i++)
 			vm_page_putfake(ma[i]);
@@ -234,47 +274,144 @@
 	return (error);
 }
 
+static struct dmar_domain *
+dmar_domain_alloc(struct dmar_unit *dmar, bool id_mapped)
+{
+	struct dmar_domain *domain;
+	int error, id, mgaw;
+
+	id = alloc_unr(dmar->domids);
+	if (id == -1)
+		return (NULL);
+	domain = malloc(sizeof(*domain), M_DMAR_DOMAIN, M_WAITOK | M_ZERO);
+	domain->domain = id;
+	LIST_INIT(&domain->contexts);
+	RB_INIT(&domain->rb_root);
+	TAILQ_INIT(&domain->unload_entries);
+	TASK_INIT(&domain->unload_task, 0, dmar_domain_unload_task, domain);
+	mtx_init(&domain->lock, "dmardom", NULL, MTX_DEF);
+	domain->dmar = dmar;
+
+	/*
+	 * For now, use the maximal usable physical address of the
+	 * installed memory to calculate the mgaw on id_mapped domain.
+	 * It is useful for the identity mapping, and less so for the
+	 * virtualized bus address space.
+	 */
+	domain->end = id_mapped ? ptoa(Maxmem) : BUS_SPACE_MAXADDR;
+	mgaw = dmar_maxaddr2mgaw(dmar, domain->end, !id_mapped);
+	error = domain_set_agaw(domain, mgaw);
+	if (error != 0)
+		goto fail;
+	if (!id_mapped)
+		/* Use all supported address space for remapping. */
+		domain->end = 1ULL << (domain->agaw - 1);
+
+	dmar_gas_init_domain(domain);
+
+	if (id_mapped) {
+		if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) {
+			domain->pgtbl_obj = domain_get_idmap_pgtbl(domain,
+			    domain->end);
+		}
+		domain->flags |= DMAR_DOMAIN_IDMAP;
+	} else {
+		error = domain_alloc_pgtbl(domain);
+		if (error != 0)
+			goto fail;
+		/* Disable local apic region access */
+		error = dmar_gas_reserve_region(domain, 0xfee00000,
+		    0xfeefffff + 1);
+		if (error != 0)
+			goto fail;
+	}
+	return (domain);
+
+fail:
+	dmar_domain_destroy(domain);
+	return (NULL);
+}
+
 static struct dmar_ctx *
-dmar_get_ctx_alloc(struct dmar_unit *dmar, uint16_t rid)
+dmar_ctx_alloc(struct dmar_domain *domain, uint16_t rid)
 {
 	struct dmar_ctx *ctx;
 
 	ctx = malloc(sizeof(*ctx), M_DMAR_CTX, M_WAITOK | M_ZERO);
-	RB_INIT(&ctx->rb_root);
-	TAILQ_INIT(&ctx->unload_entries);
-	TASK_INIT(&ctx->unload_task, 0, dmar_ctx_unload_task, ctx);
-	mtx_init(&ctx->lock, "dmarctx", NULL, MTX_DEF);
-	ctx->dmar = dmar;
+	ctx->domain = domain;
 	ctx->rid = rid;
+	ctx->refs = 1;
 	return (ctx);
 }
 
 static void
-dmar_ctx_dtr(struct dmar_ctx *ctx, bool gas_inited, bool pgtbl_inited)
+dmar_ctx_link(struct dmar_ctx *ctx)
 {
+	struct dmar_domain *domain;
 
-	if (gas_inited) {
-		DMAR_CTX_LOCK(ctx);
-		dmar_gas_fini_ctx(ctx);
-		DMAR_CTX_UNLOCK(ctx);
+	domain = ctx->domain;
+	DMAR_ASSERT_LOCKED(domain->dmar);
+	KASSERT(domain->refs >= domain->ctx_cnt,
+	    ("dom %p ref underflow %d %d", domain, domain->refs,
+	    domain->ctx_cnt));
+	domain->refs++;
+	domain->ctx_cnt++;
+	LIST_INSERT_HEAD(&domain->contexts, ctx, link);
+}
+
+static void
+dmar_ctx_unlink(struct dmar_ctx *ctx)
+{
+	struct dmar_domain *domain;
+
+	domain = ctx->domain;
+	DMAR_ASSERT_LOCKED(domain->dmar);
+	KASSERT(domain->refs > 0,
+	    ("domain %p ctx dtr refs %d", domain, domain->refs));
+	KASSERT(domain->ctx_cnt >= domain->refs,
+	    ("domain %p ctx dtr refs %d ctx_cnt %d", domain,
+	    domain->refs, domain->ctx_cnt));
+	domain->refs--;
+	domain->ctx_cnt--;
+	LIST_REMOVE(ctx, link);
+}
+
+static void
+dmar_domain_destroy(struct dmar_domain *domain)
+{
+
+	KASSERT(TAILQ_EMPTY(&domain->unload_entries),
+	    ("unfinished unloads %p", domain));
+	KASSERT(LIST_EMPTY(&domain->contexts),
+	    ("destroying dom %p with contexts", domain));
+	KASSERT(domain->ctx_cnt == 0,
+	    ("destroying dom %p with ctx_cnt %d", domain, domain->ctx_cnt));
+	KASSERT(domain->refs == 0,
+	    ("destroying dom %p with refs %d", domain, domain->refs));
+	if ((domain->flags & DMAR_DOMAIN_GAS_INITED) != 0) {
+		DMAR_DOMAIN_LOCK(domain);
+		dmar_gas_fini_domain(domain);
+		DMAR_DOMAIN_UNLOCK(domain);
 	}
-	if (pgtbl_inited) {
-		if (ctx->pgtbl_obj != NULL)
-			DMAR_CTX_PGLOCK(ctx);
-		ctx_free_pgtbl(ctx);
+	if ((domain->flags & DMAR_DOMAIN_PGTBL_INITED) != 0) {
+		if (domain->pgtbl_obj != NULL)
+			DMAR_DOMAIN_PGLOCK(domain);
+		domain_free_pgtbl(domain);
 	}
-	mtx_destroy(&ctx->lock);
-	free(ctx, M_DMAR_CTX);
+	mtx_destroy(&domain->lock);
+	free_unr(domain->dmar->domids, domain->domain);
+	free(domain, M_DMAR_DOMAIN);
 }
 
 struct dmar_ctx *
-dmar_get_ctx(struct dmar_unit *dmar, device_t dev, uint16_t rid, bool id_mapped,
-    bool rmrr_init)
+dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev, uint16_t rid,
+    bool id_mapped, bool rmrr_init)
 {
+	struct dmar_domain *domain, *domain1;
 	struct dmar_ctx *ctx, *ctx1;
 	dmar_ctx_entry_t *ctxp;
 	struct sf_buf *sf;
-	int bus, slot, func, error, mgaw;
+	int bus, slot, func, error;
 	bool enable;
 
 	bus = pci_get_bus(dev);
@@ -292,67 +429,20 @@
 		 */
 		DMAR_UNLOCK(dmar);
 		dmar_ensure_ctx_page(dmar, PCI_RID2BUS(rid));
-		ctx1 = dmar_get_ctx_alloc(dmar, rid);
-
-		if (id_mapped) {
-			/*
-			 * For now, use the maximal usable physical
-			 * address of the installed memory to
-			 * calculate the mgaw.  It is useful for the
-			 * identity mapping, and less so for the
-			 * virtualized bus address space.
-			 */
-			ctx1->end = ptoa(Maxmem);
-			mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, false);
-			error = ctx_set_agaw(ctx1, mgaw);
-			if (error != 0) {
-				dmar_ctx_dtr(ctx1, false, false);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
-		} else {
-			ctx1->end = BUS_SPACE_MAXADDR;
-			mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, true);
-			error = ctx_set_agaw(ctx1, mgaw);
-			if (error != 0) {
-				dmar_ctx_dtr(ctx1, false, false);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
-			/* Use all supported address space for remapping. */
-			ctx1->end = 1ULL << (ctx1->agaw - 1);
+		domain1 = dmar_domain_alloc(dmar, id_mapped);
+		if (domain1 == NULL) {
+			TD_PINNED_ASSERT;
+			return (NULL);
 		}
-
-
-		dmar_gas_init_ctx(ctx1);
-		if (id_mapped) {
-			if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) {
-				ctx1->pgtbl_obj = ctx_get_idmap_pgtbl(ctx1,
-				    ctx1->end);
-			}
-			ctx1->flags |= DMAR_CTX_IDMAP;
-		} else {
-			error = ctx_alloc_pgtbl(ctx1);
+		if (!id_mapped) {
+			error = domain_init_rmrr(domain1, dev);
 			if (error != 0) {
-				dmar_ctx_dtr(ctx1, true, false);
+				dmar_domain_destroy(domain1);
 				TD_PINNED_ASSERT;
 				return (NULL);
 			}
-			/* Disable local apic region access */
-			error = dmar_gas_reserve_region(ctx1, 0xfee00000,
-			    0xfeefffff + 1);
-			if (error != 0) {
-				dmar_ctx_dtr(ctx1, true, true);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
-			error = ctx_init_rmrr(ctx1, dev);
-			if (error != 0) {
-				dmar_ctx_dtr(ctx1, true, true);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
 		}
+		ctx1 = dmar_ctx_alloc(domain1, rid);
 		ctxp = dmar_map_ctx_entry(ctx1, &sf);
 		DMAR_LOCK(dmar);
 
@@ -362,16 +452,10 @@
 		 */
 		ctx = dmar_find_ctx_locked(dmar, rid);
 		if (ctx == NULL) {
+			domain = domain1;
 			ctx = ctx1;
+			dmar_ctx_link(ctx);
 			ctx->ctx_tag.owner = dev;
-			ctx->domain = alloc_unrl(dmar->domids);
-			if (ctx->domain == -1) {
-				DMAR_UNLOCK(dmar);
-				dmar_unmap_pgtbl(sf);
-				dmar_ctx_dtr(ctx, true, true);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
 			ctx_tag_init(ctx, dev);
 
 			/*
@@ -379,46 +463,35 @@
 			 * DMAR unit.  Enable the translation after
 			 * everything is set up.
 			 */
-			if (LIST_EMPTY(&dmar->contexts))
+			if (LIST_EMPTY(&dmar->domains))
 				enable = true;
-			LIST_INSERT_HEAD(&dmar->contexts, ctx, link);
-			ctx_id_entry_init(ctx, ctxp);
+			LIST_INSERT_HEAD(&dmar->domains, domain, link);
+			ctx_id_entry_init(ctx, ctxp, false);
 			device_printf(dev,
 			    "dmar%d pci%d:%d:%d:%d rid %x domain %d mgaw %d "
 			    "agaw %d %s-mapped\n",
 			    dmar->unit, dmar->segment, bus, slot,
-			    func, rid, ctx->domain, ctx->mgaw, ctx->agaw,
-			    id_mapped ? "id" : "re");
+			    func, rid, domain->domain, domain->mgaw,
+			    domain->agaw, id_mapped ? "id" : "re");
+			dmar_unmap_pgtbl(sf);
 		} else {
-			dmar_ctx_dtr(ctx1, true, true);
+			dmar_unmap_pgtbl(sf);
+			dmar_domain_destroy(domain1);
+			/* Nothing needs to be done to destroy ctx1. */
+			free(ctx1, M_DMAR_CTX);
+			domain = ctx->domain;
+			ctx->refs++; /* tag referenced us */
 		}
-		dmar_unmap_pgtbl(sf);
+	} else {
+		domain = ctx->domain;
+		ctx->refs++; /* tag referenced us */
 	}
-	ctx->refs++;
-	if ((ctx->flags & DMAR_CTX_RMRR) != 0)
-		ctx->refs++; /* XXXKIB */
 
-	/*
-	 * If dmar declares Caching Mode as Set, follow 11.5 "Caching
-	 * Mode Consideration" and do the (global) invalidation of the
-	 * negative TLB entries.
-	 */
-	if ((dmar->hw_cap & DMAR_CAP_CM) != 0 || enable) {
-		if (dmar->qi_enabled) {
-			dmar_qi_invalidate_ctx_glob_locked(dmar);
-			if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0)
-				dmar_qi_invalidate_iotlb_glob_locked(dmar);
-		} else {
-			error = dmar_inv_ctx_glob(dmar);
-			if (error == 0 &&
-			    (dmar->hw_ecap & DMAR_ECAP_DI) != 0)
-				error = dmar_inv_iotlb_glob(dmar);
-			if (error != 0) {
-				dmar_free_ctx_locked(dmar, ctx);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
-		}
+	error = dmar_flush_for_ctx_entry(dmar, enable);
+	if (error != 0) {
+		dmar_free_ctx_locked(dmar, ctx);
+		TD_PINNED_ASSERT;
+		return (NULL);
 	}
 
 	/*
@@ -439,11 +512,74 @@
 	return (ctx);
 }
 
+int
+dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx)
+{
+	struct dmar_unit *dmar;
+	struct dmar_domain *old_domain;
+	dmar_ctx_entry_t *ctxp;
+	struct sf_buf *sf;
+	int error;
+
+	dmar = domain->dmar;
+	old_domain = ctx->domain;
+	if (domain == old_domain)
+		return (0);
+	KASSERT(old_domain->dmar == dmar,
+	    ("domain %p %u moving between dmars %u %u", domain,
+	    domain->domain, old_domain->dmar->unit, domain->dmar->unit));
+	TD_PREP_PINNED_ASSERT;
+
+	ctxp = dmar_map_ctx_entry(ctx, &sf);
+	DMAR_LOCK(dmar);
+	dmar_ctx_unlink(ctx);
+	ctx->domain = domain;
+	dmar_ctx_link(ctx);
+	ctx_id_entry_init(ctx, ctxp, true);
+	dmar_unmap_pgtbl(sf);
+	error = dmar_flush_for_ctx_entry(dmar, true);
+	/* If flush failed, rolling back would not work as well. */
+	printf("dmar%d rid %x domain %d->%d %s-mapped\n",
+	    dmar->unit, ctx->rid, old_domain->domain, domain->domain,
+	    (domain->flags & DMAR_DOMAIN_IDMAP) != 0 ? "id" : "re");
+	dmar_unref_domain_locked(dmar, old_domain);
+	TD_PINNED_ASSERT;
+	return (error);
+}
+
+static void
+dmar_unref_domain_locked(struct dmar_unit *dmar, struct dmar_domain *domain)
+{
+
+	DMAR_ASSERT_LOCKED(dmar);
+	KASSERT(domain->refs >= 1,
+	    ("dmar %d domain %p refs %u", dmar->unit, domain, domain->refs));
+	KASSERT(domain->refs > domain->ctx_cnt,
+	    ("dmar %d domain %p refs %d ctx_cnt %d", dmar->unit, domain,
+	    domain->refs, domain->ctx_cnt));
+
+	if (domain->refs > 1) {
+		domain->refs--;
+		DMAR_UNLOCK(dmar);
+		return;
+	}
+
+	KASSERT((domain->flags & DMAR_DOMAIN_RMRR) == 0,
+	    ("lost ref on RMRR domain %p", domain));
+
+	LIST_REMOVE(domain, link);
+	DMAR_UNLOCK(dmar);
+
+	taskqueue_drain(dmar->delayed_taskqueue, &domain->unload_task);
+	dmar_domain_destroy(domain);
+}
+
 void
 dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx)
 {
 	struct sf_buf *sf;
 	dmar_ctx_entry_t *ctxp;
+	struct dmar_domain *domain;
 
 	DMAR_ASSERT_LOCKED(dmar);
 	KASSERT(ctx->refs >= 1,
@@ -459,8 +595,6 @@
 		return;
 	}
 
-	KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0,
-	    ("lost ref on RMRR ctx %p", ctx));
 	KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0,
 	    ("lost ref on disabled ctx %p", ctx));
 
@@ -488,8 +622,6 @@
 		return;
 	}
 
-	KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0,
-	    ("lost ref on RMRR ctx %p", ctx));
 	KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0,
 	    ("lost ref on disabled ctx %p", ctx));
 
@@ -507,19 +639,11 @@
 		else
 			dmar_inv_iotlb_glob(dmar);
 	}
-	LIST_REMOVE(ctx, link);
-	DMAR_UNLOCK(dmar);
-
-	/*
-	 * The rest of the destruction is invisible for other users of
-	 * the dmar unit.
-	 */
-	taskqueue_drain(dmar->delayed_taskqueue, &ctx->unload_task);
-	KASSERT(TAILQ_EMPTY(&ctx->unload_entries),
-	    ("unfinished unloads %p", ctx));
 	dmar_unmap_pgtbl(sf);
-	free_unr(dmar->domids, ctx->domain);
-	dmar_ctx_dtr(ctx, true, true);
+	domain = ctx->domain;
+	dmar_ctx_unlink(ctx);
+	free(ctx, M_DMAR_CTX);
+	dmar_unref_domain_locked(dmar, domain);
 	TD_PINNED_ASSERT;
 }
 
@@ -528,86 +652,101 @@
 {
 	struct dmar_unit *dmar;
 
-	dmar = ctx->dmar;
+	dmar = ctx->domain->dmar;
 	DMAR_LOCK(dmar);
 	dmar_free_ctx_locked(dmar, ctx);
 }
 
+/*
+ * Returns with the domain locked.
+ */
 struct dmar_ctx *
 dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid)
 {
+	struct dmar_domain *domain;
 	struct dmar_ctx *ctx;
 
 	DMAR_ASSERT_LOCKED(dmar);
 
-	LIST_FOREACH(ctx, &dmar->contexts, link) {
-		if (ctx->rid == rid)
-			return (ctx);
+	LIST_FOREACH(domain, &dmar->domains, link) {
+		LIST_FOREACH(ctx, &domain->contexts, link) {
+			if (ctx->rid == rid)
+				return (ctx);
+		}
 	}
 	return (NULL);
 }
 
 void
-dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free)
+dmar_domain_free_entry(struct dmar_map_entry *entry, bool free)
 {
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 
-	ctx = entry->ctx;
-	DMAR_CTX_LOCK(ctx);
+	domain = entry->domain;
+	DMAR_DOMAIN_LOCK(domain);
 	if ((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0)
-		dmar_gas_free_region(ctx, entry);
+		dmar_gas_free_region(domain, entry);
 	else
-		dmar_gas_free_space(ctx, entry);
-	DMAR_CTX_UNLOCK(ctx);
+		dmar_gas_free_space(domain, entry);
+	DMAR_DOMAIN_UNLOCK(domain);
 	if (free)
-		dmar_gas_free_entry(ctx, entry);
+		dmar_gas_free_entry(domain, entry);
 	else
 		entry->flags = 0;
 }
 
 void
-dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free)
+dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free)
 {
 	struct dmar_unit *unit;
 
-	unit = entry->ctx->dmar;
+	unit = entry->domain->dmar;
 	if (unit->qi_enabled) {
 		DMAR_LOCK(unit);
-		dmar_qi_invalidate_locked(entry->ctx, entry->start,
-		    entry->end - entry->start, &entry->gseq);
+		dmar_qi_invalidate_locked(entry->domain, entry->start,
+		    entry->end - entry->start, &entry->gseq, true);
 		if (!free)
 			entry->flags |= DMAR_MAP_ENTRY_QI_NF;
 		TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link);
 		DMAR_UNLOCK(unit);
 	} else {
-		ctx_flush_iotlb_sync(entry->ctx, entry->start, entry->end -
-		    entry->start);
-		dmar_ctx_free_entry(entry, free);
+		domain_flush_iotlb_sync(entry->domain, entry->start,
+		    entry->end - entry->start);
+		dmar_domain_free_entry(entry, free);
 	}
 }
 
+static bool
+dmar_domain_unload_emit_wait(struct dmar_domain *domain,
+    struct dmar_map_entry *entry)
+{
+
+	if (TAILQ_NEXT(entry, dmamap_link) == NULL)
+		return (true);
+	return (domain->batch_no++ % dmar_batch_coalesce == 0);
+}
+
 void
-dmar_ctx_unload(struct dmar_ctx *ctx, struct dmar_map_entries_tailq *entries,
-    bool cansleep)
+dmar_domain_unload(struct dmar_domain *domain,
+    struct dmar_map_entries_tailq *entries, bool cansleep)
 {
 	struct dmar_unit *unit;
 	struct dmar_map_entry *entry, *entry1;
-	struct dmar_qi_genseq gseq;
 	int error;
 
-	unit = ctx->dmar;
+	unit = domain->dmar;
 
 	TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
 		KASSERT((entry->flags & DMAR_MAP_ENTRY_MAP) != 0,
-		    ("not mapped entry %p %p", ctx, entry));
-		error = ctx_unmap_buf(ctx, entry->start, entry->end -
+		    ("not mapped entry %p %p", domain, entry));
+		error = domain_unmap_buf(domain, entry->start, entry->end -
 		    entry->start, cansleep ? DMAR_PGF_WAITOK : 0);
-		KASSERT(error == 0, ("unmap %p error %d", ctx, error));
+		KASSERT(error == 0, ("unmap %p error %d", domain, error));
 		if (!unit->qi_enabled) {
-			ctx_flush_iotlb_sync(ctx, entry->start,
+			domain_flush_iotlb_sync(domain, entry->start,
 			    entry->end - entry->start);
 			TAILQ_REMOVE(entries, entry, dmamap_link);
-			dmar_ctx_free_entry(entry, true);
+			dmar_domain_free_entry(entry, true);
 		}
 	}
 	if (TAILQ_EMPTY(entries))
@@ -616,36 +755,30 @@
 	KASSERT(unit->qi_enabled, ("loaded entry left"));
 	DMAR_LOCK(unit);
 	TAILQ_FOREACH(entry, entries, dmamap_link) {
-		entry->gseq.gen = 0;
-		entry->gseq.seq = 0;
-		dmar_qi_invalidate_locked(ctx, entry->start, entry->end -
-		    entry->start, TAILQ_NEXT(entry, dmamap_link) == NULL ?
-		    &gseq : NULL);
+		dmar_qi_invalidate_locked(domain, entry->start, entry->end -
+		    entry->start, &entry->gseq,
+		    dmar_domain_unload_emit_wait(domain, entry));
 	}
-	TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
-		entry->gseq = gseq;
-		TAILQ_REMOVE(entries, entry, dmamap_link);
-		TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link);
-	}
+	TAILQ_CONCAT(&unit->tlb_flush_entries, entries, dmamap_link);
 	DMAR_UNLOCK(unit);
 }	
 
 static void
-dmar_ctx_unload_task(void *arg, int pending)
+dmar_domain_unload_task(void *arg, int pending)
 {
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	struct dmar_map_entries_tailq entries;
 
-	ctx = arg;
+	domain = arg;
 	TAILQ_INIT(&entries);
 
 	for (;;) {
-		DMAR_CTX_LOCK(ctx);
-		TAILQ_SWAP(&ctx->unload_entries, &entries, dmar_map_entry,
+		DMAR_DOMAIN_LOCK(domain);
+		TAILQ_SWAP(&domain->unload_entries, &entries, dmar_map_entry,
 		    dmamap_link);
-		DMAR_CTX_UNLOCK(ctx);
+		DMAR_DOMAIN_UNLOCK(domain);
 		if (TAILQ_EMPTY(&entries))
 			break;
-		dmar_ctx_unload(ctx, &entries, true);
+		dmar_domain_unload(domain, &entries, true);
 	}
 }

Modified: trunk/sys/x86/iommu/intel_dmar.h
===================================================================
--- trunk/sys/x86/iommu/intel_dmar.h	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_dmar.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
 /*-
- * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/iommu/intel_dmar.h 281545 2015-04-15 06:56:51Z kib $
+ * $FreeBSD: stable/11/sys/x86/iommu/intel_dmar.h 320357 2017-06-26 12:30:39Z kib $
  */
 
 #ifndef __X86_IOMMU_INTEL_DMAR_H
@@ -51,10 +51,10 @@
 					   current R/B tree node */
 	u_int flags;
 	TAILQ_ENTRY(dmar_map_entry) dmamap_link; /* Link for dmamap entries */
-	RB_ENTRY(dmar_map_entry) rb_entry;	 /* Links for ctx entries */
+	RB_ENTRY(dmar_map_entry) rb_entry;	 /* Links for domain entries */
 	TAILQ_ENTRY(dmar_map_entry) unroll_link; /* Link for unroll after
 						    dmamap_load failure */
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	struct dmar_qi_genseq gseq;
 };
 
@@ -74,51 +74,85 @@
 #define	DMAR_MAP_ENTRY_SNOOP	0x4000	/* Snoop */
 #define	DMAR_MAP_ENTRY_TM	0x8000	/* Transient */
 
+/*
+ * Locking annotations:
+ * (u) - Protected by dmar unit lock
+ * (d) - Protected by domain lock
+ * (c) - Immutable after initialization
+ */
+
+/*
+ * The domain abstraction.  Most non-constant members of the domain
+ * are protected by owning dmar unit lock, not by the domain lock.
+ * Most important, the dmar lock protects the contexts list.
+ *
+ * The domain lock protects the address map for the domain, and list
+ * of unload entries delayed.
+ *
+ * Page tables pages and pages content is protected by the vm object
+ * lock pgtbl_obj, which contains the page tables pages.
+ */
+struct dmar_domain {
+	int domain;			/* (c) DID, written in context entry */
+	int mgaw;			/* (c) Real max address width */
+	int agaw;			/* (c) Adjusted guest address width */
+	int pglvl;			/* (c) The pagelevel */
+	int awlvl;			/* (c) The pagelevel as the bitmask,
+					   to set in context entry */
+	dmar_gaddr_t end;		/* (c) Highest address + 1 in
+					   the guest AS */
+	u_int ctx_cnt;			/* (u) Number of contexts owned */
+	u_int refs;			/* (u) Refs, including ctx */
+	struct dmar_unit *dmar;		/* (c) */
+	struct mtx lock;		/* (c) */
+	LIST_ENTRY(dmar_domain) link;	/* (u) Member in the dmar list */
+	LIST_HEAD(, dmar_ctx) contexts;	/* (u) */
+	vm_object_t pgtbl_obj;		/* (c) Page table pages */
+	u_int flags;			/* (u) */
+	u_int entries_cnt;		/* (d) */
+	struct dmar_gas_entries_tree rb_root; /* (d) */
+	struct dmar_map_entries_tailq unload_entries; /* (d) Entries to
+							 unload */
+	struct dmar_map_entry *first_place, *last_place; /* (d) */
+	struct task unload_task;	/* (c) */
+	u_int batch_no;
+};
+
 struct dmar_ctx {
-	uint16_t rid;	/* pci RID */
-	int domain;	/* DID */
-	int mgaw;	/* Real max address width */
-	int agaw;	/* Adjusted guest address width */
-	int pglvl;	/* The pagelevel */
-	int awlvl;	/* The pagelevel as the bitmask, to set in
-			   context entry */
-	dmar_gaddr_t end;/* Highest address + 1 in the guest AS */
-	u_int refs;	/* References to the context, from tags */
-	struct dmar_unit *dmar;
-	struct bus_dma_tag_dmar ctx_tag; /* Root tag */
-	struct mtx lock;
-	LIST_ENTRY(dmar_ctx) link;	/* Member in the dmar list */
-	vm_object_t pgtbl_obj;		/* Page table pages */
-	u_int flags;			/* Protected by dmar lock */
+	struct bus_dma_tag_dmar ctx_tag; /* (c) Root tag */
+	uint16_t rid;			/* (c) pci RID */
 	uint64_t last_fault_rec[2];	/* Last fault reported */
-	u_int entries_cnt;
-	u_long loads;
-	u_long unloads;
-	struct dmar_gas_entries_tree rb_root;
-	struct dmar_map_entries_tailq unload_entries; /* Entries to unload */
-	struct dmar_map_entry *first_place, *last_place;
-	struct task unload_task;
+	struct dmar_domain *domain;	/* (c) */
+	LIST_ENTRY(dmar_ctx) link;	/* (u) Member in the domain list */
+	u_int refs;			/* (u) References from tags */
+	u_int flags;			/* (u) */
+	u_long loads;			/* atomic updates, for stat only */
+	u_long unloads;			/* same */
 };
 
+#define	DMAR_DOMAIN_GAS_INITED		0x0001
+#define	DMAR_DOMAIN_PGTBL_INITED	0x0002
+#define	DMAR_DOMAIN_IDMAP		0x0010	/* Domain uses identity
+						   page table */
+#define	DMAR_DOMAIN_RMRR		0x0020	/* Domain contains RMRR entry,
+						   cannot be turned off */
+
 /* struct dmar_ctx flags */
 #define	DMAR_CTX_FAULTED	0x0001	/* Fault was reported,
 					   last_fault_rec is valid */
-#define	DMAR_CTX_IDMAP		0x0002	/* Context uses identity page table */
-#define	DMAR_CTX_RMRR		0x0004	/* Context contains RMRR entry,
-					   cannot be turned off */
-#define	DMAR_CTX_DISABLED	0x0008	/* Device is disabled, the
+#define	DMAR_CTX_DISABLED	0x0002	/* Device is disabled, the
 					   ephemeral reference is kept
 					   to prevent context destruction */
 
-#define	DMAR_CTX_PGLOCK(ctx)	VM_OBJECT_WLOCK((ctx)->pgtbl_obj)
-#define	DMAR_CTX_PGTRYLOCK(ctx)	VM_OBJECT_TRYWLOCK((ctx)->pgtbl_obj)
-#define	DMAR_CTX_PGUNLOCK(ctx)	VM_OBJECT_WUNLOCK((ctx)->pgtbl_obj)
-#define	DMAR_CTX_ASSERT_PGLOCKED(ctx) \
-	VM_OBJECT_ASSERT_WLOCKED((ctx)->pgtbl_obj)
+#define	DMAR_DOMAIN_PGLOCK(dom)		VM_OBJECT_WLOCK((dom)->pgtbl_obj)
+#define	DMAR_DOMAIN_PGTRYLOCK(dom)	VM_OBJECT_TRYWLOCK((dom)->pgtbl_obj)
+#define	DMAR_DOMAIN_PGUNLOCK(dom)	VM_OBJECT_WUNLOCK((dom)->pgtbl_obj)
+#define	DMAR_DOMAIN_ASSERT_PGLOCKED(dom) \
+	VM_OBJECT_ASSERT_WLOCKED((dom)->pgtbl_obj)
 
-#define	DMAR_CTX_LOCK(ctx)	mtx_lock(&(ctx)->lock)
-#define	DMAR_CTX_UNLOCK(ctx)	mtx_unlock(&(ctx)->lock)
-#define	DMAR_CTX_ASSERT_LOCKED(ctx) mtx_assert(&(ctx)->lock, MA_OWNED)
+#define	DMAR_DOMAIN_LOCK(dom)	mtx_lock(&(dom)->lock)
+#define	DMAR_DOMAIN_UNLOCK(dom)	mtx_unlock(&(dom)->lock)
+#define	DMAR_DOMAIN_ASSERT_LOCKED(dom) mtx_assert(&(dom)->lock, MA_OWNED)
 
 struct dmar_msi_data {
 	int irq;
@@ -158,7 +192,7 @@
 
 	/* Data for being a dmar */
 	struct mtx lock;
-	LIST_HEAD(, dmar_ctx) contexts;
+	LIST_HEAD(, dmar_domain) domains;
 	struct unrhdr *domids;
 	vm_object_t ctx_obj;
 	u_int barrier_flags;
@@ -186,6 +220,13 @@
 	u_int inv_seq_waiters;	/* count of waiters for seq */
 	u_int inv_queue_full;	/* informational counter */
 
+	/* IR */
+	int ir_enabled;
+	vm_paddr_t irt_phys;
+	dmar_irte_t *irt;
+	u_int irte_cnt;
+	vmem_t *irtids;
+
 	/* Delayed freeing of map entries queue processing */
 	struct dmar_map_entries_tailq tlb_flush_entries;
 	struct task qi_task;
@@ -195,6 +236,8 @@
 	struct task dmamap_load_task;
 	TAILQ_HEAD(, bus_dmamap_dmar) delayed_maps;
 	struct taskqueue *delayed_taskqueue;
+
+	int dma_enabled;
 };
 
 #define	DMAR_LOCK(dmar)		mtx_lock(&(dmar)->lock)
@@ -207,6 +250,8 @@
 
 #define	DMAR_IS_COHERENT(dmar)	(((dmar)->hw_ecap & DMAR_ECAP_C) != 0)
 #define	DMAR_HAS_QI(dmar)	(((dmar)->hw_ecap & DMAR_ECAP_QI) != 0)
+#define	DMAR_X2APIC(dmar) \
+	(x2apic_mode && ((dmar)->hw_ecap & DMAR_ECAP_EIM) != 0)
 
 /* Barrier ids */
 #define	DMAR_BARRIER_RMRR	0
@@ -213,16 +258,18 @@
 #define	DMAR_BARRIER_USEQ	1
 
 struct dmar_unit *dmar_find(device_t dev);
+struct dmar_unit *dmar_find_hpet(device_t dev, uint16_t *rid);
+struct dmar_unit *dmar_find_ioapic(u_int apic_id, uint16_t *rid);
 
 u_int dmar_nd2mask(u_int nd);
 bool dmar_pglvl_supported(struct dmar_unit *unit, int pglvl);
-int ctx_set_agaw(struct dmar_ctx *ctx, int mgaw);
-int dmar_maxaddr2mgaw(struct dmar_unit* unit, dmar_gaddr_t maxaddr,
+int domain_set_agaw(struct dmar_domain *domain, int mgaw);
+int dmar_maxaddr2mgaw(struct dmar_unit *unit, dmar_gaddr_t maxaddr,
     bool allow_less);
 vm_pindex_t pglvl_max_pages(int pglvl);
-int ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl);
+int domain_is_sp_lvl(struct dmar_domain *domain, int lvl);
 dmar_gaddr_t pglvl_page_size(int total_pglvl, int lvl);
-dmar_gaddr_t ctx_page_size(struct dmar_ctx *ctx, int lvl);
+dmar_gaddr_t domain_page_size(struct dmar_domain *domain, int lvl);
 int calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size,
     dmar_gaddr_t *isizep);
 struct vm_page *dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags);
@@ -239,8 +286,13 @@
 void dmar_flush_root_to_ram(struct dmar_unit *unit, dmar_root_entry_t *dst);
 int dmar_enable_translation(struct dmar_unit *unit);
 int dmar_disable_translation(struct dmar_unit *unit);
+int dmar_load_irt_ptr(struct dmar_unit *unit);
+int dmar_enable_ir(struct dmar_unit *unit);
+int dmar_disable_ir(struct dmar_unit *unit);
 bool dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id);
 void dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id);
+uint64_t dmar_get_timeout(void);
+void dmar_update_timeout(uint64_t newval);
 
 int dmar_fault_intr(void *arg);
 void dmar_enable_fault_intr(struct dmar_unit *unit);
@@ -253,52 +305,61 @@
 void dmar_disable_qi_intr(struct dmar_unit *unit);
 int dmar_init_qi(struct dmar_unit *unit);
 void dmar_fini_qi(struct dmar_unit *unit);
-void dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t start,
-    dmar_gaddr_t size, struct dmar_qi_genseq *pseq);
+void dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t start,
+    dmar_gaddr_t size, struct dmar_qi_genseq *psec, bool emit_wait);
 void dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit);
 void dmar_qi_invalidate_iotlb_glob_locked(struct dmar_unit *unit);
+void dmar_qi_invalidate_iec_glob(struct dmar_unit *unit);
+void dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt);
 
-vm_object_t ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr);
+vm_object_t domain_get_idmap_pgtbl(struct dmar_domain *domain,
+    dmar_gaddr_t maxaddr);
 void put_idmap_pgtbl(vm_object_t obj);
-int ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
-    vm_page_t *ma, uint64_t pflags, int flags);
-int ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
-    int flags);
-void ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base,
+int domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags);
+int domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, int flags);
+void domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base,
     dmar_gaddr_t size);
-int ctx_alloc_pgtbl(struct dmar_ctx *ctx);
-void ctx_free_pgtbl(struct dmar_ctx *ctx);
+int domain_alloc_pgtbl(struct dmar_domain *domain);
+void domain_free_pgtbl(struct dmar_domain *domain);
 
 struct dmar_ctx *dmar_instantiate_ctx(struct dmar_unit *dmar, device_t dev,
     bool rmrr);
-struct dmar_ctx *dmar_get_ctx(struct dmar_unit *dmar, device_t dev, 
+struct dmar_ctx *dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev,
     uint16_t rid, bool id_mapped, bool rmrr_init);
+int dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx);
 void dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx);
 void dmar_free_ctx(struct dmar_ctx *ctx);
 struct dmar_ctx *dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid);
-void dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free);
-void dmar_ctx_unload(struct dmar_ctx *ctx,
+void dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free);
+void dmar_domain_unload(struct dmar_domain *domain,
     struct dmar_map_entries_tailq *entries, bool cansleep);
-void dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free);
+void dmar_domain_free_entry(struct dmar_map_entry *entry, bool free);
 
 int dmar_init_busdma(struct dmar_unit *unit);
 void dmar_fini_busdma(struct dmar_unit *unit);
+device_t dmar_get_requester(device_t dev, uint16_t *rid);
 
-void dmar_gas_init_ctx(struct dmar_ctx *ctx);
-void dmar_gas_fini_ctx(struct dmar_ctx *ctx);
-struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags);
-void dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry);
-void dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry);
-int dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common,
-    dmar_gaddr_t size, int offset, u_int eflags, u_int flags, vm_page_t *ma,
-    struct dmar_map_entry **res);
-void dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry);
-int dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
-    u_int eflags, u_int flags, vm_page_t *ma);
-int dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start,
+void dmar_gas_init_domain(struct dmar_domain *domain);
+void dmar_gas_fini_domain(struct dmar_domain *domain);
+struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_domain *domain,
+    u_int flags);
+void dmar_gas_free_entry(struct dmar_domain *domain,
+    struct dmar_map_entry *entry);
+void dmar_gas_free_space(struct dmar_domain *domain,
+    struct dmar_map_entry *entry);
+int dmar_gas_map(struct dmar_domain *domain,
+    const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset,
+    u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res);
+void dmar_gas_free_region(struct dmar_domain *domain,
+    struct dmar_map_entry *entry);
+int dmar_gas_map_region(struct dmar_domain *domain,
+    struct dmar_map_entry *entry, u_int eflags, u_int flags, vm_page_t *ma);
+int dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start,
     dmar_gaddr_t end);
 
-void dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev,
+void dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev,
     struct dmar_map_entries_tailq *rmrr_entries);
 int dmar_instantiate_rmrr_ctxs(struct dmar_unit *dmar);
 
@@ -305,6 +366,9 @@
 void dmar_quirks_post_ident(struct dmar_unit *dmar);
 void dmar_quirks_pre_use(struct dmar_unit *dmar);
 
+int dmar_init_irt(struct dmar_unit *unit);
+void dmar_fini_irt(struct dmar_unit *unit);
+
 #define	DMAR_GM_CANWAIT	0x0001
 #define	DMAR_GM_CANSPLIT 0x0002
 
@@ -318,6 +382,7 @@
 extern int haw;
 extern int dmar_tbl_pagecnt;
 extern int dmar_match_verbose;
+extern int dmar_batch_coalesce;
 extern int dmar_check_free;
 
 static inline uint32_t
@@ -375,13 +440,16 @@
  * containing the P or R and W bits, is set only after the high word
  * is written.  For clear, the P bit is cleared first, then the high
  * word is cleared.
+ *
+ * dmar_pte_update updates the pte.  For amd64, the update is atomic.
+ * For i386, it first disables the entry by clearing the word
+ * containing the P bit, and then defer to dmar_pte_store.  The locked
+ * cmpxchg8b is probably available on any machine having DMAR support,
+ * but interrupt translation table may be mapped uncached.
  */
 static inline void
-dmar_pte_store(volatile uint64_t *dst, uint64_t val)
+dmar_pte_store1(volatile uint64_t *dst, uint64_t val)
 {
-
-	KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx",
-	    dst, (uintmax_t)*dst, (uintmax_t)val));
 #ifdef __i386__
 	volatile uint32_t *p;
 	uint32_t hi, lo;
@@ -397,6 +465,28 @@
 }
 
 static inline void
+dmar_pte_store(volatile uint64_t *dst, uint64_t val)
+{
+
+	KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx",
+	    dst, (uintmax_t)*dst, (uintmax_t)val));
+	dmar_pte_store1(dst, val);
+}
+
+static inline void
+dmar_pte_update(volatile uint64_t *dst, uint64_t val)
+{
+
+#ifdef __i386__
+	volatile uint32_t *p;
+
+	p = (volatile uint32_t *)dst;
+	*p = 0;
+#endif
+	dmar_pte_store1(dst, val);
+}
+
+static inline void
 dmar_pte_clear(volatile uint64_t *dst)
 {
 #ifdef __i386__
@@ -420,6 +510,36 @@
 	return (start + size <= ((start + boundary) & ~(boundary - 1)));
 }
 
+extern struct timespec dmar_hw_timeout;
+
+#define	DMAR_WAIT_UNTIL(cond)					\
+{								\
+	struct timespec last, curr;				\
+	bool forever;						\
+								\
+	if (dmar_hw_timeout.tv_sec == 0 &&			\
+	    dmar_hw_timeout.tv_nsec == 0) {			\
+		forever = true;					\
+	} else {						\
+		forever = false;				\
+		nanouptime(&curr);				\
+		last = curr;					\
+		timespecadd(&last, &dmar_hw_timeout);		\
+	}							\
+	for (;;) {						\
+		if (cond) {					\
+			error = 0;				\
+			break;					\
+		}						\
+		nanouptime(&curr);				\
+		if (!forever && timespeccmp(&last, &curr, <)) {	\
+			error = ETIMEDOUT;			\
+			break;					\
+		}						\
+		cpu_spinwait();					\
+	}							\
+}
+
 #ifdef INVARIANTS
 #define	TD_PREP_PINNED_ASSERT						\
 	int old_td_pinned;						\

Modified: trunk/sys/x86/iommu/intel_drv.c
===================================================================
--- trunk/sys/x86/iommu/intel_drv.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_drv.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
 /*-
- * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
@@ -29,10 +29,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_drv.c 279470 2015-03-01 04:22:06Z rstone $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_drv.c 323921 2017-09-22 10:51:32Z kib $");
 
 #include "opt_acpi.h"
-#if defined(__amd64__) /* || defined(__ia64__) */
+#if defined(__amd64__)
 #define	DEV_APIC
 #else
 #include "opt_apic.h"
@@ -51,6 +51,7 @@
 #include <sys/smp.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <machine/bus.h>
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
@@ -66,10 +67,14 @@
 #include <x86/iommu/intel_reg.h>
 #include <x86/iommu/busdma_dmar.h>
 #include <x86/iommu/intel_dmar.h>
+#include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #ifdef DEV_APIC
 #include "pcib_if.h"
+#include <machine/intr_machdep.h>
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
 #endif
 
 #define	DMAR_FAULT_IRQ_RID	0
@@ -108,6 +113,7 @@
 		if (!iter(dmarh, arg))
 			break;
 	}
+	AcpiPutTable((ACPI_TABLE_HEADER *)dmartbl);
 }
 
 struct find_iter_args {
@@ -183,6 +189,7 @@
 		    (unsigned)dmartbl->Flags,
 		    "\020\001INTR_REMAP\002X2APIC_OPT_OUT");
 	}
+	AcpiPutTable((ACPI_TABLE_HEADER *)dmartbl);
 
 	dmar_iterate_tbl(dmar_count_iter, NULL);
 	if (dmar_devcnt == 0)
@@ -244,6 +251,7 @@
 	int i;
 
 	dmar_fini_busdma(unit);
+	dmar_fini_irt(unit);
 	dmar_fini_qi(unit);
 	dmar_fini_fault_log(unit);
 	for (i = 0; i < DMAR_INTR_TOTAL; i++)
@@ -304,7 +312,7 @@
 		    dmd->name, error);
 		goto err4;
 	}
-	bus_describe_intr(dev, dmd->irq_res, dmd->intr_handle, dmd->name);
+	bus_describe_intr(dev, dmd->irq_res, dmd->intr_handle, "%s", dmd->name);
 	error = PCIB_MAP_MSI(pcib, dev, dmd->irq, &msi_addr, &msi_data);
 	if (error != 0) {
 		device_printf(dev, "cannot map %s interrupt, %d\n",
@@ -398,6 +406,7 @@
 {
 	struct dmar_unit *unit;
 	ACPI_DMAR_HARDWARE_UNIT *dmaru;
+	uint64_t timeout;
 	int i, error;
 
 	unit = device_get_softc(dev);
@@ -422,6 +431,10 @@
 		dmar_print_caps(dev, unit, dmaru);
 	dmar_quirks_post_ident(unit);
 
+	timeout = dmar_get_timeout();
+	TUNABLE_UINT64_FETCH("hw.dmar.timeout", &timeout);
+	dmar_update_timeout(timeout);
+
 	for (i = 0; i < DMAR_INTR_TOTAL; i++)
 		unit->intrs[i].irq = -1;
 
@@ -457,6 +470,7 @@
 	mtx_init(&unit->lock, "dmarhw", NULL, MTX_DEF);
 	unit->domids = new_unrhdr(0, dmar_nd2mask(DMAR_CAP_ND(unit->hw_cap)),
 	    &unit->lock);
+	LIST_INIT(&unit->domains);
 
 	/*
 	 * 9.2 "Context Entry":
@@ -510,6 +524,11 @@
 		dmar_release_resources(dev, unit);
 		return (error);
 	}
+	error = dmar_init_irt(unit);
+	if (error != 0) {
+		dmar_release_resources(dev, unit);
+		return (error);
+	}
 	error = dmar_init_busdma(unit);
 	if (error != 0) {
 		dmar_release_resources(dev, unit);
@@ -764,8 +783,87 @@
 	return (device_get_softc(dmar_dev));
 }
 
+static struct dmar_unit *
+dmar_find_nonpci(u_int id, u_int entry_type, uint16_t *rid)
+{
+	device_t dmar_dev;
+	struct dmar_unit *unit;
+	ACPI_DMAR_HARDWARE_UNIT *dmarh;
+	ACPI_DMAR_DEVICE_SCOPE *devscope;
+	ACPI_DMAR_PCI_PATH *path;
+	char *ptr, *ptrend;
+#ifdef DEV_APIC
+	int error;
+#endif
+	int i;
+
+	for (i = 0; i < dmar_devcnt; i++) {
+		dmar_dev = dmar_devs[i];
+		if (dmar_dev == NULL)
+			continue;
+		unit = (struct dmar_unit *)device_get_softc(dmar_dev);
+		dmarh = dmar_find_by_index(i);
+		if (dmarh == NULL)
+			continue;
+		ptr = (char *)dmarh + sizeof(*dmarh);
+		ptrend = (char *)dmarh + dmarh->Header.Length;
+		for (;;) {
+			if (ptr >= ptrend)
+				break;
+			devscope = (ACPI_DMAR_DEVICE_SCOPE *)ptr;
+			ptr += devscope->Length;
+			if (devscope->EntryType != entry_type)
+				continue;
+			if (devscope->EnumerationId != id)
+				continue;
+#ifdef DEV_APIC
+			if (entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
+				error = ioapic_get_rid(id, rid);
+				/*
+				 * If our IOAPIC has PCI bindings then
+				 * use the PCI device rid.
+				 */
+				if (error == 0)
+					return (unit);
+			}
+#endif
+			if (devscope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE)
+			    == 2) {
+				if (rid != NULL) {
+					path = (ACPI_DMAR_PCI_PATH *)
+					    (devscope + 1);
+					*rid = PCI_RID(devscope->Bus,
+					    path->Device, path->Function);
+				}
+				return (unit);
+			}
+			printf(
+		           "dmar_find_nonpci: id %d type %d path length != 2\n",
+			    id, entry_type);
+			break;
+		}
+	}
+	return (NULL);
+}
+
+
+struct dmar_unit *
+dmar_find_hpet(device_t dev, uint16_t *rid)
+{
+
+	return (dmar_find_nonpci(hpet_get_uid(dev), ACPI_DMAR_SCOPE_TYPE_HPET,
+	    rid));
+}
+
+struct dmar_unit *
+dmar_find_ioapic(u_int apic_id, uint16_t *rid)
+{
+
+	return (dmar_find_nonpci(apic_id, ACPI_DMAR_SCOPE_TYPE_IOAPIC, rid));
+}
+
 struct rmrr_iter_args {
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	device_t dev;
 	int dev_domain;
 	int dev_busno;
@@ -810,7 +908,8 @@
 		if (match == 1) {
 			if (dmar_match_verbose)
 				printf("matched\n");
-			entry = dmar_gas_alloc_entry(ria->ctx, DMAR_PGF_WAITOK);
+			entry = dmar_gas_alloc_entry(ria->domain,
+			    DMAR_PGF_WAITOK);
 			entry->start = resmem->BaseAddress;
 			/* The RMRR entry end address is inclusive. */
 			entry->end = resmem->EndAddress;
@@ -825,7 +924,7 @@
 }
 
 void
-dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev,
+dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev,
     struct dmar_map_entries_tailq *rmrr_entries)
 {
 	struct rmrr_iter_args ria;
@@ -841,7 +940,7 @@
 		    dev_path);
 	}
 
-	ria.ctx = ctx;
+	ria.domain = domain;
 	ria.dev = dev;
 	ria.dev_path = dev_path;
 	ria.rmrr_entries = rmrr_entries;
@@ -961,7 +1060,7 @@
 		printf("dmar%d: instantiating RMRR contexts\n", dmar->unit);
 	dmar_iterate_tbl(dmar_inst_rmrr_iter, &iria);
 	DMAR_LOCK(dmar);
-	if (!LIST_EMPTY(&dmar->contexts)) {
+	if (!LIST_EMPTY(&dmar->domains)) {
 		KASSERT((dmar->hw_gcmd & DMAR_GCMD_TE) == 0,
 	    ("dmar%d: RMRR not handled but translation is already enabled",
 		    dmar->unit));
@@ -976,7 +1075,7 @@
 #include <ddb/db_lex.h>
 
 static void
-dmar_print_ctx_entry(const struct dmar_map_entry *entry)
+dmar_print_domain_entry(const struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *l, *r;
 
@@ -1000,24 +1099,39 @@
 }
 
 static void
-dmar_print_ctx(struct dmar_ctx *ctx, bool show_mappings)
+dmar_print_ctx(struct dmar_ctx *ctx)
 {
-	struct dmar_map_entry *entry;
 
 	db_printf(
-	    "  @%p pci%d:%d:%d dom %d mgaw %d agaw %d pglvl %d end %jx\n"
-	    "    refs %d flags %x pgobj %p map_ents %u loads %lu unloads %lu\n",
+	    "    @%p pci%d:%d:%d refs %d flags %x loads %lu unloads %lu\n",
 	    ctx, pci_get_bus(ctx->ctx_tag.owner),
 	    pci_get_slot(ctx->ctx_tag.owner),
-	    pci_get_function(ctx->ctx_tag.owner), ctx->domain, ctx->mgaw,
-	    ctx->agaw, ctx->pglvl, (uintmax_t)ctx->end, ctx->refs,
-	    ctx->flags, ctx->pgtbl_obj, ctx->entries_cnt, ctx->loads,
-	    ctx->unloads);
+	    pci_get_function(ctx->ctx_tag.owner), ctx->refs, ctx->flags,
+	    ctx->loads, ctx->unloads);
+}
+
+static void
+dmar_print_domain(struct dmar_domain *domain, bool show_mappings)
+{
+	struct dmar_map_entry *entry;
+	struct dmar_ctx *ctx;
+
+	db_printf(
+	    "  @%p dom %d mgaw %d agaw %d pglvl %d end %jx refs %d\n"
+	    "   ctx_cnt %d flags %x pgobj %p map_ents %u\n",
+	    domain, domain->domain, domain->mgaw, domain->agaw, domain->pglvl,
+	    (uintmax_t)domain->end, domain->refs, domain->ctx_cnt,
+	    domain->flags, domain->pgtbl_obj, domain->entries_cnt);
+	if (!LIST_EMPTY(&domain->contexts)) {
+		db_printf("  Contexts:\n");
+		LIST_FOREACH(ctx, &domain->contexts, link)
+			dmar_print_ctx(ctx);
+	}
 	if (!show_mappings)
 		return;
 	db_printf("    mapped:\n");
-	RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) {
-		dmar_print_ctx_entry(entry);
+	RB_FOREACH(entry, dmar_gas_entries_tree, &domain->rb_root) {
+		dmar_print_domain_entry(entry);
 		if (db_pager_quit)
 			break;
 	}
@@ -1024,19 +1138,20 @@
 	if (db_pager_quit)
 		return;
 	db_printf("    unloading:\n");
-	TAILQ_FOREACH(entry, &ctx->unload_entries, dmamap_link) {
-		dmar_print_ctx_entry(entry);
+	TAILQ_FOREACH(entry, &domain->unload_entries, dmamap_link) {
+		dmar_print_domain_entry(entry);
 		if (db_pager_quit)
 			break;
 	}
 }
 
-DB_FUNC(dmar_ctx, db_dmar_print_ctx, db_show_table, CS_OWN, NULL)
+DB_FUNC(dmar_domain, db_dmar_print_domain, db_show_table, CS_OWN, NULL)
 {
 	struct dmar_unit *unit;
+	struct dmar_domain *domain;
 	struct dmar_ctx *ctx;
 	bool show_mappings, valid;
-	int domain, bus, device, function, i, t;
+	int pci_domain, bus, device, function, i, t;
 	db_expr_t radix;
 
 	valid = false;
@@ -1057,7 +1172,7 @@
 		show_mappings = false;
 	}
 	if (t == tNUMBER) {
-		domain = db_tok_number;
+		pci_domain = db_tok_number;
 		t = db_read_token();
 		if (t == tNUMBER) {
 			bus = db_tok_number;
@@ -1075,19 +1190,24 @@
 			db_radix = radix;
 	db_skip_to_eol();
 	if (!valid) {
-		db_printf("usage: show dmar_ctx [/m] "
+		db_printf("usage: show dmar_domain [/m] "
 		    "<domain> <bus> <device> <func>\n");
 		return;
 	}
 	for (i = 0; i < dmar_devcnt; i++) {
 		unit = device_get_softc(dmar_devs[i]);
-		LIST_FOREACH(ctx, &unit->contexts, link) {
-			if (domain == unit->segment && 
-			    bus == pci_get_bus(ctx->ctx_tag.owner) &&
-			    device == pci_get_slot(ctx->ctx_tag.owner) && 
-			    function == pci_get_function(ctx->ctx_tag.owner)) {
-				dmar_print_ctx(ctx, show_mappings);
-				goto out;
+		LIST_FOREACH(domain, &unit->domains, link) {
+			LIST_FOREACH(ctx, &domain->contexts, link) {
+				if (pci_domain == unit->segment && 
+				    bus == pci_get_bus(ctx->ctx_tag.owner) &&
+				    device ==
+				    pci_get_slot(ctx->ctx_tag.owner) &&
+				    function ==
+				    pci_get_function(ctx->ctx_tag.owner)) {
+					dmar_print_domain(domain,
+					    show_mappings);
+					goto out;
+				}
 			}
 		}
 	}
@@ -1095,10 +1215,10 @@
 }
 
 static void
-dmar_print_one(int idx, bool show_ctxs, bool show_mappings)
+dmar_print_one(int idx, bool show_domains, bool show_mappings)
 {
 	struct dmar_unit *unit;
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	int i, frir;
 
 	unit = device_get_softc(dmar_devs[idx]);
@@ -1110,6 +1230,10 @@
 	    dmar_read4(unit, DMAR_GSTS_REG),
 	    dmar_read4(unit, DMAR_FSTS_REG),
 	    dmar_read4(unit, DMAR_FECTL_REG));
+	if (unit->ir_enabled) {
+		db_printf("ir is enabled; IRT @%p phys 0x%jx maxcnt %d\n",
+		    unit->irt, (uintmax_t)unit->irt_phys, unit->irte_cnt);
+	}
 	db_printf("fed 0x%x fea 0x%x feua 0x%x\n",
 	    dmar_read4(unit, DMAR_FEDATA_REG),
 	    dmar_read4(unit, DMAR_FEADDR_REG),
@@ -1148,10 +1272,10 @@
 			db_printf("qi is disabled\n");
 		}
 	}
-	if (show_ctxs) {
-		db_printf("contexts:\n");
-		LIST_FOREACH(ctx, &unit->contexts, link) {
-			dmar_print_ctx(ctx, show_mappings);
+	if (show_domains) {
+		db_printf("domains:\n");
+		LIST_FOREACH(domain, &unit->domains, link) {
+			dmar_print_domain(domain, show_mappings);
 			if (db_pager_quit)
 				break;
 		}
@@ -1160,27 +1284,27 @@
 
 DB_SHOW_COMMAND(dmar, db_dmar_print)
 {
-	bool show_ctxs, show_mappings;
+	bool show_domains, show_mappings;
 
-	show_ctxs = strchr(modif, 'c') != NULL;
+	show_domains = strchr(modif, 'd') != NULL;
 	show_mappings = strchr(modif, 'm') != NULL;
 	if (!have_addr) {
-		db_printf("usage: show dmar [/c] [/m] index\n");
+		db_printf("usage: show dmar [/d] [/m] index\n");
 		return;
 	}
-	dmar_print_one((int)addr, show_ctxs, show_mappings);
+	dmar_print_one((int)addr, show_domains, show_mappings);
 }
 
 DB_SHOW_ALL_COMMAND(dmars, db_show_all_dmars)
 {
 	int i;
-	bool show_ctxs, show_mappings;
+	bool show_domains, show_mappings;
 
-	show_ctxs = strchr(modif, 'c') != NULL;
+	show_domains = strchr(modif, 'd') != NULL;
 	show_mappings = strchr(modif, 'm') != NULL;
 
 	for (i = 0; i < dmar_devcnt; i++) {
-		dmar_print_one(i, show_ctxs, show_mappings);
+		dmar_print_one(i, show_domains, show_mappings);
 		if (db_pager_quit)
 			break;
 	}

Modified: trunk/sys/x86/iommu/intel_fault.c
===================================================================
--- trunk/sys/x86/iommu/intel_fault.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_fault.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_fault.c 279485 2015-03-01 10:35:54Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_fault.c 309882 2016-12-12 09:43:48Z kib $");
 
 #include "opt_acpi.h"
 
@@ -42,6 +42,7 @@
 #include <sys/rman.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <machine/bus.h>
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
@@ -179,7 +180,7 @@
 	}
 
 	if (enqueue) {
-		taskqueue_enqueue_fast(unit->fault_taskqueue,
+		taskqueue_enqueue(unit->fault_taskqueue,
 		    &unit->fault_task);
 	}
 	return (FILTER_HANDLED);
@@ -271,7 +272,7 @@
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 
 	TASK_INIT(&unit->fault_task, 0, dmar_fault_task, unit);
-	unit->fault_taskqueue = taskqueue_create_fast("dmar", M_WAITOK,
+	unit->fault_taskqueue = taskqueue_create_fast("dmarff", M_WAITOK,
 	    taskqueue_thread_enqueue, &unit->fault_taskqueue);
 	taskqueue_start_threads(&unit->fault_taskqueue, 1, PI_AV,
 	    "dmar%d fault taskq", unit->unit);

Modified: trunk/sys/x86/iommu/intel_gas.c
===================================================================
--- trunk/sys/x86/iommu/intel_gas.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_gas.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_gas.c 281545 2015-04-15 06:56:51Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_gas.c 329942 2018-02-25 00:32:42Z markj $");
 
 #define	RB_AUGMENT(entry) dmar_gas_augment_entry(entry)
 
@@ -50,6 +50,7 @@
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/uio.h>
+#include <sys/vmem.h>
 #include <dev/pci/pcivar.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
@@ -79,12 +80,12 @@
 
 	dmar_map_entry_zone = uma_zcreate("DMAR_MAP_ENTRY",
 	    sizeof(struct dmar_map_entry), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NODUMP);
 }
 SYSINIT(intel_gas, SI_SUB_DRIVERS, SI_ORDER_FIRST, intel_gas_init, NULL);
 
 struct dmar_map_entry *
-dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags)
+dmar_gas_alloc_entry(struct dmar_domain *domain, u_int flags)
 {
 	struct dmar_map_entry *res;
 
@@ -94,20 +95,20 @@
 	res = uma_zalloc(dmar_map_entry_zone, ((flags & DMAR_PGF_WAITOK) !=
 	    0 ? M_WAITOK : M_NOWAIT) | M_ZERO);
 	if (res != NULL) {
-		res->ctx = ctx;
-		atomic_add_int(&ctx->entries_cnt, 1);
+		res->domain = domain;
+		atomic_add_int(&domain->entries_cnt, 1);
 	}
 	return (res);
 }
 
 void
-dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_free_entry(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 
-	KASSERT(ctx == entry->ctx,
-	    ("mismatched free ctx %p entry %p entry->ctx %p", ctx,
-	    entry, entry->ctx));
-	atomic_subtract_int(&ctx->entries_cnt, 1);
+	KASSERT(domain == entry->domain,
+	    ("mismatched free domain %p entry %p entry->domain %p", domain,
+	    entry, entry->domain));
+	atomic_subtract_int(&domain->entries_cnt, 1);
 	uma_zfree(dmar_map_entry_zone, entry);
 }
 
@@ -158,12 +159,12 @@
     dmar_gas_cmp_entries);
 
 static void
-dmar_gas_fix_free(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_fix_free(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *next;
 
-	next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	entry->free_after = (next != NULL ? next->start : ctx->end) -
+	next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
+	entry->free_after = (next != NULL ? next->start : domain->end) -
 	    entry->end;
 	dmar_gas_augment_entry(entry);
 }
@@ -170,18 +171,18 @@
 
 #ifdef INVARIANTS
 static void
-dmar_gas_check_free(struct dmar_ctx *ctx)
+dmar_gas_check_free(struct dmar_domain *domain)
 {
 	struct dmar_map_entry *entry, *next, *l, *r;
 	dmar_gaddr_t v;
 
-	RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) {
-		KASSERT(ctx == entry->ctx,
-		    ("mismatched free ctx %p entry %p entry->ctx %p", ctx,
-		    entry, entry->ctx));
-		next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
+	RB_FOREACH(entry, dmar_gas_entries_tree, &domain->rb_root) {
+		KASSERT(domain == entry->domain,
+		    ("mismatched free domain %p entry %p entry->domain %p",
+		    domain, entry, entry->domain));
+		next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
 		if (next == NULL) {
-			MPASS(entry->free_after == ctx->end - entry->end);
+			MPASS(entry->free_after == domain->end - entry->end);
 		} else {
 			MPASS(entry->free_after = next->start - entry->end);
 			MPASS(entry->end <= next->start);
@@ -198,7 +199,7 @@
 			    l->free_down));
 		} else {
 			v = MAX(entry->free_after, l->free_down);
-			v = MAX(entry->free_down, r->free_down);
+			v = MAX(v, r->free_down);
 			MPASS(entry->free_down == v);
 		}
 	}
@@ -206,93 +207,95 @@
 #endif
 
 static bool
-dmar_gas_rb_insert(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_rb_insert(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *prev, *found;
 
-	found = RB_INSERT(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	dmar_gas_fix_free(ctx, entry);
-	prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
+	found = RB_INSERT(dmar_gas_entries_tree, &domain->rb_root, entry);
+	dmar_gas_fix_free(domain, entry);
+	prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
 	if (prev != NULL)
-		dmar_gas_fix_free(ctx, prev);
+		dmar_gas_fix_free(domain, prev);
 	return (found == NULL);
 }
 
 static void
-dmar_gas_rb_remove(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_rb_remove(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *prev;
 
-	prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
+	prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
+	RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
 	if (prev != NULL)
-		dmar_gas_fix_free(ctx, prev);
+		dmar_gas_fix_free(domain, prev);
 }
 
 void
-dmar_gas_init_ctx(struct dmar_ctx *ctx)
+dmar_gas_init_domain(struct dmar_domain *domain)
 {
 	struct dmar_map_entry *begin, *end;
 
-	begin = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK);
-	end = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK);
+	begin = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK);
+	end = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK);
 
-	DMAR_CTX_LOCK(ctx);
-	KASSERT(ctx->entries_cnt == 2, ("dirty ctx %p", ctx));
-	KASSERT(RB_EMPTY(&ctx->rb_root), ("non-empty entries %p", ctx));
+	DMAR_DOMAIN_LOCK(domain);
+	KASSERT(domain->entries_cnt == 2, ("dirty domain %p", domain));
+	KASSERT(RB_EMPTY(&domain->rb_root), ("non-empty entries %p", domain));
 
 	begin->start = 0;
 	begin->end = DMAR_PAGE_SIZE;
-	begin->free_after = ctx->end - begin->end;
+	begin->free_after = domain->end - begin->end;
 	begin->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED;
-	dmar_gas_rb_insert(ctx, begin);
+	dmar_gas_rb_insert(domain, begin);
 
-	end->start = ctx->end;
-	end->end = ctx->end;
+	end->start = domain->end;
+	end->end = domain->end;
 	end->free_after = 0;
 	end->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED;
-	dmar_gas_rb_insert(ctx, end);
+	dmar_gas_rb_insert(domain, end);
 
-	ctx->first_place = begin;
-	ctx->last_place = end;
-	DMAR_CTX_UNLOCK(ctx);
+	domain->first_place = begin;
+	domain->last_place = end;
+	domain->flags |= DMAR_DOMAIN_GAS_INITED;
+	DMAR_DOMAIN_UNLOCK(domain);
 }
 
 void
-dmar_gas_fini_ctx(struct dmar_ctx *ctx)
+dmar_gas_fini_domain(struct dmar_domain *domain)
 {
 	struct dmar_map_entry *entry, *entry1;
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
-	KASSERT(ctx->entries_cnt == 2, ("ctx still in use %p", ctx));
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
+	KASSERT(domain->entries_cnt == 2, ("domain still in use %p", domain));
 
-	entry = RB_MIN(dmar_gas_entries_tree, &ctx->rb_root);
-	KASSERT(entry->start == 0, ("start entry start %p", ctx));
-	KASSERT(entry->end == DMAR_PAGE_SIZE, ("start entry end %p", ctx));
+	entry = RB_MIN(dmar_gas_entries_tree, &domain->rb_root);
+	KASSERT(entry->start == 0, ("start entry start %p", domain));
+	KASSERT(entry->end == DMAR_PAGE_SIZE, ("start entry end %p", domain));
 	KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE,
-	    ("start entry flags %p", ctx));
-	RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	dmar_gas_free_entry(ctx, entry);
+	    ("start entry flags %p", domain));
+	RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
+	dmar_gas_free_entry(domain, entry);
 
-	entry = RB_MAX(dmar_gas_entries_tree, &ctx->rb_root);
-	KASSERT(entry->start == ctx->end, ("end entry start %p", ctx));
-	KASSERT(entry->end == ctx->end, ("end entry end %p", ctx));
-	KASSERT(entry->free_after == 0, ("end entry free_after%p", ctx));
+	entry = RB_MAX(dmar_gas_entries_tree, &domain->rb_root);
+	KASSERT(entry->start == domain->end, ("end entry start %p", domain));
+	KASSERT(entry->end == domain->end, ("end entry end %p", domain));
+	KASSERT(entry->free_after == 0, ("end entry free_after %p", domain));
 	KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE,
-	    ("end entry flags %p", ctx));
-	RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	dmar_gas_free_entry(ctx, entry);
+	    ("end entry flags %p", domain));
+	RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
+	dmar_gas_free_entry(domain, entry);
 
-	RB_FOREACH_SAFE(entry, dmar_gas_entries_tree, &ctx->rb_root, entry1) {
+	RB_FOREACH_SAFE(entry, dmar_gas_entries_tree, &domain->rb_root,
+	    entry1) {
 		KASSERT((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0,
-		    ("non-RMRR entry left %p", ctx));
-		RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
-		dmar_gas_free_entry(ctx, entry);
+		    ("non-RMRR entry left %p", domain));
+		RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
+		dmar_gas_free_entry(domain, entry);
 	}
 }
 
 struct dmar_gas_match_args {
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	dmar_gaddr_t size;
 	int offset;
 	const struct bus_dma_tag_common *common;
@@ -325,8 +328,8 @@
 	 * the boundary.  Check if there is enough space after the
 	 * next boundary after the prev->end.
 	 */
-	bs = (a->entry->start + a->offset + a->common->boundary) &
-	    ~(a->common->boundary - 1);
+	bs = rounddown2(a->entry->start + a->offset + a->common->boundary,
+	    a->common->boundary);
 	start = roundup2(bs, a->common->alignment);
 	/* DMAR_PAGE_SIZE to create gap after new entry. */
 	if (start + a->offset + a->size + DMAR_PAGE_SIZE <=
@@ -371,12 +374,12 @@
 	 */
 	a->entry->end = a->entry->start + a->size;
 
-	next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev);
+	next = RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, prev);
 	KASSERT(next->start >= a->entry->end &&
 	    next->start - a->entry->start >= a->size &&
 	    prev->end <= a->entry->end,
 	    ("dmar_gas_match_insert hole failed %p prev (%jx, %jx) "
-	    "free_after %jx next (%jx, %jx) entry (%jx, %jx)", a->ctx,
+	    "free_after %jx next (%jx, %jx) entry (%jx, %jx)", a->domain,
 	    (uintmax_t)prev->start, (uintmax_t)prev->end,
 	    (uintmax_t)prev->free_after,
 	    (uintmax_t)next->start, (uintmax_t)next->end,
@@ -385,19 +388,19 @@
 	prev->free_after = a->entry->start - prev->end;
 	a->entry->free_after = next->start - a->entry->end;
 
-	found = dmar_gas_rb_insert(a->ctx, a->entry);
+	found = dmar_gas_rb_insert(a->domain, a->entry);
 	KASSERT(found, ("found dup %p start %jx size %jx",
-	    a->ctx, (uintmax_t)a->entry->start, (uintmax_t)a->size));
+	    a->domain, (uintmax_t)a->entry->start, (uintmax_t)a->size));
 	a->entry->flags = DMAR_MAP_ENTRY_MAP;
 
-	KASSERT(RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root,
+	KASSERT(RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root,
 	    a->entry) == prev,
 	    ("entry %p prev %p inserted prev %p", a->entry, prev,
-	    RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry)));
-	KASSERT(RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root,
+	    RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root, a->entry)));
+	KASSERT(RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root,
 	    a->entry) == next,
 	    ("entry %p next %p inserted next %p", a->entry, next,
-	    RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry)));
+	    RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, a->entry)));
 }
 
 static int
@@ -434,11 +437,12 @@
 	struct dmar_map_entry *next, *prev, find_entry;
 
 	find_entry.start = a->common->highaddr;
-	next = RB_NFIND(dmar_gas_entries_tree, &a->ctx->rb_root, &find_entry);
+	next = RB_NFIND(dmar_gas_entries_tree, &a->domain->rb_root,
+	    &find_entry);
 	if (next == NULL)
 		return (ENOMEM);
-	prev = RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, next);
-	KASSERT(prev != NULL, ("no prev %p %jx", a->ctx,
+	prev = RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root, next);
+	KASSERT(prev != NULL, ("no prev %p %jx", a->domain,
 	    (uintmax_t)find_entry.start));
 	for (;;) {
 		a->entry->start = prev->start + DMAR_PAGE_SIZE;
@@ -446,7 +450,7 @@
 			a->entry->start = a->common->highaddr;
 		a->entry->start = roundup2(a->entry->start,
 		    a->common->alignment);
-		if (dmar_gas_match_one(a, prev, a->ctx->end)) {
+		if (dmar_gas_match_one(a, prev, a->domain->end)) {
 			dmar_gas_match_insert(a, prev);
 			return (0);
 		}
@@ -459,16 +463,17 @@
 		 * non-optimal way.
 		 */
 		prev = next;
-		next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev);
-		KASSERT(next != NULL, ("no next %p %jx", a->ctx,
+		next = RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root,
+		    prev);
+		KASSERT(next != NULL, ("no next %p %jx", a->domain,
 		    (uintmax_t)find_entry.start));
-		if (next->end >= a->ctx->end)
+		if (next->end >= a->domain->end)
 			return (ENOMEM);
 	}
 }
 
 static int
-dmar_gas_find_space(struct dmar_ctx *ctx,
+dmar_gas_find_space(struct dmar_domain *domain,
     const struct bus_dma_tag_common *common, dmar_gaddr_t size,
     int offset, u_int flags, struct dmar_map_entry *entry)
 {
@@ -475,11 +480,11 @@
 	struct dmar_gas_match_args a;
 	int error;
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
-	KASSERT(entry->flags == 0, ("dirty entry %p %p", ctx, entry));
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
+	KASSERT(entry->flags == 0, ("dirty entry %p %p", domain, entry));
 	KASSERT((size & DMAR_PAGE_MASK) == 0, ("size %jx", (uintmax_t)size));
 
-	a.ctx = ctx;
+	a.domain = domain;
 	a.size = size;
 	a.offset = offset;
 	a.common = common;
@@ -488,7 +493,7 @@
 
 	/* Handle lower region. */
 	if (common->lowaddr > 0) {
-		error = dmar_gas_lowermatch(&a, RB_ROOT(&ctx->rb_root));
+		error = dmar_gas_lowermatch(&a, RB_ROOT(&domain->rb_root));
 		if (error == 0)
 			return (0);
 		KASSERT(error == ENOMEM,
@@ -495,7 +500,7 @@
 		    ("error %d from dmar_gas_lowermatch", error));
 	}
 	/* Handle upper region. */
-	if (common->highaddr >= ctx->end)
+	if (common->highaddr >= domain->end)
 		return (ENOMEM);
 	error = dmar_gas_uppermatch(&a);
 	KASSERT(error == ENOMEM,
@@ -504,13 +509,13 @@
 }
 
 static int
-dmar_gas_alloc_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
+dmar_gas_alloc_region(struct dmar_domain *domain, struct dmar_map_entry *entry,
     u_int flags)
 {
 	struct dmar_map_entry *next, *prev;
 	bool found;
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
 
 	if ((entry->start & DMAR_PAGE_MASK) != 0 ||
 	    (entry->end & DMAR_PAGE_MASK) != 0)
@@ -517,13 +522,13 @@
 		return (EINVAL);
 	if (entry->start >= entry->end)
 		return (EINVAL);
-	if (entry->end >= ctx->end)
+	if (entry->end >= domain->end)
 		return (EINVAL);
 
-	next = RB_NFIND(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	KASSERT(next != NULL, ("next must be non-null %p %jx", ctx,
+	next = RB_NFIND(dmar_gas_entries_tree, &domain->rb_root, entry);
+	KASSERT(next != NULL, ("next must be non-null %p %jx", domain,
 	    (uintmax_t)entry->start));
-	prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, next);
+	prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, next);
 	/* prev could be NULL */
 
 	/*
@@ -551,23 +556,23 @@
 
 	if (prev != NULL && prev->end > entry->start) {
 		/* This assumes that prev is the placeholder entry. */
-		dmar_gas_rb_remove(ctx, prev);
+		dmar_gas_rb_remove(domain, prev);
 		prev = NULL;
 	}
 	if (next != NULL && next->start < entry->end) {
-		dmar_gas_rb_remove(ctx, next);
+		dmar_gas_rb_remove(domain, next);
 		next = NULL;
 	}
 
-	found = dmar_gas_rb_insert(ctx, entry);
+	found = dmar_gas_rb_insert(domain, entry);
 	KASSERT(found, ("found RMRR dup %p start %jx end %jx",
-	    ctx, (uintmax_t)entry->start, (uintmax_t)entry->end));
+	    domain, (uintmax_t)entry->start, (uintmax_t)entry->end));
 	entry->flags = DMAR_MAP_ENTRY_RMRR;
 
 #ifdef INVARIANTS
 	struct dmar_map_entry *ip, *in;
-	ip = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	in = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
+	ip = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
+	in = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
 	KASSERT(prev == NULL || ip == prev,
 	    ("RMRR %p (%jx %jx) prev %p (%jx %jx) ins prev %p (%jx %jx)",
 	    entry, entry->start, entry->end, prev,
@@ -584,47 +589,47 @@
 }
 
 void
-dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_free_space(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
 	KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR |
 	    DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_MAP,
-	    ("permanent entry %p %p", ctx, entry));
+	    ("permanent entry %p %p", domain, entry));
 
-	dmar_gas_rb_remove(ctx, entry);
+	dmar_gas_rb_remove(domain, entry);
 	entry->flags &= ~DMAR_MAP_ENTRY_MAP;
 #ifdef INVARIANTS
 	if (dmar_check_free)
-		dmar_gas_check_free(ctx);
+		dmar_gas_check_free(domain);
 #endif
 }
 
 void
-dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_free_region(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *next, *prev;
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
 	KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR |
 	    DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_RMRR,
-	    ("non-RMRR entry %p %p", ctx, entry));
+	    ("non-RMRR entry %p %p", domain, entry));
 
-	prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	dmar_gas_rb_remove(ctx, entry);
+	prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
+	next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
+	dmar_gas_rb_remove(domain, entry);
 	entry->flags &= ~DMAR_MAP_ENTRY_RMRR;
 
 	if (prev == NULL)
-		dmar_gas_rb_insert(ctx, ctx->first_place);
+		dmar_gas_rb_insert(domain, domain->first_place);
 	if (next == NULL)
-		dmar_gas_rb_insert(ctx, ctx->last_place);
+		dmar_gas_rb_insert(domain, domain->last_place);
 }
 
 int
-dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common,
-    dmar_gaddr_t size, int offset, u_int eflags, u_int flags, vm_page_t *ma,
-    struct dmar_map_entry **res)
+dmar_gas_map(struct dmar_domain *domain,
+    const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset,
+    u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res)
 {
 	struct dmar_map_entry *entry;
 	int error;
@@ -632,29 +637,31 @@
 	KASSERT((flags & ~(DMAR_GM_CANWAIT | DMAR_GM_CANSPLIT)) == 0,
 	    ("invalid flags 0x%x", flags));
 
-	entry = dmar_gas_alloc_entry(ctx, (flags & DMAR_GM_CANWAIT) != 0 ?
+	entry = dmar_gas_alloc_entry(domain, (flags & DMAR_GM_CANWAIT) != 0 ?
 	    DMAR_PGF_WAITOK : 0);
 	if (entry == NULL)
 		return (ENOMEM);
-	DMAR_CTX_LOCK(ctx);
-	error = dmar_gas_find_space(ctx, common, size, offset, flags, entry);
+	DMAR_DOMAIN_LOCK(domain);
+	error = dmar_gas_find_space(domain, common, size, offset, flags,
+	    entry);
 	if (error == ENOMEM) {
-		DMAR_CTX_UNLOCK(ctx);
-		dmar_gas_free_entry(ctx, entry);
+		DMAR_DOMAIN_UNLOCK(domain);
+		dmar_gas_free_entry(domain, entry);
 		return (error);
 	}
 #ifdef INVARIANTS
 	if (dmar_check_free)
-		dmar_gas_check_free(ctx);
+		dmar_gas_check_free(domain);
 #endif
 	KASSERT(error == 0,
 	    ("unexpected error %d from dmar_gas_find_entry", error));
-	KASSERT(entry->end < ctx->end, ("allocated GPA %jx, max GPA %jx",
-	    (uintmax_t)entry->end, (uintmax_t)ctx->end));
+	KASSERT(entry->end < domain->end, ("allocated GPA %jx, max GPA %jx",
+	    (uintmax_t)entry->end, (uintmax_t)domain->end));
 	entry->flags |= eflags;
-	DMAR_CTX_UNLOCK(ctx);
+	DMAR_DOMAIN_UNLOCK(domain);
 
-	error = ctx_map_buf(ctx, entry->start, entry->end - entry->start, ma,
+	error = domain_map_buf(domain, entry->start, entry->end - entry->start,
+	    ma,
 	    ((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) |
 	    ((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) |
 	    ((eflags & DMAR_MAP_ENTRY_SNOOP) != 0 ? DMAR_PTE_SNP : 0) |
@@ -661,11 +668,11 @@
 	    ((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0),
 	    (flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0);
 	if (error == ENOMEM) {
-		dmar_ctx_unload_entry(entry, true);
+		dmar_domain_unload_entry(entry, true);
 		return (error);
 	}
 	KASSERT(error == 0,
-	    ("unexpected error %d from ctx_map_buf", error));
+	    ("unexpected error %d from domain_map_buf", error));
 
 	*res = entry;
 	return (0);
@@ -672,30 +679,30 @@
 }
 
 int
-dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
+dmar_gas_map_region(struct dmar_domain *domain, struct dmar_map_entry *entry,
     u_int eflags, u_int flags, vm_page_t *ma)
 {
 	dmar_gaddr_t start;
 	int error;
 
-	KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", ctx,
+	KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", domain,
 	    entry, entry->flags));
 	KASSERT((flags & ~(DMAR_GM_CANWAIT)) == 0,
 	    ("invalid flags 0x%x", flags));
 
 	start = entry->start;
-	DMAR_CTX_LOCK(ctx);
-	error = dmar_gas_alloc_region(ctx, entry, flags);
+	DMAR_DOMAIN_LOCK(domain);
+	error = dmar_gas_alloc_region(domain, entry, flags);
 	if (error != 0) {
-		DMAR_CTX_UNLOCK(ctx);
+		DMAR_DOMAIN_UNLOCK(domain);
 		return (error);
 	}
 	entry->flags |= eflags;
-	DMAR_CTX_UNLOCK(ctx);
+	DMAR_DOMAIN_UNLOCK(domain);
 	if (entry->end == entry->start)
 		return (0);
 
-	error = ctx_map_buf(ctx, entry->start, entry->end - entry->start,
+	error = domain_map_buf(domain, entry->start, entry->end - entry->start,
 	    ma + OFF_TO_IDX(start - entry->start),
 	    ((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) |
 	    ((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) |
@@ -703,31 +710,31 @@
 	    ((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0),
 	    (flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0);
 	if (error == ENOMEM) {
-		dmar_ctx_unload_entry(entry, false);
+		dmar_domain_unload_entry(entry, false);
 		return (error);
 	}
 	KASSERT(error == 0,
-	    ("unexpected error %d from ctx_map_buf", error));
+	    ("unexpected error %d from domain_map_buf", error));
 
 	return (0);
 }
 
 int
-dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start,
+dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start,
     dmar_gaddr_t end)
 {
 	struct dmar_map_entry *entry;
 	int error;
 
-	entry = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK);
+	entry = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK);
 	entry->start = start;
 	entry->end = end;
-	DMAR_CTX_LOCK(ctx);
-	error = dmar_gas_alloc_region(ctx, entry, DMAR_GM_CANWAIT);
+	DMAR_DOMAIN_LOCK(domain);
+	error = dmar_gas_alloc_region(domain, entry, DMAR_GM_CANWAIT);
 	if (error == 0)
 		entry->flags |= DMAR_MAP_ENTRY_UNMAPPED;
-	DMAR_CTX_UNLOCK(ctx);
+	DMAR_DOMAIN_UNLOCK(domain);
 	if (error != 0)
-		dmar_gas_free_entry(ctx, entry);
+		dmar_gas_free_entry(domain, entry);
 	return (error);
 }

Modified: trunk/sys/x86/iommu/intel_idpgtbl.c
===================================================================
--- trunk/sys/x86/iommu/intel_idpgtbl.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_idpgtbl.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_idpgtbl.c 286854 2015-08-17 18:36:16Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_idpgtbl.c 286777 2015-08-14 13:51:59Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -49,6 +49,7 @@
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/uio.h>
+#include <sys/vmem.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
@@ -66,8 +67,8 @@
 #include <x86/iommu/busdma_dmar.h>
 #include <x86/iommu/intel_dmar.h>
 
-static int ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base,
-    dmar_gaddr_t size, int flags);
+static int domain_unmap_buf_locked(struct dmar_domain *domain,
+    dmar_gaddr_t base, dmar_gaddr_t size, int flags);
 
 /*
  * The cache of the identity mapping page tables for the DMARs.  Using
@@ -105,7 +106,7 @@
  *   mapped by the page table page.
  */
 static void
-ctx_idmap_nextlvl(struct idpgtbl *tbl, int lvl, vm_pindex_t idx,
+domain_idmap_nextlvl(struct idpgtbl *tbl, int lvl, vm_pindex_t idx,
     dmar_gaddr_t addr)
 {
 	vm_page_t m1;
@@ -124,7 +125,7 @@
 	pg_sz = pglvl_page_size(tbl->pglvl, lvl);
 	if (lvl != tbl->leaf) {
 		for (i = 0, f = addr; i < DMAR_NPTEPG; i++, f += pg_sz)
-			ctx_idmap_nextlvl(tbl, lvl + 1, base + i, f);
+			domain_idmap_nextlvl(tbl, lvl + 1, base + i, f);
 	}
 	VM_OBJECT_WUNLOCK(tbl->pgtbl_obj);
 	pte = dmar_map_pgtbl(tbl->pgtbl_obj, idx, DMAR_PGF_WAITOK, &sf);
@@ -146,7 +147,7 @@
 			    VM_PAGE_TO_PHYS(m1)) | DMAR_PTE_R | DMAR_PTE_W;
 		}
 	}
-	/* ctx_get_idmap_pgtbl flushes CPU cache if needed. */
+	/* domain_get_idmap_pgtbl flushes CPU cache if needed. */
 	dmar_unmap_pgtbl(sf);
 	VM_OBJECT_WLOCK(tbl->pgtbl_obj);
 }
@@ -160,7 +161,7 @@
  * maxaddr is typically mapped.
  */
 vm_object_t
-ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr)
+domain_get_idmap_pgtbl(struct dmar_domain *domain, dmar_gaddr_t maxaddr)
 {
 	struct dmar_unit *unit;
 	struct idpgtbl *tbl;
@@ -173,8 +174,8 @@
 	/*
 	 * First, determine where to stop the paging structures.
 	 */
-	for (i = 0; i < ctx->pglvl; i++) {
-		if (i == ctx->pglvl - 1 || ctx_is_sp_lvl(ctx, i)) {
+	for (i = 0; i < domain->pglvl; i++) {
+		if (i == domain->pglvl - 1 || domain_is_sp_lvl(domain, i)) {
 			leaf = i;
 			break;
 		}
@@ -191,12 +192,12 @@
 	sx_slock(&idpgtbl_lock);
 	LIST_FOREACH(tbl, &idpgtbls, link) {
 		if (tbl->maxaddr >= maxaddr &&
-		    dmar_pglvl_supported(ctx->dmar, tbl->pglvl) &&
+		    dmar_pglvl_supported(domain->dmar, tbl->pglvl) &&
 		    tbl->leaf == leaf) {
 			res = tbl->pgtbl_obj;
 			vm_object_reference(res);
 			sx_sunlock(&idpgtbl_lock);
-			ctx->pglvl = tbl->pglvl; /* XXXKIB ? */
+			domain->pglvl = tbl->pglvl; /* XXXKIB ? */
 			goto end;
 		}
 	}
@@ -210,12 +211,12 @@
 	sx_xlock(&idpgtbl_lock);
 	LIST_FOREACH(tbl, &idpgtbls, link) {
 		if (tbl->maxaddr >= maxaddr &&
-		    dmar_pglvl_supported(ctx->dmar, tbl->pglvl) &&
+		    dmar_pglvl_supported(domain->dmar, tbl->pglvl) &&
 		    tbl->leaf == leaf) {
 			res = tbl->pgtbl_obj;
 			vm_object_reference(res);
 			sx_xunlock(&idpgtbl_lock);
-			ctx->pglvl = tbl->pglvl; /* XXXKIB ? */
+			domain->pglvl = tbl->pglvl; /* XXXKIB ? */
 			return (res);
 		}
 	}
@@ -224,13 +225,13 @@
 	 * Still not found, create new page table.
 	 */
 	tbl = malloc(sizeof(*tbl), M_DMAR_IDPGTBL, M_WAITOK);
-	tbl->pglvl = ctx->pglvl;
+	tbl->pglvl = domain->pglvl;
 	tbl->leaf = leaf;
 	tbl->maxaddr = maxaddr;
 	tbl->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL,
 	    IDX_TO_OFF(pglvl_max_pages(tbl->pglvl)), 0, 0, NULL);
 	VM_OBJECT_WLOCK(tbl->pgtbl_obj);
-	ctx_idmap_nextlvl(tbl, 0, 0, 0);
+	domain_idmap_nextlvl(tbl, 0, 0, 0);
 	VM_OBJECT_WUNLOCK(tbl->pgtbl_obj);
 	LIST_INSERT_HEAD(&idpgtbls, tbl, link);
 	res = tbl->pgtbl_obj;
@@ -251,7 +252,7 @@
 	 * If DMAR cannot look into the chipset write buffer, flush it
 	 * as well.
 	 */
-	unit = ctx->dmar;
+	unit = domain->dmar;
 	if (!DMAR_IS_COHERENT(unit)) {
 		VM_OBJECT_WLOCK(res);
 		for (m = vm_page_lookup(res, 0); m != NULL;
@@ -320,10 +321,11 @@
  * the level lvl.
  */
 static int
-ctx_pgtbl_pte_off(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl)
+domain_pgtbl_pte_off(struct dmar_domain *domain, dmar_gaddr_t base, int lvl)
 {
 
-	base >>= DMAR_PAGE_SHIFT + (ctx->pglvl - lvl - 1) * DMAR_NPTEPGSHIFT;
+	base >>= DMAR_PAGE_SHIFT + (domain->pglvl - lvl - 1) *
+	    DMAR_NPTEPGSHIFT;
 	return (base & DMAR_PTEMASK);
 }
 
@@ -333,21 +335,24 @@
  * lvl.
  */
 static vm_pindex_t
-ctx_pgtbl_get_pindex(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl)
+domain_pgtbl_get_pindex(struct dmar_domain *domain, dmar_gaddr_t base, int lvl)
 {
 	vm_pindex_t idx, pidx;
 	int i;
 
-	KASSERT(lvl >= 0 && lvl < ctx->pglvl, ("wrong lvl %p %d", ctx, lvl));
+	KASSERT(lvl >= 0 && lvl < domain->pglvl,
+	    ("wrong lvl %p %d", domain, lvl));
 
-	for (pidx = idx = 0, i = 0; i < lvl; i++, pidx = idx)
-		idx = ctx_pgtbl_pte_off(ctx, base, i) + pidx * DMAR_NPTEPG + 1;
+	for (pidx = idx = 0, i = 0; i < lvl; i++, pidx = idx) {
+		idx = domain_pgtbl_pte_off(domain, base, i) +
+		    pidx * DMAR_NPTEPG + 1;
+	}
 	return (idx);
 }
 
 static dmar_pte_t *
-ctx_pgtbl_map_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags,
-    vm_pindex_t *idxp, struct sf_buf **sf)
+domain_pgtbl_map_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl,
+    int flags, vm_pindex_t *idxp, struct sf_buf **sf)
 {
 	vm_page_t m;
 	struct sf_buf *sfp;
@@ -354,10 +359,10 @@
 	dmar_pte_t *pte, *ptep;
 	vm_pindex_t idx, idx1;
 
-	DMAR_CTX_ASSERT_PGLOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
 	KASSERT((flags & DMAR_PGF_OBJL) != 0, ("lost PGF_OBJL"));
 
-	idx = ctx_pgtbl_get_pindex(ctx, base, lvl);
+	idx = domain_pgtbl_get_pindex(domain, base, lvl);
 	if (*sf != NULL && idx == *idxp) {
 		pte = (dmar_pte_t *)sf_buf_kva(*sf);
 	} else {
@@ -365,15 +370,16 @@
 			dmar_unmap_pgtbl(*sf);
 		*idxp = idx;
 retry:
-		pte = dmar_map_pgtbl(ctx->pgtbl_obj, idx, flags, sf);
+		pte = dmar_map_pgtbl(domain->pgtbl_obj, idx, flags, sf);
 		if (pte == NULL) {
-			KASSERT(lvl > 0, ("lost root page table page %p", ctx));
+			KASSERT(lvl > 0,
+			    ("lost root page table page %p", domain));
 			/*
 			 * Page table page does not exist, allocate
 			 * it and create a pte in the preceeding page level
 			 * to reference the allocated page table page.
 			 */
-			m = dmar_pgalloc(ctx->pgtbl_obj, idx, flags |
+			m = dmar_pgalloc(domain->pgtbl_obj, idx, flags |
 			    DMAR_PGF_ZERO);
 			if (m == NULL)
 				return (NULL);
@@ -381,25 +387,26 @@
 			/*
 			 * Prevent potential free while pgtbl_obj is
 			 * unlocked in the recursive call to
-			 * ctx_pgtbl_map_pte(), if other thread did
-			 * pte write and clean while the lock if
+			 * domain_pgtbl_map_pte(), if other thread did
+			 * pte write and clean while the lock is
 			 * dropped.
 			 */
 			m->wire_count++;
 
 			sfp = NULL;
-			ptep = ctx_pgtbl_map_pte(ctx, base, lvl - 1, flags,
-			    &idx1, &sfp);
+			ptep = domain_pgtbl_map_pte(domain, base, lvl - 1,
+			    flags, &idx1, &sfp);
 			if (ptep == NULL) {
 				KASSERT(m->pindex != 0,
-				    ("loosing root page %p", ctx));
+				    ("loosing root page %p", domain));
 				m->wire_count--;
-				dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags);
+				dmar_pgfree(domain->pgtbl_obj, m->pindex,
+				    flags);
 				return (NULL);
 			}
 			dmar_pte_store(&ptep->pte, DMAR_PTE_R | DMAR_PTE_W |
 			    VM_PAGE_TO_PHYS(m));
-			dmar_flush_pte_to_ram(ctx->dmar, ptep);
+			dmar_flush_pte_to_ram(domain->dmar, ptep);
 			sf_buf_page(sfp)->wire_count += 1;
 			m->wire_count--;
 			dmar_unmap_pgtbl(sfp);
@@ -407,13 +414,13 @@
 			goto retry;
 		}
 	}
-	pte += ctx_pgtbl_pte_off(ctx, base, lvl);
+	pte += domain_pgtbl_pte_off(domain, base, lvl);
 	return (pte);
 }
 
 static int
-ctx_map_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
-    vm_page_t *ma, uint64_t pflags, int flags)
+domain_map_buf_locked(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags)
 {
 	dmar_pte_t *pte;
 	struct sf_buf *sf;
@@ -422,7 +429,7 @@
 	int lvl;
 	bool superpage;
 
-	DMAR_CTX_ASSERT_PGLOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
 
 	base1 = base;
 	size1 = size;
@@ -432,15 +439,15 @@
 	for (sf = NULL, pi = 0; size > 0; base += pg_sz, size -= pg_sz,
 	    pi += run_sz) {
 		for (lvl = 0, c = 0, superpage = false;; lvl++) {
-			pg_sz = ctx_page_size(ctx, lvl);
+			pg_sz = domain_page_size(domain, lvl);
 			run_sz = pg_sz >> DMAR_PAGE_SHIFT;
-			if (lvl == ctx->pglvl - 1)
+			if (lvl == domain->pglvl - 1)
 				break;
 			/*
 			 * Check if the current base suitable for the
 			 * superpage mapping.  First, verify the level.
 			 */
-			if (!ctx_is_sp_lvl(ctx, lvl))
+			if (!domain_is_sp_lvl(domain, lvl))
 				continue;
 			/*
 			 * Next, look at the size of the mapping and
@@ -464,22 +471,23 @@
 			}
 		}
 		KASSERT(size >= pg_sz,
-		    ("mapping loop overflow %p %jx %jx %jx", ctx,
+		    ("mapping loop overflow %p %jx %jx %jx", domain,
 		    (uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz));
 		KASSERT(pg_sz > 0, ("pg_sz 0 lvl %d", lvl));
-		pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf);
+		pte = domain_pgtbl_map_pte(domain, base, lvl, flags, &idx, &sf);
 		if (pte == NULL) {
 			KASSERT((flags & DMAR_PGF_WAITOK) == 0,
-			    ("failed waitable pte alloc %p", ctx));
+			    ("failed waitable pte alloc %p", domain));
 			if (sf != NULL)
 				dmar_unmap_pgtbl(sf);
-			ctx_unmap_buf_locked(ctx, base1, base - base1, flags);
+			domain_unmap_buf_locked(domain, base1, base - base1,
+			    flags);
 			TD_PINNED_ASSERT;
 			return (ENOMEM);
 		}
 		dmar_pte_store(&pte->pte, VM_PAGE_TO_PHYS(ma[pi]) | pflags |
 		    (superpage ? DMAR_PTE_SP : 0));
-		dmar_flush_pte_to_ram(ctx->dmar, pte);
+		dmar_flush_pte_to_ram(domain->dmar, pte);
 		sf_buf_page(sf)->wire_count += 1;
 	}
 	if (sf != NULL)
@@ -489,32 +497,32 @@
 }
 
 int
-ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
+domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base, dmar_gaddr_t size,
     vm_page_t *ma, uint64_t pflags, int flags)
 {
 	struct dmar_unit *unit;
 	int error;
 
-	unit = ctx->dmar;
+	unit = domain->dmar;
 
-	KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0,
-	    ("modifying idmap pagetable ctx %p", ctx));
+	KASSERT((domain->flags & DMAR_DOMAIN_IDMAP) == 0,
+	    ("modifying idmap pagetable domain %p", domain));
 	KASSERT((base & DMAR_PAGE_MASK) == 0,
-	    ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base,
+	    ("non-aligned base %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
 	KASSERT((size & DMAR_PAGE_MASK) == 0,
-	    ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base,
+	    ("non-aligned size %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
-	KASSERT(size > 0, ("zero size %p %jx %jx", ctx, (uintmax_t)base,
+	KASSERT(size > 0, ("zero size %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
-	KASSERT(base < (1ULL << ctx->agaw),
-	    ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
-	    (uintmax_t)size, ctx->agaw));
-	KASSERT(base + size < (1ULL << ctx->agaw),
-	    ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
-	    (uintmax_t)size, ctx->agaw));
+	KASSERT(base < (1ULL << domain->agaw),
+	    ("base too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+	    (uintmax_t)size, domain->agaw));
+	KASSERT(base + size < (1ULL << domain->agaw),
+	    ("end too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+	    (uintmax_t)size, domain->agaw));
 	KASSERT(base + size > base,
-	    ("size overflow %p %jx %jx", ctx, (uintmax_t)base,
+	    ("size overflow %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
 	KASSERT((pflags & (DMAR_PTE_R | DMAR_PTE_W)) != 0,
 	    ("neither read nor write %jx", (uintmax_t)pflags));
@@ -524,21 +532,21 @@
 	KASSERT((pflags & DMAR_PTE_SNP) == 0 ||
 	    (unit->hw_ecap & DMAR_ECAP_SC) != 0,
 	    ("PTE_SNP for dmar without snoop control %p %jx",
-	    ctx, (uintmax_t)pflags));
+	    domain, (uintmax_t)pflags));
 	KASSERT((pflags & DMAR_PTE_TM) == 0 ||
 	    (unit->hw_ecap & DMAR_ECAP_DI) != 0,
 	    ("PTE_TM for dmar without DIOTLB %p %jx",
-	    ctx, (uintmax_t)pflags));
+	    domain, (uintmax_t)pflags));
 	KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags));
 
-	DMAR_CTX_PGLOCK(ctx);
-	error = ctx_map_buf_locked(ctx, base, size, ma, pflags, flags);
-	DMAR_CTX_PGUNLOCK(ctx);
+	DMAR_DOMAIN_PGLOCK(domain);
+	error = domain_map_buf_locked(domain, base, size, ma, pflags, flags);
+	DMAR_DOMAIN_PGUNLOCK(domain);
 	if (error != 0)
 		return (error);
 
 	if ((unit->hw_cap & DMAR_CAP_CM) != 0)
-		ctx_flush_iotlb_sync(ctx, base, size);
+		domain_flush_iotlb_sync(domain, base, size);
 	else if ((unit->hw_cap & DMAR_CAP_RWBF) != 0) {
 		/* See 11.1 Write Buffer Flushing. */
 		DMAR_LOCK(unit);
@@ -548,11 +556,13 @@
 	return (0);
 }
 
-static void ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base,
-    int lvl, int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_fs);
+static void domain_unmap_clear_pte(struct dmar_domain *domain,
+    dmar_gaddr_t base, int lvl, int flags, dmar_pte_t *pte,
+    struct sf_buf **sf, bool free_fs);
 
 static void
-ctx_free_pgtbl_pde(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags)
+domain_free_pgtbl_pde(struct dmar_domain *domain, dmar_gaddr_t base,
+    int lvl, int flags)
 {
 	struct sf_buf *sf;
 	dmar_pte_t *pde;
@@ -559,18 +569,18 @@
 	vm_pindex_t idx;
 
 	sf = NULL;
-	pde = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf);
-	ctx_unmap_clear_pte(ctx, base, lvl, flags, pde, &sf, true);
+	pde = domain_pgtbl_map_pte(domain, base, lvl, flags, &idx, &sf);
+	domain_unmap_clear_pte(domain, base, lvl, flags, pde, &sf, true);
 }
 
 static void
-ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl,
+domain_unmap_clear_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl,
     int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_sf)
 {
 	vm_page_t m;
 
 	dmar_pte_clear(&pte->pte);
-	dmar_flush_pte_to_ram(ctx->dmar, pte);
+	dmar_flush_pte_to_ram(domain->dmar, pte);
 	m = sf_buf_page(*sf);
 	if (free_sf) {
 		dmar_unmap_pgtbl(*sf);
@@ -580,13 +590,13 @@
 	if (m->wire_count != 0)
 		return;
 	KASSERT(lvl != 0,
-	    ("lost reference (lvl) on root pg ctx %p base %jx lvl %d",
-	    ctx, (uintmax_t)base, lvl));
+	    ("lost reference (lvl) on root pg domain %p base %jx lvl %d",
+	    domain, (uintmax_t)base, lvl));
 	KASSERT(m->pindex != 0,
-	    ("lost reference (idx) on root pg ctx %p base %jx lvl %d",
-	    ctx, (uintmax_t)base, lvl));
-	dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags);
-	ctx_free_pgtbl_pde(ctx, base, lvl - 1, flags);
+	    ("lost reference (idx) on root pg domain %p base %jx lvl %d",
+	    domain, (uintmax_t)base, lvl));
+	dmar_pgfree(domain->pgtbl_obj, m->pindex, flags);
+	domain_free_pgtbl_pde(domain, base, lvl - 1, flags);
 }
 
 /*
@@ -593,7 +603,7 @@
  * Assumes that the unmap is never partial.
  */
 static int
-ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base,
+domain_unmap_buf_locked(struct dmar_domain *domain, dmar_gaddr_t base,
     dmar_gaddr_t size, int flags)
 {
 	dmar_pte_t *pte;
@@ -602,26 +612,26 @@
 	dmar_gaddr_t pg_sz;
 	int lvl;
 
-	DMAR_CTX_ASSERT_PGLOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
 	if (size == 0)
 		return (0);
 
-	KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0,
-	    ("modifying idmap pagetable ctx %p", ctx));
+	KASSERT((domain->flags & DMAR_DOMAIN_IDMAP) == 0,
+	    ("modifying idmap pagetable domain %p", domain));
 	KASSERT((base & DMAR_PAGE_MASK) == 0,
-	    ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base,
+	    ("non-aligned base %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
 	KASSERT((size & DMAR_PAGE_MASK) == 0,
-	    ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base,
+	    ("non-aligned size %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
-	KASSERT(base < (1ULL << ctx->agaw),
-	    ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
-	    (uintmax_t)size, ctx->agaw));
-	KASSERT(base + size < (1ULL << ctx->agaw),
-	    ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
-	    (uintmax_t)size, ctx->agaw));
+	KASSERT(base < (1ULL << domain->agaw),
+	    ("base too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+	    (uintmax_t)size, domain->agaw));
+	KASSERT(base + size < (1ULL << domain->agaw),
+	    ("end too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+	    (uintmax_t)size, domain->agaw));
 	KASSERT(base + size > base,
-	    ("size overflow %p %jx %jx", ctx, (uintmax_t)base,
+	    ("size overflow %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
 	KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags));
 
@@ -630,26 +640,27 @@
 	TD_PREP_PINNED_ASSERT;
 
 	for (sf = NULL; size > 0; base += pg_sz, size -= pg_sz) {
-		for (lvl = 0; lvl < ctx->pglvl; lvl++) {
-			if (lvl != ctx->pglvl - 1 && !ctx_is_sp_lvl(ctx, lvl))
+		for (lvl = 0; lvl < domain->pglvl; lvl++) {
+			if (lvl != domain->pglvl - 1 &&
+			    !domain_is_sp_lvl(domain, lvl))
 				continue;
-			pg_sz = ctx_page_size(ctx, lvl);
+			pg_sz = domain_page_size(domain, lvl);
 			if (pg_sz > size)
 				continue;
-			pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags,
+			pte = domain_pgtbl_map_pte(domain, base, lvl, flags,
 			    &idx, &sf);
 			KASSERT(pte != NULL,
 			    ("sleeping or page missed %p %jx %d 0x%x",
-			    ctx, (uintmax_t)base, lvl, flags));
+			    domain, (uintmax_t)base, lvl, flags));
 			if ((pte->pte & DMAR_PTE_SP) != 0 ||
-			    lvl == ctx->pglvl - 1) {
-				ctx_unmap_clear_pte(ctx, base, lvl, flags,
-				    pte, &sf, false);
+			    lvl == domain->pglvl - 1) {
+				domain_unmap_clear_pte(domain, base, lvl,
+				    flags, pte, &sf, false);
 				break;
 			}
 		}
 		KASSERT(size >= pg_sz,
-		    ("unmapping loop overflow %p %jx %jx %jx", ctx,
+		    ("unmapping loop overflow %p %jx %jx %jx", domain,
 		    (uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz));
 	}
 	if (sf != NULL)
@@ -664,54 +675,58 @@
 }
 
 int
-ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
-    int flags)
+domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, int flags)
 {
 	int error;
 
-	DMAR_CTX_PGLOCK(ctx);
-	error = ctx_unmap_buf_locked(ctx, base, size, flags);
-	DMAR_CTX_PGUNLOCK(ctx);
+	DMAR_DOMAIN_PGLOCK(domain);
+	error = domain_unmap_buf_locked(domain, base, size, flags);
+	DMAR_DOMAIN_PGUNLOCK(domain);
 	return (error);
 }
 
 int
-ctx_alloc_pgtbl(struct dmar_ctx *ctx)
+domain_alloc_pgtbl(struct dmar_domain *domain)
 {
 	vm_page_t m;
 
-	KASSERT(ctx->pgtbl_obj == NULL, ("already initialized %p", ctx));
+	KASSERT(domain->pgtbl_obj == NULL,
+	    ("already initialized %p", domain));
 
-	ctx->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL,
-	    IDX_TO_OFF(pglvl_max_pages(ctx->pglvl)), 0, 0, NULL);
-	DMAR_CTX_PGLOCK(ctx);
-	m = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_WAITOK |
+	domain->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL,
+	    IDX_TO_OFF(pglvl_max_pages(domain->pglvl)), 0, 0, NULL);
+	DMAR_DOMAIN_PGLOCK(domain);
+	m = dmar_pgalloc(domain->pgtbl_obj, 0, DMAR_PGF_WAITOK |
 	    DMAR_PGF_ZERO | DMAR_PGF_OBJL);
 	/* No implicit free of the top level page table page. */
 	m->wire_count = 1;
-	DMAR_CTX_PGUNLOCK(ctx);
+	DMAR_DOMAIN_PGUNLOCK(domain);
+	DMAR_LOCK(domain->dmar);
+	domain->flags |= DMAR_DOMAIN_PGTBL_INITED;
+	DMAR_UNLOCK(domain->dmar);
 	return (0);
 }
 
 void
-ctx_free_pgtbl(struct dmar_ctx *ctx)
+domain_free_pgtbl(struct dmar_domain *domain)
 {
 	vm_object_t obj;
 	vm_page_t m;
 
-	obj = ctx->pgtbl_obj;
+	obj = domain->pgtbl_obj;
 	if (obj == NULL) {
-		KASSERT((ctx->dmar->hw_ecap & DMAR_ECAP_PT) != 0 &&
-		    (ctx->flags & DMAR_CTX_IDMAP) != 0,
-		    ("lost pagetable object ctx %p", ctx));
+		KASSERT((domain->dmar->hw_ecap & DMAR_ECAP_PT) != 0 &&
+		    (domain->flags & DMAR_DOMAIN_IDMAP) != 0,
+		    ("lost pagetable object domain %p", domain));
 		return;
 	}
-	DMAR_CTX_ASSERT_PGLOCKED(ctx);
-	ctx->pgtbl_obj = NULL;
+	DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
+	domain->pgtbl_obj = NULL;
 
-	if ((ctx->flags & DMAR_CTX_IDMAP) != 0) {
+	if ((domain->flags & DMAR_DOMAIN_IDMAP) != 0) {
 		put_idmap_pgtbl(obj);
-		ctx->flags &= ~DMAR_CTX_IDMAP;
+		domain->flags &= ~DMAR_DOMAIN_IDMAP;
 		return;
 	}
 
@@ -724,7 +739,7 @@
 }
 
 static inline uint64_t
-ctx_wait_iotlb_flush(struct dmar_unit *unit, uint64_t wt, int iro)
+domain_wait_iotlb_flush(struct dmar_unit *unit, uint64_t wt, int iro)
 {
 	uint64_t iotlbr;
 
@@ -740,7 +755,8 @@
 }
 
 void
-ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size)
+domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size)
 {
 	struct dmar_unit *unit;
 	dmar_gaddr_t isize;
@@ -747,14 +763,14 @@
 	uint64_t iotlbr;
 	int am, iro;
 
-	unit = ctx->dmar;
+	unit = domain->dmar;
 	KASSERT(!unit->qi_enabled, ("dmar%d: sync iotlb flush call",
 	    unit->unit));
 	iro = DMAR_ECAP_IRO(unit->hw_ecap) * 16;
 	DMAR_LOCK(unit);
 	if ((unit->hw_cap & DMAR_CAP_PSI) == 0 || size > 2 * 1024 * 1024) {
-		iotlbr = ctx_wait_iotlb_flush(unit, DMAR_IOTLB_IIRG_DOM |
-		    DMAR_IOTLB_DID(ctx->domain), iro);
+		iotlbr = domain_wait_iotlb_flush(unit, DMAR_IOTLB_IIRG_DOM |
+		    DMAR_IOTLB_DID(domain->domain), iro);
 		KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) !=
 		    DMAR_IOTLB_IAIG_INVLD,
 		    ("dmar%d: invalidation failed %jx", unit->unit,
@@ -763,9 +779,9 @@
 		for (; size > 0; base += isize, size -= isize) {
 			am = calc_am(unit, base, size, &isize);
 			dmar_write8(unit, iro, base | am);
-			iotlbr = ctx_wait_iotlb_flush(unit,
-			    DMAR_IOTLB_IIRG_PAGE | DMAR_IOTLB_DID(ctx->domain),
-			    iro);
+			iotlbr = domain_wait_iotlb_flush(unit,
+			    DMAR_IOTLB_IIRG_PAGE |
+			    DMAR_IOTLB_DID(domain->domain), iro);
 			KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) !=
 			    DMAR_IOTLB_IAIG_INVLD,
 			    ("dmar%d: PSI invalidation failed "

Added: trunk/sys/x86/iommu/intel_intrmap.c
===================================================================
--- trunk/sys/x86/iommu/intel_intrmap.c	                        (rev 0)
+++ trunk/sys/x86/iommu/intel_intrmap.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,381 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_intrmap.c 340016 2018-11-01 18:34:26Z jhb $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/rman.h>
+#include <sys/rwlock.h>
+#include <sys/taskqueue.h>
+#include <sys/tree.h>
+#include <sys/vmem.h>
+#include <machine/bus.h>
+#include <machine/intr_machdep.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <x86/include/apicreg.h>
+#include <x86/include/apicvar.h>
+#include <x86/include/busdma_impl.h>
+#include <x86/iommu/intel_reg.h>
+#include <x86/iommu/busdma_dmar.h>
+#include <x86/iommu/intel_dmar.h>
+#include <dev/pci/pcivar.h>
+#include <x86/iommu/iommu_intrmap.h>
+
+static struct dmar_unit *dmar_ir_find(device_t src, uint16_t *rid,
+    int *is_dmar);
+static void dmar_ir_program_irte(struct dmar_unit *unit, u_int idx,
+    uint64_t low, uint16_t rid);
+static int dmar_ir_free_irte(struct dmar_unit *unit, u_int cookie);
+
+int
+iommu_alloc_msi_intr(device_t src, u_int *cookies, u_int count)
+{
+	struct dmar_unit *unit;
+	vmem_addr_t vmem_res;
+	u_int idx, i;
+	int error;
+
+	unit = dmar_ir_find(src, NULL, NULL);
+	if (unit == NULL || !unit->ir_enabled) {
+		for (i = 0; i < count; i++)
+			cookies[i] = -1;
+		return (EOPNOTSUPP);
+	}
+
+	error = vmem_alloc(unit->irtids, count, M_FIRSTFIT | M_NOWAIT,
+	    &vmem_res);
+	if (error != 0) {
+		KASSERT(error != EOPNOTSUPP,
+		    ("impossible EOPNOTSUPP from vmem"));
+		return (error);
+	}
+	idx = vmem_res;
+	for (i = 0; i < count; i++)
+		cookies[i] = idx + i;
+	return (0);
+}
+
+int
+iommu_map_msi_intr(device_t src, u_int cpu, u_int vector, u_int cookie,
+    uint64_t *addr, uint32_t *data)
+{
+	struct dmar_unit *unit;
+	uint64_t low;
+	uint16_t rid;
+	int is_dmar;
+
+	unit = dmar_ir_find(src, &rid, &is_dmar);
+	if (is_dmar) {
+		KASSERT(unit == NULL, ("DMAR cannot translate itself"));
+
+		/*
+		 * See VT-d specification, 5.1.6 Remapping Hardware -
+		 * Interrupt Programming.
+		 */
+		*data = vector;
+		*addr = MSI_INTEL_ADDR_BASE | ((cpu & 0xff) << 12);
+		if (x2apic_mode)
+			*addr |= ((uint64_t)cpu & 0xffffff00) << 32;
+		else
+			KASSERT(cpu <= 0xff, ("cpu id too big %d", cpu));
+		return (0);
+	}
+	if (unit == NULL || !unit->ir_enabled || cookie == -1)
+		return (EOPNOTSUPP);
+
+	low = (DMAR_X2APIC(unit) ? DMAR_IRTE1_DST_x2APIC(cpu) :
+	    DMAR_IRTE1_DST_xAPIC(cpu)) | DMAR_IRTE1_V(vector) |
+	    DMAR_IRTE1_DLM_FM | DMAR_IRTE1_TM_EDGE | DMAR_IRTE1_RH_DIRECT |
+	    DMAR_IRTE1_DM_PHYSICAL | DMAR_IRTE1_P;
+	dmar_ir_program_irte(unit, cookie, low, rid);
+
+	if (addr != NULL) {
+		/*
+		 * See VT-d specification, 5.1.5.2 MSI and MSI-X
+		 * Register Programming.
+		 */
+		*addr = MSI_INTEL_ADDR_BASE | ((cookie & 0x7fff) << 5) |
+		    ((cookie & 0x8000) << 2) | 0x18;
+		*data = 0;
+	}
+	return (0);
+}
+
+int
+iommu_unmap_msi_intr(device_t src, u_int cookie)
+{
+	struct dmar_unit *unit;
+
+	if (cookie == -1)
+		return (0);
+	unit = dmar_ir_find(src, NULL, NULL);
+	return (dmar_ir_free_irte(unit, cookie));
+}
+
+int
+iommu_map_ioapic_intr(u_int ioapic_id, u_int cpu, u_int vector, bool edge,
+    bool activehi, int irq, u_int *cookie, uint32_t *hi, uint32_t *lo)
+{
+	struct dmar_unit *unit;
+	vmem_addr_t vmem_res;
+	uint64_t low, iorte;
+	u_int idx;
+	int error;
+	uint16_t rid;
+
+	unit = dmar_find_ioapic(ioapic_id, &rid);
+	if (unit == NULL || !unit->ir_enabled) {
+		*cookie = -1;
+		return (EOPNOTSUPP);
+	}
+
+	error = vmem_alloc(unit->irtids, 1, M_FIRSTFIT | M_NOWAIT, &vmem_res);
+	if (error != 0) {
+		KASSERT(error != EOPNOTSUPP,
+		    ("impossible EOPNOTSUPP from vmem"));
+		return (error);
+	}
+	idx = vmem_res;
+	low = 0;
+	switch (irq) {
+	case IRQ_EXTINT:
+		low |= DMAR_IRTE1_DLM_ExtINT;
+		break;
+	case IRQ_NMI:
+		low |= DMAR_IRTE1_DLM_NMI;
+		break;
+	case IRQ_SMI:
+		low |= DMAR_IRTE1_DLM_SMI;
+		break;
+	default:
+		KASSERT(vector != 0, ("No vector for IRQ %u", irq));
+		low |= DMAR_IRTE1_DLM_FM | DMAR_IRTE1_V(vector);
+		break;
+	}
+	low |= (DMAR_X2APIC(unit) ? DMAR_IRTE1_DST_x2APIC(cpu) :
+	    DMAR_IRTE1_DST_xAPIC(cpu)) |
+	    (edge ? DMAR_IRTE1_TM_EDGE : DMAR_IRTE1_TM_LEVEL) |
+	    DMAR_IRTE1_RH_DIRECT | DMAR_IRTE1_DM_PHYSICAL | DMAR_IRTE1_P;
+	dmar_ir_program_irte(unit, idx, low, rid);
+
+	if (hi != NULL) {
+		/*
+		 * See VT-d specification, 5.1.5.1 I/OxAPIC
+		 * Programming.
+		 */
+		iorte = (1ULL << 48) | ((uint64_t)(idx & 0x7fff) << 49) |
+		    ((idx & 0x8000) != 0 ? (1 << 11) : 0) |
+		    (edge ? IOART_TRGREDG : IOART_TRGRLVL) |
+		    (activehi ? IOART_INTAHI : IOART_INTALO) |
+		    IOART_DELFIXED | vector;
+		*hi = iorte >> 32;
+		*lo = iorte;
+	}
+	*cookie = idx;
+	return (0);
+}
+
+int
+iommu_unmap_ioapic_intr(u_int ioapic_id, u_int *cookie)
+{
+	struct dmar_unit *unit;
+	u_int idx;
+
+	idx = *cookie;
+	if (idx == -1)
+		return (0);
+	*cookie = -1;
+	unit = dmar_find_ioapic(ioapic_id, NULL);
+	KASSERT(unit != NULL && unit->ir_enabled,
+	    ("unmap: cookie %d unit %p", idx, unit));
+	return (dmar_ir_free_irte(unit, idx));
+}
+
+static struct dmar_unit *
+dmar_ir_find(device_t src, uint16_t *rid, int *is_dmar)
+{
+	devclass_t src_class;
+	struct dmar_unit *unit;
+
+	/*
+	 * We need to determine if the interrupt source generates FSB
+	 * interrupts.  If yes, it is either DMAR, in which case
+	 * interrupts are not remapped.  Or it is HPET, and interrupts
+	 * are remapped.  For HPET, source id is reported by HPET
+	 * record in DMAR ACPI table.
+	 */
+	if (is_dmar != NULL)
+		*is_dmar = FALSE;
+	src_class = device_get_devclass(src);
+	if (src_class == devclass_find("dmar")) {
+		unit = NULL;
+		if (is_dmar != NULL)
+			*is_dmar = TRUE;
+	} else if (src_class == devclass_find("hpet")) {
+		unit = dmar_find_hpet(src, rid);
+	} else {
+		unit = dmar_find(src);
+		if (unit != NULL && rid != NULL)
+			dmar_get_requester(src, rid);
+	}
+	return (unit);
+}
+
+static void
+dmar_ir_program_irte(struct dmar_unit *unit, u_int idx, uint64_t low,
+    uint16_t rid)
+{
+	dmar_irte_t *irte;
+	uint64_t high;
+
+	KASSERT(idx < unit->irte_cnt,
+	    ("bad cookie %d %d", idx, unit->irte_cnt));
+	irte = &(unit->irt[idx]);
+	high = DMAR_IRTE2_SVT_RID | DMAR_IRTE2_SQ_RID |
+	    DMAR_IRTE2_SID_RID(rid);
+	device_printf(unit->dev,
+	    "programming irte[%d] rid %#x high %#jx low %#jx\n",
+	    idx, rid, (uintmax_t)high, (uintmax_t)low);
+	DMAR_LOCK(unit);
+	if ((irte->irte1 & DMAR_IRTE1_P) != 0) {
+		/*
+		 * The rte is already valid.  Assume that the request
+		 * is to remap the interrupt for balancing.  Only low
+		 * word of rte needs to be changed.  Assert that the
+		 * high word contains expected value.
+		 */
+		KASSERT(irte->irte2 == high,
+		    ("irte2 mismatch, %jx %jx", (uintmax_t)irte->irte2,
+		    (uintmax_t)high));
+		dmar_pte_update(&irte->irte1, low);
+	} else {
+		dmar_pte_store(&irte->irte2, high);
+		dmar_pte_store(&irte->irte1, low);
+	}
+	dmar_qi_invalidate_iec(unit, idx, 1);
+	DMAR_UNLOCK(unit);
+
+}
+
+static int
+dmar_ir_free_irte(struct dmar_unit *unit, u_int cookie)
+{
+	dmar_irte_t *irte;
+
+	KASSERT(unit != NULL && unit->ir_enabled,
+	    ("unmap: cookie %d unit %p", cookie, unit));
+	KASSERT(cookie < unit->irte_cnt,
+	    ("bad cookie %u %u", cookie, unit->irte_cnt));
+	irte = &(unit->irt[cookie]);
+	dmar_pte_clear(&irte->irte1);
+	dmar_pte_clear(&irte->irte2);
+	DMAR_LOCK(unit);
+	dmar_qi_invalidate_iec(unit, cookie, 1);
+	DMAR_UNLOCK(unit);
+	vmem_free(unit->irtids, cookie, 1);
+	return (0);
+}
+
+static u_int
+clp2(u_int v)
+{
+
+	return (powerof2(v) ? v : 1 << fls(v));
+}
+
+int
+dmar_init_irt(struct dmar_unit *unit)
+{
+
+	if ((unit->hw_ecap & DMAR_ECAP_IR) == 0)
+		return (0);
+	unit->ir_enabled = 1;
+	TUNABLE_INT_FETCH("hw.dmar.ir", &unit->ir_enabled);
+	if (!unit->ir_enabled)
+		return (0);
+	if (!unit->qi_enabled) {
+		unit->ir_enabled = 0;
+		if (bootverbose)
+			device_printf(unit->dev,
+	     "QI disabled, disabling interrupt remapping\n");
+		return (0);
+	}
+	unit->irte_cnt = clp2(num_io_irqs);
+	unit->irt = (dmar_irte_t *)(uintptr_t)kmem_alloc_contig(kernel_arena,
+	    unit->irte_cnt * sizeof(dmar_irte_t), M_ZERO | M_WAITOK, 0,
+	    dmar_high, PAGE_SIZE, 0, DMAR_IS_COHERENT(unit) ?
+	    VM_MEMATTR_DEFAULT : VM_MEMATTR_UNCACHEABLE);
+	if (unit->irt == NULL)
+		return (ENOMEM);
+	unit->irt_phys = pmap_kextract((vm_offset_t)unit->irt);
+	unit->irtids = vmem_create("dmarirt", 0, unit->irte_cnt, 1, 0,
+	    M_FIRSTFIT | M_NOWAIT);
+	DMAR_LOCK(unit);
+	dmar_load_irt_ptr(unit);
+	dmar_qi_invalidate_iec_glob(unit);
+	DMAR_UNLOCK(unit);
+
+	/*
+	 * Initialize mappings for already configured interrupt pins.
+	 * Required, because otherwise the interrupts fault without
+	 * irtes.
+	 */
+	intr_reprogram();
+
+	DMAR_LOCK(unit);
+	dmar_enable_ir(unit);
+	DMAR_UNLOCK(unit);
+	return (0);
+}
+
+void
+dmar_fini_irt(struct dmar_unit *unit)
+{
+
+	unit->ir_enabled = 0;
+	if (unit->irt != NULL) {
+		dmar_disable_ir(unit);
+		dmar_qi_invalidate_iec_glob(unit);
+		vmem_destroy(unit->irtids);
+		kmem_free(kernel_arena, (vm_offset_t)unit->irt,
+		    unit->irte_cnt * sizeof(dmar_irte_t));
+	}
+}


Property changes on: trunk/sys/x86/iommu/intel_intrmap.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/iommu/intel_qi.c
===================================================================
--- trunk/sys/x86/iommu/intel_qi.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_qi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_qi.c 284019 2015-06-05 08:23:33Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_qi.c 320357 2017-06-26 12:30:39Z kib $");
 
 #include "opt_acpi.h"
 
@@ -41,7 +41,9 @@
 #include <sys/module.h>
 #include <sys/rman.h>
 #include <sys/taskqueue.h>
+#include <sys/time.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <machine/bus.h>
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
@@ -70,27 +72,27 @@
 static int
 dmar_enable_qi(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->hw_gcmd |= DMAR_GCMD_QIE;
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) == 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES)
+	    != 0));
+	return (error);
 }
 
 static int
 dmar_disable_qi(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->hw_gcmd &= ~DMAR_GCMD_QIE;
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) != 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES)
+	    == 0));
+	return (error);
 }
 
 static void
@@ -170,7 +172,8 @@
 }
 
 static void
-dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct dmar_qi_genseq *pseq)
+dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct dmar_qi_genseq *pseq,
+    bool emit_wait)
 {
 	struct dmar_qi_genseq gsec;
 	uint32_t seq;
@@ -191,17 +194,21 @@
 	seq = unit->inv_waitd_seq++;
 	pseq->gen = unit->inv_waitd_gen;
 	pseq->seq = seq;
-	dmar_qi_emit_wait_descr(unit, seq, true, true, false);
+	if (emit_wait) {
+		dmar_qi_ensure(unit, 1);
+		dmar_qi_emit_wait_descr(unit, seq, true, true, false);
+	}
 }
 
 static void
-dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct dmar_qi_genseq *gseq)
+dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct dmar_qi_genseq *gseq,
+    bool nowait)
 {
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->inv_seq_waiters++;
 	while (!dmar_qi_seq_processed(unit, gseq)) {
-		if (cold) {
+		if (cold || nowait) {
 			cpu_spinwait();
 		} else {
 			msleep(&unit->inv_seq_waiters, &unit->lock, 0,
@@ -212,14 +219,14 @@
 }
 
 void
-dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t base,
-    dmar_gaddr_t size, struct dmar_qi_genseq *pseq)
+dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, struct dmar_qi_genseq *pseq, bool emit_wait)
 {
 	struct dmar_unit *unit;
 	dmar_gaddr_t isize;
 	int am;
 
-	unit = ctx->dmar;
+	unit = domain->dmar;
 	DMAR_ASSERT_LOCKED(unit);
 	for (; size > 0; base += isize, size -= isize) {
 		am = calc_am(unit, base, size, &isize);
@@ -227,13 +234,10 @@
 		dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV |
 		    DMAR_IQ_DESCR_IOTLB_PAGE | DMAR_IQ_DESCR_IOTLB_DW |
 		    DMAR_IQ_DESCR_IOTLB_DR |
-		    DMAR_IQ_DESCR_IOTLB_DID(ctx->domain),
+		    DMAR_IQ_DESCR_IOTLB_DID(domain->domain),
 		    base | am);
 	}
-	if (pseq != NULL) {
-		dmar_qi_ensure(unit, 1);
-		dmar_qi_emit_wait_seq(unit, pseq);
-	}
+	dmar_qi_emit_wait_seq(unit, pseq, emit_wait);
 	dmar_qi_advance_tail(unit);
 }
 
@@ -245,9 +249,9 @@
 	DMAR_ASSERT_LOCKED(unit);
 	dmar_qi_ensure(unit, 2);
 	dmar_qi_emit(unit, DMAR_IQ_DESCR_CTX_INV | DMAR_IQ_DESCR_CTX_GLOB, 0);
-	dmar_qi_emit_wait_seq(unit, &gseq);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
 	dmar_qi_advance_tail(unit);
-	dmar_qi_wait_for_seq(unit, &gseq);
+	dmar_qi_wait_for_seq(unit, &gseq, false);
 }
 
 void
@@ -259,11 +263,64 @@
 	dmar_qi_ensure(unit, 2);
 	dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV | DMAR_IQ_DESCR_IOTLB_GLOB |
 	    DMAR_IQ_DESCR_IOTLB_DW | DMAR_IQ_DESCR_IOTLB_DR, 0);
-	dmar_qi_emit_wait_seq(unit, &gseq);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
 	dmar_qi_advance_tail(unit);
-	dmar_qi_wait_for_seq(unit, &gseq);
+	dmar_qi_wait_for_seq(unit, &gseq, false);
 }
 
+void
+dmar_qi_invalidate_iec_glob(struct dmar_unit *unit)
+{
+	struct dmar_qi_genseq gseq;
+
+	DMAR_ASSERT_LOCKED(unit);
+	dmar_qi_ensure(unit, 2);
+	dmar_qi_emit(unit, DMAR_IQ_DESCR_IEC_INV, 0);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
+	dmar_qi_advance_tail(unit);
+	dmar_qi_wait_for_seq(unit, &gseq, false);
+}
+
+void
+dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt)
+{
+	struct dmar_qi_genseq gseq;
+	u_int c, l;
+
+	DMAR_ASSERT_LOCKED(unit);
+	KASSERT(start < unit->irte_cnt && start < start + cnt &&
+	    start + cnt <= unit->irte_cnt,
+	    ("inv iec overflow %d %d %d", unit->irte_cnt, start, cnt));
+	for (; cnt > 0; cnt -= c, start += c) {
+		l = ffs(start | cnt) - 1;
+		c = 1 << l;
+		dmar_qi_ensure(unit, 1);
+		dmar_qi_emit(unit, DMAR_IQ_DESCR_IEC_INV |
+		    DMAR_IQ_DESCR_IEC_IDX | DMAR_IQ_DESCR_IEC_IIDX(start) |
+		    DMAR_IQ_DESCR_IEC_IM(l), 0);
+	}
+	dmar_qi_ensure(unit, 1);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
+	dmar_qi_advance_tail(unit);
+
+	/*
+	 * The caller of the function, in particular,
+	 * dmar_ir_program_irte(), may be called from the context
+	 * where the sleeping is forbidden (in fact, the
+	 * intr_table_lock mutex may be held, locked from
+	 * intr_shuffle_irqs()).  Wait for the invalidation completion
+	 * using the busy wait.
+	 *
+	 * The impact on the interrupt input setup code is small, the
+	 * expected overhead is comparable with the chipset register
+	 * read.  It is more harmful for the parallel DMA operations,
+	 * since we own the dmar unit lock until whole invalidation
+	 * queue is processed, which includes requests possibly issued
+	 * before our request.
+	 */
+	dmar_qi_wait_for_seq(unit, &gseq, true);
+}
+
 int
 dmar_qi_intr(void *arg)
 {
@@ -271,7 +328,7 @@
 
 	unit = arg;
 	KASSERT(unit->qi_enabled, ("dmar%d: QI is not enabled", unit->unit));
-	taskqueue_enqueue_fast(unit->qi_taskqueue, &unit->qi_task);
+	taskqueue_enqueue(unit->qi_taskqueue, &unit->qi_task);
 	return (FILTER_HANDLED);
 }
 
@@ -289,12 +346,11 @@
 		entry = TAILQ_FIRST(&unit->tlb_flush_entries);
 		if (entry == NULL)
 			break;
-		if ((entry->gseq.gen == 0 && entry->gseq.seq == 0) ||
-		    !dmar_qi_seq_processed(unit, &entry->gseq))
+		if (!dmar_qi_seq_processed(unit, &entry->gseq))
 			break;
 		TAILQ_REMOVE(&unit->tlb_flush_entries, entry, dmamap_link);
 		DMAR_UNLOCK(unit);
-		dmar_ctx_free_entry(entry, (entry->flags &
+		dmar_domain_free_entry(entry, (entry->flags &
 		    DMAR_MAP_ENTRY_QI_NF) == 0);
 		DMAR_LOCK(unit);
 	}
@@ -324,7 +380,7 @@
 
 	TAILQ_INIT(&unit->tlb_flush_entries);
 	TASK_INIT(&unit->qi_task, 0, dmar_qi_task, unit);
-	unit->qi_taskqueue = taskqueue_create_fast("dmar", M_WAITOK,
+	unit->qi_taskqueue = taskqueue_create_fast("dmarqf", M_WAITOK,
 	    taskqueue_thread_enqueue, &unit->qi_taskqueue);
 	taskqueue_start_threads(&unit->qi_taskqueue, 1, PI_AV,
 	    "dmar%d qi taskq", unit->unit);
@@ -377,9 +433,9 @@
 	DMAR_LOCK(unit);
 	/* quisce */
 	dmar_qi_ensure(unit, 1);
-	dmar_qi_emit_wait_seq(unit, &gseq);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
 	dmar_qi_advance_tail(unit);
-	dmar_qi_wait_for_seq(unit, &gseq);
+	dmar_qi_wait_for_seq(unit, &gseq, false);
 	/* only after the quisce, disable queue */
 	dmar_disable_qi_intr(unit);
 	dmar_disable_qi(unit);

Modified: trunk/sys/x86/iommu/intel_quirks.c
===================================================================
--- trunk/sys/x86/iommu/intel_quirks.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_quirks.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
 /*-
- * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013, 2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_quirks.c 257251 2013-10-28 13:33:29Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_quirks.c 280260 2015-03-19 13:57:47Z kib $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -43,6 +43,7 @@
 #include <sys/smp.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <machine/bus.h>
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
@@ -60,7 +61,7 @@
 #include <x86/iommu/intel_dmar.h>
 #include <dev/pci/pcivar.h>
 
-typedef void (*dmar_quirk_fun)(struct dmar_unit *);
+typedef void (*dmar_quirk_cpu_fun)(struct dmar_unit *);
 
 struct intel_dmar_quirk_cpu {
 	u_int ext_family;
@@ -68,17 +69,21 @@
 	u_int family_code;
 	u_int model;
 	u_int stepping;
-	dmar_quirk_fun quirk;
+	dmar_quirk_cpu_fun quirk;
 	const char *descr;
 };
 
+typedef void (*dmar_quirk_nb_fun)(struct dmar_unit *, device_t nb);
+
 struct intel_dmar_quirk_nb {
 	u_int dev_id;
 	u_int rev_no;
-	dmar_quirk_fun quirk;
+	dmar_quirk_nb_fun quirk;
 	const char *descr;
 };
 
+#define	QUIRK_NB_ALL_REV	0xffffffff
+
 static void
 dmar_match_quirks(struct dmar_unit *dmar,
     const struct intel_dmar_quirk_nb *nb_quirks, int nb_quirks_len,
@@ -100,13 +105,14 @@
 			for (i = 0; i < nb_quirks_len; i++) {
 				nb_quirk = &nb_quirks[i];
 				if (nb_quirk->dev_id == dev_id &&
-				    nb_quirk->rev_no == rev_no) {
+				    (nb_quirk->rev_no == rev_no ||
+				    nb_quirk->rev_no == QUIRK_NB_ALL_REV)) {
 					if (bootverbose) {
 						device_printf(dmar->dev,
 						    "NB IOMMU quirk %s\n",
 						    nb_quirk->descr);
 					}
-					nb_quirk->quirk(dmar);
+					nb_quirk->quirk(dmar, nb);
 				}
 			}
 		} else {
@@ -140,12 +146,29 @@
 }
 
 static void
-nb_5400_no_low_high_prot_mem(struct dmar_unit *unit)
+nb_5400_no_low_high_prot_mem(struct dmar_unit *unit, device_t nb __unused)
 {
 
 	unit->hw_cap &= ~(DMAR_CAP_PHMR | DMAR_CAP_PLMR);
 }
 
+static void
+nb_no_ir(struct dmar_unit *unit, device_t nb __unused)
+{
+
+	unit->hw_ecap &= ~(DMAR_ECAP_IR | DMAR_ECAP_EIM);
+}
+
+static void
+nb_5500_no_ir_rev13(struct dmar_unit *unit, device_t nb)
+{
+	u_int rev_no;
+
+	rev_no = pci_get_revid(nb);
+	if (rev_no <= 0x13)
+		nb_no_ir(unit, nb);
+}
+
 static const struct intel_dmar_quirk_nb pre_use_nb[] = {
 	{
 	    .dev_id = 0x4001, .rev_no = 0x20,
@@ -157,6 +180,26 @@
 	    .quirk = nb_5400_no_low_high_prot_mem,
 	    .descr = "5400 E23" /* no low/high protected memory */
 	},
+	{
+	    .dev_id = 0x3403, .rev_no = QUIRK_NB_ALL_REV,
+	    .quirk = nb_5500_no_ir_rev13,
+	    .descr = "5500 E47, E53" /* interrupt remapping does not work */
+	},
+	{
+	    .dev_id = 0x3405, .rev_no = QUIRK_NB_ALL_REV,
+	    .quirk = nb_5500_no_ir_rev13,
+	    .descr = "5500 E47, E53" /* interrupt remapping does not work */
+	},
+	{
+	    .dev_id = 0x3405, .rev_no = 0x22,
+	    .quirk = nb_no_ir,
+	    .descr = "5500 E47, E53" /* interrupt remapping does not work */
+	},
+	{
+	    .dev_id = 0x3406, .rev_no = QUIRK_NB_ALL_REV,
+	    .quirk = nb_5500_no_ir_rev13,
+	    .descr = "5500 E47, E53" /* interrupt remapping does not work */
+	},
 };
 
 static void

Modified: trunk/sys/x86/iommu/intel_reg.h
===================================================================
--- trunk/sys/x86/iommu/intel_reg.h	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_reg.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/iommu/intel_reg.h 306466 2016-09-30 00:31:17Z jhb $
+ * $FreeBSD: stable/11/sys/x86/iommu/intel_reg.h 306466 2016-09-30 00:31:17Z jhb $
  */
 
 #ifndef __X86_IOMMU_INTEL_REG_H

Modified: trunk/sys/x86/iommu/intel_utils.c
===================================================================
--- trunk/sys/x86/iommu/intel_utils.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_utils.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_utils.c 279470 2015-03-01 04:22:06Z rstone $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_utils.c 327785 2018-01-10 20:39:26Z markj $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -47,7 +47,9 @@
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
+#include <sys/time.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <dev/pci/pcivar.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
@@ -58,6 +60,8 @@
 #include <vm/vm_pageout.h>
 #include <machine/bus.h>
 #include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <x86/include/apicvar.h>
 #include <x86/include/busdma_impl.h>
 #include <x86/iommu/intel_reg.h>
 #include <x86/iommu/busdma_dmar.h>
@@ -98,7 +102,6 @@
 	{.agaw = 64, .cap = DMAR_CAP_SAGAW_6LVL, .awlvl = DMAR_CTX2_AW_6LVL,
 	    .pglvl = 6}
 };
-#define SIZEOF_SAGAW_BITS (sizeof(sagaw_bits) / sizeof(sagaw_bits[0]))
 
 bool
 dmar_pglvl_supported(struct dmar_unit *unit, int pglvl)
@@ -105,7 +108,7 @@
 {
 	int i;
 
-	for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
+	for (i = 0; i < nitems(sagaw_bits); i++) {
 		if (sagaw_bits[i].pglvl != pglvl)
 			continue;
 		if ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
@@ -115,26 +118,23 @@
 }
 
 int
-ctx_set_agaw(struct dmar_ctx *ctx, int mgaw)
+domain_set_agaw(struct dmar_domain *domain, int mgaw)
 {
 	int sagaw, i;
 
-	ctx->mgaw = mgaw;
-	sagaw = DMAR_CAP_SAGAW(ctx->dmar->hw_cap);
-	for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
+	domain->mgaw = mgaw;
+	sagaw = DMAR_CAP_SAGAW(domain->dmar->hw_cap);
+	for (i = 0; i < nitems(sagaw_bits); i++) {
 		if (sagaw_bits[i].agaw >= mgaw) {
-			ctx->agaw = sagaw_bits[i].agaw;
-			ctx->pglvl = sagaw_bits[i].pglvl;
-			ctx->awlvl = sagaw_bits[i].awlvl;
+			domain->agaw = sagaw_bits[i].agaw;
+			domain->pglvl = sagaw_bits[i].pglvl;
+			domain->awlvl = sagaw_bits[i].awlvl;
 			return (0);
 		}
 	}
-	device_printf(ctx->dmar->dev,
-	    "context request mgaw %d for pci%d:%d:%d:%d, "
-	    "no agaw found, sagaw %x\n", mgaw, ctx->dmar->segment, 
-	    pci_get_bus(ctx->ctx_tag.owner),
-	    pci_get_slot(ctx->ctx_tag.owner),
-	    pci_get_function(ctx->ctx_tag.owner), sagaw);
+	device_printf(domain->dmar->dev,
+	    "context request mgaw %d: no agaw found, sagaw %x\n",
+	    mgaw, sagaw);
 	return (EINVAL);
 }
 
@@ -150,18 +150,18 @@
 {
 	int i;
 
-	for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
+	for (i = 0; i < nitems(sagaw_bits); i++) {
 		if ((1ULL << sagaw_bits[i].agaw) >= maxaddr &&
 		    (DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
 			break;
 	}
-	if (allow_less && i == SIZEOF_SAGAW_BITS) {
+	if (allow_less && i == nitems(sagaw_bits)) {
 		do {
 			i--;
 		} while ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap)
 		    == 0);
 	}
-	if (i < SIZEOF_SAGAW_BITS)
+	if (i < nitems(sagaw_bits))
 		return (sagaw_bits[i].agaw);
 	KASSERT(0, ("no mgaw for maxaddr %jx allow_less %d",
 	    (uintmax_t) maxaddr, allow_less));
@@ -190,7 +190,7 @@
  * the context ctx.
  */
 int
-ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl)
+domain_is_sp_lvl(struct dmar_domain *domain, int lvl)
 {
 	int alvl, cap_sps;
 	static const int sagaw_sp[] = {
@@ -200,10 +200,9 @@
 		DMAR_CAP_SPS_1T
 	};
 
-	alvl = ctx->pglvl - lvl - 1;
-	cap_sps = DMAR_CAP_SPS(ctx->dmar->hw_cap);
-	return (alvl < sizeof(sagaw_sp) / sizeof(sagaw_sp[0]) &&
-	    (sagaw_sp[alvl] & cap_sps) != 0);
+	alvl = domain->pglvl - lvl - 1;
+	cap_sps = DMAR_CAP_SPS(domain->dmar->hw_cap);
+	return (alvl < nitems(sagaw_sp) && (sagaw_sp[alvl] & cap_sps) != 0);
 }
 
 dmar_gaddr_t
@@ -222,16 +221,15 @@
 	KASSERT(lvl >= 0 && lvl < total_pglvl,
 	    ("total %d lvl %d", total_pglvl, lvl));
 	rlvl = total_pglvl - lvl - 1;
-	KASSERT(rlvl < sizeof(pg_sz) / sizeof(pg_sz[0]),
-	    ("sizeof pg_sz lvl %d", lvl));
+	KASSERT(rlvl < nitems(pg_sz), ("sizeof pg_sz lvl %d", lvl));
 	return (pg_sz[rlvl]);
 }
 
 dmar_gaddr_t
-ctx_page_size(struct dmar_ctx *ctx, int lvl)
+domain_page_size(struct dmar_domain *domain, int lvl)
 {
 
-	return (pglvl_page_size(ctx->pglvl, lvl));
+	return (pglvl_page_size(domain->pglvl, lvl));
 }
 
 int
@@ -260,9 +258,12 @@
 dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags)
 {
 	vm_page_t m;
-	int zeroed;
+	int zeroed, aflags;
 
 	zeroed = (flags & DMAR_PGF_ZERO) != 0 ? VM_ALLOC_ZERO : 0;
+	aflags = zeroed | VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP |
+	    ((flags & DMAR_PGF_WAITOK) != 0 ? VM_ALLOC_WAITFAIL :
+	    VM_ALLOC_NOWAIT);
 	for (;;) {
 		if ((flags & DMAR_PGF_OBJL) == 0)
 			VM_OBJECT_WLOCK(obj);
@@ -272,8 +273,7 @@
 				VM_OBJECT_WUNLOCK(obj);
 			break;
 		}
-		m = vm_page_alloc_contig(obj, idx, VM_ALLOC_NOBUSY |
-		    VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP | zeroed, 1, 0,
+		m = vm_page_alloc_contig(obj, idx, aflags, 1, 0,
 		    dmar_high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
 		if ((flags & DMAR_PGF_OBJL) == 0)
 			VM_OBJECT_WUNLOCK(obj);
@@ -285,11 +285,6 @@
 		}
 		if ((flags & DMAR_PGF_WAITOK) == 0)
 			break;
-		if ((flags & DMAR_PGF_OBJL) != 0)
-			VM_OBJECT_WUNLOCK(obj);
-		VM_WAIT;
-		if ((flags & DMAR_PGF_OBJL) != 0)
-			VM_OBJECT_WLOCK(obj);
 	}
 	return (m);
 }
@@ -405,6 +400,7 @@
 dmar_load_root_entry_ptr(struct dmar_unit *unit)
 {
 	vm_page_t root_entry;
+	int error;
 
 	/*
 	 * Access to the GCMD register must be serialized while the
@@ -417,10 +413,9 @@
 	VM_OBJECT_RUNLOCK(unit->ctx_obj);
 	dmar_write8(unit, DMAR_RTADDR_REG, VM_PAGE_TO_PHYS(root_entry));
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SRTP);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS) == 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS)
+	    != 0));
+	return (error);
 }
 
 /*
@@ -430,6 +425,7 @@
 int
 dmar_inv_ctx_glob(struct dmar_unit *unit)
 {
+	int error;
 
 	/*
 	 * Access to the CCMD register must be serialized while the
@@ -445,10 +441,9 @@
 	 * writes the upper dword last.
 	 */
 	dmar_write8(unit, DMAR_CCMD_REG, DMAR_CCMD_ICC | DMAR_CCMD_CIRG_GLOB);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32) != 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32)
+	    == 0));
+	return (error);
 }
 
 /*
@@ -457,7 +452,7 @@
 int
 dmar_inv_iotlb_glob(struct dmar_unit *unit)
 {
-	int reg;
+	int error, reg;
 
 	DMAR_ASSERT_LOCKED(unit);
 	KASSERT(!unit->qi_enabled, ("QI enabled"));
@@ -466,11 +461,9 @@
 	/* See a comment about DMAR_CCMD_ICC in dmar_inv_ctx_glob. */
 	dmar_write8(unit, reg + DMAR_IOTLB_REG_OFF, DMAR_IOTLB_IVT |
 	    DMAR_IOTLB_IIRG_GLB | DMAR_IOTLB_DR | DMAR_IOTLB_DW);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) &
-	    DMAR_IOTLB_IVT32) != 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) &
+	    DMAR_IOTLB_IVT32) == 0));
+	return (error);
 }
 
 /*
@@ -480,6 +473,7 @@
 int
 dmar_flush_write_bufs(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 
@@ -490,38 +484,86 @@
 	    ("dmar%d: no RWBF", unit->unit));
 
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_WBF);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS) == 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS)
+	    != 0));
+	return (error);
 }
 
 int
 dmar_enable_translation(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->hw_gcmd |= DMAR_GCMD_TE;
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) == 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES)
+	    != 0));
+	return (error);
 }
 
 int
 dmar_disable_translation(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->hw_gcmd &= ~DMAR_GCMD_TE;
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) != 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES)
+	    == 0));
+	return (error);
 }
 
+int
+dmar_load_irt_ptr(struct dmar_unit *unit)
+{
+	uint64_t irta, s;
+	int error;
+
+	DMAR_ASSERT_LOCKED(unit);
+	irta = unit->irt_phys;
+	if (DMAR_X2APIC(unit))
+		irta |= DMAR_IRTA_EIME;
+	s = fls(unit->irte_cnt) - 2;
+	KASSERT(unit->irte_cnt >= 2 && s <= DMAR_IRTA_S_MASK &&
+	    powerof2(unit->irte_cnt),
+	    ("IRTA_REG_S overflow %x", unit->irte_cnt));
+	irta |= s;
+	dmar_write8(unit, DMAR_IRTA_REG, irta);
+	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SIRTP);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRTPS)
+	    != 0));
+	return (error);
+}
+
+int
+dmar_enable_ir(struct dmar_unit *unit)
+{
+	int error;
+
+	DMAR_ASSERT_LOCKED(unit);
+	unit->hw_gcmd |= DMAR_GCMD_IRE;
+	unit->hw_gcmd &= ~DMAR_GCMD_CFI;
+	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES)
+	    != 0));
+	return (error);
+}
+
+int
+dmar_disable_ir(struct dmar_unit *unit)
+{
+	int error;
+
+	DMAR_ASSERT_LOCKED(unit);
+	unit->hw_gcmd &= ~DMAR_GCMD_IRE;
+	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES)
+	    == 0));
+	return (error);
+}
+
 #define BARRIER_F				\
 	u_int f_done, f_inproc, f_wakeup;	\
 						\
@@ -573,18 +615,62 @@
 }
 
 int dmar_match_verbose;
+int dmar_batch_coalesce = 100;
+struct timespec dmar_hw_timeout = {
+	.tv_sec = 0,
+	.tv_nsec = 1000000
+};
 
-static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL,
-    "");
-SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD | CTLFLAG_TUN,
+static const uint64_t d = 1000000000;
+
+void
+dmar_update_timeout(uint64_t newval)
+{
+
+	/* XXXKIB not atomic */
+	dmar_hw_timeout.tv_sec = newval / d;
+	dmar_hw_timeout.tv_nsec = newval % d;
+}
+
+uint64_t
+dmar_get_timeout(void)
+{
+
+	return ((uint64_t)dmar_hw_timeout.tv_sec * d +
+	    dmar_hw_timeout.tv_nsec);
+}
+
+static int
+dmar_timeout_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int error;
+
+	val = dmar_get_timeout();
+	error = sysctl_handle_long(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	dmar_update_timeout(val);
+	return (error);
+}
+
+static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL, "");
+SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD,
     &dmar_tbl_pagecnt, 0,
     "Count of pages used for DMAR pagetables");
-SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RW | CTLFLAG_TUN,
+SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RWTUN,
     &dmar_match_verbose, 0,
     "Verbose matching of the PCI devices to DMAR paths");
+SYSCTL_INT(_hw_dmar, OID_AUTO, batch_coalesce, CTLFLAG_RWTUN,
+    &dmar_batch_coalesce, 0,
+    "Number of qi batches between interrupt");
+SYSCTL_PROC(_hw_dmar, OID_AUTO, timeout,
+    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+    dmar_timeout_sysctl, "QU",
+    "Timeout for command wait, in nanoseconds");
 #ifdef INVARIANTS
 int dmar_check_free;
-SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RW | CTLFLAG_TUN,
+SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RWTUN,
     &dmar_check_free, 0,
     "Check the GPA RBtree for free_down and free_after validity");
 #endif

Added: trunk/sys/x86/iommu/iommu_intrmap.h
===================================================================
--- trunk/sys/x86/iommu/iommu_intrmap.h	                        (rev 0)
+++ trunk/sys/x86/iommu/iommu_intrmap.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,44 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/iommu/iommu_intrmap.h 280260 2015-03-19 13:57:47Z kib $
+ */
+
+#ifndef __X86_IOMMU_IOMMU_INTRMAP_H
+#define	__X86_IOMMU_IOMMU_INTRMAP_H
+
+int iommu_alloc_msi_intr(device_t src, u_int *cookies, u_int count);
+int iommu_map_msi_intr(device_t src, u_int cpu, u_int vector, u_int cookie,
+    uint64_t *addr, uint32_t *data);
+int iommu_unmap_msi_intr(device_t src, u_int cookie);
+int iommu_map_ioapic_intr(u_int ioapic_id, u_int cpu, u_int vector, bool edge,
+    bool activehi, int irq, u_int *cookie, uint32_t *hi, uint32_t *lo);
+int iommu_unmap_ioapic_intr(u_int ioapic_id, u_int *cookie);
+
+#endif


Property changes on: trunk/sys/x86/iommu/iommu_intrmap.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/isa/atpic.c
===================================================================
--- trunk/sys/x86/isa/atpic.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/atpic.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -30,10 +30,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/atpic.c 262192 2014-02-18 20:27:17Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/atpic.c 340016 2018-11-01 18:34:26Z jhb $");
 
 #include "opt_auto_eoi.h"
 #include "opt_isa.h"
+#include "opt_mca.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -55,9 +56,12 @@
 #ifdef PC98
 #include <pc98/cbus/cbus.h>
 #else
-#include <x86/isa/isa.h>
+#include <isa/isareg.h>
 #endif
 #include <isa/isavar.h>
+#ifdef DEV_MCA
+#include <i386/bios/mca_machdep.h>
+#endif
 
 #ifdef __amd64__
 #define	SDT_ATPIC	SDT_SYSIGT
@@ -70,12 +74,12 @@
 #define	MASTER	0
 #define	SLAVE	1
 
+#define	IMEN_MASK(ai)		(IRQ_MASK((ai)->at_irq))
+
 #define	NUM_ISA_IRQS		16
 
 static void	atpic_init(void *dummy);
 
-unsigned int imen;	/* XXX */
-
 inthand_t
 	IDTVEC(atpic_intr0), IDTVEC(atpic_intr1), IDTVEC(atpic_intr2),
 	IDTVEC(atpic_intr3), IDTVEC(atpic_intr4), IDTVEC(atpic_intr5),
@@ -83,19 +87,42 @@
 	IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11),
 	IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14),
 	IDTVEC(atpic_intr15);
+/* XXXKIB i386 uses stubs until pti comes */
+inthand_t
+	IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti),
+	IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti),
+	IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti),
+	IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti),
+	IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti),
+	IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti),
+	IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti),
+	IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti);
 
 #define	IRQ(ap, ai)	((ap)->at_irqbase + (ai)->at_irq)
 
-#define	ATPIC(io, base, eoi, imenptr)					\
-     	{ { atpic_enable_source, atpic_disable_source, (eoi),		\
-	    atpic_enable_intr, atpic_disable_intr, atpic_vector,	\
-	    atpic_source_pending, NULL,	atpic_resume, atpic_config_intr,\
-	    atpic_assign_cpu }, (io), (base), IDT_IO_INTS + (base),	\
-	    (imenptr) }
+#define	ATPIC(io, base, eoi) {						\
+		.at_pic = {						\
+			.pic_register_sources = atpic_register_sources,	\
+			.pic_enable_source = atpic_enable_source,	\
+			.pic_disable_source = atpic_disable_source,	\
+			.pic_eoi_source = (eoi),			\
+			.pic_enable_intr = atpic_enable_intr,		\
+			.pic_disable_intr = atpic_disable_intr,		\
+			.pic_vector = atpic_vector,			\
+			.pic_source_pending = atpic_source_pending,	\
+			.pic_resume = atpic_resume,			\
+			.pic_config_intr = atpic_config_intr,		\
+			.pic_assign_cpu = atpic_assign_cpu		\
+		},							\
+		.at_ioaddr = (io),					\
+		.at_irqbase = (base),					\
+		.at_intbase = IDT_IO_INTS + (base),			\
+		.at_imen = 0xff,					\
+	}
 
 #define	INTSRC(irq)							\
 	{ { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ),	\
-	    (irq) % 8 }
+	    IDTVEC(atpic_intr ## irq ## _pti), (irq) % 8 }
 
 struct atpic {
 	struct pic at_pic;
@@ -102,12 +129,12 @@
 	int	at_ioaddr;
 	int	at_irqbase;
 	uint8_t	at_intbase;
-	uint8_t	*at_imen;
+	uint8_t	at_imen;
 };
 
 struct atpic_intsrc {
 	struct intsrc at_intsrc;
-	inthand_t *at_intr;
+	inthand_t *at_intr, *at_intr_pti;
 	int	at_irq;			/* Relative to PIC base. */
 	enum intr_trigger at_trigger;
 	u_long	at_count;
@@ -114,6 +141,7 @@
 	u_long	at_straycount;
 };
 
+static void atpic_register_sources(struct pic *pic);
 static void atpic_enable_source(struct intsrc *isrc);
 static void atpic_disable_source(struct intsrc *isrc, int eoi);
 static void atpic_eoi_master(struct intsrc *isrc);
@@ -129,8 +157,8 @@
 static void i8259_init(struct atpic *pic, int slave);
 
 static struct atpic atpics[] = {
-	ATPIC(IO_ICU1, 0, atpic_eoi_master, (uint8_t *)&imen),
-	ATPIC(IO_ICU2, 8, atpic_eoi_slave, ((uint8_t *)&imen) + 1)
+	ATPIC(IO_ICU1, 0, atpic_eoi_master),
+	ATPIC(IO_ICU2, 8, atpic_eoi_slave)
 };
 
 static struct atpic_intsrc atintrs[] = {
@@ -152,7 +180,7 @@
 	INTSRC(15),
 };
 
-CTASSERT(sizeof(atintrs) / sizeof(atintrs[0]) == NUM_ISA_IRQS);
+CTASSERT(nitems(atintrs) == NUM_ISA_IRQS);
 
 static __inline void
 _atpic_eoi_master(struct intsrc *isrc)
@@ -184,6 +212,42 @@
 }
 
 static void
+atpic_register_sources(struct pic *pic)
+{
+	struct atpic *ap = (struct atpic *)pic;
+	struct atpic_intsrc *ai;
+	int i;
+
+	/*
+	 * If any of the ISA IRQs have an interrupt source already, then
+	 * assume that the I/O APICs are being used and don't register any
+	 * of our interrupt sources.  This makes sure we don't accidentally
+	 * use mixed mode.  The "accidental" use could otherwise occur on
+	 * machines that route the ACPI SCI interrupt to a different ISA
+	 * IRQ (at least one machine routes it to IRQ 13) thus disabling
+	 * that APIC ISA routing and allowing the ATPIC source for that IRQ
+	 * to leak through.  We used to depend on this feature for routing
+	 * IRQ0 via mixed mode, but now we don't use mixed mode at all.
+	 *
+	 * To avoid the slave not register sources after the master
+	 * registers its sources, register all IRQs when this function is
+	 * called on the master.
+	 */
+	if (ap != &atpics[MASTER])
+		return;
+	for (i = 0; i < NUM_ISA_IRQS; i++)
+		if (intr_lookup_source(i) != NULL)
+			return;
+
+	/* Loop through all interrupt sources and add them. */
+	for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) {
+		if (i == ICU_SLAVEID)
+			continue;
+		intr_register_source(&ai->at_intsrc);
+	}
+}
+
+static void
 atpic_enable_source(struct intsrc *isrc)
 {
 	struct atpic_intsrc *ai = (struct atpic_intsrc *)isrc;
@@ -190,9 +254,9 @@
 	struct atpic *ap = (struct atpic *)isrc->is_pic;
 
 	spinlock_enter();
-	if (*ap->at_imen & IMEN_MASK(ai)) {
-		*ap->at_imen &= ~IMEN_MASK(ai);
-		outb(ap->at_ioaddr + ICU_IMR_OFFSET, *ap->at_imen);
+	if (ap->at_imen & IMEN_MASK(ai)) {
+		ap->at_imen &= ~IMEN_MASK(ai);
+		outb(ap->at_ioaddr + ICU_IMR_OFFSET, ap->at_imen);
 	}
 	spinlock_exit();
 }
@@ -205,8 +269,8 @@
 
 	spinlock_enter();
 	if (ai->at_trigger != INTR_TRIGGER_EDGE) {
-		*ap->at_imen |= IMEN_MASK(ai);
-		outb(ap->at_ioaddr + ICU_IMR_OFFSET, *ap->at_imen);
+		ap->at_imen |= IMEN_MASK(ai);
+		outb(ap->at_ioaddr + ICU_IMR_OFFSET, ap->at_imen);
 	}
 
 	/*
@@ -400,7 +464,7 @@
 		outb(imr_addr, MASTER_MODE);
 
 	/* Set interrupt enable mask. */
-	outb(imr_addr, *pic->at_imen);
+	outb(imr_addr, pic->at_imen);
 
 	/* Reset is finished, default to IRR on read. */
 	outb(pic->at_ioaddr, OCW3_SEL | OCW3_RR);
@@ -420,7 +484,6 @@
 	int i;
 
 	/* Start off with all interrupts disabled. */
-	imen = 0xffff;
 	i8259_init(&atpics[MASTER], 0);
 	i8259_init(&atpics[SLAVE], 1);
 	atpic_enable_source((struct intsrc *)&atintrs[ICU_SLAVEID]);
@@ -432,7 +495,8 @@
 		ai->at_intsrc.is_count = &ai->at_count;
 		ai->at_intsrc.is_straycount = &ai->at_straycount;
 		setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase +
-		    ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC);
+		    ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC,
+		    SEL_KPL, GSEL_ATPIC);
 	}
 
 #ifdef DEV_MCA
@@ -492,8 +556,6 @@
 static void
 atpic_init(void *dummy __unused)
 {
-	struct atpic_intsrc *ai;
-	int i;
 
 	/*
 	 * Register our PICs, even if we aren't going to use any of their
@@ -503,29 +565,10 @@
 	    intr_register_pic(&atpics[1].at_pic) != 0)
 		panic("Unable to register ATPICs");
 
-	/*
-	 * If any of the ISA IRQs have an interrupt source already, then
-	 * assume that the APICs are being used and don't register any
-	 * of our interrupt sources.  This makes sure we don't accidentally
-	 * use mixed mode.  The "accidental" use could otherwise occur on
-	 * machines that route the ACPI SCI interrupt to a different ISA
-	 * IRQ (at least one machines routes it to IRQ 13) thus disabling
-	 * that APIC ISA routing and allowing the ATPIC source for that IRQ
-	 * to leak through.  We used to depend on this feature for routing
-	 * IRQ0 via mixed mode, but now we don't use mixed mode at all.
-	 */
-	for (i = 0; i < NUM_ISA_IRQS; i++)
-		if (intr_lookup_source(i) != NULL)
-			return;
-
-	/* Loop through all interrupt sources and add them. */
-	for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) {
-		if (i == ICU_SLAVEID)
-			continue;
-		intr_register_source(&ai->at_intsrc);
-	}
+	if (num_io_irqs == 0)
+		num_io_irqs = NUM_ISA_IRQS;
 }
-SYSINIT(atpic_init, SI_SUB_INTR, SI_ORDER_SECOND + 1, atpic_init, NULL);
+SYSINIT(atpic_init, SI_SUB_INTR, SI_ORDER_FOURTH, atpic_init, NULL);
 
 void
 atpic_handle_intr(u_int vector, struct trapframe *frame)

Modified: trunk/sys/x86/isa/atrtc.c
===================================================================
--- trunk/sys/x86/isa/atrtc.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/atrtc.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -25,12 +25,13 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/isa/atrtc.c 285446 2015-07-13 11:58:08Z brueffer $
+ * $FreeBSD: stable/11/sys/x86/isa/atrtc.c 345590 2019-03-27 19:17:42Z wulf $
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/atrtc.c 285446 2015-07-13 11:58:08Z brueffer $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/atrtc.c 345590 2019-03-27 19:17:42Z wulf $");
 
+#include "opt_acpi.h"
 #include "opt_isa.h"
 
 #include <sys/param.h>
@@ -53,10 +54,24 @@
 #endif
 #include <machine/intr_machdep.h>
 #include "clock_if.h"
+#ifdef DEV_ACPI
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/accommon.h>
+#include <dev/acpica/acpivar.h>
+#include <machine/md_var.h>
+#endif
 
-#define	RTC_LOCK	do { if (!kdb_active) mtx_lock_spin(&clock_lock); } while (0)
-#define	RTC_UNLOCK	do { if (!kdb_active) mtx_unlock_spin(&clock_lock); } while (0)
+/*
+ * atrtc_lock protects low-level access to individual hardware registers.
+ * atrtc_time_lock protects the entire sequence of accessing multiple registers
+ * to read or write the date and time.
+ */
+static struct mtx atrtc_lock;
+MTX_SYSINIT(atrtc_lock_init, &atrtc_lock, "atrtc", MTX_SPIN);
 
+struct mtx atrtc_time_lock;
+MTX_SYSINIT(atrtc_time_lock_init, &atrtc_time_lock, "atrtc_time", MTX_DEF);
+
 int	atrtcclock_disable = 0;
 
 static	int	rtc_reg = -1;
@@ -63,16 +78,19 @@
 static	u_char	rtc_statusa = RTCSA_DIVIDER | RTCSA_NOPROF;
 static	u_char	rtc_statusb = RTCSB_24HR;
 
+#ifdef DEV_ACPI
+#define	_COMPONENT	ACPI_TIMER
+ACPI_MODULE_NAME("ATRTC")
+#endif
+
 /*
  * RTC support routines
  */
 
-int
-rtcin(int reg)
+static inline u_char
+rtcin_locked(int reg)
 {
-	u_char val;
 
-	RTC_LOCK;
 	if (rtc_reg != reg) {
 		inb(0x84);
 		outb(IO_RTC, reg);
@@ -79,16 +97,13 @@
 		rtc_reg = reg;
 		inb(0x84);
 	}
-	val = inb(IO_RTC + 1);
-	RTC_UNLOCK;
-	return (val);
+	return (inb(IO_RTC + 1));
 }
 
-void
-writertc(int reg, u_char val)
+static inline void
+rtcout_locked(int reg, u_char val)
 {
 
-	RTC_LOCK;
 	if (rtc_reg != reg) {
 		inb(0x84);
 		outb(IO_RTC, reg);
@@ -97,21 +112,36 @@
 	}
 	outb(IO_RTC + 1, val);
 	inb(0x84);
-	RTC_UNLOCK;
 }
 
-static __inline int
-readrtc(int port)
+int
+rtcin(int reg)
 {
-	return(bcd2bin(rtcin(port)));
+	u_char val;
+
+	mtx_lock_spin(&atrtc_lock);
+	val = rtcin_locked(reg);
+	mtx_unlock_spin(&atrtc_lock);
+	return (val);
 }
 
+void
+writertc(int reg, u_char val)
+{
+
+	mtx_lock_spin(&atrtc_lock);
+	rtcout_locked(reg, val);
+	mtx_unlock_spin(&atrtc_lock);
+}
+
 static void
 atrtc_start(void)
 {
 
-	writertc(RTC_STATUSA, rtc_statusa);
-	writertc(RTC_STATUSB, RTCSB_24HR);
+	mtx_lock_spin(&atrtc_lock);
+	rtcout_locked(RTC_STATUSA, rtc_statusa);
+	rtcout_locked(RTC_STATUSB, RTCSB_24HR);
+	mtx_unlock_spin(&atrtc_lock);
 }
 
 static void
@@ -127,8 +157,10 @@
 {
 
 	rtc_statusb |= RTCSB_PINTR;
-	writertc(RTC_STATUSB, rtc_statusb);
-	rtcin(RTC_INTR);
+	mtx_lock_spin(&atrtc_lock);
+	rtcout_locked(RTC_STATUSB, rtc_statusb);
+	rtcin_locked(RTC_INTR);
+	mtx_unlock_spin(&atrtc_lock);
 }
 
 static void
@@ -136,8 +168,10 @@
 {
 
 	rtc_statusb &= ~RTCSB_PINTR;
-	writertc(RTC_STATUSB, rtc_statusb);
-	rtcin(RTC_INTR);
+	mtx_lock_spin(&atrtc_lock);
+	rtcout_locked(RTC_STATUSB, rtc_statusb);
+	rtcin_locked(RTC_INTR);
+	mtx_unlock_spin(&atrtc_lock);
 }
 
 void
@@ -145,11 +179,13 @@
 {
 
 	/* Restore all of the RTC's "status" (actually, control) registers. */
-	rtcin(RTC_STATUSA);	/* dummy to get rtc_reg set */
-	writertc(RTC_STATUSB, RTCSB_24HR);
-	writertc(RTC_STATUSA, rtc_statusa);
-	writertc(RTC_STATUSB, rtc_statusb);
-	rtcin(RTC_INTR);
+	mtx_lock_spin(&atrtc_lock);
+	rtcin_locked(RTC_STATUSA);	/* dummy to get rtc_reg set */
+	rtcout_locked(RTC_STATUSB, RTCSB_24HR);
+	rtcout_locked(RTC_STATUSA, rtc_statusa);
+	rtcout_locked(RTC_STATUSB, rtc_statusb);
+	rtcin_locked(RTC_INTR);
+	mtx_unlock_spin(&atrtc_lock);
 }
 
 /**********************************************************************
@@ -162,6 +198,9 @@
 	struct resource *intr_res;
 	void *intr_handler;
 	struct eventtimer et;
+#ifdef DEV_ACPI
+	ACPI_HANDLE acpi_handle;
+#endif
 };
 
 static int
@@ -216,7 +255,145 @@
 	return(flag ? FILTER_HANDLED : FILTER_STRAY);
 }
 
+#ifdef DEV_ACPI
 /*
+ *  ACPI RTC CMOS address space handler
+ */
+#define	ATRTC_LAST_REG	0x40
+
+static void
+rtcin_region(int reg, void *buf, int len)
+{
+	u_char *ptr = buf;
+
+	/* Drop lock after each IO as intr and settime have greater priority */
+	while (len-- > 0)
+		*ptr++ = rtcin(reg++) & 0xff;
+}
+
+static void
+rtcout_region(int reg, const void *buf, int len)
+{
+	const u_char *ptr = buf;
+
+	while (len-- > 0)
+		writertc(reg++, *ptr++);
+}
+
+static bool
+atrtc_check_cmos_access(bool is_read, ACPI_PHYSICAL_ADDRESS addr, UINT32 len)
+{
+
+	/* Block address space wrapping on out-of-bound access */
+	if (addr >= ATRTC_LAST_REG || addr + len > ATRTC_LAST_REG)
+		return (false);
+
+	if (is_read) {
+		/* Reading 0x0C will muck with interrupts */
+		if (addr <= RTC_INTR && addr + len > RTC_INTR)
+			return (false);
+	} else {
+		/*
+		 * Allow single-byte writes to alarm registers and
+		 * multi-byte writes to addr >= 0x30, else deny.
+		 */
+		if (!((len == 1 && (addr == RTC_SECALRM ||
+				    addr == RTC_MINALRM ||
+				    addr == RTC_HRSALRM)) ||
+		      addr >= 0x30))
+			return (false);
+	}
+	return (true);
+}
+
+static ACPI_STATUS
+atrtc_acpi_cmos_handler(UINT32 func, ACPI_PHYSICAL_ADDRESS addr,
+    UINT32 bitwidth, UINT64 *value, void *context, void *region_context)
+{
+	device_t dev = context;
+	UINT32 bytewidth = howmany(bitwidth, 8);
+	bool is_read = func == ACPI_READ;
+
+	/* ACPICA is very verbose on CMOS handler failures, so we, too */
+#define	CMOS_HANDLER_ERR(fmt, ...) \
+	device_printf(dev, "ACPI [SystemCMOS] handler: " fmt, ##__VA_ARGS__)
+
+	ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
+
+	if (value == NULL) {
+		CMOS_HANDLER_ERR("NULL parameter\n");
+		return (AE_BAD_PARAMETER);
+	}
+	if (bitwidth == 0 || (bitwidth & 0x07) != 0) {
+		CMOS_HANDLER_ERR("Invalid bitwidth: %u\n", bitwidth);
+		return (AE_BAD_PARAMETER);
+	}
+	if (!atrtc_check_cmos_access(is_read, addr, bytewidth)) {
+		CMOS_HANDLER_ERR("%s access rejected: addr=%#04jx, len=%u\n",
+		    is_read ? "Read" : "Write", (uintmax_t)addr, bytewidth);
+		return (AE_BAD_PARAMETER);
+	}
+
+	switch (func) {
+	case ACPI_READ:
+		rtcin_region(addr, value, bytewidth);
+		break;
+	case ACPI_WRITE:
+		rtcout_region(addr, value, bytewidth);
+		break;
+	default:
+		CMOS_HANDLER_ERR("Invalid function: %u\n", func);
+		return (AE_BAD_PARAMETER);
+	}
+
+	ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
+	    "ACPI RTC CMOS %s access: addr=%#04x, len=%u, val=%*D\n",
+	    is_read ? "read" : "write", (unsigned)addr, bytewidth,
+	    bytewidth, value, " ");
+
+	return (AE_OK);
+}
+
+static int
+atrtc_reg_acpi_cmos_handler(device_t dev)
+{
+	struct atrtc_softc *sc = device_get_softc(dev);
+
+	ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__);
+
+	/* Don't handle address space events if driver is disabled. */
+	if (acpi_disabled("atrtc"))
+		return (ENXIO);
+
+	sc->acpi_handle = acpi_get_handle(dev);
+	if (sc->acpi_handle == NULL ||
+	    ACPI_FAILURE(AcpiInstallAddressSpaceHandler(sc->acpi_handle,
+	      ACPI_ADR_SPACE_CMOS, atrtc_acpi_cmos_handler, NULL, dev))) {
+		sc->acpi_handle = NULL;
+		device_printf(dev,
+		    "Can't register ACPI CMOS address space handler\n");
+		return (ENXIO);
+        }
+
+        return (0);
+}
+
+static int
+atrtc_unreg_acpi_cmos_handler(device_t dev)
+{
+	struct atrtc_softc *sc = device_get_softc(dev);
+
+	ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__);
+
+	if (sc->acpi_handle != NULL)
+		AcpiRemoveAddressSpaceHandler(sc->acpi_handle,
+		    ACPI_ADR_SPACE_CMOS, atrtc_acpi_cmos_handler);
+
+	return (0);
+}
+#endif	/* DEV_ACPI */
+
+/*
  * Attach to the ISA PnP descriptors for the timer and realtime clock.
  */
 static struct isa_pnp_id atrtc_ids[] = {
@@ -242,7 +419,7 @@
 atrtc_attach(device_t dev)
 {
 	struct atrtc_softc *sc;
-	u_long s;
+	rman_res_t s;
 	int i;
 
 	sc = device_get_softc(dev);
@@ -288,6 +465,37 @@
 }
 
 static int
+atrtc_isa_attach(device_t dev)
+{
+
+	return (atrtc_attach(dev));
+}
+
+#ifdef DEV_ACPI
+static int
+atrtc_acpi_attach(device_t dev)
+{
+	int ret;
+
+	ret = atrtc_attach(dev);
+	if (ret)
+		return (ret);
+
+	(void)atrtc_reg_acpi_cmos_handler(dev);
+
+	return (0);
+}
+
+static int
+atrtc_acpi_detach(device_t dev)
+{
+
+	(void)atrtc_unreg_acpi_cmos_handler(dev);
+	return (0);
+}
+#endif	/* DEV_ACPI */
+
+static int
 atrtc_resume(device_t dev)
 {
 
@@ -298,28 +506,38 @@
 static int
 atrtc_settime(device_t dev __unused, struct timespec *ts)
 {
-	struct clocktime ct;
+	struct bcd_clocktime bct;
 
-	clock_ts_to_ct(ts, &ct);
+	clock_ts_to_bcd(ts, &bct, false);
+	clock_dbgprint_bcd(dev, CLOCK_DBG_WRITE, &bct);
 
-	/* Disable RTC updates and interrupts. */
-	writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR);
+	mtx_lock(&atrtc_time_lock);
+	mtx_lock_spin(&atrtc_lock);
 
-	writertc(RTC_SEC, bin2bcd(ct.sec)); 		/* Write back Seconds */
-	writertc(RTC_MIN, bin2bcd(ct.min)); 		/* Write back Minutes */
-	writertc(RTC_HRS, bin2bcd(ct.hour));		/* Write back Hours   */
+	/* Disable RTC updates and interrupts.  */
+	rtcout_locked(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR);
 
-	writertc(RTC_WDAY, ct.dow + 1);			/* Write back Weekday */
-	writertc(RTC_DAY, bin2bcd(ct.day));		/* Write back Day */
-	writertc(RTC_MONTH, bin2bcd(ct.mon));           /* Write back Month   */
-	writertc(RTC_YEAR, bin2bcd(ct.year % 100));	/* Write back Year    */
+	/* Write all the time registers. */
+	rtcout_locked(RTC_SEC,   bct.sec);
+	rtcout_locked(RTC_MIN,   bct.min);
+	rtcout_locked(RTC_HRS,   bct.hour);
+	rtcout_locked(RTC_WDAY,  bct.dow + 1);
+	rtcout_locked(RTC_DAY,   bct.day);
+	rtcout_locked(RTC_MONTH, bct.mon);
+	rtcout_locked(RTC_YEAR,  bct.year & 0xff);
 #ifdef USE_RTC_CENTURY
-	writertc(RTC_CENTURY, bin2bcd(ct.year / 100));	/* ... and Century    */
+	rtcout_locked(RTC_CENTURY, bct.year >> 8);
 #endif
 
-	/* Reenable RTC updates and interrupts. */
-	writertc(RTC_STATUSB, rtc_statusb);
-	rtcin(RTC_INTR);
+	/*
+	 * Re-enable RTC updates and interrupts.
+	 */
+	rtcout_locked(RTC_STATUSB, rtc_statusb);
+	rtcin_locked(RTC_INTR);
+
+	mtx_unlock_spin(&atrtc_lock);
+	mtx_unlock(&atrtc_time_lock);
+
 	return (0);
 }
 
@@ -326,7 +544,7 @@
 static int
 atrtc_gettime(device_t dev, struct timespec *ts)
 {
-	struct clocktime ct;
+	struct bcd_clocktime bct;
 
 	/* Look if we have a RTC present and the time is valid */
 	if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) {
@@ -341,32 +559,32 @@
 	 * to make sure that no more than 240us pass after we start reading,
 	 * and try again if so.
 	 */
+	mtx_lock(&atrtc_time_lock);
 	while (rtcin(RTC_STATUSA) & RTCSA_TUP)
 		continue;
-	critical_enter();
-	ct.nsec = 0;
-	ct.sec = readrtc(RTC_SEC);
-	ct.min = readrtc(RTC_MIN);
-	ct.hour = readrtc(RTC_HRS);
-	ct.day = readrtc(RTC_DAY);
-	ct.dow = readrtc(RTC_WDAY) - 1;
-	ct.mon = readrtc(RTC_MONTH);
-	ct.year = readrtc(RTC_YEAR);
+	mtx_lock_spin(&atrtc_lock);
+	bct.sec  = rtcin_locked(RTC_SEC);
+	bct.min  = rtcin_locked(RTC_MIN);
+	bct.hour = rtcin_locked(RTC_HRS);
+	bct.day  = rtcin_locked(RTC_DAY);
+	bct.mon  = rtcin_locked(RTC_MONTH);
+	bct.year = rtcin_locked(RTC_YEAR);
 #ifdef USE_RTC_CENTURY
-	ct.year += readrtc(RTC_CENTURY) * 100;
-#else
-	ct.year += (ct.year < 80 ? 2000 : 1900);
+	bct.year |= rtcin_locked(RTC_CENTURY) << 8;
 #endif
-	critical_exit();
-	/* Set dow = -1 because some clocks don't set it correctly. */
-	ct.dow = -1;
-	return (clock_ct_to_ts(&ct, ts));
+	mtx_unlock_spin(&atrtc_lock);
+	mtx_unlock(&atrtc_time_lock);
+	/* dow is unused in timespec conversion and we have no nsec info. */
+	bct.dow  = 0;
+	bct.nsec = 0;
+	clock_dbgprint_bcd(dev, CLOCK_DBG_READ, &bct);
+	return (clock_bcd_to_ts(&bct, ts, false));
 }
 
-static device_method_t atrtc_methods[] = {
+static device_method_t atrtc_isa_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		atrtc_probe),
-	DEVMETHOD(device_attach,	atrtc_attach),
+	DEVMETHOD(device_attach,	atrtc_isa_attach),
 	DEVMETHOD(device_detach,	bus_generic_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
@@ -380,26 +598,38 @@
 	{ 0, 0 }
 };
 
-static driver_t atrtc_driver = {
+static driver_t atrtc_isa_driver = {
 	"atrtc",
-	atrtc_methods,
+	atrtc_isa_methods,
 	sizeof(struct atrtc_softc),
 };
 
-static devclass_t atrtc_devclass;
+#ifdef DEV_ACPI
+static device_method_t atrtc_acpi_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		atrtc_probe),
+	DEVMETHOD(device_attach,	atrtc_acpi_attach),
+	DEVMETHOD(device_detach,	atrtc_acpi_detach),
+		/* XXX stop statclock? */
+	DEVMETHOD(device_resume,	atrtc_resume),
 
-DRIVER_MODULE(atrtc, isa, atrtc_driver, atrtc_devclass, 0, 0);
-DRIVER_MODULE(atrtc, acpi, atrtc_driver, atrtc_devclass, 0, 0);
+	/* clock interface */
+	DEVMETHOD(clock_gettime,	atrtc_gettime),
+	DEVMETHOD(clock_settime,	atrtc_settime),
 
-#include "opt_ddb.h"
-#ifdef DDB
-#include <ddb/ddb.h>
+	{ 0, 0 }
+};
 
-DB_SHOW_COMMAND(rtc, rtc)
-{
-	printf("%02x/%02x/%02x %02x:%02x:%02x, A = %02x, B = %02x, C = %02x\n",
-		rtcin(RTC_YEAR), rtcin(RTC_MONTH), rtcin(RTC_DAY),
-		rtcin(RTC_HRS), rtcin(RTC_MIN), rtcin(RTC_SEC),
-		rtcin(RTC_STATUSA), rtcin(RTC_STATUSB), rtcin(RTC_INTR));
-}
-#endif /* DDB */
+static driver_t atrtc_acpi_driver = {
+	"atrtc",
+	atrtc_acpi_methods,
+	sizeof(struct atrtc_softc),
+};
+#endif	/* DEV_ACPI */
+
+static devclass_t atrtc_devclass;
+
+DRIVER_MODULE(atrtc, isa, atrtc_isa_driver, atrtc_devclass, 0, 0);
+#ifdef DEV_ACPI
+DRIVER_MODULE(atrtc, acpi, atrtc_acpi_driver, atrtc_devclass, 0, 0);
+#endif

Modified: trunk/sys/x86/isa/clock.c
===================================================================
--- trunk/sys/x86/isa/clock.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/clock.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/clock.c 254373 2013-08-15 17:21:06Z brooks $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/clock.c 331722 2018-03-29 02:50:57Z eadler $");
 
 /*
  * Routines to handle clock hardware.
@@ -66,6 +66,7 @@
 #include <machine/intr_machdep.h>
 #include <machine/ppireg.h>
 #include <machine/timerreg.h>
+#include <x86/init.h>
 
 #ifdef PC98
 #include <pc98/pc98/pc98_machdep.h>
@@ -98,7 +99,7 @@
 int	i8254_max_count;
 static int i8254_timecounter = 1;
 
-struct mtx clock_lock;
+static	struct mtx clock_lock;
 static	struct intsrc *i8254_intsrc;
 static	uint16_t i8254_lastcount;
 static	uint16_t i8254_offset;
@@ -140,6 +141,15 @@
 static	unsigned i8254_get_timecount(struct timecounter *tc);
 static	void	set_i8254_freq(int mode, uint32_t period);
 
+void
+clock_init(void)
+{
+	/* Init the clock lock */
+	mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE);
+	/* Init the clock in order to use DELAY */
+	init_ops.early_clock_source_init();
+}
+
 static int
 clkintr(void *arg)
 {
@@ -157,7 +167,7 @@
 		mtx_unlock_spin(&clock_lock);
 	}
 
-	if (sc && sc->et.et_active && sc->mode != MODE_STOP)
+	if (sc->et.et_active && sc->mode != MODE_STOP)
 		sc->et.et_event_cb(&sc->et, sc->et.et_arg);
 
 #ifdef DEV_MCA
@@ -248,54 +258,6 @@
 	return ((high << 8) | low);
 }
 
-#ifndef DELAYDEBUG
-static u_int
-get_tsc(__unused struct timecounter *tc)
-{
-
-	return (rdtsc32());
-}
-
-static __inline int
-delay_tc(int n)
-{
-	struct timecounter *tc;
-	timecounter_get_t *func;
-	uint64_t end, freq, now;
-	u_int last, mask, u;
-
-	tc = timecounter;
-	freq = atomic_load_acq_64(&tsc_freq);
-	if (tsc_is_invariant && freq != 0) {
-		func = get_tsc;
-		mask = ~0u;
-	} else {
-		if (tc->tc_quality <= 0)
-			return (0);
-		func = tc->tc_get_timecount;
-		mask = tc->tc_counter_mask;
-		freq = tc->tc_frequency;
-	}
-	now = 0;
-	end = freq * n / 1000000;
-	if (func == get_tsc)
-		sched_pin();
-	last = func(tc) & mask;
-	do {
-		cpu_spinwait();
-		u = func(tc) & mask;
-		if (u < last)
-			now += mask - last + u + 1;
-		else
-			now += u - last;
-		last = u;
-	} while (now < end);
-	if (func == get_tsc)
-		sched_unpin();
-	return (1);
-}
-#endif
-
 /*
  * Wait "n" microseconds.
  * Relies on timer 1 counting down from (i8254_freq / hz)
@@ -302,7 +264,7 @@
  * Note: timer had better have been programmed before this is first used!
  */
 void
-DELAY(int n)
+i8254_delay(int n)
 {
 	int delta, prev_tick, tick, ticks_left;
 #ifdef DELAYDEBUG
@@ -318,9 +280,6 @@
 	}
 	if (state == 1)
 		printf("DELAY(%d)...", n);
-#else
-	if (delay_tc(n))
-		return;
 #endif
 	/*
 	 * Read the counter first, so that the rest of the setup overhead is
@@ -500,7 +459,6 @@
 i8254_init(void)
 {
 
-	mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE);
 #ifdef PC98
 	if (pc98_machine_type & M_8M)
 		i8254_freq = 1996800L; /* 1.9968 MHz */
@@ -518,8 +476,27 @@
 void
 cpu_initclocks(void)
 {
+#ifdef EARLY_AP_STARTUP
+	struct thread *td;
+	int i;
 
+	td = curthread;
 	cpu_initclocks_bsp();
+	CPU_FOREACH(i) {
+		if (i == 0)
+			continue;
+		thread_lock(td);
+		sched_bind(td, i);
+		thread_unlock(td);
+		cpu_initclocks_ap();
+	}
+	thread_lock(td);
+	if (sched_is_bound(td))
+		sched_unbind(td);
+	thread_unlock(td);
+#else
+	cpu_initclocks_bsp();
+#endif
 }
 
 static int
@@ -699,7 +676,7 @@
 attimer_attach(device_t dev)
 {
 	struct attimer_softc *sc;
-	u_long s;
+	rman_res_t s;
 	int i;
 
 	attimer_sc = sc = device_get_softc(dev);

Modified: trunk/sys/x86/isa/elcr.c
===================================================================
--- trunk/sys/x86/isa/elcr.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/elcr.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/elcr.c 262192 2014-02-18 20:27:17Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/elcr.c 261520 2014-02-05 18:13:27Z jhb $");
 
 /*
  * The ELCR is a register that controls the trigger mode and polarity of

Modified: trunk/sys/x86/isa/icu.h
===================================================================
--- trunk/sys/x86/isa/icu.h	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/icu.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)icu.h	5.6 (Berkeley) 5/9/91
- * $FreeBSD: stable/10/sys/x86/isa/icu.h 233031 2012-03-16 12:13:44Z nyan $
+ * $FreeBSD: stable/11/sys/x86/isa/icu.h 339928 2018-10-30 19:10:41Z jhb $
  */
 
 /*
@@ -88,7 +88,6 @@
 #endif
 
 #define	IRQ_MASK(irq)		(1 << (irq))
-#define	IMEN_MASK(ai)		(IRQ_MASK((ai)->at_irq))
 
 void	atpic_handle_intr(u_int vector, struct trapframe *frame);
 void	atpic_startup(void);

Modified: trunk/sys/x86/isa/isa.c
===================================================================
--- trunk/sys/x86/isa/isa.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/isa.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/isa.c 221526 2011-05-06 13:48:53Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/isa.c 295832 2016-02-20 01:32:58Z jhibbits $");
 
 /*-
  * Modifications for Intel architecture by Garrett A. Wollman.
@@ -89,13 +89,13 @@
  */
 struct resource *
 isa_alloc_resource(device_t bus, device_t child, int type, int *rid,
-		   u_long start, u_long end, u_long count, u_int flags)
+		   rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	/*
 	 * Consider adding a resource definition.
 	 */
 	int passthrough = (device_get_parent(child) != bus);
-	int isdefault = (start == 0UL && end == ~0UL);
+	int isdefault = RMAN_IS_DEFAULT_RANGE(start, end);
 	struct isa_device* idev = DEVTOISA(child);
 	struct resource_list *rl = &idev->id_resources;
 	struct resource_list_entry *rle;
@@ -242,3 +242,8 @@
  * On this platform, isa can also attach to the legacy bus.
  */
 DRIVER_MODULE(isa, legacy, isa_driver, isa_devclass, 0, 0);
+
+/*
+ * Attach the ISA bus to the xenpv bus in order to get syscons.
+ */
+DRIVER_MODULE(isa, xenpv, isa_driver, isa_devclass, 0, 0);

Modified: trunk/sys/x86/isa/isa_dma.c
===================================================================
--- trunk/sys/x86/isa/isa_dma.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/isa_dma.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/isa_dma.c 233675 2012-03-29 18:58:02Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/isa_dma.c 332304 2018-04-08 20:52:09Z emaste $");
 
 /*
  * code to manage AT bus
@@ -62,7 +62,7 @@
 #include <isa/isavar.h>
 #include <isa/isa_dmareg.h>
 
-#define	ISARAM_END	RAM_END
+#define	ISARAM_END	0x1000000
 
 static int isa_dmarangecheck(caddr_t va, u_int length, int chan);
 
@@ -145,8 +145,7 @@
  * in open() or during its initialization.
  */
 int
-isa_dma_acquire(chan)
-	int chan;
+isa_dma_acquire(int chan)
 {
 #ifdef DIAGNOSTIC
 	if (chan & ~VALID_DMA_MASK)
@@ -171,8 +170,7 @@
  * during close() or during its shutdown.
  */
 void
-isa_dma_release(chan)
-	int chan;
+isa_dma_release(int chan)
 {
 #ifdef DIAGNOSTIC
 	if (chan & ~VALID_DMA_MASK)
@@ -206,8 +204,7 @@
  * external dma control by a board.
  */
 void
-isa_dmacascade(chan)
-	int chan;
+isa_dmacascade(int chan)
 {
 #ifdef DIAGNOSTIC
 	if (chan & ~VALID_DMA_MASK)

Modified: trunk/sys/x86/isa/nmi.c
===================================================================
--- trunk/sys/x86/isa/nmi.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/nmi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/nmi.c 204309 2010-02-25 14:13:39Z attilio $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/nmi.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include "opt_mca.h"
 

Modified: trunk/sys/x86/isa/orm.c
===================================================================
--- trunk/sys/x86/isa/orm.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/orm.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/orm.c 204309 2010-02-25 14:13:39Z attilio $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/orm.c 299392 2016-05-10 22:28:06Z bz $");
 
 /*
  * Driver to take care of holes in ISA I/O memory occupied
@@ -59,7 +59,7 @@
 	{ 0,		NULL },
 };
 
-#define MAX_ROMS	16
+#define MAX_ROMS	32
 
 struct orm_softc {
 	int		rnum;
@@ -92,6 +92,9 @@
 	struct orm_softc	*sc;
 	u_int8_t		buf[3];
 
+	if (resource_disabled("orm", 0))
+		return;
+
 	child = BUS_ADD_CHILD(parent, ISA_ORDER_SENSITIVE, "orm", -1);
 	device_set_driver(child, driver);
 	isa_set_logicalid(child, ORM_ID);
@@ -98,7 +101,7 @@
 	isa_set_vendorid(child, ORM_ID);
 	sc = device_get_softc(child);
 	sc->rnum = 0;
-	while (chunk < IOMEM_END) {
+	while (sc->rnum < MAX_ROMS && chunk < IOMEM_END) {
 		bus_set_resource(child, SYS_RES_MEMORY, sc->rnum, chunk,
 		    IOMEM_STEP);
 		rid = sc->rnum;

Modified: trunk/sys/x86/pci/pci_bus.c
===================================================================
--- trunk/sys/x86/pci/pci_bus.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/pci/pci_bus.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/pci/pci_bus.c 280970 2015-04-01 21:48:54Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/pci/pci_bus.c 294883 2016-01-27 02:23:54Z jhibbits $");
 
 #include "opt_cpu.h"
 
@@ -525,7 +525,7 @@
 			device_probe_and_attach(pir);
 	}
 #endif
-	device_add_child(dev, "pci", bus);
+	device_add_child(dev, "pci", -1);
 	return bus_generic_attach(dev);
 }
 
@@ -576,12 +576,11 @@
 SYSCTL_DECL(_hw_pci);
 
 static unsigned long host_mem_start = 0x80000000;
-TUNABLE_ULONG("hw.pci.host_mem_start", &host_mem_start);
 SYSCTL_ULONG(_hw_pci, OID_AUTO, host_mem_start, CTLFLAG_RDTUN, &host_mem_start,
     0, "Limit the host bridge memory to being above this address.");
 
-u_long
-hostb_alloc_start(int type, u_long start, u_long end, u_long count)
+rman_res_t
+hostb_alloc_start(int type, rman_res_t start, rman_res_t end, rman_res_t count)
 {
 
 	if (start + count - 1 != end) {
@@ -595,7 +594,7 @@
 
 struct resource *
 legacy_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
-    u_long start, u_long end, u_long count, u_int flags)
+    rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 
 #if defined(NEW_PCIB) && defined(PCI_RES_BUS)
@@ -611,7 +610,7 @@
 #if defined(NEW_PCIB) && defined(PCI_RES_BUS)
 int
 legacy_pcib_adjust_resource(device_t dev, device_t child, int type,
-    struct resource *r, u_long start, u_long end)
+    struct resource *r, rman_res_t start, rman_res_t end)
 {
 
 	if (type == PCI_RES_BUS)

Modified: trunk/sys/x86/pci/qpi.c
===================================================================
--- trunk/sys/x86/pci/qpi.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/pci/qpi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,14 +27,14 @@
  */
 
 /*
- * This driver provides a psuedo-bus to enumerate the PCI buses
- * present on a sytem using a QPI chipset.  It creates a qpi0 bus that
- * is a child of nexus0 and then creates two Host-PCI bridges as a
+ * This driver provides a pseudo-bus to enumerate the PCI buses
+ * present on a system using a QPI chipset.  It creates a qpi0 bus that
+ * is a child of nexus0 and then creates Host-PCI bridges as a
  * child of that.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/pci/qpi.c 283927 2015-06-02 19:20:39Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/pci/qpi.c 323609 2017-09-15 09:03:01Z kib $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -64,17 +64,23 @@
 static void
 qpi_identify(driver_t *driver, device_t parent)
 {
+	int do_qpi;
 
-        /* Check CPUID to ensure this is an i7 CPU of some sort. */
-        if (!(cpu_vendor_id == CPU_VENDOR_INTEL &&
-	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
-	    (CPUID_TO_MODEL(cpu_id) == 0x1a || CPUID_TO_MODEL(cpu_id) == 0x2c)))
-                return;
+	/* Check CPUID to ensure this is an i7 CPU of some sort. */
+	if (cpu_vendor_id != CPU_VENDOR_INTEL ||
+	    CPUID_TO_FAMILY(cpu_id) != 0x6)
+		return;
 
-        /* PCI config register access is required. */
-        if (pci_cfgregopen() == 0)
-                return;
+	/* Only discover buses with configuration devices if allowed by user */
+	do_qpi = 0;
+	TUNABLE_INT_FETCH("hw.attach_intel_csr_pci", &do_qpi);
+	if (!do_qpi)
+		return;
 
+	/* PCI config register access is required. */
+	if (pci_cfgregopen() == 0)
+		return;
+
 	/* Add a qpi bus device. */
 	if (BUS_ADD_CHILD(parent, 20, "qpi", -1) == NULL)
 		panic("Failed to add qpi bus");
@@ -98,6 +104,7 @@
 	struct qpi_device *qdev;
 	device_t child;
 	uint32_t devid;
+	int s;
 
 	/*
 	 * If a PCI bus already exists for this bus number, then
@@ -107,18 +114,23 @@
 		return (EEXIST);
 
 	/*
-	 * Attempt to read the device id for device 0, function 0 on
-	 * the bus.  A value of 0xffffffff means that the bus is not
-	 * present.
+	 * Attempt to read the device id for every slot, function 0 on
+	 * the bus.  If all read values are 0xffffffff this means that
+	 * the bus is not present.
 	 */
-	devid = pci_cfgregread(bus, 0, 0, PCIR_DEVVENDOR, 4);
+	for (s = 0; s <= PCI_SLOTMAX; s++) {
+		devid = pci_cfgregread(bus, s, 0, PCIR_DEVVENDOR, 4);
+		if (devid != 0xffffffff)
+			break;
+	}
 	if (devid == 0xffffffff)
 		return (ENOENT);
 
 	if ((devid & 0xffff) != 0x8086) {
-		device_printf(dev,
-		    "Device at pci%d.0.0 has non-Intel vendor 0x%x\n", bus,
-		    devid & 0xffff);
+		if (bootverbose)
+			device_printf(dev,
+			    "Device at pci%d.%d.0 has non-Intel vendor 0x%x\n",
+			    bus, s, devid & 0xffff);
 		return (ENXIO);
 	}
 
@@ -138,12 +150,12 @@
 	int bus;
 
 	/*
-	 * Each processor socket has a dedicated PCI bus counting down from
-	 * 255.  We keep probing buses until one fails.
+	 * Each processor socket has a dedicated PCI bus, sometimes
+	 * not enumerated by ACPI.  Probe all unattached buses from 0
+	 * to 255.
 	 */
-	for (bus = 255;; bus--)
-		if (qpi_probe_pcib(dev, bus) != 0)
-			break;
+	for (bus = PCI_BUSMAX; bus >= 0; bus--)
+		qpi_probe_pcib(dev, bus);
 
 	return (bus_generic_attach(dev));
 }
@@ -219,8 +231,8 @@
 qpi_pcib_attach(device_t dev)
 {
 
-	device_add_child(dev, "pci", pcib_get_bus(dev));      
-        return (bus_generic_attach(dev));
+	device_add_child(dev, "pci", -1);
+	return (bus_generic_attach(dev));
 }
 
 static int
@@ -242,7 +254,7 @@
 #if defined(NEW_PCIB) && defined(PCI_RES_BUS)
 static struct resource *
 qpi_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
-    u_long start, u_long end, u_long count, u_int flags)
+    rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 
 	if (type == PCI_RES_BUS)

Added: trunk/sys/x86/x86/autoconf.c
===================================================================
--- trunk/sys/x86/x86/autoconf.c	                        (rev 0)
+++ trunk/sys/x86/x86/autoconf.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,162 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)autoconf.c	7.1 (Berkeley) 5/9/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/autoconf.c 332304 2018-04-08 20:52:09Z emaste $");
+
+/*
+ * Setup the system to run on the current machine.
+ *
+ * Configure() is called at boot time and initializes the vba
+ * device tables and the memory controller monitoring.  Available
+ * devices are determined (from possibilities mentioned in ioconf.c),
+ * and the drivers are initialized.
+ */
+#include "opt_bootp.h"
+#include "opt_isa.h"
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/reboot.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/cons.h>
+
+#include <sys/socket.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+
+#ifdef PC98
+#include <machine/bootinfo.h>
+#endif
+#include <machine/md_var.h>
+
+#ifdef DEV_ISA
+#include <isa/isavar.h>
+
+device_t isa_bus_device = 0;
+#endif
+
+static void	configure_first(void *);
+static void	configure(void *);
+static void	configure_final(void *);
+
+SYSINIT(configure1, SI_SUB_CONFIGURE, SI_ORDER_FIRST, configure_first, NULL);
+/* SI_ORDER_SECOND is hookable */
+SYSINIT(configure2, SI_SUB_CONFIGURE, SI_ORDER_THIRD, configure, NULL);
+/* SI_ORDER_MIDDLE is hookable */
+SYSINIT(configure3, SI_SUB_CONFIGURE, SI_ORDER_ANY, configure_final, NULL);
+
+/*
+ * Determine i/o configuration for a machine.
+ */
+static void
+configure_first(void *dummy)
+{
+
+	/* nexus0 is the top of the x86 device tree */
+	device_add_child(root_bus, "nexus", 0);
+}
+
+static void
+configure(void *dummy)
+{
+
+	/* initialize new bus architecture */
+	root_bus_configure();
+
+#ifdef DEV_ISA
+	/*
+	 * Explicitly probe and attach ISA last.  The isa bus saves
+	 * it's device node at attach time for us here.
+	 */
+	if (isa_bus_device)
+		isa_probe_children(isa_bus_device);
+#endif
+}
+
+static void
+configure_final(void *dummy)
+{
+
+	cninit_finish(); 
+
+	if (bootverbose) {
+#ifdef PC98
+		int i;
+
+		/*
+		 * Print out the BIOS's idea of the disk geometries.
+		 */
+		printf("BIOS Geometries:\n");
+		for (i = 0; i < N_BIOS_GEOM; i++) {
+			unsigned long bios_geom;
+			int max_cylinder, max_head, max_sector;
+
+			bios_geom = bootinfo.bi_bios_geom[i];
+
+			/*
+			 * XXX the bootstrap punts a 1200K floppy geometry
+			 * when the get-disk-geometry interrupt fails.  Skip
+			 * drives that have this geometry.
+			 */
+			if (bios_geom == 0x4f020f)
+				continue;
+
+			printf(" %x:%08lx ", i, bios_geom);
+			max_cylinder = bios_geom >> 16;
+			max_head = (bios_geom >> 8) & 0xff;
+			max_sector = bios_geom & 0xff;
+			printf(
+		"0..%d=%d cylinders, 0..%d=%d heads, 1..%d=%d sectors\n",
+			       max_cylinder, max_cylinder + 1,
+			       max_head, max_head + 1,
+			       max_sector, max_sector);
+		}
+		printf(" %d accounted for\n", bootinfo.bi_n_bios_used);
+#endif
+
+		printf("Device configuration finished.\n");
+	}
+	cold = 0;
+}


Property changes on: trunk/sys/x86/x86/autoconf.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/bus_machdep.c
===================================================================
--- trunk/sys/x86/x86/bus_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/bus_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/bus_machdep.c 287126 2015-08-25 14:39:40Z marcel $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/bus_machdep.c 286667 2015-08-12 15:26:32Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/x86/x86/busdma_bounce.c
===================================================================
--- trunk/sys/x86/x86/busdma_bounce.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/busdma_bounce.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/busdma_bounce.c 318977 2017-05-27 08:17:59Z hselasky $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/busdma_bounce.c 343361 2019-01-23 20:49:14Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -80,7 +80,8 @@
 	vm_offset_t	vaddr;		/* kva of bounce buffer */
 	bus_addr_t	busaddr;	/* Physical address */
 	vm_offset_t	datavaddr;	/* kva of client data */
-	bus_addr_t	dataaddr;	/* client physical address */
+	vm_offset_t	dataoffs;	/* page offset of client data */
+	vm_page_t	datapage[2];	/* physical page(s) of client data */
 	bus_size_t	datacount;	/* client data count */
 	STAILQ_ENTRY(bounce_page) links;
 };
@@ -135,10 +136,9 @@
 static int reserve_bounce_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
 				int commit);
 static bus_addr_t add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map,
-				  vm_offset_t vaddr, bus_addr_t addr,
-				  bus_size_t size);
+				  vm_offset_t vaddr, bus_addr_t addr1,
+				  bus_addr_t addr2, bus_size_t size);
 static void free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage);
-int run_filter(bus_dma_tag_t dmat, bus_addr_t paddr);
 static void _bus_dmamap_count_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
 				    pmap_t pmap, void *buf, bus_size_t buflen,
 				    int flags);
@@ -148,11 +148,6 @@
 static int _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
 				     int flags);
 
-#ifdef XEN
-#undef pmap_kextract
-#define pmap_kextract pmap_kextract_ma
-#endif
-
 /*
  * Allocate a device specific dma_tag.
  */
@@ -494,7 +489,8 @@
 		while (buflen != 0) {
 			sgsize = MIN(buflen, dmat->common.maxsegsz);
 			if (bus_dma_run_filter(&dmat->common, curaddr)) {
-				sgsize = MIN(sgsize, PAGE_SIZE);
+				sgsize = MIN(sgsize,
+				    PAGE_SIZE - (curaddr & PAGE_MASK));
 				map->pagesneeded++;
 			}
 			curaddr += sgsize;
@@ -544,6 +540,51 @@
 	}
 }
 
+static void
+_bus_dmamap_count_ma(bus_dma_tag_t dmat, bus_dmamap_t map, struct vm_page **ma,
+    int ma_offs, bus_size_t buflen, int flags)
+{
+	bus_size_t sg_len, max_sgsize;
+	int page_index;
+	vm_paddr_t paddr;
+
+	if ((map != &nobounce_dmamap && map->pagesneeded == 0)) {
+		CTR4(KTR_BUSDMA, "lowaddr= %d Maxmem= %d, boundary= %d, "
+		    "alignment= %d", dmat->common.lowaddr,
+		    ptoa((vm_paddr_t)Maxmem),
+		    dmat->common.boundary, dmat->common.alignment);
+		CTR3(KTR_BUSDMA, "map= %p, nobouncemap= %p, pagesneeded= %d",
+		    map, &nobounce_dmamap, map->pagesneeded);
+
+		/*
+		 * Count the number of bounce pages
+		 * needed in order to complete this transfer
+		 */
+		page_index = 0;
+		while (buflen > 0) {
+			paddr = VM_PAGE_TO_PHYS(ma[page_index]) + ma_offs;
+			sg_len = PAGE_SIZE - ma_offs;
+			max_sgsize = MIN(buflen, dmat->common.maxsegsz);
+			sg_len = MIN(sg_len, max_sgsize);
+			if (bus_dma_run_filter(&dmat->common, paddr) != 0) {
+				sg_len = roundup2(sg_len,
+				    dmat->common.alignment);
+				sg_len = MIN(sg_len, max_sgsize);
+				KASSERT((sg_len & (dmat->common.alignment - 1))
+				    == 0, ("Segment size is not aligned"));
+				map->pagesneeded++;
+			}
+			if (((ma_offs + sg_len) & ~PAGE_MASK) != 0)
+				page_index++;
+			ma_offs = (ma_offs + sg_len) & PAGE_MASK;
+			KASSERT(buflen >= sg_len,
+			    ("Segment length overruns original buffer"));
+			buflen -= sg_len;
+		}
+		CTR1(KTR_BUSDMA, "pagesneeded= %d\n", map->pagesneeded);
+	}
+}
+
 static int
 _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int flags)
 {
@@ -648,8 +689,8 @@
 		if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) &&
 		    map->pagesneeded != 0 &&
 		    bus_dma_run_filter(&dmat->common, curaddr)) {
-			sgsize = MIN(sgsize, PAGE_SIZE);
-			curaddr = add_bounce_page(dmat, map, 0, curaddr,
+			sgsize = MIN(sgsize, PAGE_SIZE - (curaddr & PAGE_MASK));
+			curaddr = add_bounce_page(dmat, map, 0, curaddr, 0,
 			    sgsize);
 		}
 		sgsize = _bus_dmamap_addseg(dmat, map, curaddr, sgsize, segs,
@@ -677,7 +718,7 @@
 {
 	bus_size_t sgsize, max_sgsize;
 	bus_addr_t curaddr;
-	vm_offset_t vaddr;
+	vm_offset_t kvaddr, vaddr;
 	int error;
 
 	if (map == NULL)
@@ -700,22 +741,25 @@
 		/*
 		 * Get the physical address for this segment.
 		 */
-		if (pmap == kernel_pmap)
+		if (pmap == kernel_pmap) {
 			curaddr = pmap_kextract(vaddr);
-		else
+			kvaddr = vaddr;
+		} else {
 			curaddr = pmap_extract(pmap, vaddr);
+			kvaddr = 0;
+		}
 
 		/*
 		 * Compute the segment size, and adjust counts.
 		 */
 		max_sgsize = MIN(buflen, dmat->common.maxsegsz);
-		sgsize = PAGE_SIZE - ((vm_offset_t)curaddr & PAGE_MASK);
+		sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
 		if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) &&
 		    map->pagesneeded != 0 &&
 		    bus_dma_run_filter(&dmat->common, curaddr)) {
 			sgsize = roundup2(sgsize, dmat->common.alignment);
 			sgsize = MIN(sgsize, max_sgsize);
-			curaddr = add_bounce_page(dmat, map, vaddr, curaddr,
+			curaddr = add_bounce_page(dmat, map, kvaddr, curaddr, 0,
 			    sgsize);
 		} else {
 			sgsize = MIN(sgsize, max_sgsize);
@@ -734,6 +778,88 @@
 	return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */
 }
 
+static int
+bounce_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map,
+    struct vm_page **ma, bus_size_t buflen, int ma_offs, int flags,
+    bus_dma_segment_t *segs, int *segp)
+{
+	vm_paddr_t paddr, next_paddr;
+	int error, page_index;
+	bus_size_t sgsize, max_sgsize;
+
+	if (dmat->common.flags & BUS_DMA_KEEP_PG_OFFSET) {
+		/*
+		 * If we have to keep the offset of each page this function
+		 * is not suitable, switch back to bus_dmamap_load_ma_triv
+		 * which is going to do the right thing in this case.
+		 */
+		error = bus_dmamap_load_ma_triv(dmat, map, ma, buflen, ma_offs,
+		    flags, segs, segp);
+		return (error);
+	}
+
+	if (map == NULL)
+		map = &nobounce_dmamap;
+
+	if (segs == NULL)
+		segs = dmat->segments;
+
+	if ((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) {
+		_bus_dmamap_count_ma(dmat, map, ma, ma_offs, buflen, flags);
+		if (map->pagesneeded != 0) {
+			error = _bus_dmamap_reserve_pages(dmat, map, flags);
+			if (error)
+				return (error);
+		}
+	}
+
+	page_index = 0;
+	while (buflen > 0) {
+		/*
+		 * Compute the segment size, and adjust counts.
+		 */
+		paddr = VM_PAGE_TO_PHYS(ma[page_index]) + ma_offs;
+		max_sgsize = MIN(buflen, dmat->common.maxsegsz);
+		sgsize = PAGE_SIZE - ma_offs;
+		if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) &&
+		    map->pagesneeded != 0 &&
+		    bus_dma_run_filter(&dmat->common, paddr)) {
+			sgsize = roundup2(sgsize, dmat->common.alignment);
+			sgsize = MIN(sgsize, max_sgsize);
+			KASSERT((sgsize & (dmat->common.alignment - 1)) == 0,
+			    ("Segment size is not aligned"));
+			/*
+			 * Check if two pages of the user provided buffer
+			 * are used.
+			 */
+			if ((ma_offs + sgsize) > PAGE_SIZE)
+				next_paddr =
+				    VM_PAGE_TO_PHYS(ma[page_index + 1]);
+			else
+				next_paddr = 0;
+			paddr = add_bounce_page(dmat, map, 0, paddr,
+			    next_paddr, sgsize);
+		} else {
+			sgsize = MIN(sgsize, max_sgsize);
+		}
+		sgsize = _bus_dmamap_addseg(dmat, map, paddr, sgsize, segs,
+		    segp);
+		if (sgsize == 0)
+			break;
+		KASSERT(buflen >= sgsize,
+		    ("Segment length overruns original buffer"));
+		buflen -= sgsize;
+		if (((ma_offs + sgsize) & ~PAGE_MASK) != 0)
+			page_index++;
+		ma_offs = (ma_offs + sgsize) & PAGE_MASK;
+	}
+
+	/*
+	 * Did we fit?
+	 */
+	return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */
+}
+
 static void
 bounce_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg)
@@ -779,6 +905,8 @@
     bus_dmasync_op_t op)
 {
 	struct bounce_page *bpage;
+	vm_offset_t datavaddr, tempvaddr;
+	bus_size_t datacount1, datacount2;
 
 	if (map == NULL || (bpage = STAILQ_FIRST(&map->bpages)) == NULL)
 		return;
@@ -792,13 +920,40 @@
 
 	if ((op & BUS_DMASYNC_PREWRITE) != 0) {
 		while (bpage != NULL) {
-			if (bpage->datavaddr != 0) {
-				bcopy((void *)bpage->datavaddr,
-				    (void *)bpage->vaddr, bpage->datacount);
-			} else {
-				physcopyout(bpage->dataaddr,
-				    (void *)bpage->vaddr, bpage->datacount);
+			tempvaddr = 0;
+			datavaddr = bpage->datavaddr;
+			datacount1 = bpage->datacount;
+			if (datavaddr == 0) {
+				tempvaddr =
+				    pmap_quick_enter_page(bpage->datapage[0]);
+				datavaddr = tempvaddr | bpage->dataoffs;
+				datacount1 = min(PAGE_SIZE - bpage->dataoffs,
+				    datacount1);
 			}
+
+			bcopy((void *)datavaddr,
+			    (void *)bpage->vaddr, datacount1);
+
+			if (tempvaddr != 0)
+				pmap_quick_remove_page(tempvaddr);
+
+			if (bpage->datapage[1] == 0) {
+				KASSERT(datacount1 == bpage->datacount,
+		("Mismatch between data size and provided memory space"));
+				goto next_w;
+			}
+
+			/*
+			 * We are dealing with an unmapped buffer that expands
+			 * over two pages.
+			 */
+			datavaddr = pmap_quick_enter_page(bpage->datapage[1]);
+			datacount2 = bpage->datacount - datacount1;
+			bcopy((void *)datavaddr,
+			    (void *)(bpage->vaddr + datacount1), datacount2);
+			pmap_quick_remove_page(datavaddr);
+
+next_w:
 			bpage = STAILQ_NEXT(bpage, links);
 		}
 		dmat->bounce_zone->total_bounced++;
@@ -806,14 +961,40 @@
 
 	if ((op & BUS_DMASYNC_POSTREAD) != 0) {
 		while (bpage != NULL) {
-			if (bpage->datavaddr != 0) {
-				bcopy((void *)bpage->vaddr,
-				    (void *)bpage->datavaddr,
-				    bpage->datacount);
-			} else {
-				physcopyin((void *)bpage->vaddr,
-				    bpage->dataaddr, bpage->datacount);
+			tempvaddr = 0;
+			datavaddr = bpage->datavaddr;
+			datacount1 = bpage->datacount;
+			if (datavaddr == 0) {
+				tempvaddr =
+				    pmap_quick_enter_page(bpage->datapage[0]);
+				datavaddr = tempvaddr | bpage->dataoffs;
+				datacount1 = min(PAGE_SIZE - bpage->dataoffs,
+				    datacount1);
 			}
+
+			bcopy((void *)bpage->vaddr, (void *)datavaddr,
+			    datacount1);
+
+			if (tempvaddr != 0)
+				pmap_quick_remove_page(tempvaddr);
+
+			if (bpage->datapage[1] == 0) {
+				KASSERT(datacount1 == bpage->datacount,
+		("Mismatch between data size and provided memory space"));
+				goto next_r;
+			}
+
+			/*
+			 * We are dealing with an unmapped buffer that expands
+			 * over two pages.
+			 */
+			datavaddr = pmap_quick_enter_page(bpage->datapage[1]);
+			datacount2 = bpage->datacount - datacount1;
+			bcopy((void *)(bpage->vaddr + datacount1),
+			    (void *)datavaddr, datacount2);
+			pmap_quick_remove_page(datavaddr);
+
+next_r:
 			bpage = STAILQ_NEXT(bpage, links);
 		}
 		dmat->bounce_zone->total_bounced++;
@@ -979,7 +1160,7 @@
 
 static bus_addr_t
 add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr,
-		bus_addr_t addr, bus_size_t size)
+		bus_addr_t addr1, bus_addr_t addr2, bus_size_t size)
 {
 	struct bounce_zone *bz;
 	struct bounce_page *bpage;
@@ -1009,11 +1190,16 @@
 
 	if (dmat->common.flags & BUS_DMA_KEEP_PG_OFFSET) {
 		/* Page offset needs to be preserved. */
-		bpage->vaddr |= addr & PAGE_MASK;
-		bpage->busaddr |= addr & PAGE_MASK;
+		bpage->vaddr |= addr1 & PAGE_MASK;
+		bpage->busaddr |= addr1 & PAGE_MASK;
+		KASSERT(addr2 == 0,
+	("Trying to bounce multiple pages with BUS_DMA_KEEP_PG_OFFSET"));
 	}
 	bpage->datavaddr = vaddr;
-	bpage->dataaddr = addr;
+	bpage->datapage[0] = PHYS_TO_VM_PAGE(addr1);
+	KASSERT((addr2 & PAGE_MASK) == 0, ("Second page is not aligned"));
+	bpage->datapage[1] = PHYS_TO_VM_PAGE(addr2);
+	bpage->dataoffs = addr1 & PAGE_MASK;
 	bpage->datacount = size;
 	STAILQ_INSERT_TAIL(&(map->bpages), bpage, links);
 	return (bpage->busaddr);
@@ -1085,7 +1271,7 @@
 	.mem_free = bounce_bus_dmamem_free,
 	.load_phys = bounce_bus_dmamap_load_phys,
 	.load_buffer = bounce_bus_dmamap_load_buffer,
-	.load_ma = bus_dmamap_load_ma_triv,
+	.load_ma = bounce_bus_dmamap_load_ma,
 	.map_waitok = bounce_bus_dmamap_waitok,
 	.map_complete = bounce_bus_dmamap_complete,
 	.map_unload = bounce_bus_dmamap_unload,

Modified: trunk/sys/x86/x86/busdma_machdep.c
===================================================================
--- trunk/sys/x86/x86/busdma_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/busdma_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/busdma_machdep.c 259511 2013-12-17 13:39:50Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/busdma_machdep.c 257230 2013-10-27 22:05:10Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Added: trunk/sys/x86/x86/cpu_machdep.c
===================================================================
--- trunk/sys/x86/x86/cpu_machdep.c	                        (rev 0)
+++ trunk/sys/x86/x86/cpu_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,1359 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/cpu_machdep.c 355701 2019-12-13 06:54:41Z scottl $");
+
+#include "opt_atpic.h"
+#include "opt_compat.h"
+#include "opt_cpu.h"
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_isa.h"
+#include "opt_kdb.h"
+#include "opt_kstack_pages.h"
+#include "opt_maxmem.h"
+#include "opt_mp_watchdog.h"
+#include "opt_perfmon.h"
+#include "opt_platform.h"
+#ifdef __i386__
+#include "opt_apic.h"
+#include "opt_xbox.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/cpu.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/cputypes.h>
+#include <machine/specialreg.h>
+#include <machine/md_var.h>
+#include <machine/mp_watchdog.h>
+#ifdef PERFMON
+#include <machine/perfmon.h>
+#endif
+#include <machine/tss.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+#ifdef CPU_ELAN
+#include <machine/elan_mmcr.h>
+#endif
+#include <x86/acpica_machdep.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_param.h>
+
+#ifndef PC98
+#include <isa/isareg.h>
+#endif
+
+#define	STATE_RUNNING	0x0
+#define	STATE_MWAIT	0x1
+#define	STATE_SLEEPING	0x2
+
+#ifdef SMP
+static u_int	cpu_reset_proxyid;
+static volatile u_int	cpu_reset_proxy_active;
+#endif
+
+struct msr_op_arg {
+	u_int msr;
+	int op;
+	uint64_t arg1;
+};
+
+static void
+x86_msr_op_one(void *argp)
+{
+	struct msr_op_arg *a;
+	uint64_t v;
+
+	a = argp;
+	switch (a->op) {
+	case MSR_OP_ANDNOT:
+		v = rdmsr(a->msr);
+		v &= ~a->arg1;
+		wrmsr(a->msr, v);
+		break;
+	case MSR_OP_OR:
+		v = rdmsr(a->msr);
+		v |= a->arg1;
+		wrmsr(a->msr, v);
+		break;
+	case MSR_OP_WRITE:
+		wrmsr(a->msr, a->arg1);
+		break;
+	}
+}
+
+#define	MSR_OP_EXMODE_MASK	0xf0000000
+#define	MSR_OP_OP_MASK		0x000000ff
+
+void
+x86_msr_op(u_int msr, u_int op, uint64_t arg1)
+{
+	struct thread *td;
+	struct msr_op_arg a;
+	u_int exmode;
+	int bound_cpu, i, is_bound;
+
+	a.op = op & MSR_OP_OP_MASK;
+	MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
+	    a.op == MSR_OP_WRITE);
+	exmode = op & MSR_OP_EXMODE_MASK;
+	MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED ||
+	    exmode == MSR_OP_RENDEZVOUS);
+	a.msr = msr;
+	a.arg1 = arg1;
+	switch (exmode) {
+	case MSR_OP_LOCAL:
+		x86_msr_op_one(&a);
+		break;
+	case MSR_OP_SCHED:
+		td = curthread;
+		thread_lock(td);
+		is_bound = sched_is_bound(td);
+		bound_cpu = td->td_oncpu;
+		CPU_FOREACH(i) {
+			sched_bind(td, i);
+			x86_msr_op_one(&a);
+		}
+		if (is_bound)
+			sched_bind(td, bound_cpu);
+		else
+			sched_unbind(td);
+		thread_unlock(td);
+		break;
+	case MSR_OP_RENDEZVOUS:
+		smp_rendezvous(NULL, x86_msr_op_one, NULL, &a);
+		break;
+	}
+}
+
+/*
+ * Machine dependent boot() routine
+ *
+ * I haven't seen anything to put here yet
+ * Possibly some stuff might be grafted back here from boot()
+ */
+void
+cpu_boot(int howto)
+{
+}
+
+/*
+ * Flush the D-cache for non-DMA I/O so that the I-cache can
+ * be made coherent later.
+ */
+void
+cpu_flush_dcache(void *ptr, size_t len)
+{
+	/* Not applicable */
+}
+
+void
+acpi_cpu_c1(void)
+{
+
+	__asm __volatile("sti; hlt");
+}
+
+/*
+ * Use mwait to pause execution while waiting for an interrupt or
+ * another thread to signal that there is more work.
+ *
+ * NOTE: Interrupts will cause a wakeup; however, this function does
+ * not enable interrupt handling. The caller is responsible to enable
+ * interrupts.
+ */
+void
+acpi_cpu_idle_mwait(uint32_t mwait_hint)
+{
+	int *state;
+	uint64_t v;
+
+	/*
+	 * A comment in Linux patch claims that 'CPUs run faster with
+	 * speculation protection disabled. All CPU threads in a core
+	 * must disable speculation protection for it to be
+	 * disabled. Disable it while we are idle so the other
+	 * hyperthread can run fast.'
+	 *
+	 * XXXKIB.  Software coordination mode should be supported,
+	 * but all Intel CPUs provide hardware coordination.
+	 */
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	KASSERT(atomic_load_int(state) == STATE_SLEEPING,
+	    ("cpu_mwait_cx: wrong monitorbuf state"));
+	atomic_store_int(state, STATE_MWAIT);
+	if (PCPU_GET(ibpb_set) || hw_ssb_active) {
+		v = rdmsr(MSR_IA32_SPEC_CTRL);
+		wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
+		    IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
+	} else {
+		v = 0;
+	}
+	cpu_monitor(state, 0, 0);
+	if (atomic_load_int(state) == STATE_MWAIT)
+		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
+
+	/*
+	 * SSB cannot be disabled while we sleep, or rather, if it was
+	 * disabled, the sysctl thread will bind to our cpu to tweak
+	 * MSR.
+	 */
+	if (v != 0)
+		wrmsr(MSR_IA32_SPEC_CTRL, v);
+
+	/*
+	 * We should exit on any event that interrupts mwait, because
+	 * that event might be a wanted interrupt.
+	 */
+	atomic_store_int(state, STATE_RUNNING);
+}
+
+/* Get current clock frequency for the given cpu id. */
+int
+cpu_est_clockrate(int cpu_id, uint64_t *rate)
+{
+	uint64_t tsc1, tsc2;
+	uint64_t acnt, mcnt, perf;
+	register_t reg;
+
+	if (pcpu_find(cpu_id) == NULL || rate == NULL)
+		return (EINVAL);
+#ifdef __i386__
+	if ((cpu_feature & CPUID_TSC) == 0)
+		return (EOPNOTSUPP);
+#endif
+
+	/*
+	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
+	 * DELAY(9) based logic fails.
+	 */
+	if (tsc_is_invariant && !tsc_perf_stat)
+		return (EOPNOTSUPP);
+
+#ifdef SMP
+	if (smp_cpus > 1) {
+		/* Schedule ourselves on the indicated cpu. */
+		thread_lock(curthread);
+		sched_bind(curthread, cpu_id);
+		thread_unlock(curthread);
+	}
+#endif
+
+	/* Calibrate by measuring a short delay. */
+	reg = intr_disable();
+	if (tsc_is_invariant) {
+		wrmsr(MSR_MPERF, 0);
+		wrmsr(MSR_APERF, 0);
+		tsc1 = rdtsc();
+		DELAY(1000);
+		mcnt = rdmsr(MSR_MPERF);
+		acnt = rdmsr(MSR_APERF);
+		tsc2 = rdtsc();
+		intr_restore(reg);
+		perf = 1000 * acnt / mcnt;
+		*rate = (tsc2 - tsc1) * perf;
+	} else {
+		tsc1 = rdtsc();
+		DELAY(1000);
+		tsc2 = rdtsc();
+		intr_restore(reg);
+		*rate = (tsc2 - tsc1) * 1000;
+	}
+
+#ifdef SMP
+	if (smp_cpus > 1) {
+		thread_lock(curthread);
+		sched_unbind(curthread);
+		thread_unlock(curthread);
+	}
+#endif
+
+	return (0);
+}
+
+/*
+ * Shutdown the CPU as much as possible
+ */
+void
+cpu_halt(void)
+{
+	for (;;)
+		halt();
+}
+
+static void
+cpu_reset_real(void)
+{
+	struct region_descriptor null_idt;
+#ifndef PC98
+	int b;
+#endif
+
+	disable_intr();
+#ifdef CPU_ELAN
+	if (elan_mmcr != NULL)
+		elan_mmcr->RESCFG = 1;
+#endif
+#ifdef __i386__
+	if (cpu == CPU_GEODE1100) {
+		/* Attempt Geode's own reset */
+		outl(0xcf8, 0x80009044ul);
+		outl(0xcfc, 0xf);
+	}
+#endif
+#ifdef PC98
+	/*
+	 * Attempt to do a CPU reset via CPU reset port.
+	 */
+	if ((inb(0x35) & 0xa0) != 0xa0) {
+		outb(0x37, 0x0f);		/* SHUT0 = 0. */
+		outb(0x37, 0x0b);		/* SHUT1 = 0. */
+	}
+	outb(0xf0, 0x00);			/* Reset. */
+#else
+#if !defined(BROKEN_KEYBOARD_RESET)
+	/*
+	 * Attempt to do a CPU reset via the keyboard controller,
+	 * do not turn off GateA20, as any machine that fails
+	 * to do the reset here would then end up in no man's land.
+	 */
+	outb(IO_KBD + 4, 0xFE);
+	DELAY(500000);	/* wait 0.5 sec to see if that did it */
+#endif
+
+	/*
+	 * Attempt to force a reset via the Reset Control register at
+	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
+	 * transitions from 0 to 1.  Bit 1 selects the type of reset
+	 * to attempt: 0 selects a "soft" reset, and 1 selects a
+	 * "hard" reset.  We try a "hard" reset.  The first write sets
+	 * bit 1 to select a "hard" reset and clears bit 2.  The
+	 * second write forces a 0 -> 1 transition in bit 2 to trigger
+	 * a reset.
+	 */
+	outb(0xcf9, 0x2);
+	outb(0xcf9, 0x6);
+	DELAY(500000);  /* wait 0.5 sec to see if that did it */
+
+	/*
+	 * Attempt to force a reset via the Fast A20 and Init register
+	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
+	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
+	 * preserve bit 1 while setting bit 0.  We also must clear bit
+	 * 0 before setting it if it isn't already clear.
+	 */
+	b = inb(0x92);
+	if (b != 0xff) {
+		if ((b & 0x1) != 0)
+			outb(0x92, b & 0xfe);
+		outb(0x92, b | 0x1);
+		DELAY(500000);  /* wait 0.5 sec to see if that did it */
+	}
+#endif /* PC98 */
+
+	printf("No known reset method worked, attempting CPU shutdown\n");
+	DELAY(1000000); /* wait 1 sec for printf to complete */
+
+	/* Wipe the IDT. */
+	null_idt.rd_limit = 0;
+	null_idt.rd_base = 0;
+	lidt(&null_idt);
+
+	/* "good night, sweet prince .... <THUNK!>" */
+	breakpoint();
+
+	/* NOTREACHED */
+	while(1);
+}
+
+#ifdef SMP
+static void
+cpu_reset_proxy(void)
+{
+
+	cpu_reset_proxy_active = 1;
+	while (cpu_reset_proxy_active == 1)
+		ia32_pause(); /* Wait for other cpu to see that we've started */
+
+	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
+	DELAY(1000000);
+	cpu_reset_real();
+}
+#endif
+
+void
+cpu_reset(void)
+{
+#ifdef SMP
+	cpuset_t map;
+	u_int cnt;
+
+	if (smp_started) {
+		map = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &map);
+		CPU_NAND(&map, &stopped_cpus);
+		if (!CPU_EMPTY(&map)) {
+			printf("cpu_reset: Stopping other CPUs\n");
+			stop_cpus(map);
+		}
+
+		if (PCPU_GET(cpuid) != 0) {
+			cpu_reset_proxyid = PCPU_GET(cpuid);
+			cpustop_restartfunc = cpu_reset_proxy;
+			cpu_reset_proxy_active = 0;
+			printf("cpu_reset: Restarting BSP\n");
+
+			/* Restart CPU #0. */
+			CPU_SETOF(0, &started_cpus);
+			wmb();
+
+			cnt = 0;
+			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
+				ia32_pause();
+				cnt++;	/* Wait for BSP to announce restart */
+			}
+			if (cpu_reset_proxy_active == 0) {
+				printf("cpu_reset: Failed to restart BSP\n");
+			} else {
+				cpu_reset_proxy_active = 2;
+				while (1)
+					ia32_pause();
+				/* NOTREACHED */
+			}
+		}
+
+		DELAY(1000000);
+	}
+#endif
+	cpu_reset_real();
+	/* NOTREACHED */
+}
+
+bool
+cpu_mwait_usable(void)
+{
+
+	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
+	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
+	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
+}
+
+void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
+static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
+static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
+SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
+    0, "Use MONITOR/MWAIT for short idle");
+
+#ifndef PC98
+static void
+cpu_idle_acpi(sbintime_t sbt)
+{
+	int *state;
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	atomic_store_int(state, STATE_SLEEPING);
+
+	/* See comments in cpu_idle_hlt(). */
+	disable_intr();
+	if (sched_runnable())
+		enable_intr();
+	else if (cpu_idle_hook)
+		cpu_idle_hook(sbt);
+	else
+		acpi_cpu_c1();
+	atomic_store_int(state, STATE_RUNNING);
+}
+#endif /* !PC98 */
+
+static void
+cpu_idle_hlt(sbintime_t sbt)
+{
+	int *state;
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	atomic_store_int(state, STATE_SLEEPING);
+
+	/*
+	 * Since we may be in a critical section from cpu_idle(), if
+	 * an interrupt fires during that critical section we may have
+	 * a pending preemption.  If the CPU halts, then that thread
+	 * may not execute until a later interrupt awakens the CPU.
+	 * To handle this race, check for a runnable thread after
+	 * disabling interrupts and immediately return if one is
+	 * found.  Also, we must absolutely guarentee that hlt is
+	 * the next instruction after sti.  This ensures that any
+	 * interrupt that fires after the call to disable_intr() will
+	 * immediately awaken the CPU from hlt.  Finally, please note
+	 * that on x86 this works fine because of interrupts enabled only
+	 * after the instruction following sti takes place, while IF is set
+	 * to 1 immediately, allowing hlt instruction to acknowledge the
+	 * interrupt.
+	 */
+	disable_intr();
+	if (sched_runnable())
+		enable_intr();
+	else
+		acpi_cpu_c1();
+	atomic_store_int(state, STATE_RUNNING);
+}
+
+static void
+cpu_idle_mwait(sbintime_t sbt)
+{
+	int *state;
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	atomic_store_int(state, STATE_MWAIT);
+
+	/* See comments in cpu_idle_hlt(). */
+	disable_intr();
+	if (sched_runnable()) {
+		atomic_store_int(state, STATE_RUNNING);
+		enable_intr();
+		return;
+	}
+
+	cpu_monitor(state, 0, 0);
+	if (atomic_load_int(state) == STATE_MWAIT)
+		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
+	else
+		enable_intr();
+	atomic_store_int(state, STATE_RUNNING);
+}
+
+static void
+cpu_idle_spin(sbintime_t sbt)
+{
+	int *state;
+	int i;
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	atomic_store_int(state, STATE_RUNNING);
+
+	/*
+	 * The sched_runnable() call is racy but as long as there is
+	 * a loop missing it one time will have just a little impact if any 
+	 * (and it is much better than missing the check at all).
+	 */
+	for (i = 0; i < 1000; i++) {
+		if (sched_runnable())
+			return;
+		cpu_spinwait();
+	}
+}
+
+/*
+ * C1E renders the local APIC timer dead, so we disable it by
+ * reading the Interrupt Pending Message register and clearing
+ * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
+ * 
+ * Reference:
+ *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
+ *   #32559 revision 3.00+
+ */
+#define	MSR_AMDK8_IPM		0xc0010055
+#define	AMDK8_SMIONCMPHALT	(1ULL << 27)
+#define	AMDK8_C1EONCMPHALT	(1ULL << 28)
+#define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
+
+void
+cpu_probe_amdc1e(void)
+{
+
+	/*
+	 * Detect the presence of C1E capability mostly on latest
+	 * dual-cores (or future) k8 family.
+	 */
+	if (cpu_vendor_id == CPU_VENDOR_AMD &&
+	    (cpu_id & 0x00000f00) == 0x00000f00 &&
+	    (cpu_id & 0x0fff0000) >=  0x00040000) {
+		cpu_ident_amdc1e = 1;
+	}
+}
+
+#if defined(__i386__) && defined(PC98)
+void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
+#else
+void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
+#endif
+
+void
+cpu_idle(int busy)
+{
+	uint64_t msr;
+	sbintime_t sbt = -1;
+
+	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
+	    busy, curcpu);
+#ifdef MP_WATCHDOG
+	ap_watchdog(PCPU_GET(cpuid));
+#endif
+
+	/* If we are busy - try to use fast methods. */
+	if (busy) {
+		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
+			cpu_idle_mwait(busy);
+			goto out;
+		}
+	}
+
+	/* If we have time - switch timers into idle mode. */
+	if (!busy) {
+		critical_enter();
+		sbt = cpu_idleclock();
+	}
+
+	/* Apply AMD APIC timer C1E workaround. */
+	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
+		msr = rdmsr(MSR_AMDK8_IPM);
+		if (msr & AMDK8_CMPHALT)
+			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
+	}
+
+	/* Call main idle method. */
+	cpu_idle_fn(sbt);
+
+	/* Switch timers back into active mode. */
+	if (!busy) {
+		cpu_activeclock();
+		critical_exit();
+	}
+out:
+	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
+	    busy, curcpu);
+}
+
+static int cpu_idle_apl31_workaround;
+SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
+    &cpu_idle_apl31_workaround, 0,
+    "Apollo Lake APL31 MWAIT bug workaround");
+
+int
+cpu_idle_wakeup(int cpu)
+{
+	int *state;
+
+	state = (int *)pcpu_find(cpu)->pc_monitorbuf;
+	switch (atomic_load_int(state)) {
+	case STATE_SLEEPING:
+		return (0);
+	case STATE_MWAIT:
+		atomic_store_int(state, STATE_RUNNING);
+		return (cpu_idle_apl31_workaround ? 0 : 1);
+	case STATE_RUNNING:
+		return (1);
+	default:
+		panic("bad monitor state");
+		return (1);
+	}
+}
+
+/*
+ * Ordered by speed/power consumption.
+ */
+static struct {
+	void	*id_fn;
+	char	*id_name;
+	int	id_cpuid2_flag;
+} idle_tbl[] = {
+	{ .id_fn = cpu_idle_spin, .id_name = "spin" },
+	{ .id_fn = cpu_idle_mwait, .id_name = "mwait",
+	    .id_cpuid2_flag = CPUID2_MON },
+	{ .id_fn = cpu_idle_hlt, .id_name = "hlt" },
+#if !defined(__i386__) || !defined(PC98)
+	{ .id_fn = cpu_idle_acpi, .id_name = "acpi" },
+#endif
+};
+
+static int
+idle_sysctl_available(SYSCTL_HANDLER_ARGS)
+{
+	char *avail, *p;
+	int error;
+	int i;
+
+	avail = malloc(256, M_TEMP, M_WAITOK);
+	p = avail;
+	for (i = 0; i < nitems(idle_tbl); i++) {
+		if (idle_tbl[i].id_cpuid2_flag != 0 &&
+		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
+			continue;
+#if !defined(__i386__) || !defined(PC98)
+		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
+		    cpu_idle_hook == NULL)
+			continue;
+#endif
+		p += sprintf(p, "%s%s", p != avail ? ", " : "",
+		    idle_tbl[i].id_name);
+	}
+	error = sysctl_handle_string(oidp, avail, 0, req);
+	free(avail, M_TEMP);
+	return (error);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
+    0, 0, idle_sysctl_available, "A", "list of available idle functions");
+
+static bool
+cpu_idle_selector(const char *new_idle_name)
+{
+	int i;
+
+	for (i = 0; i < nitems(idle_tbl); i++) {
+		if (idle_tbl[i].id_cpuid2_flag != 0 &&
+		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
+			continue;
+#if !defined(__i386__) || !defined(PC98)
+		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
+		    cpu_idle_hook == NULL)
+			continue;
+#endif
+		if (strcmp(idle_tbl[i].id_name, new_idle_name))
+			continue;
+		cpu_idle_fn = idle_tbl[i].id_fn;
+		if (bootverbose)
+			printf("CPU idle set to %s\n", idle_tbl[i].id_name);
+		return (true);
+	}
+	return (false);
+}
+
+static int
+cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	char buf[16], *p;
+	int error, i;
+
+	p = "unknown";
+	for (i = 0; i < nitems(idle_tbl); i++) {
+		if (idle_tbl[i].id_fn == cpu_idle_fn) {
+			p = idle_tbl[i].id_name;
+			break;
+		}
+	}
+	strncpy(buf, p, sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	return (cpu_idle_selector(buf) ? 0 : EINVAL);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
+    cpu_idle_sysctl, "A", "currently selected idle function");
+
+static void
+cpu_idle_tun(void *unused __unused)
+{
+	char tunvar[16];
+
+	if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
+		cpu_idle_selector(tunvar);
+	else if (cpu_vendor_id == CPU_VENDOR_AMD &&
+	    CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
+		/* Ryzen erratas 1057, 1109. */
+		cpu_idle_selector("hlt");
+		idle_mwait = 0;
+	}
+
+	if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
+		/*
+		 * Apollo Lake errata APL31 (public errata APL30).
+		 * Stores to the armed address range may not trigger
+		 * MWAIT to resume execution.  OS needs to use
+		 * interrupts to wake processors from MWAIT-induced
+		 * sleep states.
+		 */
+		cpu_idle_apl31_workaround = 1;
+	}
+	TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
+}
+SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
+
+static int panic_on_nmi = 1;
+SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
+    &panic_on_nmi, 0,
+    "Panic on NMI raised by hardware failure");
+int nmi_is_broadcast = 1;
+SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
+    &nmi_is_broadcast, 0,
+    "Chipset NMI is broadcast");
+#ifdef KDB
+int kdb_on_nmi = 1;
+SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
+    &kdb_on_nmi, 0,
+    "Go to KDB on NMI with unknown source");
+#endif
+
+void
+nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
+{
+	bool claimed = false;
+
+#ifdef DEV_ISA
+	/* machine/parity/power fail/"kitchen sink" faults */
+	if (isa_nmi(frame->tf_err)) {
+		claimed = true;
+		if (panic_on_nmi)
+			panic("NMI indicates hardware failure");
+	}
+#endif /* DEV_ISA */
+#ifdef KDB
+	if (!claimed && kdb_on_nmi) {
+		/*
+		 * NMI can be hooked up to a pushbutton for debugging.
+		 */
+		printf("NMI/cpu%d ... going to debugger\n", cpu);
+		kdb_trap(type, 0, frame);
+	}
+#endif /* KDB */
+}
+
+void
+nmi_handle_intr(u_int type, struct trapframe *frame)
+{
+
+#ifdef SMP
+	if (nmi_is_broadcast) {
+		nmi_call_kdb_smp(type, frame);
+		return;
+	}
+#endif
+	nmi_call_kdb(PCPU_GET(cpuid), type, frame);
+}
+
+int hw_ibrs_active;
+int hw_ibrs_disable = 1;
+
+SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
+    "Indirect Branch Restricted Speculation active");
+
+void
+hw_ibrs_recalculate(void)
+{
+	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
+		x86_msr_op(MSR_IA32_SPEC_CTRL, MSR_OP_LOCAL |
+		    (hw_ibrs_disable ? MSR_OP_ANDNOT : MSR_OP_OR),
+		    IA32_SPEC_CTRL_IBRS);
+		return;
+	}
+	hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 &&
+	    !hw_ibrs_disable;
+}
+
+static int
+hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = hw_ibrs_disable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	hw_ibrs_disable = val != 0;
+	hw_ibrs_recalculate();
+	return (0);
+}
+SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
+    CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
+    "Disable Indirect Branch Restricted Speculation");
+
+int hw_ssb_active;
+int hw_ssb_disable;
+
+SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
+    &hw_ssb_active, 0,
+    "Speculative Store Bypass Disable active");
+
+static void
+hw_ssb_set(bool enable, bool for_all_cpus)
+{
+
+	if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
+		hw_ssb_active = 0;
+		return;
+	}
+	hw_ssb_active = enable;
+	x86_msr_op(MSR_IA32_SPEC_CTRL,
+	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
+	    (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD);
+}
+
+void
+hw_ssb_recalculate(bool all_cpus)
+{
+
+	switch (hw_ssb_disable) {
+	default:
+		hw_ssb_disable = 0;
+		/* FALLTHROUGH */
+	case 0: /* off */
+		hw_ssb_set(false, all_cpus);
+		break;
+	case 1: /* on */
+		hw_ssb_set(true, all_cpus);
+		break;
+	case 2: /* auto */
+		hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
+		    false : true, all_cpus);
+		break;
+	}
+}
+
+static int
+hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = hw_ssb_disable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	hw_ssb_disable = val;
+	hw_ssb_recalculate(true);
+	return (0);
+}
+SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
+    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+    hw_ssb_disable_handler, "I",
+    "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
+
+int hw_mds_disable;
+
+/*
+ * Handler for Microarchitectural Data Sampling issues.  Really not a
+ * pointer to C function: on amd64 the code must not change any CPU
+ * architectural state except possibly %rflags. Also, it is always
+ * called with interrupts disabled.
+ */
+void mds_handler_void(void);
+void mds_handler_verw(void);
+void mds_handler_ivb(void);
+void mds_handler_bdw(void);
+void mds_handler_skl_sse(void);
+void mds_handler_skl_avx(void);
+void mds_handler_skl_avx512(void);
+void mds_handler_silvermont(void);
+void (*mds_handler)(void) = mds_handler_void;
+
+static int
+sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
+{
+	const char *state;
+
+	if (mds_handler == mds_handler_void)
+		state = "inactive";
+	else if (mds_handler == mds_handler_verw)
+		state = "VERW";
+	else if (mds_handler == mds_handler_ivb)
+		state = "software IvyBridge";
+	else if (mds_handler == mds_handler_bdw)
+		state = "software Broadwell";
+	else if (mds_handler == mds_handler_skl_sse)
+		state = "software Skylake SSE";
+	else if (mds_handler == mds_handler_skl_avx)
+		state = "software Skylake AVX";
+	else if (mds_handler == mds_handler_skl_avx512)
+		state = "software Skylake AVX512";
+	else if (mds_handler == mds_handler_silvermont)
+		state = "software Silvermont";
+	else
+		state = "unknown";
+	return (SYSCTL_OUT(req, state, strlen(state)));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_hw_mds_disable_state_handler, "A",
+    "Microarchitectural Data Sampling Mitigation state");
+
+_Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
+
+void
+hw_mds_recalculate(void)
+{
+	struct pcpu *pc;
+	vm_offset_t b64;
+	u_long xcr0;
+	int i;
+
+	/*
+	 * Allow user to force VERW variant even if MD_CLEAR is not
+	 * reported.  For instance, hypervisor might unknowingly
+	 * filter the cap out.
+	 * For the similar reasons, and for testing, allow to enable
+	 * mitigation even for RDCL_NO or MDS_NO caps.
+	 */
+	if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
+	    ((cpu_ia32_arch_caps & (IA32_ARCH_CAP_RDCL_NO |
+	    IA32_ARCH_CAP_MDS_NO)) != 0 && hw_mds_disable == 3)) {
+		mds_handler = mds_handler_void;
+	} else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
+	    hw_mds_disable == 3) || hw_mds_disable == 1) {
+		mds_handler = mds_handler_verw;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
+	    CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
+	    CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
+	    CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
+	    CPUID_TO_MODEL(cpu_id) == 0x3a) &&
+	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+		/*
+		 * Nehalem, SandyBridge, IvyBridge
+		 */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL) {
+				pc->pc_mds_buf = malloc(672, M_TEMP,
+				    M_WAITOK);
+				bzero(pc->pc_mds_buf, 16);
+			}
+		}
+		mds_handler = mds_handler_ivb;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
+	    CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
+	    CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
+	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+		/*
+		 * Haswell, Broadwell
+		 */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL) {
+				pc->pc_mds_buf = malloc(1536, M_TEMP,
+				    M_WAITOK);
+				bzero(pc->pc_mds_buf, 16);
+			}
+		}
+		mds_handler = mds_handler_bdw;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
+	    CPUID_STEPPING) <= 5) ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
+	    (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
+	    CPUID_STEPPING) <= 0xb) ||
+	    (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
+	    CPUID_STEPPING) <= 0xc)) &&
+	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+		/*
+		 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
+		 * CascadeLake
+		 */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL) {
+				pc->pc_mds_buf = malloc(6 * 1024,
+				    M_TEMP, M_WAITOK);
+				b64 = (vm_offset_t)malloc(64 + 63,
+				    M_TEMP, M_WAITOK);
+				pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
+				bzero(pc->pc_mds_buf64, 64);
+			}
+		}
+		xcr0 = rxcr(0);
+		if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
+		    (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0)
+			mds_handler = mds_handler_skl_avx512;
+		else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
+		    (cpu_feature2 & CPUID2_AVX) != 0)
+			mds_handler = mds_handler_skl_avx;
+		else
+			mds_handler = mds_handler_skl_sse;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
+	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
+	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
+	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
+	    CPUID_TO_MODEL(cpu_id) == 0x65 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x75 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
+	    CPUID_TO_MODEL(cpu_id) == 0x26 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x7a))) {
+		/* Silvermont, Airmont */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL)
+				pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
+		}
+		mds_handler = mds_handler_silvermont;
+	} else {
+		hw_mds_disable = 0;
+		mds_handler = mds_handler_void;
+	}
+}
+
+static void
+hw_mds_recalculate_boot(void *arg __unused)
+{
+
+	hw_mds_recalculate();
+}
+SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
+
+static int
+sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = hw_mds_disable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (val < 0 || val > 3)
+		return (EINVAL);
+	hw_mds_disable = val;
+	hw_mds_recalculate();
+	return (0);
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
+    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_mds_disable_handler, "I",
+    "Microarchitectural Data Sampling Mitigation "
+    "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
+
+
+/*
+ * Intel Transactional Memory Asynchronous Abort Mitigation
+ * CVE-2019-11135
+ */
+int x86_taa_enable;
+int x86_taa_state;
+enum {
+	TAA_NONE	= 0,	/* No mitigation enabled */
+	TAA_TSX_DISABLE	= 1,	/* Disable TSX via MSR */
+	TAA_VERW	= 2,	/* Use VERW mitigation */
+	TAA_AUTO	= 3,	/* Automatically select the mitigation */
+
+	/* The states below are not selectable by the operator */
+
+	TAA_TAA_UC	= 4,	/* Mitigation present in microcode */
+	TAA_NOT_PRESENT	= 5	/* TSX is not present */
+};
+
+static void
+taa_set(bool enable, bool all)
+{
+
+	x86_msr_op(MSR_IA32_TSX_CTRL,
+	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
+	    (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
+	    IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR);
+}
+
+void
+x86_taa_recalculate(void)
+{
+	static int taa_saved_mds_disable = 0;
+	int taa_need = 0, taa_state = 0;
+	int mds_disable = 0, need_mds_recalc = 0;
+
+	/* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
+	if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
+	    (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
+		/* TSX is not present */
+		x86_taa_state = TAA_NOT_PRESENT;
+		return;
+	}
+
+	/* Check to see what mitigation options the CPU gives us */
+	if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
+		/* CPU is not suseptible to TAA */
+		taa_need = TAA_TAA_UC;
+	} else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
+		/*
+		 * CPU can turn off TSX.  This is the next best option
+		 * if TAA_NO hardware mitigation isn't present
+		 */
+		taa_need = TAA_TSX_DISABLE;
+	} else {
+		/* No TSX/TAA specific remedies are available. */
+		if (x86_taa_enable == TAA_TSX_DISABLE) {
+			if (bootverbose)
+				printf("TSX control not available\n");
+			return;
+		} else
+			taa_need = TAA_VERW;
+	}
+
+	/* Can we automatically take action, or are we being forced? */
+	if (x86_taa_enable == TAA_AUTO)
+		taa_state = taa_need;
+	else
+		taa_state = x86_taa_enable;
+
+	/* No state change, nothing to do */
+	if (taa_state == x86_taa_state) {
+		if (bootverbose)
+			printf("No TSX change made\n");
+		return;
+	}
+
+	/* Does the MSR need to be turned on or off? */
+	if (taa_state == TAA_TSX_DISABLE)
+		taa_set(true, true);
+	else if (x86_taa_state == TAA_TSX_DISABLE)
+		taa_set(false, true);
+
+	/* Does MDS need to be set to turn on VERW? */
+	if (taa_state == TAA_VERW) {
+		taa_saved_mds_disable = hw_mds_disable;
+		mds_disable = hw_mds_disable = 1;
+		need_mds_recalc = 1;
+	} else if (x86_taa_state == TAA_VERW) {
+		mds_disable = hw_mds_disable = taa_saved_mds_disable;
+		need_mds_recalc = 1;
+	}
+	if (need_mds_recalc) {
+		hw_mds_recalculate();
+		if (mds_disable != hw_mds_disable) {
+			if (bootverbose)
+				printf("Cannot change MDS state for TAA\n");
+			/* Don't update our state */
+			return;
+		}
+	}
+
+	x86_taa_state = taa_state;
+	return;
+}
+
+static void
+taa_recalculate_boot(void * arg __unused)
+{
+
+	x86_taa_recalculate();
+}
+SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
+
+SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW, 0,
+	"TSX Asynchronous Abort Mitigation");
+
+static int
+sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = x86_taa_enable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (val < TAA_NONE || val > TAA_AUTO)
+		return (EINVAL);
+	x86_taa_enable = val;
+	x86_taa_recalculate();
+	return (0);
+}
+
+SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
+    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_taa_handler, "I",
+    "TAA Mitigation enablement control "
+    "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO");
+
+static int
+sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
+{
+	const char *state;
+
+	switch (x86_taa_state) {
+	case TAA_NONE:
+		state = "inactive";
+		break;
+	case TAA_TSX_DISABLE:
+		state = "TSX disabled";
+		break;
+	case TAA_VERW:
+		state = "VERW";
+		break;
+	case TAA_TAA_UC:
+		state = "Mitigated in microcode";
+		break;
+	case TAA_NOT_PRESENT:
+		state = "TSX not present";
+		break;
+	default:
+		state = "unknown";
+	}
+
+	return (SYSCTL_OUT(req, state, strlen(state)));
+}
+
+SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_taa_state_handler, "A",
+    "TAA Mitigation state");
+


Property changes on: trunk/sys/x86/x86/cpu_machdep.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/delay.c
===================================================================
--- trunk/sys/x86/x86/delay.c	                        (rev 0)
+++ trunk/sys/x86/x86/delay.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,138 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * Copyright (c) 2010 Alexander Motin <mav at FreeBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz and Don Ahn.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)clock.c	7.2 (Berkeley) 5/12/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/delay.c 340270 2018-11-08 22:42:55Z jhb $");
+
+/* Generic x86 routines to handle delay */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/timetc.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/sched.h>
+
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <x86/init.h>
+
+static void
+delay_tsc(int n)
+{
+	uint64_t end, now;
+
+	/*
+	 * Pin the current thread ensure correct behavior if the TSCs
+	 * on different CPUs are not in sync.
+	 */
+	sched_pin();
+	now = rdtsc();
+	end = now + tsc_freq * n / 1000000;
+	do {
+		cpu_spinwait();
+		now = rdtsc();
+	} while (now < end);
+	sched_unpin();
+}
+
+static int
+delay_tc(int n)
+{
+	struct timecounter *tc;
+	timecounter_get_t *func;
+	uint64_t end, freq, now;
+	u_int last, mask, u;
+
+	/*
+	 * Only use the TSC if it is P-state invariant.  If the TSC is
+	 * not P-state invariant and the CPU is not running at the
+	 * "full" P-state, then the TSC will increment at some rate
+	 * less than tsc_freq and delay_tsc() will wait too long.
+	 */
+	if (tsc_is_invariant && tsc_freq != 0) {
+		delay_tsc(n);
+		return (1);
+	}
+	tc = timecounter;
+	if (tc->tc_quality <= 0)
+		return (0);
+	func = tc->tc_get_timecount;
+	mask = tc->tc_counter_mask;
+	freq = tc->tc_frequency;
+	now = 0;
+	end = freq * n / 1000000;
+	last = func(tc) & mask;
+	do {
+		cpu_spinwait();
+		u = func(tc) & mask;
+		if (u < last)
+			now += mask - last + u + 1;
+		else
+			now += u - last;
+		last = u;
+	} while (now < end);
+	return (1);
+}
+
+void
+DELAY(int n)
+{
+
+	if (delay_tc(n))
+		return;
+
+	init_ops.early_delay(n);
+}
+
+void
+cpu_lock_delay(void)
+{
+
+	/*
+	 * Use TSC to wait for a usec if present, otherwise fall back
+	 * to reading from port 0x84.  We can't call into timecounters
+	 * for this delay since timecounters might use spin locks.
+	 *
+	 * Note that unlike delay_tc(), this uses the TSC even if it
+	 * is not P-state invariant.  For this function it is ok to
+	 * wait even a few usecs.
+	 */
+	if (tsc_freq != 0)
+		delay_tsc(1);
+	else
+		inb(0x84);
+}


Property changes on: trunk/sys/x86/x86/delay.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/dump_machdep.c
===================================================================
--- trunk/sys/x86/x86/dump_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/dump_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,355 +26,30 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/dump_machdep.c 236503 2012-06-03 08:01:12Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/dump_machdep.c 276772 2015-01-07 01:01:39Z markj $");
 
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
-#include <sys/systm.h>
 #include <sys/conf.h>
-#include <sys/cons.h>
+#include <sys/kerneldump.h>
 #include <sys/sysctl.h>
-#include <sys/kernel.h>
-#include <sys/kerneldump.h>
-#include <sys/watchdog.h>
+#include <sys/systm.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
-#include <machine/elf.h>
-#include <machine/md_var.h>
 
-#ifdef __amd64__
-#define	KERNELDUMP_VERSION	KERNELDUMP_AMD64_VERSION
-#define	EM_VALUE		EM_X86_64
-#else
-#define	KERNELDUMP_VERSION	KERNELDUMP_I386_VERSION
-#define	EM_VALUE		EM_386
-#endif
-
-CTASSERT(sizeof(struct kerneldumpheader) == 512);
-
 int do_minidump = 1;
-TUNABLE_INT("debug.minidump", &do_minidump);
-SYSCTL_INT(_debug, OID_AUTO, minidump, CTLFLAG_RW, &do_minidump, 0,
+SYSCTL_INT(_debug, OID_AUTO, minidump, CTLFLAG_RWTUN, &do_minidump, 0,
     "Enable mini crash dumps");
 
-/*
- * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
- * is to protect us from metadata and to protect metadata from us.
- */
-#define	SIZEOF_METADATA		(64*1024)
-
-#define	MD_ALIGN(x)	(((off_t)(x) + PAGE_MASK) & ~PAGE_MASK)
-#define	DEV_ALIGN(x)	(((off_t)(x) + (DEV_BSIZE-1)) & ~(DEV_BSIZE-1))
-
-struct md_pa {
-	vm_paddr_t md_start;
-	vm_paddr_t md_size;
-};
-
-typedef int callback_t(struct md_pa *, int, void *);
-
-static struct kerneldumpheader kdh;
-static off_t dumplo, fileofs;
-
-/* Handle buffered writes. */
-static char buffer[DEV_BSIZE];
-static size_t fragsz;
-
-/* 20 phys_avail entry pairs correspond to 10 md_pa's */
-static struct md_pa dump_map[10];
-
-static void
-md_pa_init(void)
-{
-	int n, idx;
-
-	bzero(dump_map, sizeof(dump_map));
-	for (n = 0; n < sizeof(dump_map) / sizeof(dump_map[0]); n++) {
-		idx = n * 2;
-		if (dump_avail[idx] == 0 && dump_avail[idx + 1] == 0)
-			break;
-		dump_map[n].md_start = dump_avail[idx];
-		dump_map[n].md_size = dump_avail[idx + 1] - dump_avail[idx];
-	}
-}
-
-static struct md_pa *
-md_pa_first(void)
-{
-
-	return (&dump_map[0]);
-}
-
-static struct md_pa *
-md_pa_next(struct md_pa *mdp)
-{
-
-	mdp++;
-	if (mdp->md_size == 0)
-		mdp = NULL;
-	return (mdp);
-}
-
-static int
-buf_write(struct dumperinfo *di, char *ptr, size_t sz)
-{
-	size_t len;
-	int error;
-
-	while (sz) {
-		len = DEV_BSIZE - fragsz;
-		if (len > sz)
-			len = sz;
-		bcopy(ptr, buffer + fragsz, len);
-		fragsz += len;
-		ptr += len;
-		sz -= len;
-		if (fragsz == DEV_BSIZE) {
-			error = dump_write(di, buffer, 0, dumplo,
-			    DEV_BSIZE);
-			if (error)
-				return error;
-			dumplo += DEV_BSIZE;
-			fragsz = 0;
-		}
-	}
-
-	return (0);
-}
-
-static int
-buf_flush(struct dumperinfo *di)
-{
-	int error;
-
-	if (fragsz == 0)
-		return (0);
-
-	error = dump_write(di, buffer, 0, dumplo, DEV_BSIZE);
-	dumplo += DEV_BSIZE;
-	fragsz = 0;
-	return (error);
-}
-
-#define PG2MB(pgs) ((pgs + (1 << 8) - 1) >> 8)
-
-static int
-cb_dumpdata(struct md_pa *mdp, int seqnr, void *arg)
-{
-	struct dumperinfo *di = (struct dumperinfo*)arg;
-	vm_paddr_t a, pa;
-	void *va;
-	uint64_t pgs;
-	size_t counter, sz, chunk;
-	int i, c, error, twiddle;
-	u_int maxdumppgs;
-
-	error = 0;	/* catch case in which chunk size is 0 */
-	counter = 0;	/* Update twiddle every 16MB */
-	twiddle = 0;
-	va = 0;
-	pgs = mdp->md_size / PAGE_SIZE;
-	pa = mdp->md_start;
-	maxdumppgs = min(di->maxiosize / PAGE_SIZE, MAXDUMPPGS);
-	if (maxdumppgs == 0)	/* seatbelt */
-		maxdumppgs = 1;
-
-	printf("  chunk %d: %juMB (%ju pages)", seqnr, (uintmax_t)PG2MB(pgs),
-	    (uintmax_t)pgs);
-
-	while (pgs) {
-		chunk = pgs;
-		if (chunk > maxdumppgs)
-			chunk = maxdumppgs;
-		sz = chunk << PAGE_SHIFT;
-		counter += sz;
-		if (counter >> 24) {
-			printf(" %ju", (uintmax_t)PG2MB(pgs));
-			counter &= (1<<24) - 1;
-		}
-		for (i = 0; i < chunk; i++) {
-			a = pa + i * PAGE_SIZE;
-			va = pmap_kenter_temporary(trunc_page(a), i);
-		}
-
-		wdog_kern_pat(WD_LASTVAL);
-
-		error = dump_write(di, va, 0, dumplo, sz);
-		if (error)
-			break;
-		dumplo += sz;
-		pgs -= chunk;
-		pa += sz;
-
-		/* Check for user abort. */
-		c = cncheckc();
-		if (c == 0x03)
-			return (ECANCELED);
-		if (c != -1)
-			printf(" (CTRL-C to abort) ");
-	}
-	printf(" ... %s\n", (error) ? "fail" : "ok");
-	return (error);
-}
-
-static int
-cb_dumphdr(struct md_pa *mdp, int seqnr, void *arg)
-{
-	struct dumperinfo *di = (struct dumperinfo*)arg;
-	Elf_Phdr phdr;
-	uint64_t size;
-	int error;
-
-	size = mdp->md_size;
-	bzero(&phdr, sizeof(phdr));
-	phdr.p_type = PT_LOAD;
-	phdr.p_flags = PF_R;			/* XXX */
-	phdr.p_offset = fileofs;
-	phdr.p_vaddr = mdp->md_start;
-	phdr.p_paddr = mdp->md_start;
-	phdr.p_filesz = size;
-	phdr.p_memsz = size;
-	phdr.p_align = PAGE_SIZE;
-
-	error = buf_write(di, (char*)&phdr, sizeof(phdr));
-	fileofs += phdr.p_filesz;
-	return (error);
-}
-
-static int
-cb_size(struct md_pa *mdp, int seqnr, void *arg)
-{
-	uint64_t *sz = (uint64_t*)arg;
-
-	*sz += (uint64_t)mdp->md_size;
-	return (0);
-}
-
-static int
-foreach_chunk(callback_t cb, void *arg)
-{
-	struct md_pa *mdp;
-	int error, seqnr;
-
-	seqnr = 0;
-	mdp = md_pa_first();
-	while (mdp != NULL) {
-		error = (*cb)(mdp, seqnr++, arg);
-		if (error)
-			return (-error);
-		mdp = md_pa_next(mdp);
-	}
-	return (seqnr);
-}
-
 void
-dumpsys(struct dumperinfo *di)
+dumpsys_map_chunk(vm_paddr_t pa, size_t chunk, void **va)
 {
-	Elf_Ehdr ehdr;
-	uint64_t dumpsize;
-	off_t hdrgap;
-	size_t hdrsz;
-	int error;
+	int i;
+	vm_paddr_t a;
 
-	if (do_minidump) {
-		minidumpsys(di);
-		return;
+	for (i = 0; i < chunk; i++) {
+		a = pa + i * PAGE_SIZE;
+		*va = pmap_kenter_temporary(trunc_page(a), i);
 	}
-	bzero(&ehdr, sizeof(ehdr));
-	ehdr.e_ident[EI_MAG0] = ELFMAG0;
-	ehdr.e_ident[EI_MAG1] = ELFMAG1;
-	ehdr.e_ident[EI_MAG2] = ELFMAG2;
-	ehdr.e_ident[EI_MAG3] = ELFMAG3;
-	ehdr.e_ident[EI_CLASS] = ELF_CLASS;
-#if BYTE_ORDER == LITTLE_ENDIAN
-	ehdr.e_ident[EI_DATA] = ELFDATA2LSB;
-#else
-	ehdr.e_ident[EI_DATA] = ELFDATA2MSB;
-#endif
-	ehdr.e_ident[EI_VERSION] = EV_CURRENT;
-	ehdr.e_ident[EI_OSABI] = ELFOSABI_STANDALONE;	/* XXX big picture? */
-	ehdr.e_type = ET_CORE;
-	ehdr.e_machine = EM_VALUE;
-	ehdr.e_phoff = sizeof(ehdr);
-	ehdr.e_flags = 0;
-	ehdr.e_ehsize = sizeof(ehdr);
-	ehdr.e_phentsize = sizeof(Elf_Phdr);
-	ehdr.e_shentsize = sizeof(Elf_Shdr);
-
-	md_pa_init();
-
-	/* Calculate dump size. */
-	dumpsize = 0L;
-	ehdr.e_phnum = foreach_chunk(cb_size, &dumpsize);
-	hdrsz = ehdr.e_phoff + ehdr.e_phnum * ehdr.e_phentsize;
-	fileofs = MD_ALIGN(hdrsz);
-	dumpsize += fileofs;
-	hdrgap = fileofs - DEV_ALIGN(hdrsz);
-
-	/* Determine dump offset on device. */
-	if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
-		error = ENOSPC;
-		goto fail;
-	}
-	dumplo = di->mediaoffset + di->mediasize - dumpsize;
-	dumplo -= sizeof(kdh) * 2;
-
-	mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_VERSION, dumpsize,
-	    di->blocksize);
-
-	printf("Dumping %llu MB (%d chunks)\n", (long long)dumpsize >> 20,
-	    ehdr.e_phnum);
-
-	/* Dump leader */
-	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
-	if (error)
-		goto fail;
-	dumplo += sizeof(kdh);
-
-	/* Dump ELF header */
-	error = buf_write(di, (char*)&ehdr, sizeof(ehdr));
-	if (error)
-		goto fail;
-
-	/* Dump program headers */
-	error = foreach_chunk(cb_dumphdr, di);
-	if (error < 0)
-		goto fail;
-	buf_flush(di);
-
-	/*
-	 * All headers are written using blocked I/O, so we know the
-	 * current offset is (still) block aligned. Skip the alignement
-	 * in the file to have the segment contents aligned at page
-	 * boundary. We cannot use MD_ALIGN on dumplo, because we don't
-	 * care and may very well be unaligned within the dump device.
-	 */
-	dumplo += hdrgap;
-
-	/* Dump memory chunks (updates dumplo) */
-	error = foreach_chunk(cb_dumpdata, di);
-	if (error < 0)
-		goto fail;
-
-	/* Dump trailer */
-	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
-	if (error)
-		goto fail;
-
-	/* Signal completion, signoff and exit stage left. */
-	dump_write(di, NULL, 0, 0, 0);
-	printf("\nDump complete\n");
-	return;
-
- fail:
-	if (error < 0)
-		error = -error;
-
-	if (error == ECANCELED)
-		printf("\nDump aborted\n");
-	else if (error == ENOSPC)
-		printf("\nDump failed. Partition too small.\n");
-	else
-		printf("\n** DUMP FAILED (ERROR %d) **\n", error);
 }

Modified: trunk/sys/x86/x86/fdt_machdep.c
===================================================================
--- trunk/sys/x86/x86/fdt_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/fdt_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/fdt_machdep.c 250840 2013-05-21 03:05:49Z marcel $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/fdt_machdep.c 287000 2015-08-21 15:57:57Z royger $");
 
 #include "opt_platform.h"
 
@@ -55,7 +55,7 @@
 	mdp = preload_search_by_type("elf kernel");
 	if (mdp == NULL)
 		mdp = preload_search_by_type("elf32 kernel");
-	dtbp = (mdp != NULL) ? MD_FETCH(mdp, MODINFOMD_DTBP, void *) : NULL;
+	dtbp = MD_FETCH(mdp, MODINFOMD_DTBP, void *);
 
 #if defined(FDT_DTB_STATIC)
 	/*

Modified: trunk/sys/x86/x86/identcpu.c
===================================================================
--- trunk/sys/x86/x86/identcpu.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/identcpu.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -40,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/identcpu.c 332743 2018-04-19 00:11:02Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/identcpu.c 354658 2019-11-12 19:35:46Z scottl $");
 
 #include "opt_cpu.h"
 
@@ -84,9 +84,46 @@
 static void print_via_padlock_info(void);
 static void print_vmx_info(void);
 
+#ifdef __i386__
+int	cpu;			/* Are we 386, 386sx, 486, etc? */
 int	cpu_class;
+#endif
+u_int	cpu_feature;		/* Feature flags */
+u_int	cpu_feature2;		/* Feature flags */
+u_int	amd_feature;		/* AMD feature flags */
+u_int	amd_feature2;		/* AMD feature flags */
+u_int	amd_pminfo;		/* AMD advanced power management info */
+u_int	amd_extended_feature_extensions;
+u_int	via_feature_rng;	/* VIA RNG features */
+u_int	via_feature_xcrypt;	/* VIA ACE features */
+u_int	cpu_high;		/* Highest arg to CPUID */
+u_int	cpu_exthigh;		/* Highest arg to extended CPUID */
+u_int	cpu_id;			/* Stepping ID */
+u_int	cpu_procinfo;		/* HyperThreading Info / Brand Index / CLFUSH */
+u_int	cpu_procinfo2;		/* Multicore info */
+char	cpu_vendor[20];		/* CPU Origin code */
+u_int	cpu_vendor_id;		/* CPU vendor ID */
+u_int	cpu_fxsr;		/* SSE enabled */
+u_int	cpu_mxcsr_mask;		/* Valid bits in mxcsr */
+u_int	cpu_clflush_line_size = 32;
+u_int	cpu_stdext_feature;	/* %ebx */
+u_int	cpu_stdext_feature2;	/* %ecx */
+u_int	cpu_stdext_feature3;	/* %edx */
+uint64_t cpu_ia32_arch_caps;
+u_int	cpu_max_ext_state_size;
+u_int	cpu_mon_mwait_flags;	/* MONITOR/MWAIT flags (CPUID.05H.ECX) */
+u_int	cpu_mon_min_size;	/* MONITOR minimum range size, bytes */
+u_int	cpu_mon_max_size;	/* MONITOR minimum range size, bytes */
+u_int	cpu_maxphyaddr;		/* Max phys addr width in bits */
 char machine[] = MACHINE;
 
+SYSCTL_UINT(_hw, OID_AUTO, via_feature_rng, CTLFLAG_RD,
+    &via_feature_rng, 0,
+    "VIA RNG feature available in CPU");
+SYSCTL_UINT(_hw, OID_AUTO, via_feature_xcrypt, CTLFLAG_RD,
+    &via_feature_xcrypt, 0,
+    "VIA xcrypt feature available in CPU");
+
 #ifdef __amd64__
 #ifdef SCTL_MASK32
 extern int adaptive_machine_arch;
@@ -109,8 +146,8 @@
 	return (error);
 
 }
-SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD,
-    NULL, 0, sysctl_hw_machine, "A", "Machine class");
+SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine, "A", "Machine class");
 #else
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD,
     machine, 0, "Machine class");
@@ -117,7 +154,7 @@
 #endif
 
 static char cpu_model[128];
-SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD,
+SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD | CTLFLAG_MPSAFE,
     cpu_model, 0, "Machine model");
 
 static int hw_clockrate;
@@ -126,8 +163,8 @@
 
 u_int hv_high;
 char hv_vendor[16];
-SYSCTL_STRING(_hw, OID_AUTO, hv_vendor, CTLFLAG_RD, hv_vendor, 0,
-    "Hypervisor vendor");
+SYSCTL_STRING(_hw, OID_AUTO, hv_vendor, CTLFLAG_RD | CTLFLAG_MPSAFE, hv_vendor,
+    0, "Hypervisor vendor");
 
 static eventhandler_tag tsc_post_tag;
 
@@ -147,13 +184,11 @@
 	NULL,
 	"Intel Pentium 4"
 };
-#endif
 
 static struct {
 	char	*cpu_name;
 	int	cpu_class;
 } cpus[] = {
-#ifdef __i386__
 	{ "Intel 80286",	CPUCLASS_286 },		/* CPU_286   */
 	{ "i386SX",		CPUCLASS_386 },		/* CPU_386SX */
 	{ "i386DX",		CPUCLASS_386 },		/* CPU_386   */
@@ -171,11 +206,8 @@
 	{ "Pentium II",		CPUCLASS_686 },		/* CPU_PII */
 	{ "Pentium III",	CPUCLASS_686 },		/* CPU_PIII */
 	{ "Pentium 4",		CPUCLASS_686 },		/* CPU_P4 */
-#else
-	{ "Clawhammer",		CPUCLASS_K8 },		/* CPU_CLAWHAMMER */
-	{ "Sledgehammer",	CPUCLASS_K8 },		/* CPU_SLEDGEHAMMER */
+};
 #endif
-};
 
 static struct {
 	char	*vendor;
@@ -205,9 +237,13 @@
 	u_int regs[4], i;
 	char *brand;
 
+	printf("CPU: ");
+#ifdef __i386__
 	cpu_class = cpus[cpu].cpu_class;
-	printf("CPU: ");
 	strncpy(cpu_model, cpus[cpu].cpu_name, sizeof (cpu_model));
+#else
+	strncpy(cpu_model, "Hammer", sizeof (cpu_model));
+#endif
 
 	/* Check for extended CPUID information and a processor name. */
 	if (cpu_exthigh >= 0x80000004) {
@@ -660,8 +696,8 @@
 		    (intmax_t)(tsc_freq + 4999) / 1000000,
 		    (u_int)((tsc_freq + 4999) / 10000) % 100);
 	}
+#ifdef __i386__
 	switch(cpu_class) {
-#ifdef __i386__
 	case CPUCLASS_286:
 		printf("286");
 		break;
@@ -683,14 +719,12 @@
 		printf("686");
 		break;
 #endif
-#else
-	case CPUCLASS_K8:
-		printf("K8");
-		break;
-#endif
 	default:
 		printf("Unknown");	/* will panic below... */
 	}
+#else
+	printf("K8");
+#endif
 	printf("-class CPU)\n");
 	if (*cpu_vendor)
 		printf("  Origin=\"%s\"", cpu_vendor);
@@ -914,6 +948,7 @@
 				       "\020PQE"
 				       /* AVX512 Foundation */
 				       "\021AVX512F"
+				       "\022AVX512DQ"
 				       /* Enhanced NRBG */
 				       "\023RDSEED"
 				       /* ADCX + ADOX */
@@ -920,12 +955,17 @@
 				       "\024ADX"
 				       /* Supervisor Mode Access Prevention */
 				       "\025SMAP"
+				       "\026AVX512IFMA"
+				       "\027PCOMMIT"
 				       "\030CLFLUSHOPT"
+				       "\031CLWB"
 				       "\032PROCTRACE"
 				       "\033AVX512PF"
 				       "\034AVX512ER"
 				       "\035AVX512CD"
 				       "\036SHA"
+				       "\037AVX512BW"
+				       "\040AVX512VL"
 				       );
 			}
 
@@ -934,14 +974,35 @@
 				    cpu_stdext_feature2,
 				       "\020"
 				       "\001PREFETCHWT1"
+				       "\002AVX512VBMI"
 				       "\003UMIP"
 				       "\004PKU"
 				       "\005OSPKE"
+				       "\006WAITPKG"
+				       "\011GFNI"
 				       "\027RDPID"
+				       "\032CLDEMOTE"
+				       "\034MOVDIRI"
+				       "\035MOVDIRI64B"
 				       "\037SGXLC"
 				       );
 			}
 
+			if (cpu_stdext_feature3 != 0) {
+				printf("\n  Structured Extended Features3=0x%b",
+				    cpu_stdext_feature3,
+				       "\020"
+				       "\013MD_CLEAR"
+				       "\016TSXFA"
+				       "\033IBPB"
+				       "\034STIBP"
+				       "\035L1DFL"
+				       "\036ARCH_CAP"
+				       "\037CORE_CAP"
+				       "\040SSBD"
+				       );
+			}
+
 			if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
 				cpuid_count(0xd, 0x1, regs);
 				if (regs[0] != 0) {
@@ -955,6 +1016,31 @@
 				}
 			}
 
+			if (cpu_ia32_arch_caps != 0) {
+				printf("\n  IA32_ARCH_CAPS=0x%b",
+				    (u_int)cpu_ia32_arch_caps,
+				       "\020"
+				       "\001RDCL_NO"
+				       "\002IBRS_ALL"
+				       "\003RSBA"
+				       "\004SKIP_L1DFL_VME"
+				       "\005SSB_NO"
+				       "\006MDS_NO"
+				       "\010TSX_CTRL"
+				       "\011TAA_NO"
+				       );
+			}
+
+			if (amd_extended_feature_extensions != 0) {
+				printf("\n  "
+				    "AMD Extended Feature Extensions ID EBX="
+				    "0x%b", amd_extended_feature_extensions,
+				    "\020"
+				    "\001CLZERO"
+				    "\002IRPerf"
+				    "\003XSaveErPtr");
+			}
+
 			if (via_feature_rng != 0 || via_feature_xcrypt != 0)
 				print_via_padlock_info();
 
@@ -1008,11 +1094,11 @@
 	print_hypervisor_info();
 }
 
+#ifdef __i386__
 void
 panicifcpuunsupported(void)
 {
 
-#ifdef __i386__
 #if !defined(lint)
 #if !defined(I486_CPU) && !defined(I586_CPU) && !defined(I686_CPU)
 #error This kernel is not configured for one of the supported CPUs
@@ -1019,17 +1105,11 @@
 #endif
 #else /* lint */
 #endif /* lint */
-#else /* __amd64__ */
-#ifndef HAMMER
-#error "You need to specify a cpu type"
-#endif
-#endif
 	/*
 	 * Now that we have told the user what they have,
 	 * let them know if that machine type isn't configured.
 	 */
 	switch (cpu_class) {
-#ifdef __i386__
 	case CPUCLASS_286:	/* a 286 should not make it this far, anyway */
 	case CPUCLASS_386:
 #if !defined(I486_CPU)
@@ -1041,12 +1121,6 @@
 #if !defined(I686_CPU)
 	case CPUCLASS_686:
 #endif
-#else /* __amd64__ */
-	case CPUCLASS_X86:
-#ifndef HAMMER
-	case CPUCLASS_K8:
-#endif
-#endif
 		panic("CPU class not configured");
 	default:
 		break;
@@ -1053,7 +1127,6 @@
 	}
 }
 
-#ifdef __i386__
 static	volatile u_int trap_by_rdmsr;
 
 /*
@@ -1210,7 +1283,6 @@
 
 SYSINIT(hook_tsc_freq, SI_SUB_CONFIGURE, SI_ORDER_ANY, hook_tsc_freq, NULL);
 
-#ifndef XEN
 static const char *const vm_bnames[] = {
 	"QEMU",				/* QEMU */
 	"Plex86",			/* Plex86 */
@@ -1270,6 +1342,10 @@
 				vm_guest = VM_GUEST_VMWARE;
 			else if (strcmp(hv_vendor, "Microsoft Hv") == 0)
 				vm_guest = VM_GUEST_HV;
+			else if (strcmp(hv_vendor, "KVMKVMKVM") == 0)
+				vm_guest = VM_GUEST_KVM;
+			else if (strcmp(hv_vendor, "bhyve bhyve") == 0)
+				vm_guest = VM_GUEST_BHYVE;
 		}
 		return;
 	}
@@ -1277,7 +1353,7 @@
 	/*
 	 * Examine SMBIOS strings for older hypervisors.
 	 */
-	p = getenv("smbios.system.serial");
+	p = kern_getenv("smbios.system.serial");
 	if (p != NULL) {
 		if (strncmp(p, "VMware-", 7) == 0 || strncmp(p, "VMW", 3) == 0) {
 			vmware_hvcall(VMW_HVCMD_GETVERSION, regs);
@@ -1294,7 +1370,7 @@
 	 * XXX: Some of these entries may not be needed since they were
 	 * added to FreeBSD before the checks above.
 	 */
-	p = getenv("smbios.bios.vendor");
+	p = kern_getenv("smbios.bios.vendor");
 	if (p != NULL) {
 		for (i = 0; vm_bnames[i] != NULL; i++)
 			if (strcmp(p, vm_bnames[i]) == 0) {
@@ -1304,7 +1380,7 @@
 			}
 		freeenv(p);
 	}
-	p = getenv("smbios.system.product");
+	p = kern_getenv("smbios.system.product");
 	if (p != NULL) {
 		for (i = 0; vm_pnames[i] != NULL; i++)
 			if (strcmp(p, vm_pnames[i]) == 0) {
@@ -1315,7 +1391,6 @@
 		freeenv(p);
 	}
 }
-#endif
 
 bool
 fix_cpuid(void)
@@ -1360,9 +1435,8 @@
 	return (false);
 }
 
-#ifdef __amd64__
 void
-identify_cpu(void)
+identify_cpu1(void)
 {
 	u_int regs[4];
 
@@ -1379,8 +1453,34 @@
 	cpu_feature = regs[3];
 	cpu_feature2 = regs[2];
 }
-#endif
 
+void
+identify_cpu2(void)
+{
+	u_int regs[4], cpu_stdext_disable;
+
+	if (cpu_high >= 7) {
+		cpuid_count(7, 0, regs);
+		cpu_stdext_feature = regs[1];
+
+		/*
+		 * Some hypervisors failed to filter out unsupported
+		 * extended features.  Allow to disable the
+		 * extensions, activation of which requires setting a
+		 * bit in CR4, and which VM monitors do not support.
+		 */
+		cpu_stdext_disable = 0;
+		TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
+		cpu_stdext_feature &= ~cpu_stdext_disable;
+
+		cpu_stdext_feature2 = regs[2];
+		cpu_stdext_feature3 = regs[3];
+
+		if ((cpu_stdext_feature3 & CPUID_STDEXT3_ARCH_CAP) != 0)
+			cpu_ia32_arch_caps = rdmsr(MSR_IA32_ARCH_CAP);
+	}
+}
+
 /*
  * Final stage of CPU identification.
  */
@@ -1387,7 +1487,7 @@
 void
 finishidentcpu(void)
 {
-	u_int regs[4], cpu_stdext_disable;
+	u_int regs[4];
 #ifdef __i386__
 	u_char ccr3;
 #endif
@@ -1406,26 +1506,8 @@
 		cpu_mon_max_size = regs[1] &  CPUID5_MON_MAX_SIZE;
 	}
 
-	if (cpu_high >= 7) {
-		cpuid_count(7, 0, regs);
-		cpu_stdext_feature = regs[1];
+	identify_cpu2();
 
-		/*
-		 * Some hypervisors fail to filter out unsupported
-		 * extended features.  For now, disable the
-		 * extensions, activation of which requires setting a
-		 * bit in CR4, and which VM monitors do not support.
-		 */
-		if (cpu_feature2 & CPUID2_HV) {
-			cpu_stdext_disable = CPUID_STDEXT_FSGSBASE |
-			    CPUID_STDEXT_SMEP;
-		} else
-			cpu_stdext_disable = 0;
-		TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
-		cpu_stdext_feature &= ~cpu_stdext_disable;
-		cpu_stdext_feature2 = regs[2];
-	}
-
 #ifdef __i386__
 	if (cpu_high > 0 &&
 	    (cpu_vendor_id == CPU_VENDOR_INTEL ||
@@ -1457,6 +1539,7 @@
 	if (cpu_exthigh >= 0x80000008) {
 		do_cpuid(0x80000008, regs);
 		cpu_maxphyaddr = regs[0] & 0xff;
+		amd_extended_feature_extensions = regs[1];
 		cpu_procinfo2 = regs[2];
 	} else {
 		cpu_maxphyaddr = (cpu_feature & CPUID_PAE) != 0 ? 36 : 32;
@@ -1550,18 +1633,26 @@
 			return;
 		}
 	}
-#else
-	/* XXX */
-	cpu = CPU_CLAWHAMMER;
 #endif
 }
 
+int
+pti_get_default(void)
+{
+
+	if (strcmp(cpu_vendor, AMD_VENDOR_ID) == 0)
+		return (0);
+	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) != 0)
+		return (0);
+	return (1);
+}
+
 static u_int
 find_cpu_vendor_id(void)
 {
 	int	i;
 
-	for (i = 0; i < sizeof(cpu_vendors) / sizeof(cpu_vendors[0]); i++)
+	for (i = 0; i < nitems(cpu_vendors); i++)
 		if (strcmp(cpu_vendor, cpu_vendors[i].vendor) == 0)
 			return (cpu_vendors[i].vendor_id);
 	return (0);

Modified: trunk/sys/x86/x86/intr_machdep.c
===================================================================
--- trunk/sys/x86/x86/intr_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/intr_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/x86/intr_machdep.c 307244 2016-10-14 02:03:53Z sephe $
+ * $FreeBSD: stable/11/sys/x86/x86/intr_machdep.c 340016 2018-11-01 18:34:26Z jhb $
  */
 
 /*
@@ -37,6 +37,7 @@
 
 #include "opt_atpic.h"
 #include "opt_ddb.h"
+#include "opt_smp.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -44,6 +45,7 @@
 #include <sys/ktr.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
@@ -50,6 +52,7 @@
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 #include <machine/clock.h>
 #include <machine/intr_machdep.h>
 #include <machine/smp.h>
@@ -65,7 +68,7 @@
 #ifdef PC98
 #include <pc98/cbus/cbus.h>
 #else
-#include <x86/isa/isa.h>
+#include <isa/isareg.h>
 #endif
 #endif
 
@@ -74,22 +77,26 @@
 typedef void (*mask_fn)(void *);
 
 static int intrcnt_index;
-static struct intsrc *interrupt_sources[NUM_IO_INTS];
+static struct intsrc **interrupt_sources;
 static struct sx intrsrc_lock;
 static struct mtx intrpic_lock;
 static struct mtx intrcnt_lock;
 static TAILQ_HEAD(pics_head, pic) pics;
+u_int num_io_irqs;
 
-#ifdef SMP
+#if defined(SMP) && !defined(EARLY_AP_STARTUP)
 static int assign_cpu;
 #endif
 
-u_long intrcnt[INTRCNT_COUNT];
-char intrnames[INTRCNT_COUNT * (MAXCOMLEN + 1)];
+u_long *intrcnt;
+char *intrnames;
 size_t sintrcnt = sizeof(intrcnt);
 size_t sintrnames = sizeof(intrnames);
+int nintrcnt;
 
-static int	intr_assign_cpu(void *arg, u_char cpu);
+static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
+
+static int	intr_assign_cpu(void *arg, int cpu);
 static void	intr_disable_src(void *arg);
 static void	intr_init(void *__dummy);
 static int	intr_pic_registered(struct pic *pic);
@@ -97,6 +104,18 @@
 static void	intrcnt_updatename(struct intsrc *is);
 static void	intrcnt_register(struct intsrc *is);
 
+/*
+ * SYSINIT levels for SI_SUB_INTR:
+ *
+ * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
+ * SI_ORDER_SECOND: Xen PICs
+ * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
+ * SI_ORDER_FOURTH: Add 8259A PICs
+ * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
+ * SI_ORDER_MIDDLE: SMP interrupt counters
+ * SI_ORDER_ANY: Enable interrupts on BSP
+ */
+
 static int
 intr_pic_registered(struct pic *pic)
 {
@@ -132,6 +151,56 @@
 }
 
 /*
+ * Allocate interrupt source arrays and register interrupt sources
+ * once the number of interrupts is known.
+ */
+static void
+intr_init_sources(void *arg)
+{
+	struct pic *pic;
+
+	MPASS(num_io_irqs > 0);
+
+	interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
+	    M_INTR, M_WAITOK | M_ZERO);
+
+	/*
+	 * - 1 ??? dummy counter.
+	 * - 2 counters for each I/O interrupt.
+	 * - 1 counter for each CPU for lapic timer.
+	 * - 1 counter for each CPU for the Hyper-V vmbus driver.
+	 * - 8 counters for each CPU for IPI counters for SMP.
+	 */
+	nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
+#ifdef COUNT_IPIS
+	if (mp_ncpus > 1)
+		nintrcnt += 8 * mp_ncpus;
+#endif
+	intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
+	    M_ZERO);
+	intrnames = mallocarray(nintrcnt, MAXCOMLEN + 1, M_INTR, M_WAITOK |
+	    M_ZERO);
+	sintrcnt = nintrcnt * sizeof(u_long);
+	sintrnames = nintrcnt * (MAXCOMLEN + 1);
+
+	intrcnt_setname("???", 0);
+	intrcnt_index = 1;
+
+	/*
+	 * NB: intrpic_lock is not held here to avoid LORs due to
+	 * malloc() in intr_register_source().  However, we are still
+	 * single-threaded at this point in startup so the list of
+	 * PICs shouldn't change.
+	 */
+	TAILQ_FOREACH(pic, &pics, pics) {
+		if (pic->pic_register_sources != NULL)
+			pic->pic_register_sources(pic);
+	}
+}
+SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
+    NULL);
+
+/*
  * Register a new interrupt source with the global interrupt system.
  * The global interrupts need to be disabled when this function is
  * called.
@@ -143,6 +212,8 @@
 
 	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
 	vector = isrc->is_pic->pic_vector(isrc);
+	KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
+	    num_io_irqs));
 	if (interrupt_sources[vector] != NULL)
 		return (EEXIST);
 	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
@@ -168,6 +239,8 @@
 intr_lookup_source(int vector)
 {
 
+	if (vector < 0 || vector >= num_io_irqs)
+		return (NULL);
 	return (interrupt_sources[vector]);
 }
 
@@ -308,17 +381,24 @@
 }
 
 static int
-intr_assign_cpu(void *arg, u_char cpu)
+intr_assign_cpu(void *arg, int cpu)
 {
 #ifdef SMP
 	struct intsrc *isrc;
 	int error;
 
+#ifdef EARLY_AP_STARTUP
+	MPASS(mp_ncpus == 1 || smp_started);
+
+	/* Nothing to do if there is only a single CPU. */
+	if (mp_ncpus > 1 && cpu != NOCPU) {
+#else
 	/*
 	 * Don't do anything during early boot.  We will pick up the
 	 * assignment once the APs are started.
 	 */
 	if (assign_cpu && cpu != NOCPU) {
+#endif
 		isrc = arg;
 		sx_xlock(&intrsrc_lock);
 		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
@@ -353,6 +433,7 @@
 
 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
 	mtx_lock_spin(&intrcnt_lock);
+	MPASS(intrcnt_index + 2 <= nintrcnt);
 	is->is_index = intrcnt_index;
 	intrcnt_index += 2;
 	snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
@@ -369,6 +450,7 @@
 {
 
 	mtx_lock_spin(&intrcnt_lock);
+	MPASS(intrcnt_index < nintrcnt);
 	*countp = &intrcnt[intrcnt_index];
 	intrcnt_setname(name, intrcnt_index);
 	intrcnt_index++;
@@ -379,8 +461,6 @@
 intr_init(void *dummy __unused)
 {
 
-	intrcnt_setname("???", 0);
-	intrcnt_index = 1;
 	TAILQ_INIT(&pics);
 	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
 	sx_init(&intrsrc_lock, "intrsrc");
@@ -388,6 +468,21 @@
 }
 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
 
+static void
+intr_init_final(void *dummy __unused)
+{
+
+	/*
+	 * Enable interrupts on the BSP after all of the interrupt
+	 * controllers are initialized.  Device interrupts are still
+	 * disabled in the interrupt controllers until interrupt
+	 * handlers are registered.  Interrupts are enabled on each AP
+	 * after their first context switch.
+	 */
+	enable_intr();
+}
+SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
+
 #ifndef DEV_ATPIC
 /* Initialize the two 8259A's to a known-good shutdown state. */
 void
@@ -427,6 +522,23 @@
 	return (0);
 }
 
+void
+intr_reprogram(void)
+{
+	struct intsrc *is;
+	u_int v;
+
+	sx_xlock(&intrsrc_lock);
+	for (v = 0; v < num_io_irqs; v++) {
+		is = interrupt_sources[v];
+		if (is == NULL)
+			continue;
+		if (is->is_pic->pic_reprogram_pin != NULL)
+			is->is_pic->pic_reprogram_pin(is);
+	}
+	sx_xunlock(&intrsrc_lock);
+}
+
 #ifdef DDB
 /*
  * Dump data about interrupt handlers
@@ -434,7 +546,8 @@
 DB_SHOW_COMMAND(irqs, db_show_irqs)
 {
 	struct intsrc **isrc;
-	int i, verbose;
+	u_int i;
+	int verbose;
 
 	if (strcmp(modif, "v") == 0)
 		verbose = 1;
@@ -441,7 +554,7 @@
 	else
 		verbose = 0;
 	isrc = interrupt_sources;
-	for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++)
+	for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
 		if (*isrc != NULL)
 			db_dump_intr_event((*isrc)->is_event, verbose);
 }
@@ -453,7 +566,7 @@
  * allocate CPUs round-robin.
  */
 
-static cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
+cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
 static int current_cpu;
 
 /*
@@ -465,9 +578,15 @@
 {
 	u_int apic_id;
 
+#ifdef EARLY_AP_STARTUP
+	MPASS(mp_ncpus == 1 || smp_started);
+	if (mp_ncpus == 1)
+		return (PCPU_GET(apic_id));
+#else
 	/* Leave all interrupts on the BSP during boot. */
 	if (!assign_cpu)
 		return (PCPU_GET(apic_id));
+#endif
 
 	mtx_lock_spin(&icu_lock);
 	apic_id = cpu_apic_ids[current_cpu];
@@ -509,6 +628,7 @@
 	CPU_SET(cpu, &intr_cpus);
 }
 
+#ifndef EARLY_AP_STARTUP
 /*
  * Distribute all the interrupt sources among the available CPUs once the
  * AP's have been launched.
@@ -517,15 +637,8 @@
 intr_shuffle_irqs(void *arg __unused)
 {
 	struct intsrc *isrc;
-	int i;
+	u_int i;
 
-#ifdef XEN
-	/*
-	 * Doesn't work yet
-	 */
-	return;
-#endif
-
 	/* Don't bother on UP. */
 	if (mp_ncpus == 1)
 		return;
@@ -533,7 +646,7 @@
 	/* Round-robin assign a CPU to each enabled source. */
 	sx_xlock(&intrsrc_lock);
 	assign_cpu = 1;
-	for (i = 0; i < NUM_IO_INTS; i++) {
+	for (i = 0; i < num_io_irqs; i++) {
 		isrc = interrupt_sources[i];
 		if (isrc != NULL && isrc->is_handlers > 0) {
 			/*
@@ -556,6 +669,7 @@
 }
 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
     NULL);
+#endif
 #else
 /*
  * Always route interrupts to the current processor in the UP case.

Modified: trunk/sys/x86/x86/io_apic.c
===================================================================
--- trunk/sys/x86/x86/io_apic.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/io_apic.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,8 +26,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/io_apic.c 330959 2018-03-14 23:59:52Z marius $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/io_apic.c 340016 2018-11-01 18:34:26Z jhb $");
 
+#include "opt_acpi.h"
 #include "opt_isa.h"
 
 #include <sys/param.h>
@@ -38,6 +39,7 @@
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
+#include <sys/rman.h>
 #include <sys/sysctl.h>
 
 #include <dev/pci/pcireg.h>
@@ -49,9 +51,10 @@
 #include <x86/apicreg.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
 #include <machine/resource.h>
 #include <machine/segments.h>
+#include <x86/iommu/iommu_intrmap.h>
 
 #define IOAPIC_ISA_INTS		16
 #define	IOAPIC_MEM_REGION	32
@@ -58,11 +61,6 @@
 #define	IOAPIC_REDTBL_LO(i)	(IOAPIC_REDTBL + (i) * 2)
 #define	IOAPIC_REDTBL_HI(i)	(IOAPIC_REDTBL_LO(i) + 1)
 
-#define	IRQ_EXTINT		(NUM_IO_INTS + 1)
-#define	IRQ_NMI			(NUM_IO_INTS + 2)
-#define	IRQ_SMI			(NUM_IO_INTS + 3)
-#define	IRQ_DISABLED		(NUM_IO_INTS + 4)
-
 static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures");
 
 /*
@@ -81,15 +79,16 @@
 
 struct ioapic_intsrc {
 	struct intsrc io_intsrc;
-	u_int io_irq;
+	int io_irq;
 	u_int io_intpin:8;
 	u_int io_vector:8;
-	u_int io_cpu:8;
+	u_int io_cpu;
 	u_int io_activehi:1;
 	u_int io_edgetrigger:1;
 	u_int io_masked:1;
 	int io_bus:4;
 	uint32_t io_lowreg;
+	u_int io_remap_cookie;
 };
 
 struct ioapic {
@@ -98,9 +97,13 @@
 	u_int io_apic_id:4;
 	u_int io_intbase:8;		/* System Interrupt base */
 	u_int io_numintr:8;
+	u_int io_haseoi:1;
 	volatile ioapic_t *io_addr;	/* XXX: should use bus_space */
 	vm_paddr_t io_paddr;
 	STAILQ_ENTRY(ioapic) io_next;
+	device_t pci_dev;		/* matched pci device, if found */
+	struct resource *pci_wnd;	/* BAR 0, should be same or alias to
+					   io_paddr */
 	struct ioapic_intsrc io_pins[0];
 };
 
@@ -108,6 +111,7 @@
 static void	ioapic_write(volatile ioapic_t *apic, int reg, u_int val);
 static const char *ioapic_bus_string(int bus_type);
 static void	ioapic_print_irq(struct ioapic_intsrc *intpin);
+static void	ioapic_register_sources(struct pic *pic);
 static void	ioapic_enable_source(struct intsrc *isrc);
 static void	ioapic_disable_source(struct intsrc *isrc, int eoi);
 static void	ioapic_eoi_source(struct intsrc *isrc);
@@ -120,27 +124,79 @@
 static void	ioapic_resume(struct pic *pic, bool suspend_cancelled);
 static int	ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id);
 static void	ioapic_program_intpin(struct ioapic_intsrc *intpin);
+static void	ioapic_reprogram_intpin(struct intsrc *isrc);
 
 static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list);
-struct pic ioapic_template = { ioapic_enable_source, ioapic_disable_source,
-			       ioapic_eoi_source, ioapic_enable_intr,
-			       ioapic_disable_intr, ioapic_vector,
-			       ioapic_source_pending, NULL, ioapic_resume,
-			       ioapic_config_intr, ioapic_assign_cpu };
+struct pic ioapic_template = {
+	.pic_register_sources = ioapic_register_sources,
+	.pic_enable_source = ioapic_enable_source,
+	.pic_disable_source = ioapic_disable_source,
+	.pic_eoi_source = ioapic_eoi_source,
+	.pic_enable_intr = ioapic_enable_intr,
+	.pic_disable_intr = ioapic_disable_intr,
+	.pic_vector = ioapic_vector,
+	.pic_source_pending = ioapic_source_pending,
+	.pic_suspend = NULL,
+	.pic_resume = ioapic_resume,
+	.pic_config_intr = ioapic_config_intr,
+	.pic_assign_cpu = ioapic_assign_cpu,
+	.pic_reprogram_pin = ioapic_reprogram_intpin,
+};
 
-static int next_ioapic_base;
+static u_int next_ioapic_base;
 static u_int next_id;
 
-static SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
 static int enable_extint;
 SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0,
     "Enable the ExtINT pin in the first I/O APIC");
-TUNABLE_INT("hw.apic.enable_extint", &enable_extint);
 
-static __inline void
-_ioapic_eoi_source(struct intsrc *isrc)
+static void
+_ioapic_eoi_source(struct intsrc *isrc, int locked)
 {
+	struct ioapic_intsrc *src;
+	struct ioapic *io;
+	volatile uint32_t *apic_eoi;
+	uint32_t low1;
+
 	lapic_eoi();
+	if (!lapic_eoi_suppression)
+		return;
+	src = (struct ioapic_intsrc *)isrc;
+	if (src->io_edgetrigger)
+		return;
+	io = (struct ioapic *)isrc->is_pic;
+
+	/*
+	 * Handle targeted EOI for level-triggered pins, if broadcast
+	 * EOI suppression is supported by LAPICs.
+	 */
+	if (io->io_haseoi) {
+		/*
+		 * If IOAPIC has EOI Register, simply write vector
+		 * number into the reg.
+		 */
+		apic_eoi = (volatile uint32_t *)((volatile char *)
+		    io->io_addr + IOAPIC_EOIR);
+		*apic_eoi = src->io_vector;
+	} else {
+		/*
+		 * Otherwise, if IO-APIC is too old to provide EOIR,
+		 * do what Intel did for the Linux kernel. Temporary
+		 * switch the pin to edge-trigger and back, masking
+		 * the pin during the trick.
+		 */
+		if (!locked)
+			mtx_lock_spin(&icu_lock);
+		low1 = src->io_lowreg;
+		low1 &= ~IOART_TRGRLVL;
+		low1 |= IOART_TRGREDG | IOART_INTMSET;
+		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
+		    low1);
+		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
+		    src->io_lowreg);
+		if (!locked)
+			mtx_unlock_spin(&icu_lock);
+	}
 }
 
 static u_int
@@ -195,7 +251,7 @@
 		printf("SMI");
 		break;
 	default:
-		printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus),
+		printf("%s IRQ %d", ioapic_bus_string(intpin->io_bus),
 		    intpin->io_irq);
 	}
 }
@@ -233,7 +289,7 @@
 	}
 
 	if (eoi == PIC_EOI)
-		_ioapic_eoi_source(isrc);
+		_ioapic_eoi_source(isrc, 1);
 
 	mtx_unlock_spin(&icu_lock);
 }
@@ -242,7 +298,7 @@
 ioapic_eoi_source(struct intsrc *isrc)
 {
 
-	_ioapic_eoi_source(isrc);
+	_ioapic_eoi_source(isrc, 0);
 }
 
 /*
@@ -254,6 +310,9 @@
 {
 	struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic;
 	uint32_t low, high;
+#ifdef ACPI_DMAR
+	int error;
+#endif
 
 	/*
 	 * If a pin is completely invalid or if it is valid but hasn't
@@ -260,7 +319,7 @@
 	 * been enabled yet, just ensure that the pin is masked.
 	 */
 	mtx_assert(&icu_lock, MA_OWNED);
-	if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS &&
+	if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq >= 0 &&
 	    intpin->io_vector == 0)) {
 		low = ioapic_read(io->io_addr,
 		    IOAPIC_REDTBL_LO(intpin->io_intpin));
@@ -268,9 +327,34 @@
 			ioapic_write(io->io_addr,
 			    IOAPIC_REDTBL_LO(intpin->io_intpin),
 			    low | IOART_INTMSET);
+#ifdef ACPI_DMAR
+		mtx_unlock_spin(&icu_lock);
+		iommu_unmap_ioapic_intr(io->io_apic_id,
+		    &intpin->io_remap_cookie);
+		mtx_lock_spin(&icu_lock);
+#endif
 		return;
 	}
 
+#ifdef ACPI_DMAR
+	mtx_unlock_spin(&icu_lock);
+	error = iommu_map_ioapic_intr(io->io_apic_id,
+	    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
+	    intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie,
+	    &high, &low);
+	mtx_lock_spin(&icu_lock);
+	if (error == 0) {
+		ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin),
+		    high);
+		intpin->io_lowreg = low;
+		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
+		    low);
+		return;
+	} else if (error != EOPNOTSUPP) {
+		return;
+	}
+#endif
+
 	/*
 	 * Set the destination.  Note that with Intel interrupt remapping,
 	 * the previously reserved bits 55:48 now have a purpose so ensure
@@ -318,6 +402,15 @@
 	ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low);
 }
 
+static void
+ioapic_reprogram_intpin(struct intsrc *isrc)
+{
+
+	mtx_lock_spin(&icu_lock);
+	ioapic_program_intpin((struct ioapic_intsrc *)isrc);
+	mtx_unlock_spin(&icu_lock);
+}
+
 static int
 ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id)
 {
@@ -537,6 +630,8 @@
 	io = malloc(sizeof(struct ioapic) +
 	    numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK);
 	io->io_pic = ioapic_template;
+	io->pci_dev = NULL;
+	io->pci_wnd = NULL;
 	mtx_lock_spin(&icu_lock);
 	io->io_id = next_id++;
 	io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
@@ -557,11 +652,29 @@
 		    io->io_id, intbase, next_ioapic_base);
 	io->io_intbase = intbase;
 	next_ioapic_base = intbase + numintr;
+	if (next_ioapic_base > num_io_irqs)
+		num_io_irqs = next_ioapic_base;
 	io->io_numintr = numintr;
 	io->io_addr = apic;
 	io->io_paddr = addr;
 
+	if (bootverbose) {
+		printf("ioapic%u: ver 0x%02x maxredir 0x%02x\n", io->io_id,
+		    (value & IOART_VER_VERSION), (value & IOART_VER_MAXREDIR)
+		    >> MAXREDIRSHIFT);
+	}
 	/*
+	 * The  summary information about IO-APIC versions is taken from
+	 * the Linux kernel source:
+	 *     0Xh     82489DX
+	 *     1Xh     I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+	 *     2Xh     I/O(x)APIC which is PCI 2.2 Compliant
+	 *     30h-FFh Reserved
+	 * IO-APICs with version >= 0x20 have working EOIR register.
+	 */
+	io->io_haseoi = (value & IOART_VER_VERSION) >= 0x20;
+
+	/*
 	 * Initialize pins.  Start off with interrupts disabled.  Default
 	 * to active-hi and edge-triggered for ISA interrupts and active-lo
 	 * and level-triggered for all others.
@@ -599,6 +712,15 @@
 		intpin->io_cpu = PCPU_GET(apic_id);
 		value = ioapic_read(apic, IOAPIC_REDTBL_LO(i));
 		ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET);
+#ifdef ACPI_DMAR
+		/* dummy, but sets cookie */
+		mtx_unlock_spin(&icu_lock);
+		iommu_map_ioapic_intr(io->io_apic_id,
+		    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
+		    intpin->io_activehi, intpin->io_irq,
+		    &intpin->io_remap_cookie, NULL, NULL);
+		mtx_lock_spin(&icu_lock);
+#endif
 	}
 	mtx_unlock_spin(&icu_lock);
 
@@ -640,7 +762,7 @@
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || vector < 0)
 		return (EINVAL);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	io->io_pins[pin].io_irq = vector;
 	if (bootverbose)
@@ -659,7 +781,7 @@
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	if (io->io_pins[pin].io_bus == bus_type)
 		return (0);
@@ -680,7 +802,7 @@
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_NMI)
 		return (0);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_NMI;
@@ -703,7 +825,7 @@
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_SMI)
 		return (0);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_SMI;
@@ -726,7 +848,7 @@
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_EXTINT)
 		return (0);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_EXTINT;
@@ -751,7 +873,7 @@
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	activehi = (pol == INTR_POLARITY_HIGH);
 	if (io->io_pins[pin].io_activehi == activehi)
@@ -772,7 +894,7 @@
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 	if (io->io_pins[pin].io_edgetrigger == edgetrigger)
@@ -808,14 +930,26 @@
 
 	/*
 	 * Reprogram pins to handle special case pins (such as NMI and
-	 * SMI) and register valid pins as interrupt sources.
+	 * SMI) and disable normal pins until a handler is registered.
 	 */
 	intr_register_pic(&io->io_pic);
+	for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++)
+		ioapic_reprogram_intpin(&pin->io_intsrc);
+}
+
+/*
+ * Add interrupt sources for I/O APIC interrupt pins.
+ */
+static void
+ioapic_register_sources(struct pic *pic)
+{
+	struct ioapic_intsrc *pin;
+	struct ioapic *io;
+	int i;
+
+	io = (struct ioapic *)pic;
 	for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) {
-		mtx_lock_spin(&icu_lock);
-		ioapic_program_intpin(pin);
-		mtx_unlock_spin(&icu_lock);
-		if (pin->io_irq < NUM_IO_INTS)
+		if (pin->io_irq >= 0)
 			intr_register_source(&pin->io_intsrc);
 	}
 }
@@ -846,7 +980,72 @@
 static int
 ioapic_pci_attach(device_t dev)
 {
+	struct resource *res;
+	volatile ioapic_t *apic;
+	struct ioapic *io;
+	int rid;
+	u_int apic_id;
 
+	/*
+	 * Try to match the enumerated ioapic.  Match BAR start
+	 * against io_paddr.  Due to a fear that PCI window is not the
+	 * same as the MADT reported io window, but an alias, read the
+	 * APIC ID from the mapped BAR and match against it.
+	 */
+	rid = PCIR_BAR(0);
+	res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
+	    RF_ACTIVE | RF_SHAREABLE);
+	if (res == NULL) {
+		if (bootverbose)
+			device_printf(dev, "cannot activate BAR0\n");
+		return (ENXIO);
+	}
+	apic = (volatile ioapic_t *)rman_get_virtual(res);
+	if (rman_get_size(res) < IOAPIC_WND_SIZE) {
+		if (bootverbose)
+			device_printf(dev,
+			    "BAR0 too small (%jd) for IOAPIC window\n",
+			    (uintmax_t)rman_get_size(res));
+		goto fail;
+	}
+	mtx_lock_spin(&icu_lock);
+	apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
+	/* First match by io window address */
+	STAILQ_FOREACH(io, &ioapic_list, io_next) {
+		if (io->io_paddr == (vm_paddr_t)rman_get_start(res))
+			goto found;
+	}
+	/* Then by apic id */
+	STAILQ_FOREACH(io, &ioapic_list, io_next) {
+		if (io->io_apic_id == apic_id)
+			goto found;
+	}
+	mtx_unlock_spin(&icu_lock);
+	if (bootverbose)
+		device_printf(dev,
+		    "cannot match pci bar apic id %d against MADT\n",
+		    apic_id);
+fail:
+	bus_release_resource(dev, SYS_RES_MEMORY, rid, res);
+	return (ENXIO);
+found:
+	KASSERT(io->pci_dev == NULL,
+	    ("ioapic %d pci_dev not NULL", io->io_id));
+	KASSERT(io->pci_wnd == NULL,
+	    ("ioapic %d pci_wnd not NULL", io->io_id));
+
+	io->pci_dev = dev;
+	io->pci_wnd = res;
+	if (bootverbose && (io->io_paddr != (vm_paddr_t)rman_get_start(res) ||
+	    io->io_apic_id != apic_id)) {
+		device_printf(dev, "pci%d:%d:%d:%d pci BAR0@%jx id %d "
+		    "MADT id %d paddr@%jx\n",
+		    pci_get_domain(dev), pci_get_bus(dev),
+		    pci_get_slot(dev), pci_get_function(dev),
+		    (uintmax_t)rman_get_start(res), apic_id,
+		    io->io_apic_id, (uintmax_t)io->io_paddr);
+	}
+	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
@@ -863,6 +1062,28 @@
 static devclass_t ioapic_devclass;
 DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0);
 
+int
+ioapic_get_rid(u_int apic_id, uint16_t *ridp)
+{
+	struct ioapic *io;
+	uintptr_t rid;
+	int error;
+
+	mtx_lock_spin(&icu_lock);
+	STAILQ_FOREACH(io, &ioapic_list, io_next) {
+		if (io->io_apic_id == apic_id)
+			break;
+	}
+	mtx_unlock_spin(&icu_lock);
+	if (io == NULL || io->pci_dev == NULL)
+		return (EINVAL);
+	error = pci_get_id(io->pci_dev, PCI_ID_RID, &rid);
+	if (error != 0)
+		return (error);
+	*ridp = rid;
+	return (0);
+}
+
 /*
  * A new-bus driver to consume the memory resources associated with
  * the APICs in the system.  On some systems ACPI or PnPBIOS system
@@ -896,19 +1117,11 @@
 {
 	int error;
 
-#ifdef PAE
-	/*
-	 * Resources use long's to track resources, so we can't
-	 * include memory regions above 4GB.
-	 */
-	if (base >= ~0ul)
-		return;
-#endif
 	error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length);
 	if (error)
 		panic("apic_add_resource: resource %d failed set with %d", rid,
 		    error);
-	bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0);
+	bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_SHAREABLE);
 }
 
 static int
@@ -918,7 +1131,7 @@
 	int i;
 
 	/* Reserve the local APIC. */
-	apic_add_resource(dev, 0, lapic_paddr, sizeof(lapic_t));
+	apic_add_resource(dev, 0, lapic_paddr, LAPIC_MEM_REGION);
 	i = 1;
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION);

Modified: trunk/sys/x86/x86/legacy.c
===================================================================
--- trunk/sys/x86/x86/legacy.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/legacy.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -33,7 +33,7 @@
 #include "opt_mca.h"
 #endif
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/legacy.c 233707 2012-03-30 19:10:14Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/legacy.c 233707 2012-03-30 19:10:14Z jhb $");
 
 /*
  * This code implements a system driver for legacy systems that do not

Modified: trunk/sys/x86/x86/local_apic.c
===================================================================
--- trunk/sys/x86/x86/local_apic.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/local_apic.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -33,11 +33,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/local_apic.c 314662 2017-03-04 12:04:24Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/local_apic.c 351757 2019-09-03 16:27:23Z emaste $");
 
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 
 #include "opt_ddb.h"
 
@@ -51,6 +50,7 @@
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
+#include <sys/sysctl.h>
 #include <sys/timeet.h>
 
 #include <vm/vm.h>
@@ -58,14 +58,16 @@
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
+#include <machine/cpufunc.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
+#include <x86/init.h>
 
 #ifdef DDB
 #include <sys/interrupt.h>
@@ -88,12 +90,24 @@
 CTASSERT(APIC_LOCAL_INTS == 240);
 CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
 
-/* Magic IRQ values for the timer and syscalls. */
-#define	IRQ_TIMER	(NUM_IO_INTS + 1)
-#define	IRQ_SYSCALL	(NUM_IO_INTS + 2)
-#define	IRQ_DTRACE_RET	(NUM_IO_INTS + 3)
-#define	IRQ_EVTCHN	(NUM_IO_INTS + 4)
+/*
+ * I/O interrupts use non-negative IRQ values.  These values are used
+ * to mark unused IDT entries or IDT entries reserved for a non-I/O
+ * interrupt.
+ */
+#define	IRQ_FREE	-1
+#define	IRQ_TIMER	-2
+#define	IRQ_SYSCALL	-3
+#define	IRQ_DTRACE_RET	-4
+#define	IRQ_EVTCHN	-5
 
+enum lat_timer_mode {
+	LAT_MODE_UNDEF =	0,
+	LAT_MODE_PERIODIC =	1,
+	LAT_MODE_ONESHOT =	2,
+	LAT_MODE_DEADLINE =	3,
+};
+
 /*
  * Support for local APICs.  Local APICs manage interrupts on each
  * individual processor as opposed to I/O APICs which receive interrupts
@@ -114,14 +128,16 @@
 
 struct lapic {
 	struct lvt la_lvts[APIC_LVT_MAX + 1];
+	struct lvt la_elvts[APIC_ELVT_MAX + 1];
 	u_int la_id:8;
 	u_int la_cluster:4;
 	u_int la_cluster_id:2;
 	u_int la_present:1;
 	u_long *la_timer_count;
-	u_long la_timer_period;
-	u_int la_timer_mode;
-	uint32_t lvt_timer_cache;
+	uint64_t la_timer_period;
+	enum lat_timer_mode la_timer_mode;
+	uint32_t lvt_timer_base;
+	uint32_t lvt_timer_last;
 	/* Include IDT_SYSCALL to make indexing easier. */
 	int la_ioint_irqs[APIC_NUM_IOINTS + 1];
 } static lapics[MAX_APIC_ID + 1];
@@ -137,6 +153,14 @@
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },	/* CMCI */
 };
 
+/* Global defaults for AMD local APIC ELVT entries. */
+static struct lvt elvts[APIC_ELVT_MAX + 1] = {
+	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
+	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT },
+	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
+	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
+};
+
 static inthand_t *ioint_handlers[] = {
 	NULL,			/* 0 - 31 */
 	IDTVEC(apic_isr1),	/* 32 - 63 */
@@ -148,6 +172,16 @@
 	IDTVEC(apic_isr7),	/* 224 - 255 */
 };
 
+static inthand_t *ioint_pti_handlers[] = {
+	NULL,			/* 0 - 31 */
+	IDTVEC(apic_isr1_pti),	/* 32 - 63 */
+	IDTVEC(apic_isr2_pti),	/* 64 - 95 */
+	IDTVEC(apic_isr3_pti),	/* 96 - 127 */
+	IDTVEC(apic_isr4_pti),	/* 128 - 159 */
+	IDTVEC(apic_isr5_pti),	/* 160 - 191 */
+	IDTVEC(apic_isr6_pti),	/* 192 - 223 */
+	IDTVEC(apic_isr7_pti),	/* 224 - 255 */
+};
 
 static u_int32_t lapic_timer_divisors[] = {
 	APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
@@ -154,42 +188,223 @@
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
-extern inthand_t IDTVEC(rsvd);
+extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
 
-volatile lapic_t *lapic;
+volatile char *lapic_map;
 vm_paddr_t lapic_paddr;
-static u_long lapic_timer_divisor;
+int x2apic_mode;
+int lapic_eoi_suppression;
+static int lapic_timer_tsc_deadline;
+static u_long lapic_timer_divisor, count_freq;
 static struct eventtimer lapic_et;
 #ifdef SMP
 static uint64_t lapic_ipi_wait_mult;
 #endif
 
+SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
+SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, "");
+SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD,
+    &lapic_eoi_suppression, 0, "");
+SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD,
+    &lapic_timer_tsc_deadline, 0, "");
+
+static void lapic_calibrate_initcount(struct lapic *la);
+static void lapic_calibrate_deadline(struct lapic *la);
+
+static uint32_t
+lapic_read32(enum LAPIC_REGISTERS reg)
+{
+	uint32_t res;
+
+	if (x2apic_mode) {
+		res = rdmsr32(MSR_APIC_000 + reg);
+	} else {
+		res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL);
+	}
+	return (res);
+}
+
+static void
+lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val)
+{
+
+	if (x2apic_mode) {
+		mfence();
+		lfence();
+		wrmsr(MSR_APIC_000 + reg, val);
+	} else {
+		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
+	}
+}
+
+static void
+lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val)
+{
+
+	if (x2apic_mode) {
+		wrmsr(MSR_APIC_000 + reg, val);
+	} else {
+		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
+	}
+}
+
+#ifdef SMP
+static uint64_t
+lapic_read_icr(void)
+{
+	uint64_t v;
+	uint32_t vhi, vlo;
+
+	if (x2apic_mode) {
+		v = rdmsr(MSR_APIC_000 + LAPIC_ICR_LO);
+	} else {
+		vhi = lapic_read32(LAPIC_ICR_HI);
+		vlo = lapic_read32(LAPIC_ICR_LO);
+		v = ((uint64_t)vhi << 32) | vlo;
+	}
+	return (v);
+}
+
+static uint64_t
+lapic_read_icr_lo(void)
+{
+
+	return (lapic_read32(LAPIC_ICR_LO));
+}
+
+static void
+lapic_write_icr(uint32_t vhi, uint32_t vlo)
+{
+	uint64_t v;
+
+	if (x2apic_mode) {
+		v = ((uint64_t)vhi << 32) | vlo;
+		mfence();
+		wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v);
+	} else {
+		lapic_write32(LAPIC_ICR_HI, vhi);
+		lapic_write32(LAPIC_ICR_LO, vlo);
+	}
+}
+#endif /* SMP */
+
+static void
+native_lapic_enable_x2apic(void)
+{
+	uint64_t apic_base;
+
+	apic_base = rdmsr(MSR_APICBASE);
+	apic_base |= APICBASE_X2APIC | APICBASE_ENABLED;
+	wrmsr(MSR_APICBASE, apic_base);
+}
+
+static bool
+native_lapic_is_x2apic(void)
+{
+	uint64_t apic_base;
+
+	apic_base = rdmsr(MSR_APICBASE);
+	return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) ==
+	    (APICBASE_X2APIC | APICBASE_ENABLED));
+}
+
 static void	lapic_enable(void);
 static void	lapic_resume(struct pic *pic, bool suspend_cancelled);
-static void	lapic_timer_oneshot(struct lapic *,
-		    u_int count, int enable_int);
-static void	lapic_timer_periodic(struct lapic *,
-		    u_int count, int enable_int);
+static void	lapic_timer_oneshot(struct lapic *);
+static void	lapic_timer_oneshot_nointr(struct lapic *, uint32_t);
+static void	lapic_timer_periodic(struct lapic *);
+static void	lapic_timer_deadline(struct lapic *);
 static void	lapic_timer_stop(struct lapic *);
 static void	lapic_timer_set_divisor(u_int divisor);
 static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 static int	lapic_et_start(struct eventtimer *et,
-    sbintime_t first, sbintime_t period);
+		    sbintime_t first, sbintime_t period);
 static int	lapic_et_stop(struct eventtimer *et);
+static u_int	apic_idt_to_irq(u_int apic_id, u_int vector);
+static void	lapic_set_tpr(u_int vector);
 
 struct pic lapic_pic = { .pic_resume = lapic_resume };
 
+/* Forward declarations for apic_ops */
+static void	native_lapic_create(u_int apic_id, int boot_cpu);
+static void	native_lapic_init(vm_paddr_t addr);
+static void	native_lapic_xapic_mode(void);
+static void	native_lapic_setup(int boot);
+static void	native_lapic_dump(const char *str);
+static void	native_lapic_disable(void);
+static void	native_lapic_eoi(void);
+static int	native_lapic_id(void);
+static int	native_lapic_intr_pending(u_int vector);
+static u_int	native_apic_cpuid(u_int apic_id);
+static u_int	native_apic_alloc_vector(u_int apic_id, u_int irq);
+static u_int	native_apic_alloc_vectors(u_int apic_id, u_int *irqs,
+		    u_int count, u_int align);
+static void 	native_apic_disable_vector(u_int apic_id, u_int vector);
+static void 	native_apic_enable_vector(u_int apic_id, u_int vector);
+static void 	native_apic_free_vector(u_int apic_id, u_int vector, u_int irq);
+static void 	native_lapic_set_logical_id(u_int apic_id, u_int cluster,
+		    u_int cluster_id);
+static int 	native_lapic_enable_pmc(void);
+static void 	native_lapic_disable_pmc(void);
+static void 	native_lapic_reenable_pmc(void);
+static void 	native_lapic_enable_cmc(void);
+static int 	native_lapic_enable_mca_elvt(void);
+static int 	native_lapic_set_lvt_mask(u_int apic_id, u_int lvt,
+		    u_char masked);
+static int 	native_lapic_set_lvt_mode(u_int apic_id, u_int lvt,
+		    uint32_t mode);
+static int 	native_lapic_set_lvt_polarity(u_int apic_id, u_int lvt,
+		    enum intr_polarity pol);
+static int 	native_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
+		    enum intr_trigger trigger);
+#ifdef SMP
+static void 	native_lapic_ipi_raw(register_t icrlo, u_int dest);
+static void 	native_lapic_ipi_vectored(u_int vector, int dest);
+static int 	native_lapic_ipi_wait(int delay);
+#endif /* SMP */
+static int	native_lapic_ipi_alloc(inthand_t *ipifunc);
+static void	native_lapic_ipi_free(int vector);
+
+struct apic_ops apic_ops = {
+	.create			= native_lapic_create,
+	.init			= native_lapic_init,
+	.xapic_mode		= native_lapic_xapic_mode,
+	.is_x2apic		= native_lapic_is_x2apic,
+	.setup			= native_lapic_setup,
+	.dump			= native_lapic_dump,
+	.disable		= native_lapic_disable,
+	.eoi			= native_lapic_eoi,
+	.id			= native_lapic_id,
+	.intr_pending		= native_lapic_intr_pending,
+	.set_logical_id		= native_lapic_set_logical_id,
+	.cpuid			= native_apic_cpuid,
+	.alloc_vector		= native_apic_alloc_vector,
+	.alloc_vectors		= native_apic_alloc_vectors,
+	.enable_vector		= native_apic_enable_vector,
+	.disable_vector		= native_apic_disable_vector,
+	.free_vector		= native_apic_free_vector,
+	.enable_pmc		= native_lapic_enable_pmc,
+	.disable_pmc		= native_lapic_disable_pmc,
+	.reenable_pmc		= native_lapic_reenable_pmc,
+	.enable_cmc		= native_lapic_enable_cmc,
+	.enable_mca_elvt	= native_lapic_enable_mca_elvt,
+#ifdef SMP
+	.ipi_raw		= native_lapic_ipi_raw,
+	.ipi_vectored		= native_lapic_ipi_vectored,
+	.ipi_wait		= native_lapic_ipi_wait,
+#endif
+	.ipi_alloc		= native_lapic_ipi_alloc,
+	.ipi_free		= native_lapic_ipi_free,
+	.set_lvt_mask		= native_lapic_set_lvt_mask,
+	.set_lvt_mode		= native_lapic_set_lvt_mode,
+	.set_lvt_polarity	= native_lapic_set_lvt_polarity,
+	.set_lvt_triggermode	= native_lapic_set_lvt_triggermode,
+};
+
 static uint32_t
-lvt_mode(struct lapic *la, u_int pin, uint32_t value)
+lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value)
 {
-	struct lvt *lvt;
 
-	KASSERT(pin <= APIC_LVT_MAX, ("%s: pin %u out of range", __func__, pin));
-	if (la->la_lvts[pin].lvt_active)
-		lvt = &la->la_lvts[pin];
-	else
-		lvt = &lvts[pin];
-
 	value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
 	    APIC_LVT_VECTOR);
 	if (lvt->lvt_edgetrigger == 0)
@@ -204,7 +419,7 @@
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
-		if (!lvt->lvt_edgetrigger) {
+		if (!lvt->lvt_edgetrigger && bootverbose) {
 			printf("lapic%u: Forcing LINT%u to edge trigger\n",
 			    la->la_id, pin);
 			value &= ~APIC_LVT_TM;
@@ -220,23 +435,70 @@
 	return (value);
 }
 
+static uint32_t
+lvt_mode(struct lapic *la, u_int pin, uint32_t value)
+{
+	struct lvt *lvt;
+
+	KASSERT(pin <= APIC_LVT_MAX,
+	    ("%s: pin %u out of range", __func__, pin));
+	if (la->la_lvts[pin].lvt_active)
+		lvt = &la->la_lvts[pin];
+	else
+		lvt = &lvts[pin];
+
+	return (lvt_mode_impl(la, lvt, pin, value));
+}
+
+static uint32_t
+elvt_mode(struct lapic *la, u_int idx, uint32_t value)
+{
+	struct lvt *elvt;
+
+	KASSERT(idx <= APIC_ELVT_MAX,
+	    ("%s: idx %u out of range", __func__, idx));
+
+	elvt = &la->la_elvts[idx];
+	KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx));
+	KASSERT(elvt->lvt_edgetrigger,
+	    ("%s: ELVT%u is not edge triggered", __func__, idx));
+	KASSERT(elvt->lvt_activehi,
+	    ("%s: ELVT%u is not active high", __func__, idx));
+	return (lvt_mode_impl(la, elvt, idx, value));
+}
+
 /*
  * Map the local APIC and setup necessary interrupt vectors.
  */
-void
-lapic_init(vm_paddr_t addr)
+static void
+native_lapic_init(vm_paddr_t addr)
 {
 #ifdef SMP
 	uint64_t r, r1, r2, rx;
 #endif
+	uint32_t ver;
 	u_int regs[4];
 	int i, arat;
 
-	/* Map the local APIC and setup the spurious interrupt handler. */
+	/*
+	 * Enable x2APIC mode if possible. Map the local APIC
+	 * registers page.
+	 *
+	 * Keep the LAPIC registers page mapped uncached for x2APIC
+	 * mode too, to have direct map page attribute set to
+	 * uncached.  This is needed to work around CPU errata present
+	 * on all Intel processors.
+	 */
 	KASSERT(trunc_page(addr) == addr,
 	    ("local APIC not aligned on a page boundary"));
 	lapic_paddr = addr;
-	lapic = pmap_mapdev(addr, sizeof(lapic_t));
+	lapic_map = pmap_mapdev(addr, PAGE_SIZE);
+	if (x2apic_mode) {
+		native_lapic_enable_x2apic();
+		lapic_map = NULL;
+	}
+
+	/* Setup the spurious interrupt handler. */
 	setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL,
 	    GSEL_APIC);
 
@@ -247,15 +509,18 @@
 	PCPU_SET(apic_id, lapic_id());
 
 	/* Local APIC timer interrupt. */
-	setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC);
+	setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* Local APIC error interrupt. */
-	setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
+	setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* XXX: Thermal interrupt */
 
 	/* Local APIC CMCI. */
-	setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
+	    SDT_APICT, SEL_KPL, GSEL_APIC);
 
 	if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
 		arat = 0;
@@ -264,6 +529,9 @@
 			do_cpuid(0x06, regs);
 			if ((regs[0] & CPUTPM1_ARAT) != 0)
 				arat = 1;
+		} else if (cpu_vendor_id == CPU_VENDOR_AMD &&
+		    CPUID_TO_FAMILY(cpu_id) >= 0x12) {
+			arat = 1;
 		}
 		bzero(&lapic_et, sizeof(lapic_et));
 		lapic_et.et_name = "LAPIC";
@@ -272,8 +540,16 @@
 		lapic_et.et_quality = 600;
 		if (!arat) {
 			lapic_et.et_flags |= ET_FLAGS_C3STOP;
-			lapic_et.et_quality -= 200;
+			lapic_et.et_quality = 100;
 		}
+		if ((cpu_feature & CPUID_TSC) != 0 &&
+		    (cpu_feature2 & CPUID2_TSCDLT) != 0 &&
+		    tsc_is_invariant && tsc_freq != 0) {
+			lapic_timer_tsc_deadline = 1;
+			TUNABLE_INT_FETCH("hw.lapic_tsc_deadline",
+			    &lapic_timer_tsc_deadline);
+		}
+
 		lapic_et.et_frequency = 0;
 		/* We don't know frequency yet, so trying to guess. */
 		lapic_et.et_min_period = 0x00001000LL;
@@ -284,6 +560,29 @@
 		et_register(&lapic_et);
 	}
 
+	/*
+	 * Set lapic_eoi_suppression after lapic_enable(), to not
+	 * enable suppression in the hardware prematurely.  Note that
+	 * we by default enable suppression even when system only has
+	 * one IO-APIC, since EOI is broadcasted to all APIC agents,
+	 * including CPUs, otherwise.
+	 *
+	 * It seems that at least some KVM versions report
+	 * EOI_SUPPRESSION bit, but auto-EOI does not work.
+	 */
+	ver = lapic_read32(LAPIC_VERSION);
+	if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) {
+		lapic_eoi_suppression = 1;
+		if (vm_guest == VM_GUEST_KVM) {
+			if (bootverbose)
+				printf(
+		       "KVM -- disabling lapic eoi suppression\n");
+			lapic_eoi_suppression = 0;
+		}
+		TUNABLE_INT_FETCH("hw.lapic_eoi_suppression",
+		    &lapic_eoi_suppression);
+	}
+
 #ifdef SMP
 #define	LOOPS	100000
 	/*
@@ -299,20 +598,22 @@
 	 */
 	KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0,
 	    ("TSC not initialized"));
-	r = rdtsc();
-	for (rx = 0; rx < LOOPS; rx++) {
-		(void)lapic->icr_lo;
-		ia32_pause();
+	if (!x2apic_mode) {
+		r = rdtsc();
+		for (rx = 0; rx < LOOPS; rx++) {
+			(void)lapic_read_icr_lo();
+			ia32_pause();
+		}
+		r = rdtsc() - r;
+		r1 = tsc_freq * LOOPS;
+		r2 = r * 1000000;
+		lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
+		if (bootverbose) {
+			printf("LAPIC: ipi_wait() us multiplier %ju (r %ju "
+			    "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult,
+			    (uintmax_t)r, (uintmax_t)tsc_freq);
+		}
 	}
-	r = rdtsc() - r;
-	r1 = tsc_freq * LOOPS;
-	r2 = r * 1000000;
-	lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
-	if (bootverbose) {
-		printf("LAPIC: ipi_wait() us multiplier %ju (r %ju tsc %ju)\n",
-		    (uintmax_t)lapic_ipi_wait_mult, (uintmax_t)r,
-		    (uintmax_t)tsc_freq);
-	}
 #undef LOOPS
 #endif /* SMP */
 }
@@ -320,8 +621,8 @@
 /*
  * Create a local APIC instance.
  */
-void
-lapic_create(u_int apic_id, int boot_cpu)
+static void
+native_lapic_create(u_int apic_id, int boot_cpu)
 {
 	int i;
 
@@ -344,8 +645,12 @@
 		lapics[apic_id].la_lvts[i] = lvts[i];
 		lapics[apic_id].la_lvts[i].lvt_active = 0;
 	}
+	for (i = 0; i <= APIC_ELVT_MAX; i++) {
+		lapics[apic_id].la_elvts[i] = elvts[i];
+		lapics[apic_id].la_elvts[i].lvt_active = 0;
+	}
 	for (i = 0; i <= APIC_NUM_IOINTS; i++)
-	    lapics[apic_id].la_ioint_irqs[i] = -1;
+	    lapics[apic_id].la_ioint_irqs[i] = IRQ_FREE;
 	lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
 	lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] =
 	    IRQ_TIMER;
@@ -363,41 +668,100 @@
 #endif
 }
 
+static inline uint32_t
+amd_read_ext_features(void)
+{
+	uint32_t version;
+
+	if (cpu_vendor_id != CPU_VENDOR_AMD)
+		return (0);
+	version = lapic_read32(LAPIC_VERSION);
+	if ((version & APIC_VER_AMD_EXT_SPACE) != 0)
+		return (lapic_read32(LAPIC_EXT_FEATURES));
+	else
+		return (0);
+}
+
+static inline uint32_t
+amd_read_elvt_count(void)
+{
+	uint32_t extf;
+	uint32_t count;
+
+	extf = amd_read_ext_features();
+	count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT;
+	count = min(count, APIC_ELVT_MAX + 1);
+	return (count);
+}
+
 /*
  * Dump contents of local APIC registers
  */
-void
-lapic_dump(const char* str)
+static void
+native_lapic_dump(const char* str)
 {
+	uint32_t version;
 	uint32_t maxlvt;
+	uint32_t extf;
+	int elvt_count;
+	int i;
 
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	version = lapic_read32(LAPIC_VERSION);
+	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
-	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n",
-	    lapic->id, lapic->version, lapic->ldr, lapic->dfr);
-	printf("  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
-	    lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr);
+	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x",
+	    lapic_read32(LAPIC_ID), version,
+	    lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR));
+	if ((cpu_feature2 & CPUID2_X2APIC) != 0)
+		printf(" x2APIC: %d", x2apic_mode);
+	printf("\n  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
+	    lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1),
+	    lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR));
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x",
-	    lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error);
+	    lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL),
+	    lapic_read32(LAPIC_LVT_ERROR));
 	if (maxlvt >= APIC_LVT_PMC)
-		printf(" pmc: 0x%08x", lapic->lvt_pcint);
+		printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT));
 	printf("\n");
 	if (maxlvt >= APIC_LVT_CMCI)
-		printf("   cmci: 0x%08x\n", lapic->lvt_cmci);
+		printf("   cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI));
+	extf = amd_read_ext_features();
+	if (extf != 0) {
+		printf("   AMD ext features: 0x%08x\n", extf);
+		elvt_count = amd_read_elvt_count();
+		for (i = 0; i < elvt_count; i++)
+			printf("   AMD elvt%d: 0x%08x\n", i,
+			    lapic_read32(LAPIC_EXT_LVT0 + i));
+	}
 }
 
-void
-lapic_setup(int boot)
+static void
+native_lapic_xapic_mode(void)
 {
+	register_t saveintr;
+
+	saveintr = intr_disable();
+	if (x2apic_mode)
+		native_lapic_enable_x2apic();
+	intr_restore(saveintr);
+}
+
+static void
+native_lapic_setup(int boot)
+{
 	struct lapic *la;
-	u_int32_t maxlvt;
+	uint32_t version;
+	uint32_t maxlvt;
 	register_t saveintr;
-	char buf[MAXCOMLEN + 1];
+	int elvt_count;
+	int i;
 
+	saveintr = intr_disable();
+
 	la = &lapics[lapic_id()];
 	KASSERT(la->la_present, ("missing APIC structure"));
-	saveintr = intr_disable();
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	version = lapic_read32(LAPIC_VERSION);
+	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 
 	/* Initialize the TPR to allow all interrupts. */
 	lapic_set_tpr(0);
@@ -406,54 +770,103 @@
 	lapic_enable();
 
 	/* Program LINT[01] LVT entries. */
-	lapic->lvt_lint0 = lvt_mode(la, APIC_LVT_LINT0, lapic->lvt_lint0);
-	lapic->lvt_lint1 = lvt_mode(la, APIC_LVT_LINT1, lapic->lvt_lint1);
+	lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0,
+	    lapic_read32(LAPIC_LVT_LINT0)));
+	lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1,
+	    lapic_read32(LAPIC_LVT_LINT1)));
 
 	/* Program the PMC LVT entry if present. */
-	if (maxlvt >= APIC_LVT_PMC)
-		lapic->lvt_pcint = lvt_mode(la, APIC_LVT_PMC, lapic->lvt_pcint);
+	if (maxlvt >= APIC_LVT_PMC) {
+		lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
+		    LAPIC_LVT_PCINT));
+	}
 
-	/* Program timer LVT and setup handler. */
-	la->lvt_timer_cache = lapic->lvt_timer =
-	    lvt_mode(la, APIC_LVT_TIMER, lapic->lvt_timer);
-	if (boot) {
-		snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid));
-		intrcnt_add(buf, &la->la_timer_count);
+	/* Program timer LVT. */
+	la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER,
+	    lapic_read32(LAPIC_LVT_TIMER));
+	la->lvt_timer_last = la->lvt_timer_base;
+	lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base);
+
+	/* Calibrate the timer parameters using BSP. */
+	if (boot && IS_BSP()) {
+		lapic_calibrate_initcount(la);
+		if (lapic_timer_tsc_deadline)
+			lapic_calibrate_deadline(la);
 	}
 
 	/* Setup the timer if configured. */
-	if (la->la_timer_mode != 0) {
+	if (la->la_timer_mode != LAT_MODE_UNDEF) {
 		KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor",
 		    lapic_id()));
-		lapic_timer_set_divisor(lapic_timer_divisor);
-		if (la->la_timer_mode == 1)
-			lapic_timer_periodic(la, la->la_timer_period, 1);
-		else
-			lapic_timer_oneshot(la, la->la_timer_period, 1);
+		switch (la->la_timer_mode) {
+		case LAT_MODE_PERIODIC:
+			lapic_timer_set_divisor(lapic_timer_divisor);
+			lapic_timer_periodic(la);
+			break;
+		case LAT_MODE_ONESHOT:
+			lapic_timer_set_divisor(lapic_timer_divisor);
+			lapic_timer_oneshot(la);
+			break;
+		case LAT_MODE_DEADLINE:
+			lapic_timer_deadline(la);
+			break;
+		default:
+			panic("corrupted la_timer_mode %p %d", la,
+			    la->la_timer_mode);
+		}
 	}
 
 	/* Program error LVT and clear any existing errors. */
-	lapic->lvt_error = lvt_mode(la, APIC_LVT_ERROR, lapic->lvt_error);
-	lapic->esr = 0;
+	lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR,
+	    lapic_read32(LAPIC_LVT_ERROR)));
+	lapic_write32(LAPIC_ESR, 0);
 
 	/* XXX: Thermal LVT */
 
 	/* Program the CMCI LVT entry if present. */
-	if (maxlvt >= APIC_LVT_CMCI)
-		lapic->lvt_cmci = lvt_mode(la, APIC_LVT_CMCI, lapic->lvt_cmci);
+	if (maxlvt >= APIC_LVT_CMCI) {
+		lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI,
+		    lapic_read32(LAPIC_LVT_CMCI)));
+	}
 
+	elvt_count = amd_read_elvt_count();
+	for (i = 0; i < elvt_count; i++) {
+		if (la->la_elvts[i].lvt_active)
+			lapic_write32(LAPIC_EXT_LVT0 + i,
+			    elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i)));
+	}
+
 	intr_restore(saveintr);
 }
 
-void
-lapic_reenable_pmc(void)
+static void
+native_lapic_intrcnt(void *dummy __unused)
 {
+	struct pcpu *pc;
+	struct lapic *la;
+	char buf[MAXCOMLEN + 1];
+
+	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+		la = &lapics[pc->pc_apic_id];
+		if (!la->la_present)
+		    continue;
+
+		snprintf(buf, sizeof(buf), "cpu%d:timer", pc->pc_cpuid);
+		intrcnt_add(buf, &la->la_timer_count);
+	}
+}
+SYSINIT(native_lapic_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, native_lapic_intrcnt,
+    NULL);
+
+static void
+native_lapic_reenable_pmc(void)
+{
 #ifdef HWPMC_HOOKS
 	uint32_t value;
 
-	value =  lapic->lvt_pcint;
+	value = lapic_read32(LAPIC_LVT_PCINT);
 	value &= ~APIC_LVT_M;
-	lapic->lvt_pcint = value;
+	lapic_write32(LAPIC_LVT_PCINT, value);
 #endif
 }
 
@@ -464,27 +877,32 @@
 	struct lapic *la;
 
 	la = &lapics[lapic_id()];
-	lapic->lvt_pcint = lvt_mode(la, APIC_LVT_PMC, lapic->lvt_pcint);
+	lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
+	    lapic_read32(LAPIC_LVT_PCINT)));
 }
 #endif
 
-int
-lapic_enable_pmc(void)
+static int
+native_lapic_enable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
-	if (lapic == NULL)
+	if (!x2apic_mode && lapic_map == NULL)
 		return (0);
 
 	/* Fail if the PMC LVT is not present. */
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return (0);
 
 	lvts[APIC_LVT_PMC].lvt_masked = 0;
 
+#ifdef EARLY_AP_STARTUP
+	MPASS(mp_ncpus == 1 || smp_started);
+	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
+#else
 #ifdef SMP
 	/*
 	 * If hwpmc was loaded at boot time then the APs may not be
@@ -496,6 +914,7 @@
 	else
 #endif
 		lapic_update_pmc(NULL);
+#endif
 	return (1);
 #else
 	return (0);
@@ -502,18 +921,18 @@
 #endif
 }
 
-void
-lapic_disable_pmc(void)
+static void
+native_lapic_disable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
-	if (lapic == NULL)
+	if (!x2apic_mode && lapic_map == NULL)
 		return;
 
 	/* Fail if the PMC LVT is not present. */
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return;
 
@@ -527,45 +946,89 @@
 #endif
 }
 
+static void
+lapic_calibrate_initcount(struct lapic *la)
+{
+	u_long value;
+
+	/* Start off with a divisor of 2 (power on reset default). */
+	lapic_timer_divisor = 2;
+	/* Try to calibrate the local APIC timer. */
+	do {
+		lapic_timer_set_divisor(lapic_timer_divisor);
+		lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
+		DELAY(1000000);
+		value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER);
+		if (value != APIC_TIMER_MAX_COUNT)
+			break;
+		lapic_timer_divisor <<= 1;
+	} while (lapic_timer_divisor <= 128);
+	if (lapic_timer_divisor > 128)
+		panic("lapic: Divisor too big");
+	if (bootverbose) {
+		printf("lapic: Divisor %lu, Frequency %lu Hz\n",
+		    lapic_timer_divisor, value);
+	}
+	count_freq = value;
+}
+
+static void
+lapic_calibrate_deadline(struct lapic *la __unused)
+{
+
+	if (bootverbose) {
+		printf("lapic: deadline tsc mode, Frequency %ju Hz\n",
+		    (uintmax_t)tsc_freq);
+	}
+}
+
+static void
+lapic_change_mode(struct eventtimer *et, struct lapic *la,
+    enum lat_timer_mode newmode)
+{
+
+	if (la->la_timer_mode == newmode)
+		return;
+	switch (newmode) {
+	case LAT_MODE_PERIODIC:
+		lapic_timer_set_divisor(lapic_timer_divisor);
+		et->et_frequency = count_freq;
+		break;
+	case LAT_MODE_DEADLINE:
+		et->et_frequency = tsc_freq;
+		break;
+	case LAT_MODE_ONESHOT:
+		lapic_timer_set_divisor(lapic_timer_divisor);
+		et->et_frequency = count_freq;
+		break;
+	default:
+		panic("lapic_change_mode %d", newmode);
+	}
+	la->la_timer_mode = newmode;
+	et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
+	et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
+}
+
 static int
 lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
 {
 	struct lapic *la;
-	u_long value;
 
 	la = &lapics[PCPU_GET(apic_id)];
-	if (et->et_frequency == 0) {
-		/* Start off with a divisor of 2 (power on reset default). */
-		lapic_timer_divisor = 2;
-		/* Try to calibrate the local APIC timer. */
-		do {
-			lapic_timer_set_divisor(lapic_timer_divisor);
-			lapic_timer_oneshot(la, APIC_TIMER_MAX_COUNT, 0);
-			DELAY(1000000);
-			value = APIC_TIMER_MAX_COUNT - lapic->ccr_timer;
-			if (value != APIC_TIMER_MAX_COUNT)
-				break;
-			lapic_timer_divisor <<= 1;
-		} while (lapic_timer_divisor <= 128);
-		if (lapic_timer_divisor > 128)
-			panic("lapic: Divisor too big");
-		if (bootverbose)
-			printf("lapic: Divisor %lu, Frequency %lu Hz\n",
-			    lapic_timer_divisor, value);
-		et->et_frequency = value;
-		et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
-		et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
-	}
-	if (la->la_timer_mode == 0)
-		lapic_timer_set_divisor(lapic_timer_divisor);
 	if (period != 0) {
-		la->la_timer_mode = 1;
-		la->la_timer_period = ((uint32_t)et->et_frequency * period) >> 32;
-		lapic_timer_periodic(la, la->la_timer_period, 1);
+		lapic_change_mode(et, la, LAT_MODE_PERIODIC);
+		la->la_timer_period = ((uint32_t)et->et_frequency * period) >>
+		    32;
+		lapic_timer_periodic(la);
+	} else if (lapic_timer_tsc_deadline) {
+		lapic_change_mode(et, la, LAT_MODE_DEADLINE);
+		la->la_timer_period = (et->et_frequency * first) >> 32;
+		lapic_timer_deadline(la);
 	} else {
-		la->la_timer_mode = 2;
-		la->la_timer_period = ((uint32_t)et->et_frequency * first) >> 32;
-		lapic_timer_oneshot(la, la->la_timer_period, 1);
+		lapic_change_mode(et, la, LAT_MODE_ONESHOT);
+		la->la_timer_period = ((uint32_t)et->et_frequency * first) >>
+		    32;
+		lapic_timer_oneshot(la);
 	}
 	return (0);
 }
@@ -573,34 +1036,37 @@
 static int
 lapic_et_stop(struct eventtimer *et)
 {
-	struct lapic *la = &lapics[PCPU_GET(apic_id)];
+	struct lapic *la;
 
-	la->la_timer_mode = 0;
+	la = &lapics[PCPU_GET(apic_id)];
 	lapic_timer_stop(la);
+	la->la_timer_mode = LAT_MODE_UNDEF;
 	return (0);
 }
 
-void
-lapic_disable(void)
+static void
+native_lapic_disable(void)
 {
 	uint32_t value;
 
 	/* Software disable the local APIC. */
-	value = lapic->svr;
+	value = lapic_read32(LAPIC_SVR);
 	value &= ~APIC_SVR_SWEN;
-	lapic->svr = value;
+	lapic_write32(LAPIC_SVR, value);
 }
 
 static void
 lapic_enable(void)
 {
-	u_int32_t value;
+	uint32_t value;
 
 	/* Program the spurious vector to enable the local APIC. */
-	value = lapic->svr;
+	value = lapic_read32(LAPIC_SVR);
 	value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
-	value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT);
-	lapic->svr = value;
+	value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT;
+	if (lapic_eoi_suppression)
+		value |= APIC_SVR_EOI_SUPPRESSION;
+	lapic_write32(LAPIC_SVR, value);
 }
 
 /* Reset the local APIC on the BSP during resume. */
@@ -611,34 +1077,36 @@
 	lapic_setup(0);
 }
 
-int
-lapic_id(void)
+static int
+native_lapic_id(void)
 {
+	uint32_t v;
 
-	KASSERT(lapic != NULL, ("local APIC is not mapped"));
-	return (lapic->id >> APIC_ID_SHIFT);
+	KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped"));
+	v = lapic_read32(LAPIC_ID);
+	if (!x2apic_mode)
+		v >>= APIC_ID_SHIFT;
+	return (v);
 }
 
-int
-lapic_intr_pending(u_int vector)
+static int
+native_lapic_intr_pending(u_int vector)
 {
-	volatile u_int32_t *irr;
+	uint32_t irr;
 
 	/*
-	 * The IRR registers are an array of 128-bit registers each of
-	 * which only describes 32 interrupts in the low 32 bits..  Thus,
-	 * we divide the vector by 32 to get the 128-bit index.  We then
-	 * multiply that index by 4 to get the equivalent index from
-	 * treating the IRR as an array of 32-bit registers.  Finally, we
-	 * modulus the vector by 32 to determine the individual bit to
-	 * test.
+	 * The IRR registers are an array of registers each of which
+	 * only describes 32 interrupts in the low 32 bits.  Thus, we
+	 * divide the vector by 32 to get the register index.
+	 * Finally, we modulus the vector by 32 to determine the
+	 * individual bit to test.
 	 */
-	irr = &lapic->irr0;
-	return (irr[(vector / 32) * 4] & 1 << (vector % 32));
+	irr = lapic_read32(LAPIC_IRR0 + vector / 32);
+	return (irr & 1 << (vector % 32));
 }
 
-void
-lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
+static void
+native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 	struct lapic *la;
 
@@ -653,8 +1121,8 @@
 	la->la_cluster_id = cluster_id;
 }
 
-int
-lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
+static int
+native_lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
 {
 
 	if (pin > APIC_LVT_MAX)
@@ -676,8 +1144,8 @@
 	return (0);
 }
 
-int
-lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
+static int
+native_lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
 {
 	struct lvt *lvt;
 
@@ -732,8 +1200,8 @@
 	return (0);
 }
 
-int
-lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
+static int
+native_lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
 {
 
 	if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM)
@@ -757,8 +1225,9 @@
 	return (0);
 }
 
-int
-lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger)
+static int
+native_lapic_set_lvt_triggermode(u_int apic_id, u_int pin,
+     enum intr_trigger trigger)
 {
 
 	if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
@@ -786,25 +1255,25 @@
  * Adjust the TPR of the current CPU so that it blocks all interrupts below
  * the passed in vector.
  */
-void
+static void
 lapic_set_tpr(u_int vector)
 {
 #ifdef CHEAP_TPR
-	lapic->tpr = vector;
+	lapic_write32(LAPIC_TPR, vector);
 #else
-	u_int32_t tpr;
+	uint32_t tpr;
 
-	tpr = lapic->tpr & ~APIC_TPR_PRIO;
+	tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO;
 	tpr |= vector;
-	lapic->tpr = tpr;
+	lapic_write32(LAPIC_TPR, tpr);
 #endif
 }
 
-void
-lapic_eoi(void)
+static void
+native_lapic_eoi(void)
 {
 
-	lapic->eoi = 0;
+	lapic_write32_nofence(LAPIC_EOI, 0);
 }
 
 void
@@ -864,48 +1333,82 @@
 {
 
 	KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
-	KASSERT(ffs(divisor) <= sizeof(lapic_timer_divisors) /
-	    sizeof(u_int32_t), ("lapic: invalid divisor %u", divisor));
-	lapic->dcr_timer = lapic_timer_divisors[ffs(divisor) - 1];
+	KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors),
+		("lapic: invalid divisor %u", divisor));
+	lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]);
 }
 
 static void
-lapic_timer_oneshot(struct lapic *la, u_int count, int enable_int)
+lapic_timer_oneshot(struct lapic *la)
 {
-	u_int32_t value;
+	uint32_t value;
 
-	value = la->lvt_timer_cache;
-	value &= ~APIC_LVTT_TM;
+	value = la->lvt_timer_base;
+	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_ONE_SHOT;
-	if (enable_int)
-		value &= ~APIC_LVT_M;
-	lapic->lvt_timer = value;
-	lapic->icr_timer = count;
+	la->lvt_timer_last = value;
+	lapic_write32(LAPIC_LVT_TIMER, value);
+	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
-lapic_timer_periodic(struct lapic *la, u_int count, int enable_int)
+lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count)
 {
-	u_int32_t value;
+	uint32_t value;
 
-	value = la->lvt_timer_cache;
+	value = la->lvt_timer_base;
 	value &= ~APIC_LVTT_TM;
+	value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M;
+	la->lvt_timer_last = value;
+	lapic_write32(LAPIC_LVT_TIMER, value);
+	lapic_write32(LAPIC_ICR_TIMER, count);
+}
+
+static void
+lapic_timer_periodic(struct lapic *la)
+{
+	uint32_t value;
+
+	value = la->lvt_timer_base;
+	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_PERIODIC;
-	if (enable_int)
-		value &= ~APIC_LVT_M;
-	lapic->lvt_timer = value;
-	lapic->icr_timer = count;
+	la->lvt_timer_last = value;
+	lapic_write32(LAPIC_LVT_TIMER, value);
+	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
+lapic_timer_deadline(struct lapic *la)
+{
+	uint32_t value;
+
+	value = la->lvt_timer_base;
+	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
+	value |= APIC_LVTT_TM_TSCDLT;
+	if (value != la->lvt_timer_last) {
+		la->lvt_timer_last = value;
+		lapic_write32_nofence(LAPIC_LVT_TIMER, value);
+		if (!x2apic_mode)
+			mfence();
+	}
+	wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc());
+}
+
+static void
 lapic_timer_stop(struct lapic *la)
 {
-	u_int32_t value;
+	uint32_t value;
 
-	value = la->lvt_timer_cache;
-	value &= ~APIC_LVTT_TM;
-	value |= APIC_LVT_M;
-	lapic->lvt_timer = value;
+	if (la->la_timer_mode == LAT_MODE_DEADLINE) {
+		wrmsr(MSR_TSC_DEADLINE, 0);
+		mfence();
+	} else {
+		value = la->lvt_timer_base;
+		value &= ~APIC_LVTT_TM;
+		value |= APIC_LVT_M;
+		la->lvt_timer_last = value;
+		lapic_write32(LAPIC_LVT_TIMER, value);
+	}
 }
 
 void
@@ -922,13 +1425,13 @@
  * is called prior to lapic_setup() during boot, this just needs to unmask
  * this CPU's LVT_CMCI entry.
  */
-void
-lapic_enable_cmc(void)
+static void
+native_lapic_enable_cmc(void)
 {
 	u_int apic_id;
 
 #ifdef DEV_ATPIC
-	if (lapic == NULL)
+	if (!x2apic_mode && lapic_map == NULL)
 		return;
 #endif
 	apic_id = PCPU_GET(apic_id);
@@ -940,10 +1443,41 @@
 		printf("lapic%u: CMCI unmasked\n", apic_id);
 }
 
+static int
+native_lapic_enable_mca_elvt(void)
+{
+	u_int apic_id;
+	uint32_t value;
+	int elvt_count;
+
+#ifdef DEV_ATPIC
+	if (lapic_map == NULL)
+		return (-1);
+#endif
+
+	apic_id = PCPU_GET(apic_id);
+	KASSERT(lapics[apic_id].la_present,
+	    ("%s: missing APIC %u", __func__, apic_id));
+	elvt_count = amd_read_elvt_count();
+	if (elvt_count <= APIC_ELVT_MCA)
+		return (-1);
+
+	value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA);
+	if ((value & APIC_LVT_M) == 0) {
+		printf("AMD MCE Thresholding Extended LVT is already active\n");
+		return (-1);
+	}
+	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0;
+	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1;
+	if (bootverbose)
+		printf("lapic%u: MCE Thresholding ELVT unmasked\n", apic_id);
+	return (APIC_ELVT_MCA);
+}
+
 void
 lapic_handle_error(void)
 {
-	u_int32_t esr;
+	uint32_t esr;
 
 	/*
 	 * Read the contents of the error status register.  Write to
@@ -951,15 +1485,15 @@
 	 * to update its value to indicate any errors that have
 	 * occurred since the previous write to the register.
 	 */
-	lapic->esr = 0;
-	esr = lapic->esr;
+	lapic_write32(LAPIC_ESR, 0);
+	esr = lapic_read32(LAPIC_ESR);
 
 	printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
 	lapic_eoi();
 }
 
-u_int
-apic_cpuid(u_int apic_id)
+static u_int
+native_apic_cpuid(u_int apic_id)
 {
 #ifdef SMP
 	return apic_cpuids[apic_id];
@@ -969,12 +1503,12 @@
 }
 
 /* Request a free IDT vector to be used by the specified IRQ. */
-u_int
-apic_alloc_vector(u_int apic_id, u_int irq)
+static u_int
+native_apic_alloc_vector(u_int apic_id, u_int irq)
 {
 	u_int vector;
 
-	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
+	KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
 
 	/*
 	 * Search for a free vector.  Currently we just use a very simple
@@ -982,7 +1516,7 @@
 	 */
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
-		if (lapics[apic_id].la_ioint_irqs[vector] != -1)
+		if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE)
 			continue;
 		lapics[apic_id].la_ioint_irqs[vector] = irq;
 		mtx_unlock_spin(&icu_lock);
@@ -998,8 +1532,8 @@
  * aligned on a boundary of 'align'.  If the request cannot be
  * satisfied, 0 is returned.
  */
-u_int
-apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
+static u_int
+native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 	u_int first, run, vector;
 
@@ -1008,7 +1542,7 @@
 	KASSERT(align >= count, ("align < count"));
 #ifdef INVARIANTS
 	for (run = 0; run < count; run++)
-		KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u",
+		KASSERT(irqs[run] < num_io_irqs, ("Invalid IRQ %u at index %u",
 		    irqs[run], run));
 #endif
 
@@ -1022,7 +1556,7 @@
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 
 		/* Vector is in use, end run. */
-		if (lapics[apic_id].la_ioint_irqs[vector] != -1) {
+		if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE) {
 			run = 0;
 			first = 0;
 			continue;
@@ -1058,8 +1592,8 @@
  * which do not have the vector configured would report spurious interrupts
  * should it fire.
  */
-void
-apic_enable_vector(u_int apic_id, u_int vector)
+static void
+native_apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
@@ -1069,12 +1603,12 @@
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
-	setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL,
-	    GSEL_APIC);
+	setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 }
 
-void
-apic_disable_vector(u_int apic_id, u_int vector)
+static void
+native_apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
@@ -1089,13 +1623,14 @@
 	 * We can not currently clear the idt entry because other cpus
 	 * may have a valid vector at this offset.
 	 */
-	setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	    SEL_KPL, GSEL_APIC);
 #endif
 }
 
 /* Release an APIC vector when it's no longer in use. */
-void
-apic_free_vector(u_int apic_id, u_int vector, u_int irq)
+static void
+native_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 	struct thread *td;
 
@@ -1102,7 +1637,7 @@
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
-	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
+	KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
 	KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] ==
 	    irq, ("IRQ mismatch"));
 #ifdef KDTRACE_HOOKS
@@ -1123,7 +1658,7 @@
 		thread_unlock(td);
 	}
 	mtx_lock_spin(&icu_lock);
-	lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1;
+	lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = IRQ_FREE;
 	mtx_unlock_spin(&icu_lock);
 	if (!rebooting) {
 		thread_lock(td);
@@ -1133,7 +1668,7 @@
 }
 
 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
-u_int
+static u_int
 apic_idt_to_irq(u_int apic_id, u_int vector)
 {
 	int irq;
@@ -1174,7 +1709,7 @@
 		db_printf("Interrupts bound to lapic %u\n", apic_id);
 		for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
 			irq = lapics[apic_id].la_ioint_irqs[i];
-			if (irq == -1 || irq == IRQ_SYSCALL)
+			if (irq == IRQ_FREE || irq == IRQ_SYSCALL)
 				continue;
 #ifdef KDTRACE_HOOKS
 			if (irq == IRQ_DTRACE_RET)
@@ -1187,7 +1722,7 @@
 			db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
 			if (irq == IRQ_TIMER)
 				db_printf("lapic timer\n");
-			else if (irq < NUM_IO_INTS) {
+			else if (irq < num_io_irqs) {
 				isrc = intr_lookup_source(irq);
 				if (isrc == NULL || verbose == 0)
 					db_printf("IRQ %u\n", irq);
@@ -1224,48 +1759,49 @@
 	uint32_t v;
 
 	db_printf("lapic ID = %d\n", lapic_id());
-	v = lapic->version;
+	v = lapic_read32(LAPIC_VERSION);
 	db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
 	    v & 0xf);
 	db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
-	v = lapic->svr;
+	v = lapic_read32(LAPIC_SVR);
 	db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
 	    v & APIC_SVR_ENABLE ? "enabled" : "disabled");
-	db_printf("TPR      = %02x\n", lapic->tpr);
+	db_printf("TPR      = %02x\n", lapic_read32(LAPIC_TPR));
 
-#define dump_field(prefix, index)					\
-	dump_mask(__XSTRING(prefix ## index), lapic->prefix ## index,	\
+#define dump_field(prefix, regn, index)					\
+	dump_mask(__XSTRING(prefix ## index), 				\
+	    lapic_read32(LAPIC_ ## regn ## index),			\
 	    index * 32)
 
 	db_printf("In-service Interrupts:\n");
-	dump_field(isr, 0);
-	dump_field(isr, 1);
-	dump_field(isr, 2);
-	dump_field(isr, 3);
-	dump_field(isr, 4);
-	dump_field(isr, 5);
-	dump_field(isr, 6);
-	dump_field(isr, 7);
+	dump_field(isr, ISR, 0);
+	dump_field(isr, ISR, 1);
+	dump_field(isr, ISR, 2);
+	dump_field(isr, ISR, 3);
+	dump_field(isr, ISR, 4);
+	dump_field(isr, ISR, 5);
+	dump_field(isr, ISR, 6);
+	dump_field(isr, ISR, 7);
 
 	db_printf("TMR Interrupts:\n");
-	dump_field(tmr, 0);
-	dump_field(tmr, 1);
-	dump_field(tmr, 2);
-	dump_field(tmr, 3);
-	dump_field(tmr, 4);
-	dump_field(tmr, 5);
-	dump_field(tmr, 6);
-	dump_field(tmr, 7);
+	dump_field(tmr, TMR, 0);
+	dump_field(tmr, TMR, 1);
+	dump_field(tmr, TMR, 2);
+	dump_field(tmr, TMR, 3);
+	dump_field(tmr, TMR, 4);
+	dump_field(tmr, TMR, 5);
+	dump_field(tmr, TMR, 6);
+	dump_field(tmr, TMR, 7);
 
 	db_printf("IRR Interrupts:\n");
-	dump_field(irr, 0);
-	dump_field(irr, 1);
-	dump_field(irr, 2);
-	dump_field(irr, 3);
-	dump_field(irr, 4);
-	dump_field(irr, 5);
-	dump_field(irr, 6);
-	dump_field(irr, 7);
+	dump_field(irr, IRR, 0);
+	dump_field(irr, IRR, 1);
+	dump_field(irr, IRR, 2);
+	dump_field(irr, IRR, 3);
+	dump_field(irr, IRR, 4);
+	dump_field(irr, IRR, 5);
+	dump_field(irr, IRR, 6);
+	dump_field(irr, IRR, 7);
 
 #undef dump_field
 }
@@ -1391,20 +1927,18 @@
 	 * Local APIC must be registered before other PICs and pseudo PICs
 	 * for proper suspend/resume order.
 	 */
-#ifndef XEN
 	intr_register_pic(&lapic_pic);
-#endif
 
 	retval = best_enum->apic_setup_io();
 	if (retval != 0)
 		printf("%s: Failed to setup I/O APICs: returned %d\n",
 		    best_enum->apic_name, retval);
-#ifdef XEN
-	return;
-#endif
+
 	/*
-	 * Finish setting up the local APIC on the BSP once we know how to
-	 * properly program the LINT pins.
+	 * Finish setting up the local APIC on the BSP once we know
+	 * how to properly program the LINT pins.  In particular, this
+	 * enables the EOI suppression mode, if LAPIC support it and
+	 * user did not disabled the mode.
 	 */
 	lapic_setup(1);
 	if (bootverbose)
@@ -1411,9 +1945,13 @@
 		lapic_dump("BSP");
 
 	/* Enable the MSI "pic". */
-	msi_init();
+	init_ops.msi_init();
+
+#ifdef XENHVM
+	xen_intr_alloc_irqs();
+#endif
 }
-SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_SECOND, apic_setup_io, NULL);
+SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL);
 
 #ifdef SMP
 /*
@@ -1426,13 +1964,18 @@
  * Wait delay microseconds for IPI to be sent.  If delay is -1, we
  * wait forever.
  */
-int
-lapic_ipi_wait(int delay)
+static int
+native_lapic_ipi_wait(int delay)
 {
 	uint64_t rx;
 
+	/* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */
+	if (x2apic_mode)
+		return (1);
+
 	for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) {
-		if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE)
+		if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) ==
+		    APIC_DELSTAT_IDLE)
 			return (1);
 		ia32_pause();
 	}
@@ -1439,33 +1982,51 @@
 	return (0);
 }
 
-void
-lapic_ipi_raw(register_t icrlo, u_int dest)
+static void
+native_lapic_ipi_raw(register_t icrlo, u_int dest)
 {
-	register_t value, saveintr;
+	uint64_t icr;
+	uint32_t vhi, vlo;
+	register_t saveintr;
 
 	/* XXX: Need more sanity checking of icrlo? */
-	KASSERT(lapic != NULL, ("%s called too early", __func__));
-	KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
+	KASSERT(x2apic_mode || lapic_map != NULL,
+	    ("%s called too early", __func__));
+	KASSERT(x2apic_mode ||
+	    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 	    ("%s: invalid dest field", __func__));
 	KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
 	    ("%s: reserved bits set in ICR LO register", __func__));
 
 	/* Set destination in ICR HI register if it is being used. */
-	saveintr = intr_disable();
+	if (!x2apic_mode) {
+		saveintr = intr_disable();
+		icr = lapic_read_icr();
+	}
+
 	if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
-		value = lapic->icr_hi;
-		value &= ~APIC_ID_MASK;
-		value |= dest << APIC_ID_SHIFT;
-		lapic->icr_hi = value;
+		if (x2apic_mode) {
+			vhi = dest;
+		} else {
+			vhi = icr >> 32;
+			vhi &= ~APIC_ID_MASK;
+			vhi |= dest << APIC_ID_SHIFT;
+		}
+	} else {
+		vhi = 0;
 	}
 
 	/* Program the contents of the IPI and dispatch it. */
-	value = lapic->icr_lo;
-	value &= APIC_ICRLO_RESV_MASK;
-	value |= icrlo;
-	lapic->icr_lo = value;
-	intr_restore(saveintr);
+	if (x2apic_mode) {
+		vlo = icrlo;
+	} else {
+		vlo = icr;
+		vlo &= APIC_ICRLO_RESV_MASK;
+		vlo |= icrlo;
+	}
+	lapic_write_icr(vhi, vlo);
+	if (!x2apic_mode)
+		intr_restore(saveintr);
 }
 
 #define	BEFORE_SPIN	50000
@@ -1473,8 +2034,8 @@
 #define	AFTER_SPIN	50
 #endif
 
-void
-lapic_ipi_vectored(u_int vector, int dest)
+static void
+native_lapic_ipi_vectored(u_int vector, int dest)
 {
 	register_t icrlo, destfield;
 
@@ -1484,11 +2045,10 @@
 	icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT;
 
 	/*
-	 * IPI_STOP_HARD is just a "fake" vector used to send a NMI.
-	 * Use special rules regard NMI if passed, otherwise specify
-	 * the vector.
+	 * NMI IPIs are just fake vectors used to send a NMI.  Use special rules
+	 * regarding NMIs if passed, otherwise specify the vector.
 	 */
-	if (vector == IPI_STOP_HARD)
+	if (vector >= IPI_NMI_FIRST)
 		icrlo |= APIC_DELMODE_NMI;
 	else
 		icrlo |= vector | APIC_DELMODE_FIXED;
@@ -1504,7 +2064,8 @@
 		icrlo |= APIC_DEST_ALLESELF;
 		break;
 	default:
-		KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
+		KASSERT(x2apic_mode ||
+		    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 		    ("%s: invalid destination 0x%x", __func__, dest));
 		destfield = dest;
 	}
@@ -1541,10 +2102,70 @@
 		printf("APIC: IPI might be stuck\n");
 #else /* !needsattention */
 		/* Wait until mesage is sent without a timeout. */
-		while (lapic->icr_lo & APIC_DELSTAT_PEND)
+		while (lapic_read_icr_lo() & APIC_DELSTAT_PEND)
 			ia32_pause();
 #endif /* needsattention */
 	}
 #endif /* DETECT_DEADLOCK */
 }
+
 #endif /* SMP */
+
+/*
+ * Since the IDT is shared by all CPUs the IPI slot update needs to be globally
+ * visible.
+ *
+ * Consider the case where an IPI is generated immediately after allocation:
+ *     vector = lapic_ipi_alloc(ipifunc);
+ *     ipi_selected(other_cpus, vector);
+ *
+ * In xAPIC mode a write to ICR_LO has serializing semantics because the
+ * APIC page is mapped as an uncached region. In x2APIC mode there is an
+ * explicit 'mfence' before the ICR MSR is written. Therefore in both cases
+ * the IDT slot update is globally visible before the IPI is delivered.
+ */
+static int
+native_lapic_ipi_alloc(inthand_t *ipifunc)
+{
+	struct gate_descriptor *ip;
+	long func;
+	int idx, vector;
+
+	KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
+	    ("invalid ipifunc %p", ipifunc));
+
+	vector = -1;
+	mtx_lock_spin(&icu_lock);
+	for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
+		ip = &idt[idx];
+		func = (ip->gd_hioffset << 16) | ip->gd_looffset;
+		if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) ||
+		    (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) {
+			vector = idx;
+			setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
+			break;
+		}
+	}
+	mtx_unlock_spin(&icu_lock);
+	return (vector);
+}
+
+static void
+native_lapic_ipi_free(int vector)
+{
+	struct gate_descriptor *ip;
+	long func;
+
+	KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST,
+	    ("%s: invalid vector %d", __func__, vector));
+
+	mtx_lock_spin(&icu_lock);
+	ip = &idt[vector];
+	func = (ip->gd_hioffset << 16) | ip->gd_looffset;
+	KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
+	    func != (uintptr_t)&IDTVEC(rsvd_pti),
+	    ("invalid idtfunc %#lx", func));
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	    SEL_KPL, GSEL_APIC);
+	mtx_unlock_spin(&icu_lock);
+}

Modified: trunk/sys/x86/x86/mca.c
===================================================================
--- trunk/sys/x86/x86/mca.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/mca.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mca.c 314667 2017-03-04 13:03:31Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mca.c 333159 2018-05-02 07:38:38Z kib $");
 
 #ifdef __amd64__
 #define	DEV_APIC
@@ -53,7 +53,7 @@
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
@@ -76,6 +76,11 @@
 	int	max_threshold;
 	time_t	last_intr;
 };
+
+struct amd_et_state {
+	int	cur_threshold;
+	time_t	last_intr;
+};
 #endif
 
 struct mca_internal {
@@ -93,22 +98,20 @@
     "Machine Check Architecture");
 
 static int mca_enabled = 1;
-TUNABLE_INT("hw.mca.enabled", &mca_enabled);
 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
     "Administrative toggle for machine check support");
 
 static int amd10h_L1TP = 1;
-TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP);
 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
     "Administrative toggle for logging of level one TLB parity (L1TP) errors");
 
 static int intel6h_HSD131;
-TUNABLE_INT("hw.mca.intel6h_hsd131", &intel6h_HSD131);
 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
     "Administrative toggle for logging of spurious corrected errors");
 
 int workaround_erratum383;
-SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
+SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
+    &workaround_erratum383, 0,
     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
 
 static STAILQ_HEAD(, mca_internal) mca_freelist;
@@ -121,8 +124,18 @@
 static struct mtx mca_lock;
 
 #ifdef DEV_APIC
-static struct cmc_state **cmc_state;	/* Indexed by cpuid, bank */
+static struct cmc_state **cmc_state;		/* Indexed by cpuid, bank. */
+static struct amd_et_state *amd_et_state;	/* Indexed by cpuid. */
 static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
+
+static int amd_elvt = -1;
+
+static inline bool
+amd_thresholding_supported(void)
+{
+	return (cpu_vendor_id == CPU_VENDOR_AMD &&
+	    CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16);
+}
 #endif
 
 static int
@@ -511,8 +524,8 @@
 	STAILQ_INSERT_TAIL(&mca_records, rec, link);
 	mca_count++;
 	mtx_unlock_spin(&mca_lock);
-	if (mode == CMCI)
-		taskqueue_enqueue_fast(mca_tq, &mca_refill_task);
+	if (mode == CMCI && !cold)
+		taskqueue_enqueue(mca_tq, &mca_refill_task);
 }
 
 #ifdef DEV_APIC
@@ -524,19 +537,15 @@
  * cmc_throttle seconds or the periodic scan.  If a periodic scan
  * finds that the threshold is too high, it is lowered.
  */
-static void
-cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+static int
+update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
+    int cur_threshold, int max_threshold)
 {
-	struct cmc_state *cc;
-	uint64_t ctl;
 	u_int delta;
-	int count, limit;
+	int limit;
 
-	/* Fetch the current limit for this bank. */
-	cc = &cmc_state[PCPU_GET(cpuid)][bank];
-	ctl = rdmsr(MSR_MC_CTL2(bank));
-	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
-	delta = (u_int)(time_uptime - cc->last_intr);
+	delta = (u_int)(time_uptime - last_intr);
+	limit = cur_threshold;
 
 	/*
 	 * If an interrupt was received less than cmc_throttle seconds
@@ -545,16 +554,11 @@
 	 * double the threshold up to the max.
 	 */
 	if (mode == CMCI && valid) {
-		limit = ctl & MC_CTL2_THRESHOLD;
 		if (delta < cmc_throttle && count >= limit &&
-		    limit < cc->max_threshold) {
-			limit = min(limit << 1, cc->max_threshold);
-			ctl &= ~MC_CTL2_THRESHOLD;
-			ctl |= limit;
-			wrmsr(MSR_MC_CTL2(bank), ctl);
+		    limit < max_threshold) {
+			limit = min(limit << 1, max_threshold);
 		}
-		cc->last_intr = time_uptime;
-		return;
+		return (limit);
 	}
 
 	/*
@@ -562,11 +566,11 @@
 	 * should be lowered.
 	 */
 	if (mode != POLLED)
-		return;
+		return (limit);
 
 	/* If a CMCI occured recently, do nothing for now. */
 	if (delta < cmc_throttle)
-		return;
+		return (limit);
 
 	/*
 	 * Compute a new limit based on the average rate of events per
@@ -573,20 +577,70 @@
 	 * cmc_throttle seconds since the last interrupt.
 	 */
 	if (valid) {
-		count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
 		limit = count * cmc_throttle / delta;
 		if (limit <= 0)
 			limit = 1;
-		else if (limit > cc->max_threshold)
-			limit = cc->max_threshold;
-	} else
+		else if (limit > max_threshold)
+			limit = max_threshold;
+	} else {
 		limit = 1;
-	if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+	}
+	return (limit);
+}
+
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+	struct cmc_state *cc;
+	uint64_t ctl;
+	int cur_threshold, new_threshold;
+	int count;
+
+	/* Fetch the current limit for this bank. */
+	cc = &cmc_state[PCPU_GET(cpuid)][bank];
+	ctl = rdmsr(MSR_MC_CTL2(bank));
+	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+	cur_threshold = ctl & MC_CTL2_THRESHOLD;
+
+	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
+	    cur_threshold, cc->max_threshold);
+
+	if (mode == CMCI && valid)
+		cc->last_intr = time_uptime;
+	if (new_threshold != cur_threshold) {
 		ctl &= ~MC_CTL2_THRESHOLD;
-		ctl |= limit;
+		ctl |= new_threshold;
 		wrmsr(MSR_MC_CTL2(bank), ctl);
 	}
 }
+
+static void
+amd_thresholding_update(enum scan_mode mode, int bank, int valid)
+{
+	struct amd_et_state *cc;
+	uint64_t misc;
+	int new_threshold;
+	int count;
+
+	KASSERT(bank == MC_AMDNB_BANK,
+	    ("%s: unexpected bank %d", __func__, bank));
+	cc = &amd_et_state[PCPU_GET(cpuid)];
+	misc = rdmsr(MSR_MC_MISC(bank));
+	count = (misc & MC_MISC_AMDNB_CNT_MASK) >> MC_MISC_AMDNB_CNT_SHIFT;
+	count = count - (MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold);
+
+	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
+	    cc->cur_threshold, MC_MISC_AMDNB_CNT_MAX);
+
+	cc->cur_threshold = new_threshold;
+	misc &= ~MC_MISC_AMDNB_CNT_MASK;
+	misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
+	    << MC_MISC_AMDNB_CNT_SHIFT;
+	misc &= ~MC_MISC_AMDNB_OVERFLOW;
+	wrmsr(MSR_MC_MISC(bank), misc);
+	if (mode == CMCI && valid)
+		cc->last_intr = time_uptime;
+}
 #endif
 
 /*
@@ -600,7 +654,7 @@
  * count of the number of valid MC records found.
  */
 static int
-mca_scan(enum scan_mode mode)
+mca_scan(enum scan_mode mode, int *recoverablep)
 {
 	struct mca_record rec;
 	uint64_t mcg_cap, ucmask;
@@ -641,13 +695,19 @@
 		 * If this is a bank this CPU monitors via CMCI,
 		 * update the threshold.
 		 */
-		if (PCPU_GET(cmci_mask) & 1 << i)
-			cmci_update(mode, i, valid, &rec);
+		if (PCPU_GET(cmci_mask) & 1 << i) {
+			if (cmc_state != NULL)
+				cmci_update(mode, i, valid, &rec);
+			else
+				amd_thresholding_update(mode, i, valid);
+		}
 #endif
 	}
 	if (mode == POLLED)
 		mca_fill_freelist();
-	return (mode == MCE ? recoverable : count);
+	if (recoverablep != NULL)
+		*recoverablep = recoverable;
+	return (count);
 }
 
 /*
@@ -669,7 +729,7 @@
 	CPU_FOREACH(cpu) {
 		sched_bind(td, cpu);
 		thread_unlock(td);
-		count += mca_scan(POLLED);
+		count += mca_scan(POLLED, NULL);
 		thread_lock(td);
 		sched_unbind(td);
 	}
@@ -690,7 +750,7 @@
 mca_periodic_scan(void *arg)
 {
 
-	taskqueue_enqueue_fast(mca_tq, &mca_scan_task);
+	taskqueue_enqueue(mca_tq, &mca_scan_task);
 	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
 }
 
@@ -704,7 +764,7 @@
 	if (error)
 		return (error);
 	if (i)
-		taskqueue_enqueue_fast(mca_tq, &mca_scan_task);
+		taskqueue_enqueue(mca_tq, &mca_scan_task);
 	return (0);
 }
 
@@ -717,6 +777,9 @@
 	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
 	    taskqueue_thread_enqueue, &mca_tq);
 	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
+
+	/* CMCIs during boot may have claimed items from the freelist. */
+	mca_fill_freelist();
 }
 SYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL);
 
@@ -729,7 +792,11 @@
 
 	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
 }
+#ifdef EARLY_AP_STARTUP
+SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
+#else
 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
+#endif
 
 #ifdef DEV_APIC
 static void
@@ -747,6 +814,18 @@
 	    &cmc_throttle, 0, sysctl_positive_int, "I",
 	    "Interval in seconds to throttle corrected MC interrupts");
 }
+
+static void
+amd_thresholding_setup(void)
+{
+
+	amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state),
+	    M_MCA, M_WAITOK | M_ZERO);
+	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+	    &cmc_throttle, 0, sysctl_positive_int, "I",
+	    "Interval in seconds to throttle corrected MC interrupts");
+}
 #endif
 
 static void
@@ -785,6 +864,8 @@
 #ifdef DEV_APIC
 	if (mcg_cap & MCG_CAP_CMCI_P)
 		cmci_setup();
+	else if (amd_thresholding_supported())
+		amd_thresholding_setup();
 #endif
 }
 
@@ -859,6 +940,82 @@
 	ctl |= MC_CTL2_CMCI_EN | 1;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 }
+
+static void
+amd_thresholding_start(struct amd_et_state *cc)
+{
+	uint64_t misc;
+
+	KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
+	misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
+	misc &= ~MC_MISC_AMDNB_INT_MASK;
+	misc |= MC_MISC_AMDNB_INT_LVT;
+	misc &= ~MC_MISC_AMDNB_LVT_MASK;
+	misc |= (uint64_t)amd_elvt << MC_MISC_AMDNB_LVT_SHIFT;
+	misc &= ~MC_MISC_AMDNB_CNT_MASK;
+	misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
+	    << MC_MISC_AMDNB_CNT_SHIFT;
+	misc &= ~MC_MISC_AMDNB_OVERFLOW;
+	misc |= MC_MISC_AMDNB_CNTEN;
+
+	wrmsr(MSR_MC_MISC(MC_AMDNB_BANK), misc);
+}
+
+static void
+amd_thresholding_init(void)
+{
+	struct amd_et_state *cc;
+	uint64_t misc;
+
+	/* The counter must be valid and present. */
+	misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
+	if ((misc & (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP)) !=
+	    (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP))
+		return;
+
+	/* The register should not be locked. */
+	if ((misc & MC_MISC_AMDNB_LOCK) != 0)
+		return;
+
+	/*
+	 * If counter is enabled then either the firmware or another CPU
+	 * has already claimed it.
+	 */
+	if ((misc & MC_MISC_AMDNB_CNTEN) != 0)
+		return;
+
+	/*
+	 * Configure an Extended Interrupt LVT register for reporting
+	 * counter overflows if that feature is supported and the first
+	 * extended register is available.
+	 */
+	amd_elvt = lapic_enable_mca_elvt();
+	if (amd_elvt < 0)
+		return;
+
+	/* Re-use Intel CMC support infrastructure. */
+	cc = &amd_et_state[PCPU_GET(cpuid)];
+	cc->cur_threshold = 1;
+	amd_thresholding_start(cc);
+
+	/* Mark the NB bank as monitored. */
+	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << MC_AMDNB_BANK);
+}
+
+static void
+amd_thresholding_resume(void)
+{
+	struct amd_et_state *cc;
+
+	/* Nothing to do if this CPU doesn't monitor the NB bank. */
+	if ((PCPU_GET(cmci_mask) & 1 << MC_AMDNB_BANK) == 0)
+		return;
+
+	cc = &amd_et_state[PCPU_GET(cpuid)];
+	cc->last_intr = 0;
+	cc->cur_threshold = 1;
+	amd_thresholding_start(cc);
+}
 #endif
 
 /*
@@ -884,7 +1041,7 @@
 		if (mcg_cap & MCG_CAP_CTL_P)
 			/* Enable MCA features. */
 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
-		if (PCPU_GET(cpuid) == 0 && boot)
+		if (IS_BSP() && boot)
 			mca_setup(mcg_cap);
 
 		/*
@@ -900,6 +1057,14 @@
 			if ((mask & (1UL << 5)) == 0)
 				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
 		}
+
+		/*
+		 * The cmci_monitor() must not be executed
+		 * simultaneously by several CPUs.
+		 */
+		if (boot)
+			mtx_lock_spin(&mca_lock);
+
 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
 			/* By default enable logging of all errors. */
 			ctl = 0xffffffffffffffffUL;
@@ -934,10 +1099,30 @@
 			/* Clear all errors. */
 			wrmsr(MSR_MC_STATUS(i), 0);
 		}
+		if (boot)
+			mtx_unlock_spin(&mca_lock);
 
 #ifdef DEV_APIC
-		if (PCPU_GET(cmci_mask) != 0 && boot)
+		/*
+		 * AMD Processors from families 10h - 16h provide support
+		 * for Machine Check Error Thresholding.
+		 * The processors support counters of MC errors and they
+		 * can be configured to generate an interrupt when a counter
+		 * overflows.
+		 * The counters are all associated with Bank 4 and each
+		 * of them covers a group of errors reported via that bank.
+		 * At the moment only the DRAM Error Threshold Group is
+		 * supported.
+		 */
+		if (amd_thresholding_supported() &&
+		    (mcg_cap & MCG_CAP_COUNT) >= 4) {
+			if (boot)
+				amd_thresholding_init();
+			else
+				amd_thresholding_resume();
+		} else if (PCPU_GET(cmci_mask) != 0 && boot) {
 			lapic_enable_cmc();
+		}
 #endif
 	}
 
@@ -978,7 +1163,7 @@
 mca_intr(void)
 {
 	uint64_t mcg_status;
-	int old_count, recoverable;
+	int recoverable, count;
 
 	if (!(cpu_feature & CPUID_MCA)) {
 		/*
@@ -992,8 +1177,7 @@
 	}
 
 	/* Scan the banks and check for any non-recoverable errors. */
-	old_count = mca_count;
-	recoverable = mca_scan(MCE);
+	count = mca_scan(MCE, &recoverable);
 	mcg_status = rdmsr(MSR_MCG_STATUS);
 	if (!(mcg_status & MCG_STATUS_RIPV))
 		recoverable = 0;
@@ -1000,12 +1184,11 @@
 
 	if (!recoverable) {
 		/*
-		 * Wait for at least one error to be logged before
-		 * panic'ing.  Some errors will assert a machine check
-		 * on all CPUs, but only certain CPUs will find a valid
-		 * bank to log.
+		 * Only panic if the error was detected local to this CPU.
+		 * Some errors will assert a machine check on all CPUs, but
+		 * only certain CPUs will find a valid bank to log.
 		 */
-		while (mca_count == old_count)
+		while (count == 0)
 			cpu_spinwait();
 
 		panic("Unrecoverable machine check exception");
@@ -1027,7 +1210,7 @@
 	 * Serialize MCA bank scanning to prevent collisions from
 	 * sibling threads.
 	 */
-	count = mca_scan(CMCI);
+	count = mca_scan(CMCI, NULL);
 
 	/* If we found anything, log them to the console. */
 	if (count != 0) {

Added: trunk/sys/x86/x86/mp_watchdog.c
===================================================================
--- trunk/sys/x86/x86/mp_watchdog.c	                        (rev 0)
+++ trunk/sys/x86/x86/mp_watchdog.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,211 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2004 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/x86/mp_watchdog.c 303912 2016-08-10 13:38:44Z kib $
+ */
+
+#include "opt_mp_watchdog.h"
+#include "opt_sched.h"
+
+#ifdef SCHED_ULE
+#error MP_WATCHDOG cannot currently be used with SCHED_ULE
+#endif
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
+#include <machine/mp_watchdog.h>
+
+/*
+ * mp_watchdog hijacks the idle thread on a specified CPU, prevents new work
+ * from being scheduled there, and uses it as a "watchdog" to detect kernel
+ * failure on other CPUs.  This is made reasonable by inclusion of logical
+ * processors in Xeon hardware.  The watchdog is configured by setting the
+ * debug.watchdog sysctl/tunable to the CPU of interest.  A callout will then
+ * begin executing reseting a timer that is gradually lowered by the watching
+ * thread.  If the timer reaches 0, the watchdog fires by ether dropping
+ * directly to the debugger, or by sending an NMI IPI to the boot processor.
+ * This is a somewhat less efficient substitute for dedicated watchdog
+ * hardware, but can be quite an effective tool for debugging hangs.
+ *
+ * XXXRW: This should really use the watchdog(9)/watchdog(4) framework, but
+ * doesn't yet.
+ */
+static int	watchdog_cpu = -1;
+static int	watchdog_dontfire = 1;
+static int	watchdog_timer = -1;
+static int	watchdog_nmi = 1;
+
+SYSCTL_INT(_debug, OID_AUTO, watchdog_nmi, CTLFLAG_RWTUN, &watchdog_nmi, 0,
+    "IPI the boot processor with an NMI to enter the debugger");
+
+static struct callout	watchdog_callout;
+
+static void watchdog_change(int wdcpu);
+
+/*
+ * Number of seconds before the watchdog will fire if the callout fails to
+ * reset the timer.
+ */
+#define	WATCHDOG_THRESHOLD	10
+
+static void
+watchdog_init(void *arg)
+{
+
+	callout_init(&watchdog_callout, 1);
+	if (watchdog_cpu != -1)
+		watchdog_change(watchdog_cpu);
+}
+
+/*
+ * This callout resets a timer until the watchdog kicks in.  It acquires some
+ * critical locks to make sure things haven't gotten wedged with those locks
+ * held.
+ */
+static void
+watchdog_function(void *arg)
+{
+
+	/*
+	 * Since the timer ran, we must not be wedged.  Acquire some critical
+	 * locks to make sure.  Then reset the timer.
+	 */
+	mtx_lock(&Giant);
+	watchdog_timer = WATCHDOG_THRESHOLD;
+	mtx_unlock(&Giant);
+	callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL);
+}
+SYSINIT(watchdog_init, SI_SUB_DRIVERS, SI_ORDER_ANY, watchdog_init, NULL);
+
+static void
+watchdog_change(int wdcpu)
+{
+
+	if (wdcpu == -1 || wdcpu == 0xffffffff) {
+		/*
+		 * Disable the watchdog.
+		 */
+		watchdog_cpu = -1;
+		watchdog_dontfire = 1;
+		callout_stop(&watchdog_callout);
+		printf("watchdog stopped\n");
+	} else {
+		watchdog_timer = WATCHDOG_THRESHOLD;
+		watchdog_dontfire = 0;
+		watchdog_cpu = wdcpu;
+		callout_reset(&watchdog_callout, 1 * hz, watchdog_function,
+		    NULL);
+	}
+}
+
+/*
+ * This sysctl sets which CPU is the watchdog CPU.  Set to -1 or 0xffffffff
+ * to disable the watchdog.
+ */
+static int
+sysctl_watchdog(SYSCTL_HANDLER_ARGS)
+{
+	int error, temp;
+
+	temp = watchdog_cpu;
+	error = sysctl_handle_int(oidp, &temp, 0, req);
+	if (error)
+		return (error);
+
+	if (req->newptr != NULL)
+		watchdog_change(temp);
+	return (0);
+}
+SYSCTL_PROC(_debug, OID_AUTO, watchdog, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+    sysctl_watchdog, "I", "");
+
+/*
+ * Drop into the debugger by sending an IPI NMI to the boot processor.
+ */
+static void
+watchdog_ipi_nmi(void)
+{
+
+	/*
+	 * Deliver NMI to the boot processor.  Why not?
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_NMI,
+	    boot_cpu_id);
+	lapic_ipi_wait(-1);
+}
+
+/*
+ * ap_watchdog() is called by the SMP idle loop code.  It works on the same
+ * premise that the disabling of logical processors does: that if the cpu is
+ * idle, then it can ignore the world from then on, as nothing will be
+ * scheduled on it.  Leaving aside multi-runqueue schedulers (SCHED_ULE) and
+ * explicit process migration (sched_bind()), this is not an unreasonable
+ * assumption.
+ */
+void
+ap_watchdog(u_int cpuid)
+{
+	char old_pcomm[MAXCOMLEN + 1];
+	struct proc *p;
+
+	if (watchdog_cpu != cpuid)
+		return;
+
+	printf("watchdog started on cpu %d\n", cpuid);
+	p = curproc;
+	bcopy(p->p_comm, old_pcomm, MAXCOMLEN + 1);
+	snprintf(p->p_comm, MAXCOMLEN + 1, "mp_watchdog cpu %d", cpuid);
+	while (1) {
+		DELAY(1000000);				/* One second. */
+		if (watchdog_cpu != cpuid)
+			break;
+		atomic_subtract_int(&watchdog_timer, 1);
+		if (watchdog_timer < 4)
+			printf("Watchdog timer: %d\n", watchdog_timer);
+		if (watchdog_timer == 0 && watchdog_dontfire == 0) {
+			printf("Watchdog firing!\n");
+			watchdog_dontfire = 1;
+			if (watchdog_nmi)
+				watchdog_ipi_nmi();
+			else
+				kdb_enter(KDB_WHY_WATCHDOG, "mp_watchdog");
+		}
+	}
+	bcopy(old_pcomm, p->p_comm, MAXCOMLEN + 1);
+	printf("watchdog stopped on cpu %d\n", cpuid);
+}


Property changes on: trunk/sys/x86/x86/mp_watchdog.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/mp_x86.c
===================================================================
--- trunk/sys/x86/x86/mp_x86.c	                        (rev 0)
+++ trunk/sys/x86/x86/mp_x86.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,1640 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1996, by Steve Passe
+ * Copyright (c) 2003, by Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mp_x86.c 349958 2019-07-12 22:31:12Z jhb $");
+
+#ifdef __i386__
+#include "opt_apic.h"
+#endif
+#include "opt_cpu.h"
+#include "opt_kstack_pages.h"
+#include "opt_pmap.h"
+#include "opt_sched.h"
+#include "opt_smp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/cons.h>	/* cngetc() */
+#include <sys/cpuset.h>
+#ifdef GPROF 
+#include <sys/gmon.h>
+#endif
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+
+#include <x86/apicreg.h>
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/cputypes.h>
+#include <x86/mca.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/psl.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+#include <x86/ucode.h>
+
+/* lock region used by kernel profiling */
+int	mcount_lock;
+
+int	mp_naps;		/* # of Applications processors */
+int	boot_cpu_id = -1;	/* designated BSP */
+
+extern	struct pcpu __pcpu[];
+
+/* AP uses this during bootstrap.  Do not staticize.  */
+char *bootSTK;
+int bootAP;
+
+/* Free these after use */
+void *bootstacks[MAXCPU];
+void *dpcpu;
+
+struct pcb stoppcbs[MAXCPU];
+struct susppcb **susppcbs;
+
+#ifdef COUNT_IPIS
+/* Interrupt counts. */
+static u_long *ipi_preempt_counts[MAXCPU];
+static u_long *ipi_ast_counts[MAXCPU];
+u_long *ipi_invltlb_counts[MAXCPU];
+u_long *ipi_invlrng_counts[MAXCPU];
+u_long *ipi_invlpg_counts[MAXCPU];
+u_long *ipi_invlcache_counts[MAXCPU];
+u_long *ipi_rendezvous_counts[MAXCPU];
+static u_long *ipi_hardclock_counts[MAXCPU];
+#endif
+
+/* Default cpu_ops implementation. */
+struct cpu_ops cpu_ops;
+
+/*
+ * Local data and functions.
+ */
+
+static volatile cpuset_t ipi_stop_nmi_pending;
+
+volatile cpuset_t resuming_cpus;
+volatile cpuset_t toresume_cpus;
+
+/* used to hold the AP's until we are ready to release them */
+struct mtx ap_boot_mtx;
+
+/* Set to 1 once we're ready to let the APs out of the pen. */
+volatile int aps_ready = 0;
+
+/*
+ * Store data from cpu_add() until later in the boot when we actually setup
+ * the APs.
+ */
+struct cpu_info cpu_info[MAX_APIC_ID + 1];
+int apic_cpuids[MAX_APIC_ID + 1];
+int cpu_apic_ids[MAXCPU];
+
+/* Holds pending bitmap based IPIs per CPU */
+volatile u_int cpu_ipi_pending[MAXCPU];
+
+static void	release_aps(void *dummy);
+static void	cpustop_handler_post(u_int cpu);
+
+static int	hyperthreading_allowed = 1;
+SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
+	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
+
+static struct topo_node topo_root;
+
+static int pkg_id_shift;
+static int core_id_shift;
+static int disabled_cpus;
+
+struct cache_info {
+	int	id_shift;
+	int	present;
+} static caches[MAX_CACHE_LEVELS];
+
+void
+mem_range_AP_init(void)
+{
+
+	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
+		mem_range_softc.mr_op->initAP(&mem_range_softc);
+}
+
+/*
+ * Round up to the next power of two, if necessary, and then
+ * take log2.
+ * Returns -1 if argument is zero.
+ */
+static __inline int
+mask_width(u_int x)
+{
+
+	return (fls(x << (1 - powerof2(x))) - 1);
+}
+
+/*
+ * Add a cache level to the cache topology description.
+ */
+static int
+add_deterministic_cache(int type, int level, int share_count)
+{
+
+	if (type == 0)
+		return (0);
+	if (type > 3) {
+		printf("unexpected cache type %d\n", type);
+		return (1);
+	}
+	if (type == 2) /* ignore instruction cache */
+		return (1);
+	if (level == 0 || level > MAX_CACHE_LEVELS) {
+		printf("unexpected cache level %d\n", type);
+		return (1);
+	}
+
+	if (caches[level - 1].present) {
+		printf("WARNING: multiple entries for L%u data cache\n", level);
+		printf("%u => %u\n", caches[level - 1].id_shift,
+		    mask_width(share_count));
+	}
+	caches[level - 1].id_shift = mask_width(share_count);
+	caches[level - 1].present = 1;
+
+	if (caches[level - 1].id_shift > pkg_id_shift) {
+		printf("WARNING: L%u data cache covers more "
+		    "APIC IDs than a package\n", level);
+		printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
+		caches[level - 1].id_shift = pkg_id_shift;
+	}
+	if (caches[level - 1].id_shift < core_id_shift) {
+		printf("WARNING: L%u data cache covers less "
+		    "APIC IDs than a core\n", level);
+		printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
+		caches[level - 1].id_shift = core_id_shift;
+	}
+
+	return (1);
+}
+
+/*
+ * Determine topology of processing units and caches for AMD CPUs.
+ * See:
+ *  - AMD CPUID Specification (Publication # 25481)
+ *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
+ *  - BKDG For AMD Family 10h Processors (Publication # 31116)
+ *  - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
+ *  - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
+ *  - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
+ */
+static void
+topo_probe_amd(void)
+{
+	u_int p[4];
+	uint64_t v;
+	int level;
+	int nodes_per_socket;
+	int share_count;
+	int type;
+	int i;
+
+	/* No multi-core capability. */
+	if ((amd_feature2 & AMDID2_CMP) == 0)
+		return;
+
+	/* For families 10h and newer. */
+	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
+	    AMDID_COREID_SIZE_SHIFT;
+
+	/* For 0Fh family. */
+	if (pkg_id_shift == 0)
+		pkg_id_shift =
+		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
+
+	/*
+	 * Families prior to 16h define the following value as
+	 * cores per compute unit and we don't really care about the AMD
+	 * compute units at the moment.  Perhaps we should treat them as
+	 * cores and cores within the compute units as hardware threads,
+	 * but that's up for debate.
+	 * Later families define the value as threads per compute unit,
+	 * so we are following AMD's nomenclature here.
+	 */
+	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
+	    CPUID_TO_FAMILY(cpu_id) >= 0x16) {
+		cpuid_count(0x8000001e, 0, p);
+		share_count = ((p[1] >> 8) & 0xff) + 1;
+		core_id_shift = mask_width(share_count);
+	}
+
+	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
+		for (i = 0; ; i++) {
+			cpuid_count(0x8000001d, i, p);
+			type = p[0] & 0x1f;
+			level = (p[0] >> 5) & 0x7;
+			share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+			if (!add_deterministic_cache(type, level, share_count))
+				break;
+		}
+	} else {
+		if (cpu_exthigh >= 0x80000005) {
+			cpuid_count(0x80000005, 0, p);
+			if (((p[2] >> 24) & 0xff) != 0) {
+				caches[0].id_shift = 0;
+				caches[0].present = 1;
+			}
+		}
+		if (cpu_exthigh >= 0x80000006) {
+			cpuid_count(0x80000006, 0, p);
+			if (((p[2] >> 16) & 0xffff) != 0) {
+				caches[1].id_shift = 0;
+				caches[1].present = 1;
+			}
+			if (((p[3] >> 18) & 0x3fff) != 0) {
+				nodes_per_socket = 1;
+				if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
+					/*
+					 * Handle multi-node processors that
+					 * have multiple chips, each with its
+					 * own L3 cache, on the same die.
+					 */
+					v = rdmsr(0xc001100c);
+					nodes_per_socket = 1 + ((v >> 3) & 0x7);
+				}
+				caches[2].id_shift =
+				    pkg_id_shift - mask_width(nodes_per_socket);
+				caches[2].present = 1;
+			}
+		}
+	}
+}
+
+/*
+ * Determine topology of processing units for Intel CPUs
+ * using CPUID Leaf 1 and Leaf 4, if supported.
+ * See:
+ *  - Intel 64 Architecture Processor Topology Enumeration
+ *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
+ *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
+ *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
+ */
+static void
+topo_probe_intel_0x4(void)
+{
+	u_int p[4];
+	int max_cores;
+	int max_logical;
+
+	/* Both zero and one here mean one logical processor per package. */
+	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
+	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
+	if (max_logical <= 1)
+		return;
+
+	if (cpu_high >= 0x4) {
+		cpuid_count(0x04, 0, p);
+		max_cores = ((p[0] >> 26) & 0x3f) + 1;
+	} else
+		max_cores = 1;
+
+	core_id_shift = mask_width(max_logical/max_cores);
+	KASSERT(core_id_shift >= 0,
+	    ("intel topo: max_cores > max_logical\n"));
+	pkg_id_shift = core_id_shift + mask_width(max_cores);
+}
+
+/*
+ * Determine topology of processing units for Intel CPUs
+ * using CPUID Leaf 11, if supported.
+ * See:
+ *  - Intel 64 Architecture Processor Topology Enumeration
+ *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
+ *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
+ *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
+ */
+static void
+topo_probe_intel_0xb(void)
+{
+	u_int p[4];
+	int bits;
+	int type;
+	int i;
+
+	/* Fall back if CPU leaf 11 doesn't really exist. */
+	cpuid_count(0x0b, 0, p);
+	if (p[1] == 0) {
+		topo_probe_intel_0x4();
+		return;
+	}
+
+	/* We only support three levels for now. */
+	for (i = 0; ; i++) {
+		cpuid_count(0x0b, i, p);
+
+		bits = p[0] & 0x1f;
+		type = (p[2] >> 8) & 0xff;
+
+		if (type == 0)
+			break;
+
+		/* TODO: check for duplicate (re-)assignment */
+		if (type == CPUID_TYPE_SMT)
+			core_id_shift = bits;
+		else if (type == CPUID_TYPE_CORE)
+			pkg_id_shift = bits;
+		else
+			printf("unknown CPU level type %d\n", type);
+	}
+
+	if (pkg_id_shift < core_id_shift) {
+		printf("WARNING: core covers more APIC IDs than a package\n");
+		core_id_shift = pkg_id_shift;
+	}
+}
+
+/*
+ * Determine topology of caches for Intel CPUs.
+ * See:
+ *  - Intel 64 Architecture Processor Topology Enumeration
+ *  - Intel 64 and IA-32 Architectures Software Developer’s Manual
+ *    Volume 2A: Instruction Set Reference, A-M,
+ *    CPUID instruction
+ */
+static void
+topo_probe_intel_caches(void)
+{
+	u_int p[4];
+	int level;
+	int share_count;
+	int type;
+	int i;
+
+	if (cpu_high < 0x4) {
+		/*
+		 * Available cache level and sizes can be determined
+		 * via CPUID leaf 2, but that requires a huge table of hardcoded
+		 * values, so for now just assume L1 and L2 caches potentially
+		 * shared only by HTT processing units, if HTT is present.
+		 */
+		caches[0].id_shift = pkg_id_shift;
+		caches[0].present = 1;
+		caches[1].id_shift = pkg_id_shift;
+		caches[1].present = 1;
+		return;
+	}
+
+	for (i = 0; ; i++) {
+		cpuid_count(0x4, i, p);
+		type = p[0] & 0x1f;
+		level = (p[0] >> 5) & 0x7;
+		share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+		if (!add_deterministic_cache(type, level, share_count))
+			break;
+	}
+}
+
+/*
+ * Determine topology of processing units and caches for Intel CPUs.
+ * See:
+ *  - Intel 64 Architecture Processor Topology Enumeration
+ */
+static void
+topo_probe_intel(void)
+{
+
+	/*
+	 * Note that 0x1 <= cpu_high < 4 case should be
+	 * compatible with topo_probe_intel_0x4() logic when
+	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
+	 * or it should trigger the fallback otherwise.
+	 */
+	if (cpu_high >= 0xb)
+		topo_probe_intel_0xb();
+	else if (cpu_high >= 0x1)
+		topo_probe_intel_0x4();
+
+	topo_probe_intel_caches();
+}
+
+/*
+ * Topology information is queried only on BSP, on which this
+ * code runs and for which it can query CPUID information.
+ * Then topology is extrapolated on all packages using an
+ * assumption that APIC ID to hardware component ID mapping is
+ * homogenious.
+ * That doesn't necesserily imply that the topology is uniform.
+ */
+void
+topo_probe(void)
+{
+	static int cpu_topo_probed = 0;
+	struct x86_topo_layer {
+		int type;
+		int subtype;
+		int id_shift;
+	} topo_layers[MAX_CACHE_LEVELS + 3];
+	struct topo_node *parent;
+	struct topo_node *node;
+	int layer;
+	int nlayers;
+	int node_id;
+	int i;
+
+	if (cpu_topo_probed)
+		return;
+
+	CPU_ZERO(&logical_cpus_mask);
+
+	if (mp_ncpus <= 1)
+		; /* nothing */
+	else if (cpu_vendor_id == CPU_VENDOR_AMD)
+		topo_probe_amd();
+	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
+		topo_probe_intel();
+
+	KASSERT(pkg_id_shift >= core_id_shift,
+	    ("bug in APIC topology discovery"));
+
+	nlayers = 0;
+	bzero(topo_layers, sizeof(topo_layers));
+
+	topo_layers[nlayers].type = TOPO_TYPE_PKG;
+	topo_layers[nlayers].id_shift = pkg_id_shift;
+	if (bootverbose)
+		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
+	nlayers++;
+
+	/*
+	 * Consider all caches to be within a package/chip
+	 * and "in front" of all sub-components like
+	 * cores and hardware threads.
+	 */
+	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
+		if (caches[i].present) {
+			KASSERT(caches[i].id_shift <= pkg_id_shift,
+				("bug in APIC topology discovery"));
+			KASSERT(caches[i].id_shift >= core_id_shift,
+				("bug in APIC topology discovery"));
+
+			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
+			topo_layers[nlayers].subtype = i + 1;
+			topo_layers[nlayers].id_shift = caches[i].id_shift;
+			if (bootverbose)
+				printf("L%u cache ID shift: %u\n",
+				    topo_layers[nlayers].subtype,
+				    topo_layers[nlayers].id_shift);
+			nlayers++;
+		}
+	}
+
+	if (pkg_id_shift > core_id_shift) {
+		topo_layers[nlayers].type = TOPO_TYPE_CORE;
+		topo_layers[nlayers].id_shift = core_id_shift;
+		if (bootverbose)
+			printf("Core ID shift: %u\n",
+			    topo_layers[nlayers].id_shift);
+		nlayers++;
+	}
+
+	topo_layers[nlayers].type = TOPO_TYPE_PU;
+	topo_layers[nlayers].id_shift = 0;
+	nlayers++;
+
+	topo_init_root(&topo_root);
+	for (i = 0; i <= MAX_APIC_ID; ++i) {
+		if (!cpu_info[i].cpu_present)
+			continue;
+
+		parent = &topo_root;
+		for (layer = 0; layer < nlayers; ++layer) {
+			node_id = i >> topo_layers[layer].id_shift;
+			parent = topo_add_node_by_hwid(parent, node_id,
+			    topo_layers[layer].type,
+			    topo_layers[layer].subtype);
+		}
+	}
+
+	parent = &topo_root;
+	for (layer = 0; layer < nlayers; ++layer) {
+		node_id = boot_cpu_id >> topo_layers[layer].id_shift;
+		node = topo_find_node_by_hwid(parent, node_id,
+		    topo_layers[layer].type,
+		    topo_layers[layer].subtype);
+		topo_promote_child(node);
+		parent = node;
+	}
+
+	cpu_topo_probed = 1;
+}
+
+/*
+ * Assign logical CPU IDs to local APICs.
+ */
+void
+assign_cpu_ids(void)
+{
+	struct topo_node *node;
+	u_int smt_mask;
+
+	smt_mask = (1u << core_id_shift) - 1;
+
+	/*
+	 * Assign CPU IDs to local APIC IDs and disable any CPUs
+	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
+	 */
+	mp_ncpus = 0;
+	TOPO_FOREACH(node, &topo_root) {
+		if (node->type != TOPO_TYPE_PU)
+			continue;
+
+		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
+			cpu_info[node->hwid].cpu_hyperthread = 1;
+
+		if (resource_disabled("lapic", node->hwid)) {
+			if (node->hwid != boot_cpu_id)
+				cpu_info[node->hwid].cpu_disabled = 1;
+			else
+				printf("Cannot disable BSP, APIC ID = %d\n",
+				    node->hwid);
+		}
+
+		if (!hyperthreading_allowed &&
+		    cpu_info[node->hwid].cpu_hyperthread)
+			cpu_info[node->hwid].cpu_disabled = 1;
+
+		if (mp_ncpus >= MAXCPU)
+			cpu_info[node->hwid].cpu_disabled = 1;
+
+		if (cpu_info[node->hwid].cpu_disabled) {
+			disabled_cpus++;
+			continue;
+		}
+
+		cpu_apic_ids[mp_ncpus] = node->hwid;
+		apic_cpuids[node->hwid] = mp_ncpus;
+		topo_set_pu_id(node, mp_ncpus);
+		mp_ncpus++;
+	}
+
+	KASSERT(mp_maxid >= mp_ncpus - 1,
+	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
+	    mp_ncpus));
+}
+
+/*
+ * Print various information about the SMP system hardware and setup.
+ */
+void
+cpu_mp_announce(void)
+{
+	struct topo_node *node;
+	const char *hyperthread;
+	int pkg_count;
+	int cores_per_pkg;
+	int thrs_per_core;
+
+	printf("FreeBSD/SMP: ");
+	if (topo_analyze(&topo_root, 1, &pkg_count,
+	    &cores_per_pkg, &thrs_per_core)) {
+		printf("%d package(s)", pkg_count);
+		if (cores_per_pkg > 0)
+			printf(" x %d core(s)", cores_per_pkg);
+		if (thrs_per_core > 1)
+		    printf(" x %d hardware threads", thrs_per_core);
+	} else {
+		printf("Non-uniform topology");
+	}
+	printf("\n");
+
+	if (disabled_cpus) {
+		printf("FreeBSD/SMP Online: ");
+		if (topo_analyze(&topo_root, 0, &pkg_count,
+		    &cores_per_pkg, &thrs_per_core)) {
+			printf("%d package(s)", pkg_count);
+			if (cores_per_pkg > 0)
+				printf(" x %d core(s)", cores_per_pkg);
+			if (thrs_per_core > 1)
+			    printf(" x %d hardware threads", thrs_per_core);
+		} else {
+			printf("Non-uniform topology");
+		}
+		printf("\n");
+	}
+
+	if (!bootverbose)
+		return;
+
+	TOPO_FOREACH(node, &topo_root) {
+		switch (node->type) {
+		case TOPO_TYPE_PKG:
+			printf("Package HW ID = %u (%#x)\n",
+			    node->hwid, node->hwid);
+			break;
+		case TOPO_TYPE_CORE:
+			printf("\tCore HW ID = %u (%#x)\n",
+			    node->hwid, node->hwid);
+			break;
+		case TOPO_TYPE_PU:
+			if (cpu_info[node->hwid].cpu_hyperthread)
+				hyperthread = "/HT";
+			else
+				hyperthread = "";
+
+			if (node->subtype == 0)
+				printf("\t\tCPU (AP%s): APIC ID: %u (%#x)"
+				    "(disabled)\n", hyperthread, node->hwid,
+				    node->hwid);
+			else if (node->id == 0)
+				printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n",
+				    node->hwid, node->hwid);
+			else
+				printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n",
+				    node->id, hyperthread, node->hwid,
+				    node->hwid);
+			break;
+		default:
+			/* ignored */
+			break;
+		}
+	}
+}
+
+/*
+ * Add a scheduling group, a group of logical processors sharing
+ * a particular cache (and, thus having an affinity), to the scheduling
+ * topology.
+ * This function recursively works on lower level caches.
+ */
+static void
+x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
+{
+	struct topo_node *node;
+	int nchildren;
+	int ncores;
+	int i;
+
+	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE,
+	    ("x86topo_add_sched_group: bad type: %u", root->type));
+	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
+	cg_root->cg_count = root->cpu_count;
+	if (root->type == TOPO_TYPE_SYSTEM)
+		cg_root->cg_level = CG_SHARE_NONE;
+	else
+		cg_root->cg_level = root->subtype;
+
+	/*
+	 * Check how many core nodes we have under the given root node.
+	 * If we have multiple logical processors, but not multiple
+	 * cores, then those processors must be hardware threads.
+	 */
+	ncores = 0;
+	node = root;
+	while (node != NULL) {
+		if (node->type != TOPO_TYPE_CORE) {
+			node = topo_next_node(root, node);
+			continue;
+		}
+
+		ncores++;
+		node = topo_next_nonchild_node(root, node);
+	}
+
+	if (cg_root->cg_level != CG_SHARE_NONE &&
+	    root->cpu_count > 1 && ncores < 2)
+		cg_root->cg_flags = CG_FLAG_SMT;
+
+	/*
+	 * Find out how many cache nodes we have under the given root node.
+	 * We ignore cache nodes that cover all the same processors as the
+	 * root node.  Also, we do not descend below found cache nodes.
+	 * That is, we count top-level "non-redundant" caches under the root
+	 * node.
+	 */
+	nchildren = 0;
+	node = root;
+	while (node != NULL) {
+		if (node->type != TOPO_TYPE_CACHE ||
+		    (root->type != TOPO_TYPE_SYSTEM &&
+		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
+			node = topo_next_node(root, node);
+			continue;
+		}
+		nchildren++;
+		node = topo_next_nonchild_node(root, node);
+	}
+
+	cg_root->cg_child = smp_topo_alloc(nchildren);
+	cg_root->cg_children = nchildren;
+
+	/*
+	 * Now find again the same cache nodes as above and recursively
+	 * build scheduling topologies for them.
+	 */
+	node = root;
+	i = 0;
+	while (node != NULL) {
+		if (node->type != TOPO_TYPE_CACHE ||
+		    (root->type != TOPO_TYPE_SYSTEM &&
+		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
+			node = topo_next_node(root, node);
+			continue;
+		}
+		cg_root->cg_child[i].cg_parent = cg_root;
+		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
+		i++;
+		node = topo_next_nonchild_node(root, node);
+	}
+}
+
+/*
+ * Build the MI scheduling topology from the discovered hardware topology.
+ */
+struct cpu_group *
+cpu_topo(void)
+{
+	struct cpu_group *cg_root;
+
+	if (mp_ncpus <= 1)
+		return (smp_topo_none());
+
+	cg_root = smp_topo_alloc(1);
+	x86topo_add_sched_group(&topo_root, cg_root);
+	return (cg_root);
+}
+
+
+/*
+ * Add a logical CPU to the topology.
+ */
+void
+cpu_add(u_int apic_id, char boot_cpu)
+{
+
+	if (apic_id > MAX_APIC_ID) {
+		panic("SMP: APIC ID %d too high", apic_id);
+		return;
+	}
+	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
+	    apic_id));
+	cpu_info[apic_id].cpu_present = 1;
+	if (boot_cpu) {
+		KASSERT(boot_cpu_id == -1,
+		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
+		    boot_cpu_id));
+		boot_cpu_id = apic_id;
+		cpu_info[apic_id].cpu_bsp = 1;
+	}
+	if (mp_ncpus < MAXCPU) {
+		mp_ncpus++;
+		mp_maxid = mp_ncpus - 1;
+	}
+	if (bootverbose)
+		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
+		    "AP");
+}
+
+void
+cpu_mp_setmaxid(void)
+{
+
+	/*
+	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
+	 * If there were no calls to cpu_add() assume this is a UP system.
+	 */
+	if (mp_ncpus == 0)
+		mp_ncpus = 1;
+}
+
+int
+cpu_mp_probe(void)
+{
+
+	/*
+	 * Always record BSP in CPU map so that the mbuf init code works
+	 * correctly.
+	 */
+	CPU_SETOF(0, &all_cpus);
+	return (mp_ncpus > 1);
+}
+
+/*
+ * AP CPU's call this to initialize themselves.
+ */
+void
+init_secondary_tail(void)
+{
+	u_int cpuid;
+
+	pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
+
+	/*
+	 * On real hardware, switch to x2apic mode if possible.  Do it
+	 * after aps_ready was signalled, to avoid manipulating the
+	 * mode while BSP might still want to send some IPI to us
+	 * (second startup IPI is ignored on modern hardware etc).
+	 */
+	lapic_xapic_mode();
+
+	/* Initialize the PAT MSR. */
+	pmap_init_pat();
+
+	/* set up CPU registers and state */
+	cpu_setregs();
+
+	/* set up SSE/NX */
+	initializecpu();
+
+	/* set up FPU state on the AP */
+#ifdef __amd64__
+	fpuinit();
+#else
+	npxinit(false);
+#endif
+
+	if (cpu_ops.cpu_init)
+		cpu_ops.cpu_init();
+
+	/* A quick check from sanity claus */
+	cpuid = PCPU_GET(cpuid);
+	if (PCPU_GET(apic_id) != lapic_id()) {
+		printf("SMP: cpuid = %d\n", cpuid);
+		printf("SMP: actual apic_id = %d\n", lapic_id());
+		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
+		panic("cpuid mismatch! boom!!");
+	}
+
+	/* Initialize curthread. */
+	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
+	PCPU_SET(curthread, PCPU_GET(idlethread));
+
+	mtx_lock_spin(&ap_boot_mtx);
+
+	mca_init();
+
+	/* Init local apic for irq's */
+	lapic_setup(1);
+
+	/* Set memory range attributes for this CPU to match the BSP */
+	mem_range_AP_init();
+
+	smp_cpus++;
+
+	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
+	printf("SMP: AP CPU #%d Launched!\n", cpuid);
+
+	/* Determine if we are a logical CPU. */
+	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
+		CPU_SET(cpuid, &logical_cpus_mask);
+
+	if (bootverbose)
+		lapic_dump("AP");
+
+	if (smp_cpus == mp_ncpus) {
+		/* enable IPI's, tlb shootdown, freezes etc */
+		atomic_store_rel_int(&smp_started, 1);
+	}
+
+#ifdef __amd64__
+	/*
+	 * Enable global pages TLB extension
+	 * This also implicitly flushes the TLB 
+	 */
+	load_cr4(rcr4() | CR4_PGE);
+	if (pmap_pcid_enabled)
+		load_cr4(rcr4() | CR4_PCIDE);
+	load_ds(_udatasel);
+	load_es(_udatasel);
+	load_fs(_ufssel);
+#endif
+
+	mtx_unlock_spin(&ap_boot_mtx);
+
+	/* Wait until all the AP's are up. */
+	while (atomic_load_acq_int(&smp_started) == 0)
+		ia32_pause();
+
+#ifndef EARLY_AP_STARTUP
+	/* Start per-CPU event timers. */
+	cpu_initclocks_ap();
+#endif
+
+	sched_throw(NULL);
+
+	panic("scheduler returned us to %s", __func__);
+	/* NOTREACHED */
+}
+
+/*******************************************************************
+ * local functions and data
+ */
+
+/*
+ * We tell the I/O APIC code about all the CPUs we want to receive
+ * interrupts.  If we don't want certain CPUs to receive IRQs we
+ * can simply not tell the I/O APIC code about them in this function.
+ * We also do not tell it about the BSP since it tells itself about
+ * the BSP internally to work with UP kernels and on UP machines.
+ */
+void
+set_interrupt_apic_ids(void)
+{
+	u_int i, apic_id;
+
+	for (i = 0; i < MAXCPU; i++) {
+		apic_id = cpu_apic_ids[i];
+		if (apic_id == -1)
+			continue;
+		if (cpu_info[apic_id].cpu_bsp)
+			continue;
+		if (cpu_info[apic_id].cpu_disabled)
+			continue;
+
+		/* Don't let hyperthreads service interrupts. */
+		if (cpu_info[apic_id].cpu_hyperthread)
+			continue;
+
+		intr_add_cpu(i);
+	}
+}
+
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+    sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+    sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+    sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+    0, "");
+#endif /* COUNT_XINVLTLB_HITS */
+
+/*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
+	/*
+	 * This attempts to follow the algorithm described in the
+	 * Intel Multiprocessor Specification v1.4 in section B.4.
+	 * For each IPI, we allow the local APIC ~20us to deliver the
+	 * IPI.  If that times out, we panic.
+	 */
+
+	/*
+	 * first we do an INIT IPI: this INIT IPI might be run, resetting
+	 * and running the target CPU. OR this INIT IPI might be latched (P5
+	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
+	 * ignored.
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
+	lapic_ipi_wait(100);
+
+	/* Explicitly deassert the INIT IPI. */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
+	    apic_id);
+
+	DELAY(10000);		/* wait ~10mS */
+
+	/*
+	 * next we do a STARTUP IPI: the previous INIT IPI might still be
+	 * latched, (P5 bug) this 1st STARTUP would then terminate
+	 * immediately, and the previously started INIT IPI would continue. OR
+	 * the previous INIT IPI has already run. and this STARTUP IPI will
+	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
+	 * will run.
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+	    vector, apic_id);
+	if (!lapic_ipi_wait(100))
+		panic("Failed to deliver first STARTUP IPI to APIC %d",
+		    apic_id);
+	DELAY(200);		/* wait ~200uS */
+
+	/*
+	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
+	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
+	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
+	 * recognized after hardware RESET or INIT IPI.
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+	    vector, apic_id);
+	if (!lapic_ipi_wait(100))
+		panic("Failed to deliver second STARTUP IPI to APIC %d",
+		    apic_id);
+
+	DELAY(200);		/* wait ~200uS */
+}
+
+/*
+ * Send an IPI to specified CPU handling the bitmap logic.
+ */
+void
+ipi_send_cpu(int cpu, u_int ipi)
+{
+	u_int bitmap, old_pending, new_pending;
+
+	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
+
+	if (IPI_IS_BITMAPED(ipi)) {
+		bitmap = 1 << ipi;
+		ipi = IPI_BITMAP_VECTOR;
+		do {
+			old_pending = cpu_ipi_pending[cpu];
+			new_pending = old_pending | bitmap;
+		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
+		    old_pending, new_pending));	
+		if (old_pending)
+			return;
+	}
+	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
+}
+
+void
+ipi_bitmap_handler(struct trapframe frame)
+{
+	struct trapframe *oldframe;
+	struct thread *td;
+	int cpu = PCPU_GET(cpuid);
+	u_int ipi_bitmap;
+
+	critical_enter();
+	td = curthread;
+	td->td_intr_nesting_level++;
+	oldframe = td->td_intr_frame;
+	td->td_intr_frame = &frame;
+	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
+	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
+#ifdef COUNT_IPIS
+		(*ipi_preempt_counts[cpu])++;
+#endif
+		sched_preempt(td);
+	}
+	if (ipi_bitmap & (1 << IPI_AST)) {
+#ifdef COUNT_IPIS
+		(*ipi_ast_counts[cpu])++;
+#endif
+		/* Nothing to do for AST */
+	}
+	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
+#ifdef COUNT_IPIS
+		(*ipi_hardclock_counts[cpu])++;
+#endif
+		hardclockintr();
+	}
+	td->td_intr_frame = oldframe;
+	td->td_intr_nesting_level--;
+	critical_exit();
+}
+
+/*
+ * send an IPI to a set of cpus.
+ */
+void
+ipi_selected(cpuset_t cpus, u_int ipi)
+{
+	int cpu;
+
+	/*
+	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+	 * of help in order to understand what is the source.
+	 * Set the mask of receiving CPUs for this purpose.
+	 */
+	if (ipi == IPI_STOP_HARD)
+		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
+
+	while ((cpu = CPU_FFS(&cpus)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &cpus);
+		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
+		ipi_send_cpu(cpu, ipi);
+	}
+}
+
+/*
+ * send an IPI to a specific CPU.
+ */
+void
+ipi_cpu(int cpu, u_int ipi)
+{
+
+	/*
+	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+	 * of help in order to understand what is the source.
+	 * Set the mask of receiving CPUs for this purpose.
+	 */
+	if (ipi == IPI_STOP_HARD)
+		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
+
+	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
+	ipi_send_cpu(cpu, ipi);
+}
+
+/*
+ * send an IPI to all CPUs EXCEPT myself
+ */
+void
+ipi_all_but_self(u_int ipi)
+{
+	cpuset_t other_cpus;
+
+	other_cpus = all_cpus;
+	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	if (IPI_IS_BITMAPED(ipi)) {
+		ipi_selected(other_cpus, ipi);
+		return;
+	}
+
+	/*
+	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+	 * of help in order to understand what is the source.
+	 * Set the mask of receiving CPUs for this purpose.
+	 */
+	if (ipi == IPI_STOP_HARD)
+		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
+
+	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
+	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
+}
+
+int
+ipi_nmi_handler(void)
+{
+	u_int cpuid;
+
+	/*
+	 * As long as there is not a simple way to know about a NMI's
+	 * source, if the bitmask for the current CPU is present in
+	 * the global pending bitword an IPI_STOP_HARD has been issued
+	 * and should be handled.
+	 */
+	cpuid = PCPU_GET(cpuid);
+	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
+		return (1);
+
+	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
+	cpustop_handler();
+	return (0);
+}
+
+int nmi_kdb_lock;
+
+void
+nmi_call_kdb_smp(u_int type, struct trapframe *frame)
+{
+	int cpu;
+	bool call_post;
+
+	cpu = PCPU_GET(cpuid);
+	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
+		nmi_call_kdb(cpu, type, frame);
+		call_post = false;
+	} else {
+		savectx(&stoppcbs[cpu]);
+		CPU_SET_ATOMIC(cpu, &stopped_cpus);
+		while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
+			ia32_pause();
+		call_post = true;
+	}
+	atomic_store_rel_int(&nmi_kdb_lock, 0);
+	if (call_post)
+		cpustop_handler_post(cpu);
+}
+
+/*
+ * Handle an IPI_STOP by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpustop_handler(void)
+{
+	u_int cpu;
+
+	cpu = PCPU_GET(cpuid);
+
+	savectx(&stoppcbs[cpu]);
+
+	/* Indicate that we are stopped */
+	CPU_SET_ATOMIC(cpu, &stopped_cpus);
+
+	/* Wait for restart */
+	while (!CPU_ISSET(cpu, &started_cpus))
+	    ia32_pause();
+
+	cpustop_handler_post(cpu);
+}
+
+static void
+cpustop_handler_post(u_int cpu)
+{
+
+	CPU_CLR_ATOMIC(cpu, &started_cpus);
+	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
+
+#if defined(__amd64__) && defined(DDB)
+	amd64_db_resume_dbreg();
+#endif
+
+	if (cpu == 0 && cpustop_restartfunc != NULL) {
+		cpustop_restartfunc();
+		cpustop_restartfunc = NULL;
+	}
+}
+
+/*
+ * Handle an IPI_SUSPEND by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpususpend_handler(void)
+{
+	u_int cpu;
+
+	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
+
+	cpu = PCPU_GET(cpuid);
+	if (savectx(&susppcbs[cpu]->sp_pcb)) {
+#ifdef __amd64__
+		fpususpend(susppcbs[cpu]->sp_fpususpend);
+#else
+		npxsuspend(susppcbs[cpu]->sp_fpususpend);
+#endif
+		/*
+		 * suspended_cpus is cleared shortly after each AP is restarted
+		 * by a Startup IPI, so that the BSP can proceed to restarting
+		 * the next AP.
+		 *
+		 * resuming_cpus gets cleared when the AP completes
+		 * initialization after having been released by the BSP.
+		 * resuming_cpus is probably not the best name for the
+		 * variable, because it is actually a set of processors that
+		 * haven't resumed yet and haven't necessarily started resuming.
+		 *
+		 * Note that suspended_cpus is meaningful only for ACPI suspend
+		 * as it's not really used for Xen suspend since the APs are
+		 * automatically restored to the running state and the correct
+		 * context.  For the same reason resumectx is never called in
+		 * that case.
+		 */
+		CPU_SET_ATOMIC(cpu, &suspended_cpus);
+		CPU_SET_ATOMIC(cpu, &resuming_cpus);
+
+		/*
+		 * Invalidate the cache after setting the global status bits.
+		 * The last AP to set its bit may end up being an Owner of the
+		 * corresponding cache line in MOESI protocol.  The AP may be
+		 * stopped before the cache line is written to the main memory.
+		 */
+		wbinvd();
+	} else {
+#ifdef __amd64__
+		fpuresume(susppcbs[cpu]->sp_fpususpend);
+#else
+		npxresume(susppcbs[cpu]->sp_fpususpend);
+#endif
+		pmap_init_pat();
+		initializecpu();
+		PCPU_SET(switchtime, 0);
+		PCPU_SET(switchticks, ticks);
+
+		/* Indicate that we have restarted and restored the context. */
+		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+	}
+
+	/* Wait for resume directive */
+	while (!CPU_ISSET(cpu, &toresume_cpus))
+		ia32_pause();
+
+	/* Re-apply microcode updates. */
+	ucode_reload();
+
+	if (cpu_ops.cpu_resume)
+		cpu_ops.cpu_resume();
+#ifdef __amd64__
+	if (vmm_resume_p)
+		vmm_resume_p();
+#endif
+
+	/* Resume MCA and local APIC */
+	lapic_xapic_mode();
+	mca_resume();
+	lapic_setup(0);
+
+	/* Indicate that we are resumed */
+	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
+	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
+}
+
+
+void
+invlcache_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_IPIS
+	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since wbinvd is a serializing instruction.  Without the
+	 * temporary, we'd wait for wbinvd to complete, then the read
+	 * would execute, then the dependent write, which must then
+	 * complete before return from interrupt.
+	 */
+	generation = smp_tlb_generation;
+	wbinvd();
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+/*
+ * This is called once the rest of the system is up and running and we're
+ * ready to let the AP's out of the pen.
+ */
+static void
+release_aps(void *dummy __unused)
+{
+
+	if (mp_ncpus == 1) 
+		return;
+	atomic_store_rel_int(&aps_ready, 1);
+	while (smp_started == 0)
+		ia32_pause();
+}
+SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
+
+#ifdef COUNT_IPIS
+/*
+ * Setup interrupt counters for IPI handlers.
+ */
+static void
+mp_ipi_intrcnt(void *dummy)
+{
+	char buf[64];
+	int i;
+
+	CPU_FOREACH(i) {
+		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
+		intrcnt_add(buf, &ipi_invltlb_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
+		intrcnt_add(buf, &ipi_invlrng_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
+		intrcnt_add(buf, &ipi_invlpg_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
+		intrcnt_add(buf, &ipi_invlcache_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
+		intrcnt_add(buf, &ipi_preempt_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
+		intrcnt_add(buf, &ipi_ast_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
+		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
+		intrcnt_add(buf, &ipi_hardclock_counts[i]);
+	}		
+}
+SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
+#endif
+
+/*
+ * Flush the TLB on other CPU's
+ */
+
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+pmap_t smp_tlb_pmap;
+volatile uint32_t smp_tlb_generation;
+
+#ifdef __amd64__
+#define	read_eflags() read_rflags()
+#endif
+
+static void
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
+    vm_offset_t addr1, vm_offset_t addr2)
+{
+	cpuset_t other_cpus;
+	volatile uint32_t *p_cpudone;
+	uint32_t generation;
+	int cpu;
+
+	/*
+	 * Check for other cpus.  Return if none.
+	 */
+	if (CPU_ISFULLSET(&mask)) {
+		if (mp_ncpus <= 1)
+			return;
+	} else {
+		CPU_CLR(PCPU_GET(cpuid), &mask);
+		if (CPU_EMPTY(&mask))
+			return;
+	}
+
+	if (!(read_eflags() & PSL_I))
+		panic("%s: interrupts disabled", __func__);
+	mtx_lock_spin(&smp_ipi_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_pmap = pmap;
+	generation = ++smp_tlb_generation;
+	if (CPU_ISFULLSET(&mask)) {
+		ipi_all_but_self(vector);
+		other_cpus = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	} else {
+		other_cpus = mask;
+		while ((cpu = CPU_FFS(&mask)) != 0) {
+			cpu--;
+			CPU_CLR(cpu, &mask);
+			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
+			    cpu, vector);
+			ipi_send_cpu(cpu, vector);
+		}
+	}
+	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &other_cpus);
+		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
+		while (*p_cpudone != generation)
+			ia32_pause();
+	}
+	mtx_unlock_spin(&smp_ipi_mtx);
+}
+
+void
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
+{
+
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_global++;
+#endif
+	}
+}
+
+void
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
+{
+
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_page++;
+#endif
+	}
+}
+
+void
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
+    pmap_t pmap)
+{
+
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
+		    addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_range++;
+		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+	}
+}
+
+void
+smp_cache_flush(void)
+{
+
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL,
+		    0, 0);
+	}
+}
+
+/*
+ * Handlers for TLB related IPIs
+ */
+void
+invltlb_handler(void)
+{
+	uint32_t generation;
+  
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since invalidating the TLB is a serializing operation.
+	 */
+	generation = smp_tlb_generation;
+	if (smp_tlb_pmap == kernel_pmap)
+		invltlb_glob();
+	else
+		invltlb();
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlpg_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	invlpg(smp_tlb_addr1);
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlrng_handler(void)
+{
+	vm_offset_t addr, addr2;
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	do {
+		invlpg(addr);
+		addr += PAGE_SIZE;
+	} while (addr < addr2);
+
+	PCPU_SET(smp_tlb_done, generation);
+}


Property changes on: trunk/sys/x86/x86/mp_x86.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/mptable.c
===================================================================
--- trunk/sys/x86/x86/mptable.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/mptable.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mptable.c 262141 2014-02-18 01:15:32Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mptable.c 261087 2014-01-23 20:10:22Z jhb $");
 
 #include "opt_mptable_force_htt.h"
 #include <sys/param.h>
@@ -51,7 +51,7 @@
 #include <x86/mptable.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
 #include <machine/md_var.h>
 #ifdef NEW_PCIB
 #include <machine/resource.h>
@@ -79,6 +79,13 @@
 typedef	void mptable_entry_handler(u_char *entry, void *arg);
 typedef	void mptable_extended_entry_handler(ext_entry_ptr entry, void *arg);
 
+/* descriptions of MP table entries */
+typedef struct BASETABLE_ENTRY {
+	uint8_t	type;
+	uint8_t	length;
+	uint8_t	name[16];
+}       basetable_entry;
+
 static basetable_entry basetable_entry_types[] =
 {
 	{0, 20, "Processor"},

Modified: trunk/sys/x86/x86/mptable_pci.c
===================================================================
--- trunk/sys/x86/x86/mptable_pci.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/mptable_pci.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mptable_pci.c 280970 2015-04-01 21:48:54Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mptable_pci.c 294883 2016-01-27 02:23:54Z jhibbits $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -70,13 +70,13 @@
 #ifdef NEW_PCIB
 	mptable_pci_host_res_init(dev);
 #endif
-	device_add_child(dev, "pci", pcib_get_bus(dev));
+	device_add_child(dev, "pci", -1);
 	return (bus_generic_attach(dev));
 }
 
 #ifdef NEW_PCIB
 static int
-mptable_is_isa_range(u_long start, u_long end)
+mptable_is_isa_range(rman_res_t start, rman_res_t end)
 {
 
 	if (end >= 0x10000)
@@ -89,7 +89,7 @@
 }
 
 static int
-mptable_is_vga_range(u_long start, u_long end)
+mptable_is_vga_range(rman_res_t start, rman_res_t end)
 {
 	if (end >= 0x10000)
 		return (0);
@@ -102,7 +102,7 @@
 
 static struct resource *
 mptable_hostb_alloc_resource(device_t dev, device_t child, int type, int *rid,
-    u_long start, u_long end, u_long count, u_int flags)
+    rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct mptable_hostb_softc *sc;
 
@@ -143,7 +143,7 @@
 
 static int
 mptable_hostb_adjust_resource(device_t dev, device_t child, int type,
-    struct resource *r, u_long start, u_long end)
+    struct resource *r, rman_res_t start, rman_res_t end)
 {
 	struct mptable_hostb_softc *sc;
 

Modified: trunk/sys/x86/x86/msi.c
===================================================================
--- trunk/sys/x86/x86/msi.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/msi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -36,11 +36,14 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/msi.c 333126 2018-04-30 20:29:28Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/msi.c 344912 2019-03-08 01:04:19Z jhb $");
 
+#include "opt_acpi.h"
+
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
@@ -52,7 +55,8 @@
 #include <machine/md_var.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
+#include <x86/iommu/iommu_intrmap.h>
 #include <machine/specialreg.h>
 #include <dev/pci/pcivar.h>
 
@@ -113,10 +117,11 @@
 	u_int msi_irq;			/* IRQ cookie. */
 	u_int msi_msix;			/* MSI-X message. */
 	u_int msi_vector:8;		/* IDT vector. */
-	u_int msi_cpu:8;		/* Local APIC ID. (g) */
+	u_int msi_cpu;			/* Local APIC ID. (g) */
 	u_int msi_count:8;		/* Messages in this group. (g) */
 	u_int msi_maxcount:8;		/* Alignment for this group. (g) */
-	int *msi_irqs;			/* Group's IRQ list. (g) */
+	u_int *msi_irqs;		/* Group's IRQ list. (g) */
+	u_int msi_remap_cookie;
 };
 
 static void	msi_create_source(void);
@@ -131,11 +136,27 @@
 		    enum intr_polarity pol);
 static int	msi_assign_cpu(struct intsrc *isrc, u_int apic_id);
 
-struct pic msi_pic = { msi_enable_source, msi_disable_source, msi_eoi_source,
-		       msi_enable_intr, msi_disable_intr, msi_vector,
-		       msi_source_pending, NULL, NULL, msi_config_intr,
-		       msi_assign_cpu };
+struct pic msi_pic = {
+	.pic_enable_source = msi_enable_source,
+	.pic_disable_source = msi_disable_source,
+	.pic_eoi_source = msi_eoi_source,
+	.pic_enable_intr = msi_enable_intr,
+	.pic_disable_intr = msi_disable_intr,
+	.pic_vector = msi_vector,
+	.pic_source_pending = msi_source_pending,
+	.pic_suspend = NULL,
+	.pic_resume = NULL,
+	.pic_config_intr = msi_config_intr,
+	.pic_assign_cpu = msi_assign_cpu,
+	.pic_reprogram_pin = NULL,
+};
 
+u_int first_msi_irq;
+
+u_int num_msi_irqs = 512;
+SYSCTL_UINT(_machdep, OID_AUTO, num_msi_irqs, CTLFLAG_RDTUN, &num_msi_irqs, 0,
+    "Number of IRQs reserved for MSI and MSI-X interrupts");
+
 #ifdef SMP
 /**
  * Xen hypervisors prior to 4.6.0 do not properly handle updates to
@@ -153,7 +174,7 @@
 #endif
 
 static int msi_enabled;
-static int msi_last_irq;
+static u_int msi_last_irq;
 static struct mtx msi_lock;
 
 static void
@@ -314,6 +335,14 @@
 	}
 #endif
 
+	if (num_msi_irqs == 0)
+		return;
+
+	first_msi_irq = max(MINIMUM_MSI_INT, num_io_irqs);
+	if (num_msi_irqs > UINT_MAX - first_msi_irq)
+		panic("num_msi_irqs too high");
+	num_io_irqs = first_msi_irq + num_msi_irqs;
+
 	msi_enabled = 1;
 	intr_register_pic(&msi_pic);
 	mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
@@ -326,11 +355,11 @@
 	u_int irq;
 
 	mtx_lock(&msi_lock);
-	if (msi_last_irq >= NUM_MSI_INTS) {
+	if (msi_last_irq >= num_msi_irqs) {
 		mtx_unlock(&msi_lock);
 		return;
 	}
-	irq = msi_last_irq + FIRST_MSI_INT;
+	irq = msi_last_irq + first_msi_irq;
 	msi_last_irq++;
 	mtx_unlock(&msi_lock);
 
@@ -348,8 +377,12 @@
 msi_alloc(device_t dev, int count, int maxcount, int *irqs)
 {
 	struct msi_intsrc *msi, *fsrc;
-	u_int cpu;
-	int cnt, i, *mirqs, vector;
+	u_int cpu, *mirqs;
+	int cnt, i, vector;
+#ifdef ACPI_DMAR
+	u_int cookies[count];
+	int error;
+#endif
 
 	if (!msi_enabled)
 		return (ENXIO);
@@ -363,7 +396,7 @@
 
 	/* Try to find 'count' free IRQs. */
 	cnt = 0;
-	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
+	for (i = first_msi_irq; i < first_msi_irq + num_msi_irqs; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
@@ -382,7 +415,7 @@
 	/* Do we need to create some new sources? */
 	if (cnt < count) {
 		/* If we would exceed the max, give up. */
-		if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) {
+		if (i + (count - cnt) > first_msi_irq + num_msi_irqs) {
 			mtx_unlock(&msi_lock);
 			free(mirqs, M_MSI);
 			return (ENXIO);
@@ -409,6 +442,24 @@
 		return (ENOSPC);
 	}
 
+#ifdef ACPI_DMAR
+	mtx_unlock(&msi_lock);
+	error = iommu_alloc_msi_intr(dev, cookies, count);
+	mtx_lock(&msi_lock);
+	if (error == EOPNOTSUPP)
+		error = 0;
+	if (error != 0) {
+		for (i = 0; i < count; i++)
+			apic_free_vector(cpu, vector + i, irqs[i]);
+		free(mirqs, M_MSI);
+		return (error);
+	}
+	for (i = 0; i < count; i++) {
+		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
+		msi->msi_remap_cookie = cookies[i];
+	}
+#endif
+
 	/* Assign IDT vectors and make these messages owned by 'dev'. */
 	fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
 	for (i = 0; i < count; i++) {
@@ -430,7 +481,6 @@
 		bcopy(irqs, mirqs, count * sizeof(*mirqs));
 	fsrc->msi_irqs = mirqs;
 	mtx_unlock(&msi_lock);
-
 	return (0);
 }
 
@@ -474,6 +524,9 @@
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		KASSERT(msi->msi_first == first, ("message not in group"));
 		KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch"));
+#ifdef ACPI_DMAR
+		iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie);
+#endif
 		msi->msi_first = NULL;
 		msi->msi_dev = NULL;
 		apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
@@ -481,6 +534,11 @@
 	}
 
 	/* Clear out the first message. */
+#ifdef ACPI_DMAR
+	mtx_unlock(&msi_lock);
+	iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie);
+	mtx_lock(&msi_lock);
+#endif
 	first->msi_first = NULL;
 	first->msi_dev = NULL;
 	apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq);
@@ -498,6 +556,11 @@
 msi_map(int irq, uint64_t *addr, uint32_t *data)
 {
 	struct msi_intsrc *msi;
+	int error;
+#ifdef ACPI_DMAR
+	struct msi_intsrc *msi1;
+	int i, k;
+#endif
 
 	mtx_lock(&msi_lock);
 	msi = (struct msi_intsrc *)intr_lookup_source(irq);
@@ -525,10 +588,36 @@
 		msi = msi->msi_first;
 	}
 
-	*addr = INTEL_ADDR(msi);
-	*data = INTEL_DATA(msi);
+#ifdef ACPI_DMAR
+	if (!msi->msi_msix) {
+		for (k = msi->msi_count - 1, i = first_msi_irq; k > 0 &&
+		    i < first_msi_irq + num_msi_irqs; i++) {
+			if (i == msi->msi_irq)
+				continue;
+			msi1 = (struct msi_intsrc *)intr_lookup_source(i);
+			if (!msi1->msi_msix && msi1->msi_first == msi) {
+				mtx_unlock(&msi_lock);
+				iommu_map_msi_intr(msi1->msi_dev,
+				    msi1->msi_cpu, msi1->msi_vector,
+				    msi1->msi_remap_cookie, NULL, NULL);
+				k--;
+				mtx_lock(&msi_lock);
+			}
+		}
+	}
 	mtx_unlock(&msi_lock);
-	return (0);
+	error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu,
+	    msi->msi_vector, msi->msi_remap_cookie, addr, data);
+#else
+	mtx_unlock(&msi_lock);
+	error = EOPNOTSUPP;
+#endif
+	if (error == EOPNOTSUPP) {
+		*addr = INTEL_ADDR(msi);
+		*data = INTEL_DATA(msi);
+		error = 0;
+	}
+	return (error);
 }
 
 int
@@ -537,6 +626,10 @@
 	struct msi_intsrc *msi;
 	u_int cpu;
 	int i, vector;
+#ifdef ACPI_DMAR
+	u_int cookie;
+	int error;
+#endif
 
 	if (!msi_enabled)
 		return (ENXIO);
@@ -545,7 +638,7 @@
 	mtx_lock(&msi_lock);
 
 	/* Find a free IRQ. */
-	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
+	for (i = first_msi_irq; i < first_msi_irq + num_msi_irqs; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
@@ -558,7 +651,7 @@
 	}
 
 	/* Are all IRQs in use? */
-	if (i == FIRST_MSI_INT + NUM_MSI_INTS) {
+	if (i == first_msi_irq + num_msi_irqs) {
 		mtx_unlock(&msi_lock);
 		return (ENXIO);
 	}
@@ -579,6 +672,22 @@
 		mtx_unlock(&msi_lock);
 		return (ENOSPC);
 	}
+
+	msi->msi_dev = dev;
+#ifdef ACPI_DMAR
+	mtx_unlock(&msi_lock);
+	error = iommu_alloc_msi_intr(dev, &cookie, 1);
+	mtx_lock(&msi_lock);
+	if (error == EOPNOTSUPP)
+		error = 0;
+	if (error != 0) {
+		msi->msi_dev = NULL;
+		apic_free_vector(cpu, vector, i);
+		return (error);
+	}
+	msi->msi_remap_cookie = cookie;
+#endif
+
 	if (bootverbose)
 		printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n",
 		    msi->msi_irq, cpu, vector);
@@ -585,7 +694,6 @@
 
 	/* Setup source. */
 	msi->msi_cpu = cpu;
-	msi->msi_dev = dev;
 	msi->msi_first = msi;
 	msi->msi_vector = vector;
 	msi->msi_msix = 1;
@@ -621,6 +729,11 @@
 	KASSERT(msi->msi_dev != NULL, ("unowned message"));
 
 	/* Clear out the message. */
+#ifdef ACPI_DMAR
+	mtx_unlock(&msi_lock);
+	iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie);
+	mtx_lock(&msi_lock);
+#endif
 	msi->msi_first = NULL;
 	msi->msi_dev = NULL;
 	apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);

Modified: trunk/sys/x86/x86/nexus.c
===================================================================
--- trunk/sys/x86/x86/nexus.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/nexus.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/nexus.c 221324 2011-05-02 14:13:12Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/nexus.c 340016 2018-11-01 18:34:26Z jhb $");
 
 /*
  * This code implements a `root nexus' for Intel Architecture
@@ -64,7 +64,6 @@
 #include <machine/vmparam.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
-#include <machine/pmap.h>
 
 #include <machine/metadata.h>
 #include <machine/nexusvar.h>
@@ -80,7 +79,7 @@
 #ifdef PC98
 #include <pc98/cbus/cbus.h>
 #else
-#include <x86/isa/isa.h>
+#include <isa/isareg.h>
 #endif
 #endif
 #include <sys/rtprio.h>
@@ -100,9 +99,10 @@
 static device_t nexus_add_child(device_t bus, u_int order, const char *name,
 				int unit);
 static	struct resource *nexus_alloc_resource(device_t, device_t, int, int *,
-					      u_long, u_long, u_long, u_int);
+					      rman_res_t, rman_res_t, rman_res_t,
+					      u_int);
 static	int nexus_adjust_resource(device_t, device_t, int, struct resource *,
-				  u_long, u_long);
+				  rman_res_t, rman_res_t);
 #ifdef SMP
 static	int nexus_bind_intr(device_t, device_t, struct resource *, int);
 #endif
@@ -115,6 +115,12 @@
 				    struct resource *);
 static	int nexus_deactivate_resource(device_t, device_t, int, int,
 				      struct resource *);
+static	int nexus_map_resource(device_t bus, device_t child, int type,
+    			       struct resource *r,
+			       struct resource_map_request *argsp,
+			       struct resource_map *map);
+static	int nexus_unmap_resource(device_t bus, device_t child, int type,
+				 struct resource *r, struct resource_map *map);
 static	int nexus_release_resource(device_t, device_t, int, int,
 				   struct resource *);
 static	int nexus_setup_intr(device_t, device_t, struct resource *, int flags,
@@ -123,9 +129,13 @@
 static	int nexus_teardown_intr(device_t, device_t, struct resource *,
 				void *);
 static struct resource_list *nexus_get_reslist(device_t dev, device_t child);
-static	int nexus_set_resource(device_t, device_t, int, int, u_long, u_long);
-static	int nexus_get_resource(device_t, device_t, int, int, u_long *, u_long *);
+static	int nexus_set_resource(device_t, device_t, int, int,
+			       rman_res_t, rman_res_t);
+static	int nexus_get_resource(device_t, device_t, int, int,
+			       rman_res_t *, rman_res_t *);
 static void nexus_delete_resource(device_t, device_t, int, int);
+static	int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t,
+			   cpuset_t *);
 #ifdef DEV_APIC
 static	int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs);
 static	int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs);
@@ -151,6 +161,8 @@
 	DEVMETHOD(bus_release_resource,	nexus_release_resource),
 	DEVMETHOD(bus_activate_resource, nexus_activate_resource),
 	DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource),
+	DEVMETHOD(bus_map_resource,	nexus_map_resource),
+	DEVMETHOD(bus_unmap_resource,	nexus_unmap_resource),
 	DEVMETHOD(bus_setup_intr,	nexus_setup_intr),
 	DEVMETHOD(bus_teardown_intr,	nexus_teardown_intr),
 #ifdef SMP
@@ -162,6 +174,7 @@
 	DEVMETHOD(bus_set_resource,	nexus_set_resource),
 	DEVMETHOD(bus_get_resource,	nexus_get_resource),
 	DEVMETHOD(bus_delete_resource,	nexus_delete_resource),
+	DEVMETHOD(bus_get_cpus,		nexus_get_cpus),
 
 	/* pcib interface */
 #ifdef DEV_APIC
@@ -214,7 +227,7 @@
 	irq_rman.rm_start = 0;
 	irq_rman.rm_type = RMAN_ARRAY;
 	irq_rman.rm_descr = "Interrupt request lines";
-	irq_rman.rm_end = NUM_IO_INTS - 1;
+	irq_rman.rm_end = num_io_irqs - 1;
 	if (rman_init(&irq_rman))
 		panic("nexus_init_resources irq_rman");
 
@@ -222,7 +235,7 @@
 	 * We search for regions of existing IRQs and add those to the IRQ
 	 * resource manager.
 	 */
-	for (irq = 0; irq < NUM_IO_INTS; irq++)
+	for (irq = 0; irq < num_io_irqs; irq++)
 		if (intr_lookup_source(irq) != NULL)
 			if (rman_manage_region(&irq_rman, irq, irq) != 0)
 				panic("nexus_init_resources irq_rman add");
@@ -260,11 +273,15 @@
 		panic("nexus_init_resources port_rman");
 
 	mem_rman.rm_start = 0;
-	mem_rman.rm_end = ~0ul;
+#ifndef PAE
+	mem_rman.rm_end = BUS_SPACE_MAXADDR;
+#else
+	mem_rman.rm_end = ((1ULL << cpu_maxphyaddr) - 1);
+#endif
 	mem_rman.rm_type = RMAN_ARRAY;
 	mem_rman.rm_descr = "I/O memory addresses";
 	if (rman_init(&mem_rman)
-	    || rman_manage_region(&mem_rman, 0, ~0))
+	    || rman_manage_region(&mem_rman, 0, mem_rman.rm_end))
 		panic("nexus_init_resources mem_rman");
 }
 
@@ -296,9 +313,9 @@
 	if (STAILQ_FIRST(rl))
 		retval += printf(" at");
 
-	retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#lx");
-	retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#lx");
-	retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%ld");
+	retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx");
+	retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx");
+	retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
 
 	return retval;
 }
@@ -360,7 +377,8 @@
  */
 static struct resource *
 nexus_alloc_resource(device_t bus, device_t child, int type, int *rid,
-		     u_long start, u_long end, u_long count, u_int flags)
+		     rman_res_t start, rman_res_t end, rman_res_t count,
+		     u_int flags)
 {
 	struct nexus_device *ndev = DEVTONX(child);
 	struct	resource *rv;
@@ -369,12 +387,13 @@
 	int needactivate = flags & RF_ACTIVE;
 
 	/*
-	 * If this is an allocation of the "default" range for a given RID, and
-	 * we know what the resources for this device are (ie. they aren't maintained
-	 * by a child bus), then work out the start/end values.
+	 * If this is an allocation of the "default" range for a given
+	 * RID, and we know what the resources for this device are
+	 * (ie. they aren't maintained by a child bus), then work out
+	 * the start/end values.
 	 */
-	if ((start == 0UL) && (end == ~0UL) && (count == 1)) {
-		if (ndev == NULL)
+	if (RMAN_IS_DEFAULT_RANGE(start, end) && (count == 1)) {
+		if (device_get_parent(child) != bus || ndev == NULL)
 			return(NULL);
 		rle = resource_list_find(&ndev->nx_resources, type, *rid);
 		if (rle == NULL)
@@ -390,7 +409,7 @@
 		return (NULL);
 
 	rv = rman_reserve_resource(rm, start, end, count, flags, child);
-	if (rv == 0)
+	if (rv == NULL)
 		return 0;
 	rman_set_rid(rv, *rid);
 
@@ -406,7 +425,7 @@
 
 static int
 nexus_adjust_resource(device_t bus, device_t child, int type,
-    struct resource *r, u_long start, u_long end)
+    struct resource *r, rman_res_t start, rman_res_t end)
 {
 	struct rman *rm;
 
@@ -422,12 +441,82 @@
 nexus_activate_resource(device_t bus, device_t child, int type, int rid,
 			struct resource *r)
 {
+	struct resource_map map;
+	int error;
+
+	error = rman_activate_resource(r);
+	if (error != 0)
+		return (error);
+
+	if (!(rman_get_flags(r) & RF_UNMAPPED) &&
+	    (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
+		error = nexus_map_resource(bus, child, type, r, NULL, &map);
+		if (error) {
+			rman_deactivate_resource(r);
+			return (error);
+		}
+
+		rman_set_mapping(r,&map);
+	}
+	return (0);
+}
+
+static int
+nexus_deactivate_resource(device_t bus, device_t child, int type, int rid,
+			  struct resource *r)
+{
+	struct resource_map map;
+	int error;
+
+	error = rman_deactivate_resource(r);
+	if (error)
+		return (error);
+
+	if (!(rman_get_flags(r) & RF_UNMAPPED) &&
+	    (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
+		rman_get_mapping(r, &map);
+		nexus_unmap_resource(bus, child, type, r, &map);
+	}
+	return (0);
+}
+
+static int
+nexus_map_resource(device_t bus, device_t child, int type, struct resource *r,
+    struct resource_map_request *argsp, struct resource_map *map)
+{
+	struct resource_map_request args;
+	rman_res_t end, length, start;
 #ifdef PC98
-	bus_space_handle_t bh;
 	int error;
 #endif
-	void *vaddr;
 
+	/* Resources must be active to be mapped. */
+	if (!(rman_get_flags(r) & RF_ACTIVE))
+		return (ENXIO);
+
+	/* Mappings are only supported on I/O and memory resources. */
+	switch (type) {
+	case SYS_RES_IOPORT:
+	case SYS_RES_MEMORY:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	resource_init_map_request(&args);
+	if (argsp != NULL)
+		bcopy(argsp, &args, imin(argsp->size, args.size));
+	start = rman_get_start(r) + args.offset;
+	if (args.length == 0)
+		length = rman_get_size(r);
+	else
+		length = args.length;
+	end = start + length - 1;
+	if (start > rman_get_end(r) || start < rman_get_start(r))
+		return (EINVAL);
+	if (end > rman_get_end(r) || end < start)
+		return (EINVAL);
+
 	/*
 	 * If this is a memory resource, map it into the kernel.
 	 */
@@ -435,58 +524,64 @@
 	case SYS_RES_IOPORT:
 #ifdef PC98
 		error = i386_bus_space_handle_alloc(X86_BUS_SPACE_IO,
-		    rman_get_start(r), rman_get_size(r), &bh);
+		    start, length, &map->r_bushandle);
 		if (error)
 			return (error);
-		rman_set_bushandle(r, bh);
 #else
-		rman_set_bushandle(r, rman_get_start(r));
+		map->r_bushandle = start;
 #endif
-		rman_set_bustag(r, X86_BUS_SPACE_IO);
+		map->r_bustag = X86_BUS_SPACE_IO;
+		map->r_size = length;
+		map->r_vaddr = NULL;
 		break;
 	case SYS_RES_MEMORY:
 #ifdef PC98
 		error = i386_bus_space_handle_alloc(X86_BUS_SPACE_MEM,
-		    rman_get_start(r), rman_get_size(r), &bh);
+		    start, length, &map->r_bushandle);
 		if (error)
 			return (error);
 #endif
-		vaddr = pmap_mapdev(rman_get_start(r), rman_get_size(r));
-		rman_set_virtual(r, vaddr);
-		rman_set_bustag(r, X86_BUS_SPACE_MEM);
+		map->r_vaddr = pmap_mapdev_attr(start, length, args.memattr);
+		map->r_bustag = X86_BUS_SPACE_MEM;
+		map->r_size = length;
+
+		/*
+		 * PC-98 stores the virtual address as a member of the
+		 * structure in the handle.  On plain x86, the handle is
+		 * the virtual address.
+		 */
 #ifdef PC98
-		/* PC-98: the type of bus_space_handle_t is the structure. */
-		bh->bsh_base = (bus_addr_t) vaddr;
-		rman_set_bushandle(r, bh);
+		map->r_bushandle->bsh_base = (bus_addr_t)map->r_vaddr;
 #else
-		/* IBM-PC: the type of bus_space_handle_t is u_int */
-		rman_set_bushandle(r, (bus_space_handle_t) vaddr);
+		map->r_bushandle = (bus_space_handle_t)map->r_vaddr;
 #endif
+		break;
 	}
-	return (rman_activate_resource(r));
+	return (0);
 }
 
 static int
-nexus_deactivate_resource(device_t bus, device_t child, int type, int rid,
-			  struct resource *r)
+nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r,
+    struct resource_map *map)
 {
-
+	
 	/*
 	 * If this is a memory resource, unmap it.
 	 */
-	if (type == SYS_RES_MEMORY) {
-		pmap_unmapdev((vm_offset_t)rman_get_virtual(r),
-		    rman_get_size(r));
-	}
+	switch (type) {
+	case SYS_RES_MEMORY:
+		pmap_unmapdev((vm_offset_t)map->r_vaddr, map->r_size);
+		/* FALLTHROUGH */
+	case SYS_RES_IOPORT:
 #ifdef PC98
-	if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) {
-		bus_space_handle_t bh;
-
-		bh = rman_get_bushandle(r);
-		i386_bus_space_handle_free(rman_get_bustag(r), bh, bh->bsh_sz);
+		i386_bus_space_handle_free(map->r_bustag, map->r_bushandle,
+		    map->r_bushandle->bsh_sz);
+#endif
+		break;
+	default:
+		return (EINVAL);
 	}
-#endif
-	return (rman_deactivate_resource(r));
+	return (0);
 }
 
 static int
@@ -493,6 +588,7 @@
 nexus_release_resource(device_t bus, device_t child, int type, int rid,
 		       struct resource *r)
 {
+
 	if (rman_get_flags(r) & RF_ACTIVE) {
 		int error = bus_deactivate_resource(child, type, rid, r);
 		if (error)
@@ -518,7 +614,7 @@
 	if (irq == NULL)
 		panic("nexus_setup_intr: NULL irq resource!");
 
-	*cookiep = 0;
+	*cookiep = NULL;
 	if ((rman_get_flags(irq) & RF_SHAREABLE) == 0)
 		flags |= INTR_EXCL;
 
@@ -573,7 +669,8 @@
 }
 
 static int
-nexus_set_resource(device_t dev, device_t child, int type, int rid, u_long start, u_long count)
+nexus_set_resource(device_t dev, device_t child, int type, int rid,
+    rman_res_t start, rman_res_t count)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
@@ -584,7 +681,8 @@
 }
 
 static int
-nexus_get_resource(device_t dev, device_t child, int type, int rid, u_long *startp, u_long *countp)
+nexus_get_resource(device_t dev, device_t child, int type, int rid,
+    rman_res_t *startp, rman_res_t *countp)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
@@ -609,6 +707,24 @@
 	resource_list_delete(rl, type, rid);
 }
 
+static int
+nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
+    cpuset_t *cpuset)
+{
+
+	switch (op) {
+#ifdef SMP
+	case INTR_CPUS:
+		if (setsize != sizeof(cpuset_t))
+			return (EINVAL);
+		*cpuset = intr_cpus;
+		return (0);
+#endif
+	default:
+		return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
+	}
+}
+
 /* Called from the MSI code to add new IRQs to the IRQ rman. */
 void
 nexus_add_irq(u_long irq)
@@ -689,11 +805,8 @@
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type(ELF_KERN_STR);  
-	if (kmdp != NULL)
-		smapbase = (struct bios_smap *)preload_search_info(kmdp,
-		    MODINFO_METADATA | MODINFOMD_SMAP);
-	else
-		smapbase = NULL;
+	smapbase = (struct bios_smap *)preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase != NULL) {
 		smapsize = *((u_int32_t *)smapbase - 1);
 		smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);

Added: trunk/sys/x86/x86/pvclock.c
===================================================================
--- trunk/sys/x86/x86/pvclock.c	                        (rev 0)
+++ trunk/sys/x86/x86/pvclock.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,204 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2009 Adrian Chadd
+ * Copyright (c) 2012 Spectra Logic Corporation
+ * Copyright (c) 2014 Bryan Venteicher
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/pvclock.c 278184 2015-02-04 08:33:04Z bryanv $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+
+#include <machine/cpufunc.h>
+#include <machine/cpu.h>
+#include <machine/atomic.h>
+#include <machine/pvclock.h>
+
+/*
+ * Last time; this guarantees a monotonically increasing clock for when
+ * a stable TSC is not provided.
+ */
+static volatile uint64_t pvclock_last_cycles;
+
+void
+pvclock_resume(void)
+{
+
+	atomic_store_rel_64(&pvclock_last_cycles, 0);
+}
+
+uint64_t
+pvclock_get_last_cycles(void)
+{
+
+	return (atomic_load_acq_64(&pvclock_last_cycles));
+}
+
+uint64_t
+pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti)
+{
+	uint64_t freq;
+
+	freq = (1000000000ULL << 32) / ti->tsc_to_system_mul;
+
+	if (ti->tsc_shift < 0)
+		freq <<= -ti->tsc_shift;
+	else
+		freq >>= ti->tsc_shift;
+
+	return (freq);
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline uint64_t
+pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
+{
+	uint64_t product;
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#if defined(__i386__)
+	{
+		uint32_t tmp1, tmp2;
+
+		/**
+		 * For i386, the formula looks like:
+		 *
+		 *   lower = (mul_frac * (delta & UINT_MAX)) >> 32
+		 *   upper = mul_frac * (delta >> 32)
+		 *   product = lower + upper
+		 */
+		__asm__ (
+			"mul  %5       ; "
+			"mov  %4,%%eax ; "
+			"mov  %%edx,%4 ; "
+			"mul  %5       ; "
+			"xor  %5,%5    ; "
+			"add  %4,%%eax ; "
+			"adc  %5,%%edx ; "
+			: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+			: "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
+			  "2" (mul_frac) );
+	}
+#elif defined(__amd64__)
+	{
+		unsigned long tmp;
+
+		__asm__ (
+			"mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+			: [lo]"=a" (product), [hi]"=d" (tmp)
+			: "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
+	}
+#else
+#error "pvclock: unsupported x86 architecture?"
+#endif
+
+	return (product);
+}
+
+static uint64_t
+pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti)
+{
+	uint64_t delta;
+
+	delta = rdtsc() - ti->tsc_timestamp;
+
+	return (pvclock_scale_delta(delta, ti->tsc_to_system_mul,
+	    ti->tsc_shift));
+}
+
+static void
+pvclock_read_time_info(struct pvclock_vcpu_time_info *ti,
+    uint64_t *cycles, uint8_t *flags)
+{
+	uint32_t version;
+
+	do {
+		version = ti->version;
+		rmb();
+		*cycles = ti->system_time + pvclock_get_nsec_offset(ti);
+		*flags = ti->flags;
+		rmb();
+	} while ((ti->version & 1) != 0 || ti->version != version);
+}
+
+static void
+pvclock_read_wall_clock(struct pvclock_wall_clock *wc, uint32_t *sec,
+    uint32_t *nsec)
+{
+	uint32_t version;
+
+	do {
+		version = wc->version;
+		rmb();
+		*sec = wc->sec;
+		*nsec = wc->nsec;
+		rmb();
+	} while ((wc->version & 1) != 0 || wc->version != version);
+}
+
+uint64_t
+pvclock_get_timecount(struct pvclock_vcpu_time_info *ti)
+{
+	uint64_t now, last;
+	uint8_t flags;
+
+	pvclock_read_time_info(ti, &now, &flags);
+
+	if (flags & PVCLOCK_FLAG_TSC_STABLE)
+		return (now);
+
+	/*
+	 * Enforce a monotonically increasing clock time across all VCPUs.
+	 * If our time is too old, use the last time and return. Otherwise,
+	 * try to update the last time.
+	 */
+	do {
+		last = atomic_load_acq_64(&pvclock_last_cycles);
+		if (last > now)
+			return (last);
+	} while (!atomic_cmpset_64(&pvclock_last_cycles, last, now));
+
+	return (now);
+}
+
+void
+pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts)
+{
+	uint32_t sec, nsec;
+
+	pvclock_read_wall_clock(wc, &sec, &nsec);
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}


Property changes on: trunk/sys/x86/x86/pvclock.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/stack_machdep.c
===================================================================
--- trunk/sys/x86/x86/stack_machdep.c	                        (rev 0)
+++ trunk/sys/x86/x86/stack_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,182 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 EMC Corporation
+ * Copyright (c) 2005 Antoine Brodin
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/stack_machdep.c 337976 2018-08-17 16:04:59Z markj $");
+
+#include "opt_stack.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/stack.h>
+
+#include <machine/pcb.h>
+#include <machine/smp.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <x86/stack.h>
+
+#ifdef __i386__
+#define	PCB_FP(pcb)	((pcb)->pcb_ebp)
+#define	TF_FLAGS(tf)	((tf)->tf_eflags)
+#define	TF_FP(tf)	((tf)->tf_ebp)
+#define	TF_PC(tf)	((tf)->tf_eip)
+
+typedef struct i386_frame *x86_frame_t;
+#else
+#define	PCB_FP(pcb)	((pcb)->pcb_rbp)
+#define	TF_FLAGS(tf)	((tf)->tf_rflags)
+#define	TF_FP(tf)	((tf)->tf_rbp)
+#define	TF_PC(tf)	((tf)->tf_rip)
+
+typedef struct amd64_frame *x86_frame_t;
+#endif
+
+#ifdef STACK
+static struct stack *nmi_stack;
+static volatile struct thread *nmi_pending;
+
+#ifdef SMP
+static struct mtx nmi_lock;
+MTX_SYSINIT(nmi_lock, &nmi_lock, "stack_nmi", MTX_SPIN);
+#endif
+#endif
+
+static void
+stack_capture(struct thread *td, struct stack *st, register_t fp)
+{
+	x86_frame_t frame;
+	vm_offset_t callpc;
+
+	stack_zero(st);
+	frame = (x86_frame_t)fp;
+	while (1) {
+		if ((vm_offset_t)frame < td->td_kstack ||
+		    (vm_offset_t)frame >= td->td_kstack +
+		    td->td_kstack_pages * PAGE_SIZE)
+			break;
+		callpc = frame->f_retaddr;
+		if (!INKERNEL(callpc))
+			break;
+		if (stack_put(st, callpc) == -1)
+			break;
+		if (frame->f_frame <= frame)
+			break;
+		frame = frame->f_frame;
+	}
+}
+
+int
+stack_nmi_handler(struct trapframe *tf)
+{
+
+#ifdef STACK
+	/* Don't consume an NMI that wasn't meant for us. */
+	if (nmi_stack == NULL || curthread != nmi_pending)
+		return (0);
+
+	if (!TRAPF_USERMODE(tf) && (TF_FLAGS(tf) & PSL_I) != 0)
+		stack_capture(curthread, nmi_stack, TF_FP(tf));
+	else
+		/* We were running in usermode or had interrupts disabled. */
+		nmi_stack->depth = 0;
+
+	atomic_store_rel_ptr((long *)&nmi_pending, (long)NULL);
+	return (1);
+#else
+	return (0);
+#endif
+}
+
+void
+stack_save_td(struct stack *st, struct thread *td)
+{
+
+	if (TD_IS_SWAPPED(td))
+		panic("stack_save_td: swapped");
+	if (TD_IS_RUNNING(td))
+		panic("stack_save_td: running");
+
+	stack_capture(td, st, PCB_FP(td->td_pcb));
+}
+
+int
+stack_save_td_running(struct stack *st, struct thread *td)
+{
+
+#ifdef STACK
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	MPASS(TD_IS_RUNNING(td));
+
+	if (td == curthread) {
+		stack_save(st);
+		return (0);
+	}
+
+#ifdef SMP
+	mtx_lock_spin(&nmi_lock);
+
+	nmi_stack = st;
+	nmi_pending = td;
+	ipi_cpu(td->td_oncpu, IPI_TRACE);
+	while ((void *)atomic_load_acq_ptr((long *)&nmi_pending) != NULL)
+		cpu_spinwait();
+	nmi_stack = NULL;
+
+	mtx_unlock_spin(&nmi_lock);
+
+	if (st->depth == 0)
+		return (EAGAIN);
+#else /* !SMP */
+	KASSERT(0, ("curthread isn't running"));
+#endif /* SMP */
+	return (0);
+#else /* !STACK */
+	return (EOPNOTSUPP);
+#endif /* STACK */
+}
+
+void
+stack_save(struct stack *st)
+{
+	register_t fp;
+
+#ifdef __i386__
+	__asm __volatile("movl %%ebp,%0" : "=g" (fp));
+#else
+	__asm __volatile("movq %%rbp,%0" : "=g" (fp));
+#endif
+	stack_capture(curthread, st, fp);
+}


Property changes on: trunk/sys/x86/x86/stack_machdep.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/tsc.c
===================================================================
--- trunk/sys/x86/x86/tsc.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/tsc.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/tsc.c 280973 2015-04-02 01:02:42Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/tsc.c 353007 2019-10-02 13:46:40Z kib $");
 
 #include "opt_compat.h"
 #include "opt_clock.h"
@@ -49,6 +49,7 @@
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <x86/vmware.h>
+#include <dev/acpica/acpi_hpet.h>
 
 #include "cpufreq_if.h"
 
@@ -60,34 +61,28 @@
 
 SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN,
     &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant");
-TUNABLE_INT("kern.timecounter.invariant_tsc", &tsc_is_invariant);
 
 #ifdef SMP
 int	smp_tsc;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0,
     "Indicates whether the TSC is safe to use in SMP mode");
-TUNABLE_INT("kern.timecounter.smp_tsc", &smp_tsc);
 
 int	smp_tsc_adjust = 0;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN,
     &smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP");
-TUNABLE_INT("kern.timecounter.smp_tsc_adjust", &smp_tsc_adjust);
 #endif
 
 static int	tsc_shift = 1;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN,
     &tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency");
-TUNABLE_INT("kern.timecounter.tsc_shift", &tsc_shift);
 
 static int	tsc_disabled;
 SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0,
     "Disable x86 Time Stamp Counter");
-TUNABLE_INT("machdep.disable_tsc", &tsc_disabled);
 
 static int	tsc_skip_calibration;
 SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN,
     &tsc_skip_calibration, 0, "Disable TSC frequency calibration");
-TUNABLE_INT("machdep.disable_tsc_calibration", &tsc_skip_calibration);
 
 static void tsc_freq_changed(void *arg, const struct cf_level *level,
     int status);
@@ -100,14 +95,22 @@
 static unsigned tsc_get_timecount_mfence(struct timecounter *tc);
 static unsigned tsc_get_timecount_low_mfence(struct timecounter *tc);
 static void tsc_levels_changed(void *arg, int unit);
+static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
+    struct timecounter *tc);
+#ifdef COMPAT_FREEBSD32
+static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
+    struct timecounter *tc);
+#endif
 
 static struct timecounter tsc_timecounter = {
-	tsc_get_timecount,	/* get_timecount */
-	0,			/* no poll_pps */
-	~0u,			/* counter_mask */
-	0,			/* frequency */
-	"TSC",			/* name */
-	800,			/* quality (adjusted in code) */
+	.tc_get_timecount =		tsc_get_timecount,
+	.tc_counter_mask =		~0u,
+	.tc_name =			"TSC",
+	.tc_quality =			800,	/* adjusted in code */
+	.tc_fill_vdso_timehands = 	x86_tsc_vdso_timehands,
+#ifdef COMPAT_FREEBSD32
+	.tc_fill_vdso_timehands32 = 	x86_tsc_vdso_timehands32,
+#endif
 };
 
 static void
@@ -126,6 +129,40 @@
 	tsc_is_invariant = 1;
 }
 
+/*
+ * Calculate TSC frequency using information from the CPUID leaf 0x15
+ * 'Time Stamp Counter and Nominal Core Crystal Clock'.  If leaf 0x15
+ * is not functional, as it is on Skylake/Kabylake, try 0x16 'Processor
+ * Frequency Information'.  Leaf 0x16 is described in the SDM as
+ * informational only, but if 0x15 did not work, and TSC calibration
+ * is disabled, it is the best we can get at all.  It should still be
+ * an improvement over the parsing of the CPU model name in
+ * tsc_freq_intel(), when available.
+ */
+static bool
+tsc_freq_cpuid(void)
+{
+	u_int regs[4];
+
+	if (cpu_high < 0x15)
+		return (false);
+	do_cpuid(0x15, regs);
+	if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) {
+		tsc_freq = (uint64_t)regs[2] * regs[1] / regs[0];
+		return (true);
+	}
+
+	if (cpu_high < 0x16)
+		return (false);
+	do_cpuid(0x16, regs);
+	if (regs[0] != 0) {
+		tsc_freq = (uint64_t)regs[0] * 1000000;
+		return (true);
+	}
+
+	return (false);
+}
+
 static void
 tsc_freq_intel(void)
 {
@@ -250,18 +287,19 @@
 	}
 
 	if (tsc_skip_calibration) {
-		if (cpu_vendor_id == CPU_VENDOR_INTEL)
+		if (tsc_freq_cpuid())
+			;
+		else if (cpu_vendor_id == CPU_VENDOR_INTEL)
 			tsc_freq_intel();
-		return;
+	} else {
+		if (bootverbose)
+			printf("Calibrating TSC clock ... ");
+		tsc1 = rdtsc();
+		DELAY(1000000);
+		tsc2 = rdtsc();
+		tsc_freq = tsc2 - tsc1;
 	}
-
 	if (bootverbose)
-	        printf("Calibrating TSC clock ... ");
-	tsc1 = rdtsc();
-	DELAY(1000000);
-	tsc2 = rdtsc();
-	tsc_freq = tsc2 - tsc1;
-	if (bootverbose)
 		printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq);
 }
 
@@ -427,7 +465,7 @@
 }
 
 static int
-test_tsc(void)
+test_tsc(int adj_max_count)
 {
 	uint64_t *data, *tsc;
 	u_int i, size, adj;
@@ -441,12 +479,12 @@
 	for (i = 0, tsc = data; i < N; i++, tsc += size)
 		smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc);
 	smp_tsc = 1;	/* XXX */
-	smp_rendezvous(smp_no_rendevous_barrier, comp_smp_tsc,
-	    smp_no_rendevous_barrier, data);
-	if (!smp_tsc && adj < smp_tsc_adjust) {
+	smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc,
+	    smp_no_rendezvous_barrier, data);
+	if (!smp_tsc && adj < adj_max_count) {
 		adj++;
-		smp_rendezvous(smp_no_rendevous_barrier, adj_smp_tsc,
-		    smp_no_rendevous_barrier, data);
+		smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc,
+		    smp_no_rendezvous_barrier, data);
 		goto retry;
 	}
 	free(data, M_TEMP);
@@ -481,19 +519,6 @@
 
 #undef N
 
-#else
-
-/*
- * The function is not called, it is provided to avoid linking failure
- * on uniprocessor kernel.
- */
-static int
-test_tsc(void)
-{
-
-	return (0);
-}
-
 #endif /* SMP */
 
 static void
@@ -529,17 +554,22 @@
 	}
 
 	/*
-	 * We cannot use the TSC if it stops incrementing while idle.
 	 * Intel CPUs without a C-state invariant TSC can stop the TSC
-	 * in either C2 or C3.
+	 * in either C2 or C3.  Disable use of C2 and C3 while using
+	 * the TSC as the timecounter.  The timecounter can be changed
+	 * to enable C2 and C3.
+	 *
+	 * Note that the TSC is used as the cputicker for computing
+	 * thread runtime regardless of the timecounter setting, so
+	 * using an alternate timecounter and enabling C2 or C3 can
+	 * result incorrect runtimes for kernel idle threads (but not
+	 * for any non-idle threads).
 	 */
-	if (cpu_deepest_sleep >= 2 && cpu_vendor_id == CPU_VENDOR_INTEL &&
+	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) {
-		tsc_timecounter.tc_quality = -1000;
 		tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP;
 		if (bootverbose)
-			printf("TSC timecounter disabled: C2/C3 may halt it.\n");
-		goto init;
+			printf("TSC timecounter disables C2 and C3.\n");
 	}
 
 	/*
@@ -549,9 +579,12 @@
 	 * non-zero value.  The TSC seems unreliable in virtualized SMP
 	 * environments, so it is set to a negative quality in those cases.
 	 */
+#ifdef SMP
 	if (mp_ncpus > 1)
-		tsc_timecounter.tc_quality = test_tsc();
-	else if (tsc_is_invariant)
+		tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust);
+	else
+#endif /* SMP */
+	if (tsc_is_invariant)
 		tsc_timecounter.tc_quality = 1000;
 	max_freq >>= tsc_shift;
 
@@ -586,6 +619,32 @@
 }
 SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
 
+void
+resume_TSC(void)
+{
+#ifdef SMP
+	int quality;
+
+	/* If TSC was not good on boot, it is unlikely to become good now. */
+	if (tsc_timecounter.tc_quality < 0)
+		return;
+	/* Nothing to do with UP. */
+	if (mp_ncpus < 2)
+		return;
+
+	/*
+	 * If TSC was good, a single synchronization should be enough,
+	 * but honour smp_tsc_adjust if it's set.
+	 */
+	quality = test_tsc(MAX(smp_tsc_adjust, 1));
+	if (quality != tsc_timecounter.tc_quality) {
+		printf("TSC timecounter quality changed: %d -> %d\n",
+		    tsc_timecounter.tc_quality, quality);
+		tsc_timecounter.tc_quality = quality;
+	}
+#endif /* SMP */
+}
+
 /*
  * When cpufreq levels change, find out about the (new) max frequency.  We
  * use this to update CPU accounting in case it got a lower estimate at boot.
@@ -726,22 +785,27 @@
 	return (tsc_get_timecount_low(tc));
 }
 
-uint32_t
-cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+static uint32_t
+x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
 {
 
-	vdso_th->th_x86_shift = (int)(intptr_t)timecounter->tc_priv;
+	vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC;
+	vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv;
+	vdso_th->th_x86_hpet_idx = 0xffffffff;
 	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
-	return (timecounter == &tsc_timecounter);
+	return (1);
 }
 
 #ifdef COMPAT_FREEBSD32
-uint32_t
-cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+static uint32_t
+x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
+    struct timecounter *tc)
 {
 
-	vdso_th32->th_x86_shift = (int)(intptr_t)timecounter->tc_priv;
+	vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC;
+	vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv;
+	vdso_th32->th_x86_hpet_idx = 0xffffffff;
 	bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res));
-	return (timecounter == &tsc_timecounter);
+	return (1);
 }
 #endif

Added: trunk/sys/x86/x86/ucode.c
===================================================================
--- trunk/sys/x86/x86/ucode.c	                        (rev 0)
+++ trunk/sys/x86/x86/ucode.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,402 @@
+/* $MidnightBSD$ */
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/ucode.c 347700 2019-05-16 14:42:16Z markj $");
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <x86/specialreg.h>
+#include <machine/stdarg.h>
+#include <x86/ucode.h>
+#include <x86/x86_smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+
+static void	*ucode_intel_match(uint8_t *data, size_t *len);
+static int	ucode_intel_verify(struct ucode_intel_header *hdr,
+		    size_t resid);
+
+static struct ucode_ops {
+	const char *vendor;
+	int (*load)(void *, bool, uint64_t *, uint64_t *);
+	void *(*match)(uint8_t *, size_t *);
+} loaders[] = {
+	{
+		.vendor = INTEL_VENDOR_ID,
+		.load = ucode_intel_load,
+		.match = ucode_intel_match,
+	},
+};
+
+/* Selected microcode update data. */
+static void *early_ucode_data;
+static void *ucode_data;
+static struct ucode_ops *ucode_loader;
+
+/* Variables used for reporting success or failure. */
+enum {
+	NO_ERROR,
+	NO_MATCH,
+	VERIFICATION_FAILED,
+} ucode_error = NO_ERROR;
+static uint64_t ucode_nrev, ucode_orev;
+
+static void
+log_msg(void *arg __unused)
+{
+
+	if (ucode_nrev != 0) {
+		printf("CPU microcode: updated from %#jx to %#jx\n",
+		    (uintmax_t)ucode_orev, (uintmax_t)ucode_nrev);
+		return;
+	}
+
+	switch (ucode_error) {
+	case NO_MATCH:
+		printf("CPU microcode: no matching update found\n");
+		break;
+	case VERIFICATION_FAILED:
+		printf("CPU microcode: microcode verification failed\n");
+		break;
+	default:
+		break;
+	}
+}
+SYSINIT(ucode_log, SI_SUB_CPU, SI_ORDER_FIRST, log_msg, NULL);
+
+int
+ucode_intel_load(void *data, bool unsafe, uint64_t *nrevp, uint64_t *orevp)
+{
+	uint64_t nrev, orev;
+	uint32_t cpuid[4];
+
+	orev = rdmsr(MSR_BIOS_SIGN) >> 32;
+
+	/*
+	 * Perform update.  Flush caches first to work around seemingly
+	 * undocumented errata applying to some Broadwell CPUs.
+	 */
+	wbinvd();
+	if (unsafe)
+		wrmsr_safe(MSR_BIOS_UPDT_TRIG, (uint64_t)(uintptr_t)data);
+	else
+		wrmsr(MSR_BIOS_UPDT_TRIG, (uint64_t)(uintptr_t)data);
+	wrmsr(MSR_BIOS_SIGN, 0);
+
+	/*
+	 * Serialize instruction flow.
+	 */
+	do_cpuid(0, cpuid);
+
+	/*
+	 * Verify that the microcode revision changed.
+	 */
+	nrev = rdmsr(MSR_BIOS_SIGN) >> 32;
+	if (nrevp != NULL)
+		*nrevp = nrev;
+	if (orevp != NULL)
+		*orevp = orev;
+	if (nrev <= orev)
+		return (EEXIST);
+	return (0);
+}
+
+static int
+ucode_intel_verify(struct ucode_intel_header *hdr, size_t resid)
+{
+	uint32_t cksum, *data, size;
+	int i;
+
+	if (resid < sizeof(struct ucode_intel_header))
+		return (1);
+	size = hdr->total_size;
+	if (size == 0)
+		size = UCODE_INTEL_DEFAULT_DATA_SIZE +
+		    sizeof(struct ucode_intel_header);
+
+	if (hdr->header_version != 1)
+		return (1);
+	if (size % 16 != 0)
+		return (1);
+	if (resid < size)
+		return (1);
+
+	cksum = 0;
+	data = (uint32_t *)hdr;
+	for (i = 0; i < size / sizeof(uint32_t); i++)
+		cksum += data[i];
+	if (cksum != 0)
+		return (1);
+	return (0);
+}
+
+static void *
+ucode_intel_match(uint8_t *data, size_t *len)
+{
+	struct ucode_intel_header *hdr;
+	struct ucode_intel_extsig_table *table;
+	struct ucode_intel_extsig *entry;
+	uint64_t platformid;
+	size_t resid;
+	uint32_t data_size, flags, regs[4], sig, total_size;
+	int i;
+
+	do_cpuid(1, regs);
+	sig = regs[0];
+
+	platformid = rdmsr(MSR_IA32_PLATFORM_ID);
+	flags = 1 << ((platformid >> 50) & 0x7);
+
+	for (resid = *len; resid > 0; data += total_size, resid -= total_size) {
+		hdr = (struct ucode_intel_header *)data;
+		if (ucode_intel_verify(hdr, resid) != 0) {
+			ucode_error = VERIFICATION_FAILED;
+			break;
+		}
+
+		data_size = hdr->data_size;
+		total_size = hdr->total_size;
+		if (data_size == 0)
+			data_size = UCODE_INTEL_DEFAULT_DATA_SIZE;
+		if (total_size == 0)
+			total_size = UCODE_INTEL_DEFAULT_DATA_SIZE +
+			    sizeof(struct ucode_intel_header);
+		if (data_size > total_size + sizeof(struct ucode_intel_header))
+			table = (struct ucode_intel_extsig_table *)
+			    ((uint8_t *)(hdr + 1) + data_size);
+		else
+			table = NULL;
+
+		if (hdr->processor_signature == sig) {
+			if ((hdr->processor_flags & flags) != 0) {
+				*len = data_size;
+				return (hdr + 1);
+			}
+		} else if (table != NULL) {
+			for (i = 0; i < table->signature_count; i++) {
+				entry = &table->entries[i];
+				if (entry->processor_signature == sig &&
+				    (entry->processor_flags & flags) != 0) {
+					*len = data_size;
+					return (hdr + 1);
+				}
+			}
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Release any memory backing unused microcode blobs back to the system.
+ * We copy the selected update and free the entire microcode file.
+ */
+static void
+ucode_release(void *arg __unused)
+{
+	char *name, *type;
+	caddr_t file;
+	int release;
+
+	if (early_ucode_data == NULL)
+		return;
+	release = 1;
+	TUNABLE_INT_FETCH("debug.ucode.release", &release);
+	if (!release)
+		return;
+
+restart:
+	file = 0;
+	for (;;) {
+		file = preload_search_next_name(file);
+		if (file == 0)
+			break;
+		type = (char *)preload_search_info(file, MODINFO_TYPE);
+		if (type == NULL || strcmp(type, "cpu_microcode") != 0)
+			continue;
+
+		name = preload_search_info(file, MODINFO_NAME);
+		preload_delete_name(name);
+		goto restart;
+	}
+}
+SYSINIT(ucode_release, SI_SUB_KMEM + 1, SI_ORDER_ANY, ucode_release, NULL);
+
+void
+ucode_load_ap(int cpu)
+{
+#ifdef SMP
+	KASSERT(cpu_info[cpu_apic_ids[cpu]].cpu_present,
+	    ("cpu %d not present", cpu));
+
+	if (cpu_info[cpu_apic_ids[cpu]].cpu_hyperthread)
+		return;
+#endif
+
+	if (ucode_data != NULL)
+		(void)ucode_loader->load(ucode_data, false, NULL, NULL);
+}
+
+static void *
+map_ucode(uintptr_t free, size_t len)
+{
+#ifdef __i386__
+	uintptr_t va;
+
+	for (va = free; va < free + len; va += PAGE_SIZE)
+		pmap_kenter(va, (vm_paddr_t)va);
+#else
+	(void)len;
+#endif
+	return ((void *)free);
+}
+
+static void
+unmap_ucode(uintptr_t free, size_t len)
+{
+#ifdef __i386__
+	uintptr_t va;
+
+	for (va = free; va < free + len; va += PAGE_SIZE)
+		pmap_kremove(va);
+#else
+	(void)free;
+	(void)len;
+#endif
+}
+
+/*
+ * Search for an applicable microcode update, and load it.  APs will load the
+ * selected update once they come online.
+ *
+ * "free" is the address of the next free physical page.  If a microcode update
+ * is selected, it will be copied to this region prior to loading in order to
+ * satisfy alignment requirements.
+ */
+size_t
+ucode_load_bsp(uintptr_t free)
+{
+	union {
+		uint32_t regs[4];
+		char vendor[13];
+	} cpuid;
+	uint8_t *addr, *fileaddr, *match;
+	char *type;
+	uint64_t nrev, orev;
+	caddr_t file;
+	size_t i, len;
+	int error;
+
+	KASSERT(free % PAGE_SIZE == 0, ("unaligned boundary %p", (void *)free));
+
+	do_cpuid(0, cpuid.regs);
+	cpuid.regs[0] = cpuid.regs[1];
+	cpuid.regs[1] = cpuid.regs[3];
+	cpuid.vendor[12] = '\0';
+	for (i = 0; i < nitems(loaders); i++)
+		if (strcmp(cpuid.vendor, loaders[i].vendor) == 0) {
+			ucode_loader = &loaders[i];
+			break;
+		}
+	if (ucode_loader == NULL)
+		return (0);
+
+	file = 0;
+	fileaddr = match = NULL;
+	for (;;) {
+		file = preload_search_next_name(file);
+		if (file == 0)
+			break;
+		type = (char *)preload_search_info(file, MODINFO_TYPE);
+		if (type == NULL || strcmp(type, "cpu_microcode") != 0)
+			continue;
+
+		fileaddr = preload_fetch_addr(file);
+		len = preload_fetch_size(file);
+		match = ucode_loader->match(fileaddr, &len);
+		if (match != NULL) {
+			addr = map_ucode(free, len);
+			/* We can't use memcpy() before ifunc resolution. */
+			for (i = 0; i < len; i++)
+				addr[i] = ((volatile uint8_t *)match)[i];
+			match = addr;
+
+			error = ucode_loader->load(match, false, &nrev, &orev);
+			if (error == 0) {
+				ucode_data = early_ucode_data = match;
+				ucode_nrev = nrev;
+				ucode_orev = orev;
+				return (len);
+			}
+			unmap_ucode(free, len);
+		}
+	}
+	if (fileaddr != NULL && ucode_error == NO_ERROR)
+		ucode_error = NO_MATCH;
+	return (0);
+}
+
+/*
+ * Reload microcode following an ACPI resume.
+ */
+void
+ucode_reload(void)
+{
+
+	ucode_load_ap(PCPU_GET(cpuid));
+}
+
+/*
+ * Replace an existing microcode update.
+ */
+void *
+ucode_update(void *newdata)
+{
+
+	newdata = (void *)atomic_swap_ptr((void *)&ucode_data,
+	    (uintptr_t)newdata);
+	if (newdata == early_ucode_data)
+		newdata = NULL;
+	return (newdata);
+}


Property changes on: trunk/sys/x86/x86/ucode.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/x86_mem.c
===================================================================
--- trunk/sys/x86/x86/x86_mem.c	                        (rev 0)
+++ trunk/sys/x86/x86/x86_mem.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,729 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1999 Michael Smith <msmith at freebsd.org>
+ * Copyright (c) 2017 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/x86_mem.c 314591 2017-03-03 10:30:30Z kib $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+/*
+ * Pentium Pro+ memory range operations
+ *
+ * This code will probably be impenetrable without reference to the
+ * Intel Pentium Pro documentation or x86-64 programmers manual vol 2.
+ */
+
+static char *mem_owner_bios = "BIOS";
+
+#define	MR686_FIXMTRR	(1<<0)
+
+#define	mrwithin(mr, a)							\
+	(((a) >= (mr)->mr_base) && ((a) < ((mr)->mr_base + (mr)->mr_len)))
+#define	mroverlap(mra, mrb)						\
+	(mrwithin(mra, mrb->mr_base) || mrwithin(mrb, mra->mr_base))
+
+#define	mrvalid(base, len) 						\
+	((!(base & ((1 << 12) - 1))) &&	/* base is multiple of 4k */	\
+	    ((len) >= (1 << 12)) &&	/* length is >= 4k */		\
+	    powerof2((len)) &&		/* ... and power of two */	\
+	    !((base) & ((len) - 1)))	/* range is not discontiuous */
+
+#define	mrcopyflags(curr, new)						\
+	(((curr) & ~MDF_ATTRMASK) | ((new) & MDF_ATTRMASK))
+
+static int mtrrs_disabled;
+SYSCTL_INT(_machdep, OID_AUTO, disable_mtrrs, CTLFLAG_RDTUN,
+    &mtrrs_disabled, 0,
+    "Disable MTRRs.");
+
+static void	x86_mrinit(struct mem_range_softc *sc);
+static int	x86_mrset(struct mem_range_softc *sc,
+		    struct mem_range_desc *mrd, int *arg);
+static void	x86_mrAPinit(struct mem_range_softc *sc);
+static void	x86_mrreinit(struct mem_range_softc *sc);
+
+static struct mem_range_ops x86_mrops = {
+	x86_mrinit,
+	x86_mrset,
+	x86_mrAPinit,
+	x86_mrreinit
+};
+
+/* XXX for AP startup hook */
+static u_int64_t mtrrcap, mtrrdef;
+
+/* The bitmask for the PhysBase and PhysMask fields of the variable MTRRs. */
+static u_int64_t mtrr_physmask;
+
+static struct mem_range_desc *mem_range_match(struct mem_range_softc *sc,
+		    struct mem_range_desc *mrd);
+static void	x86_mrfetch(struct mem_range_softc *sc);
+static int	x86_mtrrtype(int flags);
+static int	x86_mrt2mtrr(int flags, int oldval);
+static int	x86_mtrrconflict(int flag1, int flag2);
+static void	x86_mrstore(struct mem_range_softc *sc);
+static void	x86_mrstoreone(void *arg);
+static struct mem_range_desc *x86_mtrrfixsearch(struct mem_range_softc *sc,
+		    u_int64_t addr);
+static int	x86_mrsetlow(struct mem_range_softc *sc,
+		    struct mem_range_desc *mrd, int *arg);
+static int	x86_mrsetvariable(struct mem_range_softc *sc,
+		    struct mem_range_desc *mrd, int *arg);
+
+/* ia32 MTRR type to memory range type conversion */
+static int x86_mtrrtomrt[] = {
+	MDF_UNCACHEABLE,
+	MDF_WRITECOMBINE,
+	MDF_UNKNOWN,
+	MDF_UNKNOWN,
+	MDF_WRITETHROUGH,
+	MDF_WRITEPROTECT,
+	MDF_WRITEBACK
+};
+
+#define	MTRRTOMRTLEN nitems(x86_mtrrtomrt)
+
+static int
+x86_mtrr2mrt(int val)
+{
+
+	if (val < 0 || val >= MTRRTOMRTLEN)
+		return (MDF_UNKNOWN);
+	return (x86_mtrrtomrt[val]);
+}
+
+/*
+ * x86 MTRR conflicts. Writeback and uncachable may overlap.
+ */
+static int
+x86_mtrrconflict(int flag1, int flag2)
+{
+
+	flag1 &= MDF_ATTRMASK;
+	flag2 &= MDF_ATTRMASK;
+	if ((flag1 & MDF_UNKNOWN) || (flag2 & MDF_UNKNOWN))
+		return (1);
+	if (flag1 == flag2 ||
+	    (flag1 == MDF_WRITEBACK && flag2 == MDF_UNCACHEABLE) ||
+	    (flag2 == MDF_WRITEBACK && flag1 == MDF_UNCACHEABLE))
+		return (0);
+	return (1);
+}
+
+/*
+ * Look for an exactly-matching range.
+ */
+static struct mem_range_desc *
+mem_range_match(struct mem_range_softc *sc, struct mem_range_desc *mrd)
+{
+	struct mem_range_desc *cand;
+	int i;
+
+	for (i = 0, cand = sc->mr_desc; i < sc->mr_ndesc; i++, cand++)
+		if ((cand->mr_base == mrd->mr_base) &&
+		    (cand->mr_len == mrd->mr_len))
+			return (cand);
+	return (NULL);
+}
+
+/*
+ * Ensure that the direct map region does not contain any mappings
+ * that span MTRRs of different types.  However, the fixed MTRRs can
+ * be ignored, because a large page mapping the first 1 MB of physical
+ * memory is a special case that the processor handles.  Invalidate
+ * any old TLB entries that might hold inconsistent memory type
+ * information. 
+ */
+static void
+x86_mr_split_dmap(struct mem_range_softc *sc __unused)
+{
+#ifdef __amd64__
+	struct mem_range_desc *mrd;
+	int i;
+
+	i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0;
+	mrd = sc->mr_desc + i;
+	for (; i < sc->mr_ndesc; i++, mrd++) {
+		if ((mrd->mr_flags & (MDF_ACTIVE | MDF_BOGUS)) == MDF_ACTIVE)
+			pmap_demote_DMAP(mrd->mr_base, mrd->mr_len, TRUE);
+	}
+#endif
+}
+
+/*
+ * Fetch the current mtrr settings from the current CPU (assumed to
+ * all be in sync in the SMP case).  Note that if we are here, we
+ * assume that MTRRs are enabled, and we may or may not have fixed
+ * MTRRs.
+ */
+static void
+x86_mrfetch(struct mem_range_softc *sc)
+{
+	struct mem_range_desc *mrd;
+	u_int64_t msrv;
+	int i, j, msr;
+
+	mrd = sc->mr_desc;
+
+	/* Get fixed-range MTRRs. */
+	if (sc->mr_cap & MR686_FIXMTRR) {
+		msr = MSR_MTRR64kBase;
+		for (i = 0; i < (MTRR_N64K / 8); i++, msr++) {
+			msrv = rdmsr(msr);
+			for (j = 0; j < 8; j++, mrd++) {
+				mrd->mr_flags =
+				    (mrd->mr_flags & ~MDF_ATTRMASK) |
+				    x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+				if (mrd->mr_owner[0] == 0)
+					strcpy(mrd->mr_owner, mem_owner_bios);
+				msrv = msrv >> 8;
+			}
+		}
+		msr = MSR_MTRR16kBase;
+		for (i = 0; i < MTRR_N16K / 8; i++, msr++) {
+			msrv = rdmsr(msr);
+			for (j = 0; j < 8; j++, mrd++) {
+				mrd->mr_flags =
+				    (mrd->mr_flags & ~MDF_ATTRMASK) |
+				    x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+				if (mrd->mr_owner[0] == 0)
+					strcpy(mrd->mr_owner, mem_owner_bios);
+				msrv = msrv >> 8;
+			}
+		}
+		msr = MSR_MTRR4kBase;
+		for (i = 0; i < MTRR_N4K / 8; i++, msr++) {
+			msrv = rdmsr(msr);
+			for (j = 0; j < 8; j++, mrd++) {
+				mrd->mr_flags =
+				    (mrd->mr_flags & ~MDF_ATTRMASK) |
+				    x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+				if (mrd->mr_owner[0] == 0)
+					strcpy(mrd->mr_owner, mem_owner_bios);
+				msrv = msrv >> 8;
+			}
+		}
+	}
+
+	/* Get remainder which must be variable MTRRs. */
+	msr = MSR_MTRRVarBase;
+	for (; mrd - sc->mr_desc < sc->mr_ndesc; msr += 2, mrd++) {
+		msrv = rdmsr(msr);
+		mrd->mr_flags = (mrd->mr_flags & ~MDF_ATTRMASK) |
+		    x86_mtrr2mrt(msrv & MTRR_PHYSBASE_TYPE);
+		mrd->mr_base = msrv & mtrr_physmask;
+		msrv = rdmsr(msr + 1);
+		mrd->mr_flags = (msrv & MTRR_PHYSMASK_VALID) ?
+		    (mrd->mr_flags | MDF_ACTIVE) :
+		    (mrd->mr_flags & ~MDF_ACTIVE);
+
+		/* Compute the range from the mask. Ick. */
+		mrd->mr_len = (~(msrv & mtrr_physmask) &
+		    (mtrr_physmask | 0xfff)) + 1;
+		if (!mrvalid(mrd->mr_base, mrd->mr_len))
+			mrd->mr_flags |= MDF_BOGUS;
+
+		/* If unclaimed and active, must be the BIOS. */
+		if ((mrd->mr_flags & MDF_ACTIVE) && (mrd->mr_owner[0] == 0))
+			strcpy(mrd->mr_owner, mem_owner_bios);
+	}
+}
+
+/*
+ * Return the MTRR memory type matching a region's flags
+ */
+static int
+x86_mtrrtype(int flags)
+{
+	int i;
+
+	flags &= MDF_ATTRMASK;
+
+	for (i = 0; i < MTRRTOMRTLEN; i++) {
+		if (x86_mtrrtomrt[i] == MDF_UNKNOWN)
+			continue;
+		if (flags == x86_mtrrtomrt[i])
+			return (i);
+	}
+	return (-1);
+}
+
+static int
+x86_mrt2mtrr(int flags, int oldval)
+{
+	int val;
+
+	if ((val = x86_mtrrtype(flags)) == -1)
+		return (oldval & 0xff);
+	return (val & 0xff);
+}
+
+/*
+ * Update running CPU(s) MTRRs to match the ranges in the descriptor
+ * list.
+ *
+ * Must be called with interrupts enabled.
+ */
+static void
+x86_mrstore(struct mem_range_softc *sc)
+{
+
+	smp_rendezvous(NULL, x86_mrstoreone, NULL, sc);
+}
+
+/*
+ * Update the current CPU's MTRRs with those represented in the
+ * descriptor list.  Note that we do this wholesale rather than just
+ * stuffing one entry; this is simpler (but slower, of course).
+ */
+static void
+x86_mrstoreone(void *arg)
+{
+	struct mem_range_softc *sc = arg;
+	struct mem_range_desc *mrd;
+	u_int64_t omsrv, msrv;
+	int i, j, msr;
+	u_long cr0, cr4;
+
+	mrd = sc->mr_desc;
+
+	critical_enter();
+
+	/* Disable PGE. */
+	cr4 = rcr4();
+	load_cr4(cr4 & ~CR4_PGE);
+
+	/* Disable caches (CD = 1, NW = 0). */
+	cr0 = rcr0();
+	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
+
+	/* Flushes caches and TLBs. */
+	wbinvd();
+	invltlb();
+
+	/* Disable MTRRs (E = 0). */
+	wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) & ~MTRR_DEF_ENABLE);
+
+	/* Set fixed-range MTRRs. */
+	if (sc->mr_cap & MR686_FIXMTRR) {
+		msr = MSR_MTRR64kBase;
+		for (i = 0; i < MTRR_N64K / 8; i++, msr++) {
+			msrv = 0;
+			omsrv = rdmsr(msr);
+			for (j = 7; j >= 0; j--) {
+				msrv = msrv << 8;
+				msrv |= x86_mrt2mtrr((mrd + j)->mr_flags,
+				    omsrv >> (j * 8));
+			}
+			wrmsr(msr, msrv);
+			mrd += 8;
+		}
+		msr = MSR_MTRR16kBase;
+		for (i = 0; i < MTRR_N16K / 8; i++, msr++) {
+			msrv = 0;
+			omsrv = rdmsr(msr);
+			for (j = 7; j >= 0; j--) {
+				msrv = msrv << 8;
+				msrv |= x86_mrt2mtrr((mrd + j)->mr_flags,
+				    omsrv >> (j * 8));
+			}
+			wrmsr(msr, msrv);
+			mrd += 8;
+		}
+		msr = MSR_MTRR4kBase;
+		for (i = 0; i < MTRR_N4K / 8; i++, msr++) {
+			msrv = 0;
+			omsrv = rdmsr(msr);
+			for (j = 7; j >= 0; j--) {
+				msrv = msrv << 8;
+				msrv |= x86_mrt2mtrr((mrd + j)->mr_flags,
+				    omsrv >> (j * 8));
+			}
+			wrmsr(msr, msrv);
+			mrd += 8;
+		}
+	}
+
+	/* Set remainder which must be variable MTRRs. */
+	msr = MSR_MTRRVarBase;
+	for (; mrd - sc->mr_desc < sc->mr_ndesc; msr += 2, mrd++) {
+		/* base/type register */
+		omsrv = rdmsr(msr);
+		if (mrd->mr_flags & MDF_ACTIVE) {
+			msrv = mrd->mr_base & mtrr_physmask;
+			msrv |= x86_mrt2mtrr(mrd->mr_flags, omsrv);
+		} else {
+			msrv = 0;
+		}
+		wrmsr(msr, msrv);
+
+		/* mask/active register */
+		if (mrd->mr_flags & MDF_ACTIVE) {
+			msrv = MTRR_PHYSMASK_VALID |
+			    rounddown2(mtrr_physmask, mrd->mr_len);
+		} else {
+			msrv = 0;
+		}
+		wrmsr(msr + 1, msrv);
+	}
+
+	/* Flush caches and TLBs. */
+	wbinvd();
+	invltlb();
+
+	/* Enable MTRRs. */
+	wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) | MTRR_DEF_ENABLE);
+
+	/* Restore caches and PGE. */
+	load_cr0(cr0);
+	load_cr4(cr4);
+
+	critical_exit();
+}
+
+/*
+ * Hunt for the fixed MTRR referencing (addr)
+ */
+static struct mem_range_desc *
+x86_mtrrfixsearch(struct mem_range_softc *sc, u_int64_t addr)
+{
+	struct mem_range_desc *mrd;
+	int i;
+
+	for (i = 0, mrd = sc->mr_desc; i < MTRR_N64K + MTRR_N16K + MTRR_N4K;
+	     i++, mrd++)
+		if (addr >= mrd->mr_base &&
+		    addr < mrd->mr_base + mrd->mr_len)
+			return (mrd);
+	return (NULL);
+}
+
+/*
+ * Try to satisfy the given range request by manipulating the fixed
+ * MTRRs that cover low memory.
+ *
+ * Note that we try to be generous here; we'll bloat the range out to
+ * the next higher/lower boundary to avoid the consumer having to know
+ * too much about the mechanisms here.
+ *
+ * XXX note that this will have to be updated when we start supporting
+ * "busy" ranges.
+ */
+static int
+x86_mrsetlow(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg)
+{
+	struct mem_range_desc *first_md, *last_md, *curr_md;
+
+	/* Range check. */
+	if ((first_md = x86_mtrrfixsearch(sc, mrd->mr_base)) == NULL ||
+	    (last_md = x86_mtrrfixsearch(sc, mrd->mr_base + mrd->mr_len - 1))
+	    == NULL)
+		return (EINVAL);
+
+	/* Check that we aren't doing something risky. */
+	if ((mrd->mr_flags & MDF_FORCE) == 0) {
+		for (curr_md = first_md; curr_md <= last_md; curr_md++) {
+			if ((curr_md->mr_flags & MDF_ATTRMASK) == MDF_UNKNOWN)
+				return (EACCES);
+		}
+	}
+
+	/* Set flags, clear set-by-firmware flag. */
+	for (curr_md = first_md; curr_md <= last_md; curr_md++) {
+		curr_md->mr_flags = mrcopyflags(curr_md->mr_flags &
+		    ~MDF_FIRMWARE, mrd->mr_flags);
+		bcopy(mrd->mr_owner, curr_md->mr_owner, sizeof(mrd->mr_owner));
+	}
+
+	return (0);
+}
+
+/*
+ * Modify/add a variable MTRR to satisfy the request.
+ *
+ * XXX needs to be updated to properly support "busy" ranges.
+ */
+static int
+x86_mrsetvariable(struct mem_range_softc *sc, struct mem_range_desc *mrd,
+    int *arg)
+{
+	struct mem_range_desc *curr_md, *free_md;
+	int i;
+
+	/*
+	 * Scan the currently active variable descriptors, look for
+	 * one we exactly match (straight takeover) and for possible
+	 * accidental overlaps.
+	 *
+	 * Keep track of the first empty variable descriptor in case
+	 * we can't perform a takeover.
+	 */
+	i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0;
+	curr_md = sc->mr_desc + i;
+	free_md = NULL;
+	for (; i < sc->mr_ndesc; i++, curr_md++) {
+		if (curr_md->mr_flags & MDF_ACTIVE) {
+			/* Exact match? */
+			if (curr_md->mr_base == mrd->mr_base &&
+			    curr_md->mr_len == mrd->mr_len) {
+
+				/* Whoops, owned by someone. */
+				if (curr_md->mr_flags & MDF_BUSY)
+					return (EBUSY);
+
+				/* Check that we aren't doing something risky */
+				if (!(mrd->mr_flags & MDF_FORCE) &&
+				    (curr_md->mr_flags & MDF_ATTRMASK) ==
+				    MDF_UNKNOWN)
+					return (EACCES);
+
+				/* Ok, just hijack this entry. */
+				free_md = curr_md;
+				break;
+			}
+
+			/* Non-exact overlap? */
+			if (mroverlap(curr_md, mrd)) {
+				/* Between conflicting region types? */
+				if (x86_mtrrconflict(curr_md->mr_flags,
+				    mrd->mr_flags))
+					return (EINVAL);
+			}
+		} else if (free_md == NULL) {
+			free_md = curr_md;
+		}
+	}
+
+	/* Got somewhere to put it? */
+	if (free_md == NULL)
+		return (ENOSPC);
+
+	/* Set up new descriptor. */
+	free_md->mr_base = mrd->mr_base;
+	free_md->mr_len = mrd->mr_len;
+	free_md->mr_flags = mrcopyflags(MDF_ACTIVE, mrd->mr_flags);
+	bcopy(mrd->mr_owner, free_md->mr_owner, sizeof(mrd->mr_owner));
+	return (0);
+}
+
+/*
+ * Handle requests to set memory range attributes by manipulating MTRRs.
+ */
+static int
+x86_mrset(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg)
+{
+	struct mem_range_desc *targ;
+	int error;
+
+	switch (*arg) {
+	case MEMRANGE_SET_UPDATE:
+		/*
+		 * Make sure that what's being asked for is even
+		 * possible at all.
+		 */
+		if (!mrvalid(mrd->mr_base, mrd->mr_len) ||
+		    x86_mtrrtype(mrd->mr_flags) == -1)
+			return (EINVAL);
+
+#define	FIXTOP	\
+    ((MTRR_N64K * 0x10000) + (MTRR_N16K * 0x4000) + (MTRR_N4K * 0x1000))
+
+		/* Are the "low memory" conditions applicable? */
+		if ((sc->mr_cap & MR686_FIXMTRR) != 0 &&
+		    mrd->mr_base + mrd->mr_len <= FIXTOP) {
+			if ((error = x86_mrsetlow(sc, mrd, arg)) != 0)
+				return (error);
+		} else {
+			/* It's time to play with variable MTRRs. */
+			if ((error = x86_mrsetvariable(sc, mrd, arg)) != 0)
+				return (error);
+		}
+		break;
+
+	case MEMRANGE_SET_REMOVE:
+		if ((targ = mem_range_match(sc, mrd)) == NULL)
+			return (ENOENT);
+		if (targ->mr_flags & MDF_FIXACTIVE)
+			return (EPERM);
+		if (targ->mr_flags & MDF_BUSY)
+			return (EBUSY);
+		targ->mr_flags &= ~MDF_ACTIVE;
+		targ->mr_owner[0] = 0;
+		break;
+
+	default:
+		return (EOPNOTSUPP);
+	}
+
+	x86_mr_split_dmap(sc);
+
+	/* Update the hardware. */
+	x86_mrstore(sc);
+
+	/* Refetch to see where we're at. */
+	x86_mrfetch(sc);
+	return (0);
+}
+
+/*
+ * Work out how many ranges we support, initialise storage for them,
+ * and fetch the initial settings.
+ */
+static void
+x86_mrinit(struct mem_range_softc *sc)
+{
+	struct mem_range_desc *mrd;
+	int i, nmdesc;
+
+	if (sc->mr_desc != NULL)
+		/* Already initialized. */
+		return;
+
+	nmdesc = 0;
+	mtrrcap = rdmsr(MSR_MTRRcap);
+	mtrrdef = rdmsr(MSR_MTRRdefType);
+
+	/* For now, bail out if MTRRs are not enabled. */
+	if (!(mtrrdef & MTRR_DEF_ENABLE)) {
+		if (bootverbose)
+			printf("CPU supports MTRRs but not enabled\n");
+		return;
+	}
+	nmdesc = mtrrcap & MTRR_CAP_VCNT;
+	if (bootverbose)
+		printf("Pentium Pro MTRR support enabled\n");
+
+	/*
+	 * Determine the size of the PhysMask and PhysBase fields in
+	 * the variable range MTRRs.
+	 */
+	mtrr_physmask = (((uint64_t)1 << cpu_maxphyaddr) - 1) &
+	    ~(uint64_t)0xfff;
+
+	/* If fixed MTRRs supported and enabled. */
+	if ((mtrrcap & MTRR_CAP_FIXED) && (mtrrdef & MTRR_DEF_FIXED_ENABLE)) {
+		sc->mr_cap = MR686_FIXMTRR;
+		nmdesc += MTRR_N64K + MTRR_N16K + MTRR_N4K;
+	}
+
+	sc->mr_desc = malloc(nmdesc * sizeof(struct mem_range_desc), M_MEMDESC,
+	    M_WAITOK | M_ZERO);
+	sc->mr_ndesc = nmdesc;
+
+	mrd = sc->mr_desc;
+
+	/* Populate the fixed MTRR entries' base/length. */
+	if (sc->mr_cap & MR686_FIXMTRR) {
+		for (i = 0; i < MTRR_N64K; i++, mrd++) {
+			mrd->mr_base = i * 0x10000;
+			mrd->mr_len = 0x10000;
+			mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+			    MDF_FIXACTIVE;
+		}
+		for (i = 0; i < MTRR_N16K; i++, mrd++) {
+			mrd->mr_base = i * 0x4000 + 0x80000;
+			mrd->mr_len = 0x4000;
+			mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+			    MDF_FIXACTIVE;
+		}
+		for (i = 0; i < MTRR_N4K; i++, mrd++) {
+			mrd->mr_base = i * 0x1000 + 0xc0000;
+			mrd->mr_len = 0x1000;
+			mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+			    MDF_FIXACTIVE;
+		}
+	}
+
+	/*
+	 * Get current settings, anything set now is considered to
+	 * have been set by the firmware. (XXX has something already
+	 * played here?)
+	 */
+	x86_mrfetch(sc);
+	mrd = sc->mr_desc;
+	for (i = 0; i < sc->mr_ndesc; i++, mrd++) {
+		if (mrd->mr_flags & MDF_ACTIVE)
+			mrd->mr_flags |= MDF_FIRMWARE;
+	}
+
+	x86_mr_split_dmap(sc);
+}
+
+/*
+ * Initialise MTRRs on an AP after the BSP has run the init code.
+ */
+static void
+x86_mrAPinit(struct mem_range_softc *sc)
+{
+
+	x86_mrstoreone(sc);
+	wrmsr(MSR_MTRRdefType, mtrrdef);
+}
+
+/*
+ * Re-initialise running CPU(s) MTRRs to match the ranges in the descriptor
+ * list.
+ *
+ * Must be called with interrupts enabled.
+ */
+static void
+x86_mrreinit(struct mem_range_softc *sc)
+{
+
+	smp_rendezvous(NULL, (void (*)(void *))x86_mrAPinit, NULL, sc);
+}
+
+static void
+x86_mem_drvinit(void *unused)
+{
+
+	if (mtrrs_disabled)
+		return;
+	if (!(cpu_feature & CPUID_MTRR))
+		return;
+	mem_range_softc.mr_op = &x86_mrops;
+	x86_mrinit(&mem_range_softc);
+}
+SYSINIT(x86memdev, SI_SUB_CPU, SI_ORDER_ANY, x86_mem_drvinit, NULL);


Property changes on: trunk/sys/x86/x86/x86_mem.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/xen/hvm.c
===================================================================
--- trunk/sys/x86/xen/hvm.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/xen/hvm.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/xen/hvm.c 305672 2016-09-09 19:57:32Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/hvm.c 305672 2016-09-09 19:57:32Z jhb $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -59,34 +59,8 @@
 #include <xen/interface/vcpu.h>
 
 /*--------------------------- Forward Declarations ---------------------------*/
-#ifdef SMP
-static driver_filter_t xen_smp_rendezvous_action;
-static driver_filter_t xen_invltlb;
-static driver_filter_t xen_invlpg;
-static driver_filter_t xen_invlrng;
-static driver_filter_t xen_invlcache;
-#ifdef __i386__
-static driver_filter_t xen_lazypmap;
-#endif
-static driver_filter_t xen_ipi_bitmap_handler;
-static driver_filter_t xen_cpustop_handler;
-static driver_filter_t xen_cpususpend_handler;
-static driver_filter_t xen_cpustophard_handler;
-static void xen_ipi_vectored(u_int vector, int dest);
-#endif
 static void xen_hvm_cpu_init(void);
 
-/*---------------------------- Extern Declarations ---------------------------*/
-#ifdef __i386__
-extern void pmap_lazyfix_action(void);
-#endif
-#ifdef __amd64__
-extern int pmap_pcid_enabled;
-#endif
-
-/*---------------------------------- Macros ----------------------------------*/
-#define	IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS)
-
 /*-------------------------------- Local Types -------------------------------*/
 enum xen_hvm_init_type {
 	XEN_HVM_INIT_COLD,
@@ -94,18 +68,11 @@
 	XEN_HVM_INIT_RESUME
 };
 
-struct xen_ipi_handler
-{
-	driver_filter_t	*filter;
-	const char	*description;
-};
-
 /*-------------------------------- Global Data -------------------------------*/
 enum xen_domain_type xen_domain_type = XEN_NATIVE;
 
 #ifdef SMP
 struct cpu_ops xen_hvm_cpu_ops = {
-	.ipi_vectored	= lapic_ipi_vectored,
 	.cpu_init	= xen_hvm_cpu_init,
 	.cpu_resume	= xen_hvm_cpu_init
 };
@@ -113,24 +80,6 @@
 
 static MALLOC_DEFINE(M_XENHVM, "xen_hvm", "Xen HVM PV Support");
 
-#ifdef SMP
-static struct xen_ipi_handler xen_ipis[] = 
-{
-	[IPI_TO_IDX(IPI_RENDEZVOUS)]	= { xen_smp_rendezvous_action,	"r"   },
-	[IPI_TO_IDX(IPI_INVLTLB)]	= { xen_invltlb,		"itlb"},
-	[IPI_TO_IDX(IPI_INVLPG)]	= { xen_invlpg,			"ipg" },
-	[IPI_TO_IDX(IPI_INVLRNG)]	= { xen_invlrng,		"irg" },
-	[IPI_TO_IDX(IPI_INVLCACHE)]	= { xen_invlcache,		"ic"  },
-#ifdef __i386__
-	[IPI_TO_IDX(IPI_LAZYPMAP)]	= { xen_lazypmap,		"lp"  },
-#endif
-	[IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler,	"b"   },
-	[IPI_TO_IDX(IPI_STOP)]		= { xen_cpustop_handler,	"st"  },
-	[IPI_TO_IDX(IPI_SUSPEND)]	= { xen_cpususpend_handler,	"sp"  },
-	[IPI_TO_IDX(IPI_STOP_HARD)]	= { xen_cpustophard_handler,	"sth" },
-};
-#endif
-
 /**
  * If non-zero, the hypervisor has been configured to use a direct
  * IDT event callback for interrupt injection.
@@ -140,14 +89,10 @@
 /*------------------------------- Per-CPU Data -------------------------------*/
 DPCPU_DEFINE(struct vcpu_info, vcpu_local_info);
 DPCPU_DEFINE(struct vcpu_info *, vcpu_info);
-#ifdef SMP
-DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]);
-#endif
 
 /*------------------ Hypervisor Access Shared Memory Regions -----------------*/
-/** Hypercall table accessed via HYPERVISOR_*_op() methods. */
-char *hypercall_stubs;
 shared_info_t *HYPERVISOR_shared_info;
+start_info_t *HYPERVISOR_start_info;
 
 
 /*------------------------------ Sysctl tunables -----------------------------*/
@@ -156,207 +101,6 @@
 TUNABLE_INT("hw.xen.disable_pv_disks", &xen_disable_pv_disks);
 TUNABLE_INT("hw.xen.disable_pv_nics", &xen_disable_pv_nics);
 
-#ifdef SMP
-/*---------------------------- XEN PV IPI Handlers ---------------------------*/
-/*
- * This are C clones of the ASM functions found in apic_vector.s
- */
-static int
-xen_ipi_bitmap_handler(void *arg)
-{
-	struct trapframe *frame;
-
-	frame = arg;
-	ipi_bitmap_handler(*frame);
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_smp_rendezvous_action(void *arg)
-{
-#ifdef COUNT_IPIS
-	(*ipi_rendezvous_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	smp_rendezvous_action();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invltlb(void *arg)
-{
-
-	invltlb_handler();
-	return (FILTER_HANDLED);
-}
-
-#ifdef __amd64__
-static int
-xen_invltlb_pcid(void *arg)
-{
-
-	invltlb_pcid_handler();
-	return (FILTER_HANDLED);
-}
-#endif
-
-static int
-xen_invlpg(void *arg)
-{
-
-	invlpg_handler();
-	return (FILTER_HANDLED);
-}
-
-#ifdef __amd64__
-static int
-xen_invlpg_pcid(void *arg)
-{
-
-	invlpg_pcid_handler();
-	return (FILTER_HANDLED);
-}
-#endif
-
-static int
-xen_invlrng(void *arg)
-{
-
-	invlrng_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invlcache(void *arg)
-{
-
-	invlcache_handler();
-	return (FILTER_HANDLED);
-}
-
-#ifdef __i386__
-static int
-xen_lazypmap(void *arg)
-{
-
-	pmap_lazyfix_action();
-	return (FILTER_HANDLED);
-}
-#endif
-
-static int
-xen_cpustop_handler(void *arg)
-{
-
-	cpustop_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_cpususpend_handler(void *arg)
-{
-
-	cpususpend_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_cpustophard_handler(void *arg)
-{
-
-	ipi_nmi_handler();
-	return (FILTER_HANDLED);
-}
-
-/* Xen PV IPI sender */
-static void
-xen_ipi_vectored(u_int vector, int dest)
-{
-	xen_intr_handle_t *ipi_handle;
-	int ipi_idx, to_cpu, self;
-
-	ipi_idx = IPI_TO_IDX(vector);
-	if (ipi_idx > nitems(xen_ipis))
-		panic("IPI out of range");
-
-	switch(dest) {
-	case APIC_IPI_DEST_SELF:
-		ipi_handle = DPCPU_GET(ipi_handle);
-		xen_intr_signal(ipi_handle[ipi_idx]);
-		break;
-	case APIC_IPI_DEST_ALL:
-		CPU_FOREACH(to_cpu) {
-			ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
-			xen_intr_signal(ipi_handle[ipi_idx]);
-		}
-		break;
-	case APIC_IPI_DEST_OTHERS:
-		self = PCPU_GET(cpuid);
-		CPU_FOREACH(to_cpu) {
-			if (to_cpu != self) {
-				ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
-				xen_intr_signal(ipi_handle[ipi_idx]);
-			}
-		}
-		break;
-	default:
-		to_cpu = apic_cpuid(dest);
-		ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
-		xen_intr_signal(ipi_handle[ipi_idx]);
-		break;
-	}
-}
-
-/*---------------------- XEN diverged cpu operations -------------------------*/
-static void
-xen_cpu_ipi_init(int cpu)
-{
-	xen_intr_handle_t *ipi_handle;
-	const struct xen_ipi_handler *ipi;
-	device_t dev;
-	int idx, rc;
-
-	ipi_handle = DPCPU_ID_GET(cpu, ipi_handle);
-	dev = pcpu_find(cpu)->pc_device;
-	KASSERT((dev != NULL), ("NULL pcpu device_t"));
-
-	for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) {
-
-		if (ipi->filter == NULL) {
-			ipi_handle[idx] = NULL;
-			continue;
-		}
-
-		rc = xen_intr_alloc_and_bind_ipi(dev, cpu, ipi->filter,
-		    INTR_TYPE_TTY, &ipi_handle[idx]);
-		if (rc != 0)
-			panic("Unable to allocate a XEN IPI port");
-		xen_intr_describe(ipi_handle[idx], "%s", ipi->description);
-	}
-}
-
-static void
-xen_setup_cpus(void)
-{
-	int i;
-
-	if (!xen_hvm_domain() || !xen_vector_callback_enabled)
-		return;
-
-#ifdef __amd64__
-	if (pmap_pcid_enabled) {
-		xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = xen_invltlb_pcid;
-		xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = xen_invlpg_pcid;
-	}
-#endif
-	CPU_FOREACH(i)
-		xen_cpu_ipi_init(i);
-
-	/* Set the xen pv ipi ops to replace the native ones */
-	cpu_ops.ipi_vectored = xen_ipi_vectored;
-}
-#endif
-
 /*---------------------- XEN Hypervisor Probe and Setup ----------------------*/
 static uint32_t
 xen_hvm_cpuid_base(void)
@@ -376,16 +120,21 @@
  * Allocate and fill in the hypcall page.
  */
 static int
-xen_hvm_init_hypercall_stubs(void)
+xen_hvm_init_hypercall_stubs(enum xen_hvm_init_type init_type)
 {
 	uint32_t base, regs[4];
 	int i;
 
+	if (xen_pv_domain()) {
+		/* hypercall page is already set in the PV case */
+		return (0);
+	}
+
 	base = xen_hvm_cpuid_base();
 	if (base == 0)
 		return (ENXIO);
 
-	if (hypercall_stubs == NULL) {
+	if (init_type == XEN_HVM_INIT_COLD) {
 		int major, minor;
 
 		do_cpuid(base + 1, regs);
@@ -417,18 +166,9 @@
 	 * Find the hypercall pages.
 	 */
 	do_cpuid(base + 2, regs);
-	
-	if (hypercall_stubs == NULL) {
-		size_t call_region_size;
 
-		call_region_size = regs[0] * PAGE_SIZE;
-		hypercall_stubs = malloc(call_region_size, M_XENHVM, M_NOWAIT);
-		if (hypercall_stubs == NULL)
-			panic("Unable to allocate Xen hypercall region");
-	}
-
 	for (i = 0; i < regs[0]; i++)
-		wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i);
+		wrmsr(regs[1], vtophys(&hypercall_page + i * PAGE_SIZE) + i);
 
 	return (0);
 }
@@ -438,6 +178,14 @@
 {
 	struct xen_add_to_physmap xatp;
 
+	if (xen_pv_domain()) {
+		/*
+		 * Already setup in the PV case, shared_info is passed inside
+		 * of the start_info struct at start of day.
+		 */
+		return;
+	}
+
 	if (HYPERVISOR_shared_info == NULL) {
 		HYPERVISOR_shared_info = malloc(PAGE_SIZE, M_XENHVM, M_NOWAIT);
 		if (HYPERVISOR_shared_info == NULL)
@@ -516,6 +264,16 @@
 {
 	u_short disable_devs = 0;
 
+	if (xen_pv_domain()) {
+		/*
+		 * No emulated devices in the PV case, so no need to unplug
+		 * anything.
+		 */
+		if (xen_disable_pv_disks != 0 || xen_disable_pv_nics != 0)
+			printf("PV devices cannot be disabled in PV guests\n");
+		return;
+	}
+
 	if (inw(XEN_MAGIC_IOPORT) != XMI_MAGIC)
 		return;
 
@@ -543,7 +301,7 @@
 	if (init_type == XEN_HVM_INIT_CANCELLED_SUSPEND)
 		return;
 
-	error = xen_hvm_init_hypercall_stubs();
+	error = xen_hvm_init_hypercall_stubs(init_type);
 
 	switch (init_type) {
 	case XEN_HVM_INIT_COLD:
@@ -550,11 +308,21 @@
 		if (error != 0)
 			return;
 
+		/*
+		 * If xen_domain_type is not set at this point
+		 * it means we are inside a (PV)HVM guest, because
+		 * for PVH the guest type is set much earlier
+		 * (see hammer_time_xen).
+		 */
+		if (!xen_domain()) {
+			xen_domain_type = XEN_HVM_DOMAIN;
+			vm_guest = VM_GUEST_XEN;
+		}
+
 		setup_xen_features();
 #ifdef SMP
 		cpu_ops = xen_hvm_cpu_ops;
 #endif
- 		vm_guest = VM_GUEST_XEN;
 		break;
 	case XEN_HVM_INIT_RESUME:
 		if (error != 0)
@@ -569,9 +337,15 @@
 	}
 
 	xen_vector_callback_enabled = 0;
-	xen_domain_type = XEN_HVM_DOMAIN;
+	xen_hvm_set_callback(NULL);
+
+	/*
+	 * On (PV)HVM domains we need to request the hypervisor to
+	 * fill the shared info page, for PVH guest the shared_info page
+	 * is passed inside the start_info struct and is already set, so this
+	 * functions are no-ops.
+	 */
 	xen_hvm_init_shared_info_page();
-	xen_hvm_set_callback(NULL);
 	xen_hvm_disable_emulated_devices();
 } 
 
@@ -603,6 +377,9 @@
 	struct pcpu *pc;
 	int i;
 
+	if (!xen_hvm_domain())
+		return;
+
 	/* Set vcpu_id to acpi_id */
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
@@ -645,8 +422,5 @@
 }
 
 SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_sysinit, NULL);
-#ifdef SMP
-SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_FIRST, xen_setup_cpus, NULL);
-#endif
 SYSINIT(xen_hvm_cpu_init, SI_SUB_INTR, SI_ORDER_FIRST, xen_hvm_cpu_init, NULL);
 SYSINIT(xen_set_vcpu_id, SI_SUB_CPU, SI_ORDER_ANY, xen_set_vcpu_id, NULL);

Added: trunk/sys/x86/xen/pv.c
===================================================================
--- trunk/sys/x86/xen/pv.c	                        (rev 0)
+++ trunk/sys/x86/xen/pv.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,428 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2004 Christian Limpach.
+ * Copyright (c) 2004-2006,2008 Kip Macy
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2013 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/pv.c 344378 2019-02-20 19:19:24Z kevans $");
+
+#include "opt_ddb.h"
+#include "opt_kstack_pages.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/reboot.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/boot.h>
+#include <sys/ctype.h>
+#include <sys/mutex.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_param.h>
+
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+#include <x86/init.h>
+#include <machine/pc/bios.h>
+#include <machine/smp.h>
+#include <machine/intr_machdep.h>
+#include <machine/metadata.h>
+
+#include <xen/xen-os.h>
+#include <xen/hypervisor.h>
+#include <xen/xenstore/xenstorevar.h>
+#include <xen/xen_pv.h>
+#include <xen/xen_msi.h>
+
+#include <xen/interface/vcpu.h>
+
+#include <dev/xen/timer/timer.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/* Native initial function */
+extern u_int64_t hammer_time(u_int64_t, u_int64_t);
+/* Xen initial function */
+uint64_t hammer_time_xen(start_info_t *, uint64_t);
+
+#define MAX_E820_ENTRIES	128
+
+/*--------------------------- Forward Declarations ---------------------------*/
+static caddr_t xen_pv_parse_preload_data(u_int64_t);
+static void xen_pv_parse_memmap(caddr_t, vm_paddr_t *, int *);
+
+#ifdef SMP
+static int xen_pv_start_all_aps(void);
+#endif
+
+/*---------------------------- Extern Declarations ---------------------------*/
+#ifdef SMP
+/* Variables used by amd64 mp_machdep to start APs */
+extern char *doublefault_stack;
+extern char *mce_stack;
+extern char *nmi_stack;
+#endif
+
+/*
+ * Placed by the linker at the end of the bss section, which is the last
+ * section loaded by Xen before loading the symtab and strtab.
+ */
+extern uint32_t end;
+
+/*-------------------------------- Global Data -------------------------------*/
+/* Xen init_ops implementation. */
+struct init_ops xen_init_ops = {
+	.parse_preload_data		= xen_pv_parse_preload_data,
+	.early_clock_source_init	= xen_clock_init,
+	.early_delay			= xen_delay,
+	.parse_memmap			= xen_pv_parse_memmap,
+#ifdef SMP
+	.start_all_aps			= xen_pv_start_all_aps,
+#endif
+	.msi_init =			xen_msi_init,
+};
+
+static struct bios_smap xen_smap[MAX_E820_ENTRIES];
+
+/*-------------------------------- Xen PV init -------------------------------*/
+/*
+ * First function called by the Xen PVH boot sequence.
+ *
+ * Set some Xen global variables and prepare the environment so it is
+ * as similar as possible to what native FreeBSD init function expects.
+ */
+uint64_t
+hammer_time_xen(start_info_t *si, uint64_t xenstack)
+{
+	uint64_t physfree;
+	uint64_t *PT4 = (u_int64_t *)xenstack;
+	uint64_t *PT3 = (u_int64_t *)(xenstack + PAGE_SIZE);
+	uint64_t *PT2 = (u_int64_t *)(xenstack + 2 * PAGE_SIZE);
+	int i;
+
+	xen_domain_type = XEN_PV_DOMAIN;
+	vm_guest = VM_GUEST_XEN;
+
+	if ((si == NULL) || (xenstack == 0)) {
+		xc_printf("ERROR: invalid start_info or xen stack, halting\n");
+		HYPERVISOR_shutdown(SHUTDOWN_crash);
+	}
+
+	xc_printf("FreeBSD PVH running on %s\n", si->magic);
+
+	/* We use 3 pages of xen stack for the boot pagetables */
+	physfree = xenstack + 3 * PAGE_SIZE - KERNBASE;
+
+	/* Setup Xen global variables */
+	HYPERVISOR_start_info = si;
+	HYPERVISOR_shared_info =
+	    (shared_info_t *)(si->shared_info + KERNBASE);
+
+	/*
+	 * Setup some misc global variables for Xen devices
+	 *
+	 * XXX: Devices that need these specific variables should
+	 *      be rewritten to fetch this info by themselves from the
+	 *      start_info page.
+	 */
+	xen_store = (struct xenstore_domain_interface *)
+	    (ptoa(si->store_mfn) + KERNBASE);
+	console_page = (char *)(ptoa(si->console.domU.mfn) + KERNBASE);
+
+	/*
+	 * Use the stack Xen gives us to build the page tables
+	 * as native FreeBSD expects to find them (created
+	 * by the boot trampoline).
+	 */
+	for (i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); i++) {
+		/*
+		 * Each slot of the level 4 pages points
+		 * to the same level 3 page
+		 */
+		PT4[i] = ((uint64_t)&PT3[0]) - KERNBASE;
+		PT4[i] |= PG_V | PG_RW | PG_U;
+
+		/*
+		 * Each slot of the level 3 pages points
+		 * to the same level 2 page
+		 */
+		PT3[i] = ((uint64_t)&PT2[0]) - KERNBASE;
+		PT3[i] |= PG_V | PG_RW | PG_U;
+
+		/*
+		 * The level 2 page slots are mapped with
+		 * 2MB pages for 1GB.
+		 */
+		PT2[i] = i * (2 * 1024 * 1024);
+		PT2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+	}
+	load_cr3(((uint64_t)&PT4[0]) - KERNBASE);
+
+	/* Set the hooks for early functions that diverge from bare metal */
+	init_ops = xen_init_ops;
+	apic_ops = xen_apic_ops;
+
+	/* Now we can jump into the native init function */
+	return (hammer_time(0, physfree));
+}
+
+/*-------------------------------- PV specific -------------------------------*/
+#ifdef SMP
+static bool
+start_xen_ap(int cpu)
+{
+	struct vcpu_guest_context *ctxt;
+	int ms, cpus = mp_naps;
+	const size_t stacksize = kstack_pages * PAGE_SIZE;
+
+	/* allocate and set up an idle stack data page */
+	bootstacks[cpu] =
+	    (void *)kmem_malloc(kernel_arena, stacksize, M_WAITOK | M_ZERO);
+	doublefault_stack =
+	    (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+	mce_stack =
+	    (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+	nmi_stack =
+	    (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+	dpcpu =
+	    (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, M_WAITOK | M_ZERO);
+
+	bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8;
+	bootAP = cpu;
+
+	ctxt = malloc(sizeof(*ctxt), M_TEMP, M_WAITOK | M_ZERO);
+
+	ctxt->flags = VGCF_IN_KERNEL;
+	ctxt->user_regs.rip = (unsigned long) init_secondary;
+	ctxt->user_regs.rsp = (unsigned long) bootSTK;
+
+	/* Set the AP to use the same page tables */
+	ctxt->ctrlreg[3] = KPML4phys;
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+		panic("unable to initialize AP#%d", cpu);
+
+	free(ctxt, M_TEMP);
+
+	/* Launch the vCPU */
+	if (HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
+		panic("unable to start AP#%d", cpu);
+
+	/* Wait up to 5 seconds for it to start. */
+	for (ms = 0; ms < 5000; ms++) {
+		if (mp_naps > cpus)
+			return (true);
+		DELAY(1000);
+	}
+
+	return (false);
+}
+
+static int
+xen_pv_start_all_aps(void)
+{
+	int cpu;
+
+	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
+
+	for (cpu = 1; cpu < mp_ncpus; cpu++) {
+
+		/* attempt to start the Application Processor */
+		if (!start_xen_ap(cpu))
+			panic("AP #%d failed to start!", cpu);
+
+		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
+	}
+
+	return (mp_naps);
+}
+#endif /* SMP */
+
+/*
+ * Functions to convert the "extra" parameters passed by Xen
+ * into FreeBSD boot options.
+ */
+static void
+xen_pv_set_env(void)
+{
+	char *cmd_line_next, *cmd_line;
+	size_t env_size;
+
+	cmd_line = HYPERVISOR_start_info->cmd_line;
+	env_size = sizeof(HYPERVISOR_start_info->cmd_line);
+
+	/* Skip leading spaces */
+	for (; isspace(*cmd_line) && (env_size != 0); cmd_line++)
+		env_size--;
+
+	/* Replace ',' with '\0' */
+	for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;)
+		;
+
+	init_static_kenv(cmd_line, 0);
+}
+
+#ifdef DDB
+/*
+ * The way Xen loads the symtab is different from the native boot loader,
+ * because it's tailored for NetBSD. So we have to adapt and use the same
+ * method as NetBSD. Portions of the code below have been picked from NetBSD:
+ * sys/kern/kern_ksyms.c CVS Revision 1.71.
+ */
+static void
+xen_pv_parse_symtab(void)
+{
+	Elf_Ehdr *ehdr;
+	Elf_Shdr *shdr;
+	vm_offset_t sym_end;
+	uint32_t size;
+	int i, j;
+
+	size = end;
+	sym_end = HYPERVISOR_start_info->mod_start != 0 ?
+	    HYPERVISOR_start_info->mod_start :
+	    HYPERVISOR_start_info->mfn_list;
+
+	/*
+	 * Make sure the size is right headed, sym_end is just a
+	 * high boundary, but at least allows us to fail earlier.
+	 */
+	if ((vm_offset_t)&end + size > sym_end) {
+		xc_printf("Unable to load ELF symtab: size mismatch\n");
+		return;
+	}
+
+	ehdr = (Elf_Ehdr *)(&end + 1);
+	if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) ||
+	    ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+	    ehdr->e_version > 1) {
+		xc_printf("Unable to load ELF symtab: invalid symbol table\n");
+		return;
+	}
+
+	shdr = (Elf_Shdr *)((uint8_t *)ehdr + ehdr->e_shoff);
+	/* Find the symbol table and the corresponding string table. */
+	for (i = 1; i < ehdr->e_shnum; i++) {
+		if (shdr[i].sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr[i].sh_offset == 0)
+			continue;
+		ksymtab = (uintptr_t)((uint8_t *)ehdr + shdr[i].sh_offset);
+		ksymtab_size = shdr[i].sh_size;
+		j = shdr[i].sh_link;
+		if (shdr[j].sh_offset == 0)
+			continue; /* Can this happen? */
+		kstrtab = (uintptr_t)((uint8_t *)ehdr + shdr[j].sh_offset);
+		break;
+	}
+
+	if (ksymtab == 0 || kstrtab == 0) {
+		xc_printf(
+    "Unable to load ELF symtab: could not find symtab or strtab\n");
+		return;
+	}
+}
+#endif
+
+static caddr_t
+xen_pv_parse_preload_data(u_int64_t modulep)
+{
+	caddr_t		 kmdp;
+	vm_ooffset_t	 off;
+	vm_paddr_t	 metadata;
+	char             *envp;
+
+	if (HYPERVISOR_start_info->mod_start != 0) {
+		preload_metadata = (caddr_t)(HYPERVISOR_start_info->mod_start);
+
+		kmdp = preload_search_by_type("elf kernel");
+		if (kmdp == NULL)
+			kmdp = preload_search_by_type("elf64 kernel");
+		KASSERT(kmdp != NULL, ("unable to find kernel"));
+
+		/*
+		 * Xen has relocated the metadata and the modules,
+		 * so we need to recalculate it's position. This is
+		 * done by saving the original modulep address and
+		 * then calculating the offset with mod_start,
+		 * which contains the relocated modulep address.
+		 */
+		metadata = MD_FETCH(kmdp, MODINFOMD_MODULEP, vm_paddr_t);
+		off = HYPERVISOR_start_info->mod_start - metadata;
+
+		preload_bootstrap_relocate(off);
+
+		boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
+		envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
+		if (envp != NULL)
+			envp += off;
+		init_static_kenv(envp, 0);
+	} else {
+		/* Parse the extra boot information given by Xen */
+		xen_pv_set_env();
+		boothowto |= boot_env_to_howto();
+		kmdp = NULL;
+	}
+
+#ifdef DDB
+	xen_pv_parse_symtab();
+#endif
+	return (kmdp);
+}
+
+static void
+xen_pv_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
+{
+	struct xen_memory_map memmap;
+	u_int32_t size;
+	int rc;
+
+	/* Fetch the E820 map from Xen */
+	memmap.nr_entries = MAX_E820_ENTRIES;
+	set_xen_guest_handle(memmap.buffer, xen_smap);
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc)
+		panic("unable to fetch Xen E820 memory map");
+	size = memmap.nr_entries * sizeof(xen_smap[0]);
+
+	bios_add_smap_entries(xen_smap, size, physmap, physmap_idx);
+}


Property changes on: trunk/sys/x86/xen/pv.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/pvcpu_enum.c
===================================================================
--- trunk/sys/x86/xen/pvcpu_enum.c	                        (rev 0)
+++ trunk/sys/x86/xen/pvcpu_enum.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,267 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2003 John Baldwin <jhb at FreeBSD.org>
+ * Copyright (c) 2013 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/pvcpu_enum.c 340016 2018-11-01 18:34:26Z jhb $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/smp.h>
+#include <sys/pcpu.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#include <xen/xen-os.h>
+#include <xen/xen_intr.h>
+#include <xen/hypervisor.h>
+
+#include <xen/interface/vcpu.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/aclocal.h>
+#include <contrib/dev/acpica/include/actables.h>
+
+#include <dev/acpica/acpivar.h>
+
+static int xenpv_probe(void);
+static int xenpv_probe_cpus(void);
+static int xenpv_setup_local(void);
+static int xenpv_setup_io(void);
+
+static ACPI_TABLE_MADT *madt;
+static vm_paddr_t madt_physaddr;
+static vm_offset_t madt_length;
+
+static struct apic_enumerator xenpv_enumerator = {
+	"Xen PV",
+	xenpv_probe,
+	xenpv_probe_cpus,
+	xenpv_setup_local,
+	xenpv_setup_io
+};
+
+/*--------------------- Helper functions to parse MADT -----------------------*/
+
+/*
+ * Parse an interrupt source override for an ISA interrupt.
+ */
+static void
+madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr)
+{
+	enum intr_trigger trig;
+	enum intr_polarity pol;
+	int ret;
+
+	if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 &&
+	    intr->GlobalIrq == 2) {
+		if (bootverbose)
+			printf("MADT: Skipping timer override\n");
+		return;
+	}
+
+	madt_parse_interrupt_values(intr, &trig, &pol);
+
+	/* Remap the IRQ if it is mapped to a different interrupt vector. */
+	if (intr->SourceIrq != intr->GlobalIrq && intr->GlobalIrq > 15 &&
+	    intr->SourceIrq == AcpiGbl_FADT.SciInterrupt)
+		/*
+		 * If the SCI is remapped to a non-ISA global interrupt,
+		 * then override the vector we use to setup.
+		 */
+		acpi_OverrideInterruptLevel(intr->GlobalIrq);
+
+	/* Register the IRQ with the polarity and trigger mode found. */
+	ret = xen_register_pirq(intr->GlobalIrq, trig, pol);
+	if (ret != 0)
+		panic("Unable to register interrupt override");
+}
+
+/*
+ * Call the handler routine for each entry in the MADT table.
+ */
+static void
+madt_walk_table(acpi_subtable_handler *handler, void *arg)
+{
+
+	acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length,
+	    handler, arg);
+}
+
+/*
+ * Parse interrupt entries.
+ */
+static void
+madt_parse_ints(ACPI_SUBTABLE_HEADER *entry, void *arg __unused)
+{
+
+	if (entry->Type == ACPI_MADT_TYPE_INTERRUPT_OVERRIDE)
+		madt_parse_interrupt_override(
+		    (ACPI_MADT_INTERRUPT_OVERRIDE *)entry);
+}
+
+/*---------------------------- Xen PV enumerator -----------------------------*/
+
+/*
+ * This enumerator will only be registered on PVH
+ */
+static int
+xenpv_probe(void)
+{
+	return (0);
+}
+
+/*
+ * Test each possible vCPU in order to find the number of vCPUs
+ */
+static int
+xenpv_probe_cpus(void)
+{
+#ifdef SMP
+	int i, ret;
+
+	for (i = 0; i < MAXCPU; i++) {
+		ret = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+		if (ret >= 0)
+			lapic_create((i * 2), (i == 0));
+	}
+#endif
+	return (0);
+}
+
+/*
+ * Initialize the vCPU id of the BSP
+ */
+static int
+xenpv_setup_local(void)
+{
+	PCPU_SET(vcpu_id, 0);
+	lapic_init(0);
+	return (0);
+}
+
+/*
+ * On PVH guests there's no IO APIC
+ */
+static int
+xenpv_setup_io(void)
+{
+
+	if (xen_initial_domain()) {
+		/*
+		 * NB: we could iterate over the MADT IOAPIC entries in order
+		 * to figure out the exact number of IOAPIC interrupts, but
+		 * this is legacy code so just keep using the previous
+		 * behaviour and assume a maximum of 256 interrupts.
+		 */
+		num_io_irqs = max(MINIMUM_MSI_INT - 1, num_io_irqs);
+
+		acpi_SetDefaultIntrModel(ACPI_INTR_APIC);
+	}
+	return (0);
+}
+
+void
+xenpv_register_pirqs(struct pic *pic __unused)
+{
+	unsigned int i;
+	int ret;
+
+	/* Map MADT */
+	madt_physaddr = acpi_find_table(ACPI_SIG_MADT);
+	madt = acpi_map_table(madt_physaddr, ACPI_SIG_MADT);
+	madt_length = madt->Header.Length;
+
+	/* Try to initialize ACPI so that we can access the FADT. */
+	ret = acpi_Startup();
+	if (ACPI_FAILURE(ret)) {
+		printf("MADT: ACPI Startup failed with %s\n",
+		    AcpiFormatException(ret));
+		printf("Try disabling either ACPI or apic support.\n");
+		panic("Using MADT but ACPI doesn't work");
+	}
+
+	/* Run through the table to see if there are any overrides. */
+	madt_walk_table(madt_parse_ints, NULL);
+
+	/*
+	 * If there was not an explicit override entry for the SCI,
+	 * force it to use level trigger and active-low polarity.
+	 */
+	if (!madt_found_sci_override) {
+		printf(
+"MADT: Forcing active-low polarity and level trigger for SCI\n");
+		ret = xen_register_pirq(AcpiGbl_FADT.SciInterrupt,
+		    INTR_TRIGGER_LEVEL, INTR_POLARITY_LOW);
+		if (ret != 0)
+			panic("Unable to register SCI IRQ");
+	}
+
+	/* Register legacy ISA IRQs */
+	for (i = 1; i < 16; i++) {
+		if (intr_lookup_source(i) != NULL)
+			continue;
+		ret = xen_register_pirq(i, INTR_TRIGGER_EDGE,
+		    INTR_POLARITY_LOW);
+		if (ret != 0 && bootverbose)
+			printf("Unable to register legacy IRQ#%u: %d\n", i,
+			    ret);
+	}
+}
+
+static void
+xenpv_register(void *dummy __unused)
+{
+	if (xen_pv_domain()) {
+		apic_register_enumerator(&xenpv_enumerator);
+	}
+}
+SYSINIT(xenpv_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, xenpv_register, NULL);
+
+/*
+ * Setup per-CPU vCPU IDs
+ */
+static void
+xenpv_set_ids(void *dummy)
+{
+	struct pcpu *pc;
+	int i;
+
+	CPU_FOREACH(i) {
+		pc = pcpu_find(i);
+		pc->pc_vcpu_id = i;
+	}
+}
+SYSINIT(xenpv_set_ids, SI_SUB_CPU, SI_ORDER_MIDDLE, xenpv_set_ids, NULL);


Property changes on: trunk/sys/x86/xen/pvcpu_enum.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xen_apic.c
===================================================================
--- trunk/sys/x86/xen/xen_apic.c	                        (rev 0)
+++ trunk/sys/x86/xen/xen_apic.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,598 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_apic.c 334047 2018-05-22 14:36:46Z kib $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/cpufunc.h>
+#include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <machine/md_var.h>
+#include <machine/smp.h>
+
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
+
+#include <xen/xen-os.h>
+#include <xen/features.h>
+#include <xen/gnttab.h>
+#include <xen/hypervisor.h>
+#include <xen/hvm.h>
+#include <xen/xen_intr.h>
+
+#include <xen/interface/vcpu.h>
+
+/*--------------------------------- Macros -----------------------------------*/
+
+#define XEN_APIC_UNSUPPORTED \
+	panic("%s: not available in Xen PV port.", __func__)
+
+
+/*--------------------------- Forward Declarations ---------------------------*/
+#ifdef SMP
+static driver_filter_t xen_smp_rendezvous_action;
+static driver_filter_t xen_invltlb;
+static driver_filter_t xen_invlpg;
+static driver_filter_t xen_invlrng;
+static driver_filter_t xen_invlcache;
+static driver_filter_t xen_ipi_bitmap_handler;
+static driver_filter_t xen_cpustop_handler;
+static driver_filter_t xen_cpususpend_handler;
+static driver_filter_t xen_cpustophard_handler;
+#endif
+
+/*---------------------------------- Macros ----------------------------------*/
+#define	IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS)
+
+/*--------------------------------- Xen IPIs ---------------------------------*/
+#ifdef SMP
+struct xen_ipi_handler
+{
+	driver_filter_t	*filter;
+	const char	*description;
+};
+
+static struct xen_ipi_handler xen_ipis[] = 
+{
+	[IPI_TO_IDX(IPI_RENDEZVOUS)]	= { xen_smp_rendezvous_action,	"r"   },
+	[IPI_TO_IDX(IPI_INVLTLB)]	= { xen_invltlb,		"itlb"},
+	[IPI_TO_IDX(IPI_INVLPG)]	= { xen_invlpg,			"ipg" },
+	[IPI_TO_IDX(IPI_INVLRNG)]	= { xen_invlrng,		"irg" },
+	[IPI_TO_IDX(IPI_INVLCACHE)]	= { xen_invlcache,		"ic"  },
+	[IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler,	"b"   },
+	[IPI_TO_IDX(IPI_STOP)]		= { xen_cpustop_handler,	"st"  },
+	[IPI_TO_IDX(IPI_SUSPEND)]	= { xen_cpususpend_handler,	"sp"  },
+	[IPI_TO_IDX(IPI_STOP_HARD)]	= { xen_cpustophard_handler,	"sth" },
+};
+#endif
+
+/*------------------------------- Per-CPU Data -------------------------------*/
+#ifdef SMP
+DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]);
+#endif
+
+/*------------------------------- Xen PV APIC --------------------------------*/
+
+static void
+xen_pv_lapic_create(u_int apic_id, int boot_cpu)
+{
+#ifdef SMP
+	cpu_add(apic_id, boot_cpu);
+#endif
+}
+
+static void
+xen_pv_lapic_init(vm_paddr_t addr)
+{
+
+}
+
+static void
+xen_pv_lapic_setup(int boot)
+{
+
+}
+
+static void
+xen_pv_lapic_dump(const char *str)
+{
+
+	printf("cpu%d %s XEN PV LAPIC\n", PCPU_GET(cpuid), str);
+}
+
+static void
+xen_pv_lapic_disable(void)
+{
+
+}
+
+static bool
+xen_pv_lapic_is_x2apic(void)
+{
+
+	return (false);
+}
+
+static void
+xen_pv_lapic_eoi(void)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static int
+xen_pv_lapic_id(void)
+{
+
+	return (PCPU_GET(apic_id));
+}
+
+static int
+xen_pv_lapic_intr_pending(u_int vector)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static u_int
+xen_pv_apic_cpuid(u_int apic_id)
+{
+#ifdef SMP
+	return (apic_cpuids[apic_id]);
+#else
+	return (0);
+#endif
+}
+
+static u_int
+xen_pv_apic_alloc_vector(u_int apic_id, u_int irq)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static u_int
+xen_pv_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static void
+xen_pv_apic_disable_vector(u_int apic_id, u_int vector)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_apic_enable_vector(u_int apic_id, u_int vector)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static int
+xen_pv_lapic_enable_pmc(void)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static void
+xen_pv_lapic_disable_pmc(void)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_reenable_pmc(void)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_enable_cmc(void)
+{
+
+}
+
+#ifdef SMP
+static void
+xen_pv_lapic_ipi_raw(register_t icrlo, u_int dest)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_ipi_vectored(u_int vector, int dest)
+{
+	xen_intr_handle_t *ipi_handle;
+	int ipi_idx, to_cpu, self;
+
+	ipi_idx = IPI_TO_IDX(vector);
+	if (ipi_idx >= nitems(xen_ipis))
+		panic("IPI out of range");
+
+	switch(dest) {
+	case APIC_IPI_DEST_SELF:
+		ipi_handle = DPCPU_GET(ipi_handle);
+		xen_intr_signal(ipi_handle[ipi_idx]);
+		break;
+	case APIC_IPI_DEST_ALL:
+		CPU_FOREACH(to_cpu) {
+			ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
+			xen_intr_signal(ipi_handle[ipi_idx]);
+		}
+		break;
+	case APIC_IPI_DEST_OTHERS:
+		self = PCPU_GET(cpuid);
+		CPU_FOREACH(to_cpu) {
+			if (to_cpu != self) {
+				ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
+				xen_intr_signal(ipi_handle[ipi_idx]);
+			}
+		}
+		break;
+	default:
+		to_cpu = apic_cpuid(dest);
+		ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
+		xen_intr_signal(ipi_handle[ipi_idx]);
+		break;
+	}
+}
+
+static int
+xen_pv_lapic_ipi_wait(int delay)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+#endif	/* SMP */
+
+static int
+xen_pv_lapic_ipi_alloc(inthand_t *ipifunc)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (-1);
+}
+
+static void
+xen_pv_lapic_ipi_free(int vector)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static int
+xen_pv_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static int
+xen_pv_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static int
+xen_pv_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static int
+xen_pv_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
+    enum intr_trigger trigger)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+/* Xen apic_ops implementation */
+struct apic_ops xen_apic_ops = {
+	.create			= xen_pv_lapic_create,
+	.init			= xen_pv_lapic_init,
+	.xapic_mode		= xen_pv_lapic_disable,
+	.is_x2apic		= xen_pv_lapic_is_x2apic,
+	.setup			= xen_pv_lapic_setup,
+	.dump			= xen_pv_lapic_dump,
+	.disable		= xen_pv_lapic_disable,
+	.eoi			= xen_pv_lapic_eoi,
+	.id			= xen_pv_lapic_id,
+	.intr_pending		= xen_pv_lapic_intr_pending,
+	.set_logical_id		= xen_pv_lapic_set_logical_id,
+	.cpuid			= xen_pv_apic_cpuid,
+	.alloc_vector		= xen_pv_apic_alloc_vector,
+	.alloc_vectors		= xen_pv_apic_alloc_vectors,
+	.enable_vector		= xen_pv_apic_enable_vector,
+	.disable_vector		= xen_pv_apic_disable_vector,
+	.free_vector		= xen_pv_apic_free_vector,
+	.enable_pmc		= xen_pv_lapic_enable_pmc,
+	.disable_pmc		= xen_pv_lapic_disable_pmc,
+	.reenable_pmc		= xen_pv_lapic_reenable_pmc,
+	.enable_cmc		= xen_pv_lapic_enable_cmc,
+#ifdef SMP
+	.ipi_raw		= xen_pv_lapic_ipi_raw,
+	.ipi_vectored		= xen_pv_lapic_ipi_vectored,
+	.ipi_wait		= xen_pv_lapic_ipi_wait,
+#endif
+	.ipi_alloc		= xen_pv_lapic_ipi_alloc,
+	.ipi_free		= xen_pv_lapic_ipi_free,
+	.set_lvt_mask		= xen_pv_lapic_set_lvt_mask,
+	.set_lvt_mode		= xen_pv_lapic_set_lvt_mode,
+	.set_lvt_polarity	= xen_pv_lapic_set_lvt_polarity,
+	.set_lvt_triggermode	= xen_pv_lapic_set_lvt_triggermode,
+};
+
+#ifdef SMP
+/*---------------------------- XEN PV IPI Handlers ---------------------------*/
+/*
+ * These are C clones of the ASM functions found in apic_vector.
+ */
+static int
+xen_ipi_bitmap_handler(void *arg)
+{
+	struct trapframe *frame;
+
+	frame = arg;
+	ipi_bitmap_handler(*frame);
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_smp_rendezvous_action(void *arg)
+{
+#ifdef COUNT_IPIS
+	(*ipi_rendezvous_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	smp_rendezvous_action();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invltlb(void *arg)
+{
+
+	invltlb_handler();
+	return (FILTER_HANDLED);
+}
+
+#ifdef __amd64__
+static int
+xen_invltlb_invpcid(void *arg)
+{
+
+	invltlb_invpcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invltlb_pcid(void *arg)
+{
+
+	invltlb_pcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invltlb_invpcid_pti(void *arg)
+{
+
+	invltlb_invpcid_pti_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlpg_invpcid_handler(void *arg)
+{
+
+	invlpg_invpcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlpg_pcid_handler(void *arg)
+{
+
+	invlpg_pcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlrng_invpcid_handler(void *arg)
+{
+
+	invlrng_invpcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlrng_pcid_handler(void *arg)
+{
+
+	invlrng_pcid_handler();
+	return (FILTER_HANDLED);
+}
+#endif
+
+static int
+xen_invlpg(void *arg)
+{
+
+	invlpg_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlrng(void *arg)
+{
+
+	invlrng_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlcache(void *arg)
+{
+
+	invlcache_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_cpustop_handler(void *arg)
+{
+
+	cpustop_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_cpususpend_handler(void *arg)
+{
+
+	cpususpend_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_cpustophard_handler(void *arg)
+{
+
+	ipi_nmi_handler();
+	return (FILTER_HANDLED);
+}
+
+/*----------------------------- XEN PV IPI setup -----------------------------*/
+/*
+ * Those functions are provided outside of the Xen PV APIC implementation
+ * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC,
+ * because on PVHVM there's an emulated LAPIC provided by Xen.
+ */
+static void
+xen_cpu_ipi_init(int cpu)
+{
+	xen_intr_handle_t *ipi_handle;
+	const struct xen_ipi_handler *ipi;
+	int idx, rc;
+
+	ipi_handle = DPCPU_ID_GET(cpu, ipi_handle);
+
+	for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) {
+
+		if (ipi->filter == NULL) {
+			ipi_handle[idx] = NULL;
+			continue;
+		}
+
+		rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter,
+		    INTR_TYPE_TTY, &ipi_handle[idx]);
+		if (rc != 0)
+			panic("Unable to allocate a XEN IPI port");
+		xen_intr_describe(ipi_handle[idx], "%s", ipi->description);
+	}
+}
+
+static void
+xen_setup_cpus(void)
+{
+	int i;
+
+	if (!xen_vector_callback_enabled)
+		return;
+
+#ifdef __amd64__
+	if (pmap_pcid_enabled) {
+		if (pti)
+			xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
+			    invpcid_works ? xen_invltlb_invpcid_pti :
+			    xen_invltlb_pcid;
+		else
+			xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
+			    invpcid_works ? xen_invltlb_invpcid :
+			    xen_invltlb_pcid;
+		xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = invpcid_works ?
+		    xen_invlpg_invpcid_handler : xen_invlpg_pcid_handler;
+		xen_ipis[IPI_TO_IDX(IPI_INVLRNG)].filter = invpcid_works ?
+		    xen_invlrng_invpcid_handler : xen_invlrng_pcid_handler;
+	}
+#endif
+	CPU_FOREACH(i)
+		xen_cpu_ipi_init(i);
+
+	/* Set the xen pv ipi ops to replace the native ones */
+	if (xen_hvm_domain())
+		apic_ops.ipi_vectored = xen_pv_lapic_ipi_vectored;
+}
+
+/* We need to setup IPIs before APs are started */
+SYSINIT(xen_setup_cpus, SI_SUB_SMP-1, SI_ORDER_FIRST, xen_setup_cpus, NULL);
+#endif /* SMP */


Property changes on: trunk/sys/x86/xen/xen_apic.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/xen/xen_intr.c
===================================================================
--- trunk/sys/x86/xen/xen_intr.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/xen/xen_intr.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -2,7 +2,7 @@
 /******************************************************************************
  * xen_intr.c
  *
- * Xen event and interrupt services for x86 PV and HVM guests.
+ * Xen event and interrupt services for x86 HVM guests.
  *
  * Copyright (c) 2002-2005, K A Fraser
  * Copyright (c) 2005, Intel Corporation <xiaofeng.ling at intel.com>
@@ -31,8 +31,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/xen/xen_intr.c 291647 2015-12-02 12:58:20Z royger $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_intr.c 342656 2018-12-31 22:09:08Z jhb $");
 
+#include "opt_ddb.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
@@ -49,22 +51,30 @@
 #include <vm/pmap.h>
 
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
+#include <x86/apicreg.h>
 #include <machine/smp.h>
 #include <machine/stdarg.h>
 
 #include <machine/xen/synch_bitops.h>
 #include <machine/xen/xen-os.h>
-#include <machine/xen/xenvar.h>
 
+#include <xen/xen-os.h>
 #include <xen/hypervisor.h>
 #include <xen/xen_intr.h>
 #include <xen/evtchn/evtchnvar.h>
 
 #include <dev/xen/xenpci/xenpcivar.h>
+#include <dev/pci/pcivar.h>
 
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
 static MALLOC_DEFINE(M_XENINTR, "xen_intr", "Xen Interrupt Services");
 
+static u_int first_evtchn_irq;
+
 /**
  * Per-cpu event channel processing state.
  */
@@ -96,7 +106,7 @@
  * Start the scan at port 0 by initializing the last scanned
  * location as the highest numbered event channel port.
  */
-DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = {
+static DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = {
 	.last_processed_l1i = LONG_BIT - 1,
 	.last_processed_l2i = LONG_BIT - 1
 };
@@ -103,8 +113,12 @@
 
 DPCPU_DECLARE(struct vcpu_info *, vcpu_info);
 
-#define is_valid_evtchn(x)	((x) != 0)
+#define	XEN_EEXIST		17 /* Xen "already exists" error */
+#define	XEN_ALLOCATE_VECTOR	0 /* Allocate a vector for this event channel */
+#define	XEN_INVALID_EVTCHN	0 /* Invalid event channel */
 
+#define	is_valid_evtchn(x)	((x) != XEN_INVALID_EVTCHN)
+
 struct xenisrc {
 	struct intsrc	xi_intsrc;
 	enum evtchn_type xi_type;
@@ -113,13 +127,13 @@
 	evtchn_port_t	xi_port;
 	int		xi_pirq;
 	int		xi_virq;
+	void		*xi_cookie;
 	u_int		xi_close:1;	/* close on unbind? */
-	u_int		xi_needs_eoi:1;
-	u_int		xi_shared:1;	/* Shared with other domains. */
+	u_int		xi_activehi:1;
+	u_int		xi_edgetrigger:1;
+	u_int		xi_masked:1;
 };
 
-#define ARRAY_SIZE(a)	(sizeof(a) / sizeof(a[0]))
-
 static void	xen_intr_suspend(struct pic *);
 static void	xen_intr_resume(struct pic *, bool suspend_cancelled);
 static void	xen_intr_enable_source(struct intsrc *isrc);
@@ -137,6 +151,9 @@
 static void	xen_intr_pirq_disable_source(struct intsrc *isrc, int eoi);
 static void	xen_intr_pirq_eoi_source(struct intsrc *isrc);
 static void	xen_intr_pirq_enable_intr(struct intsrc *isrc);
+static void	xen_intr_pirq_disable_intr(struct intsrc *isrc);
+static int	xen_intr_pirq_config_intr(struct intsrc *isrc,
+		     enum intr_trigger trig, enum intr_polarity pol);
 
 /**
  * PIC interface for all event channel port types except physical IRQs.
@@ -160,22 +177,25 @@
  * physical interrupt sources.
  */
 struct pic xen_intr_pirq_pic = {
+#ifdef __amd64__
+	.pic_register_sources = xenpv_register_pirqs,
+#endif
 	.pic_enable_source  = xen_intr_pirq_enable_source,
 	.pic_disable_source = xen_intr_pirq_disable_source,
 	.pic_eoi_source     = xen_intr_pirq_eoi_source,
 	.pic_enable_intr    = xen_intr_pirq_enable_intr,
-	.pic_disable_intr   = xen_intr_disable_intr,
+	.pic_disable_intr   = xen_intr_pirq_disable_intr,
 	.pic_vector         = xen_intr_vector,
 	.pic_source_pending = xen_intr_source_pending,
-	.pic_suspend        = xen_intr_suspend,
-	.pic_resume         = xen_intr_resume,
-	.pic_config_intr    = xen_intr_config_intr,
+	.pic_config_intr    = xen_intr_pirq_config_intr,
 	.pic_assign_cpu     = xen_intr_assign_cpu
 };
 
-static struct mtx	xen_intr_isrc_lock;
-static int		xen_intr_isrc_count;
-static struct xenisrc  *xen_intr_port_to_isrc[NR_EVENT_CHANNELS];
+static struct mtx	 xen_intr_isrc_lock;
+static u_int		 xen_intr_auto_vector_count;
+static struct xenisrc	*xen_intr_port_to_isrc[NR_EVENT_CHANNELS];
+static u_long		*xen_intr_pirq_eoi_map;
+static boolean_t	 xen_intr_pirq_eoi_map_enabled;
 
 /*------------------------- Private Functions --------------------------------*/
 /**
@@ -197,7 +217,7 @@
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
-	clear_bit(port, pcpu->evtchn_enabled);
+	xen_clear_bit(port, pcpu->evtchn_enabled);
 }
 
 /**
@@ -219,7 +239,7 @@
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
-	set_bit(port, pcpu->evtchn_enabled);
+	xen_set_bit(port, pcpu->evtchn_enabled);
 }
 
 /**
@@ -257,11 +277,11 @@
 
 	KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn isrc lock not held"));
 
-	for (isrc_idx = 0; isrc_idx < xen_intr_isrc_count; isrc_idx ++) {
+	for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx ++) {
 		struct xenisrc *isrc;
 		u_int vector;
 
-		vector = FIRST_EVTCHN_INT + isrc_idx;
+		vector = first_evtchn_irq + isrc_idx;
 		isrc = (struct xenisrc *)intr_lookup_source(vector);
 		if (isrc != NULL
 		 && isrc->xi_type == EVTCHN_TYPE_UNBOUND) {
@@ -283,15 +303,14 @@
  *          object or NULL.
  */
 static struct xenisrc *
-xen_intr_alloc_isrc(enum evtchn_type type)
+xen_intr_alloc_isrc(enum evtchn_type type, int vector)
 {
 	static int warned;
 	struct xenisrc *isrc;
-	int vector;
 
 	KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn alloc lock not held"));
 
-	if (xen_intr_isrc_count > NR_EVENT_CHANNELS) {
+	if (xen_intr_auto_vector_count > NR_EVENT_CHANNELS) {
 		if (!warned) {
 			warned = 1;
 			printf("xen_intr_alloc: Event channels exhausted.\n");
@@ -298,12 +317,19 @@
 		}
 		return (NULL);
 	}
-	vector = FIRST_EVTCHN_INT + xen_intr_isrc_count;
-	xen_intr_isrc_count++;
 
+	if (type != EVTCHN_TYPE_PIRQ) {
+		vector = first_evtchn_irq + xen_intr_auto_vector_count;
+		xen_intr_auto_vector_count++;
+	}
+
+	KASSERT((intr_lookup_source(vector) == NULL),
+	    ("Trying to use an already allocated vector"));
+
 	mtx_unlock(&xen_intr_isrc_lock);
 	isrc = malloc(sizeof(*isrc), M_XENINTR, M_WAITOK | M_ZERO);
-	isrc->xi_intsrc.is_pic = &xen_intr_pic;
+	isrc->xi_intsrc.is_pic =
+	    (type == EVTCHN_TYPE_PIRQ) ? &xen_intr_pirq_pic : &xen_intr_pic;
 	isrc->xi_vector = vector;
 	isrc->xi_type = type;
 	intr_register_source(&isrc->xi_intsrc);
@@ -345,6 +371,7 @@
 	isrc->xi_cpu = 0;
 	isrc->xi_type = EVTCHN_TYPE_UNBOUND;
 	isrc->xi_port = 0;
+	isrc->xi_cookie = NULL;
 	mtx_unlock(&xen_intr_isrc_lock);
 	return (0);
 }
@@ -372,7 +399,7 @@
  */
 static int
 xen_intr_bind_isrc(struct xenisrc **isrcp, evtchn_port_t local_port,
-    enum evtchn_type type, device_t intr_owner, driver_filter_t filter,
+    enum evtchn_type type, const char *intr_owner, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags,
     xen_intr_handle_t *port_handlep)
 {
@@ -381,8 +408,8 @@
 
 	*isrcp = NULL;
 	if (port_handlep == NULL) {
-		device_printf(intr_owner,
-			      "xen_intr_bind_isrc: Bad event handle\n");
+		printf("%s: xen_intr_bind_isrc: Bad event handle\n",
+		    intr_owner);
 		return (EINVAL);
 	}
 
@@ -389,7 +416,7 @@
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = xen_intr_find_unused_isrc(type);
 	if (isrc == NULL) {
-		isrc = xen_intr_alloc_isrc(type);
+		isrc = xen_intr_alloc_isrc(type, XEN_ALLOCATE_VECTOR);
 		if (isrc == NULL) {
 			mtx_unlock(&xen_intr_isrc_lock);
 			return (ENOSPC);
@@ -399,17 +426,37 @@
 	xen_intr_port_to_isrc[local_port] = isrc;
 	mtx_unlock(&xen_intr_isrc_lock);
 
-	error = intr_add_handler(device_get_nameunit(intr_owner),
-				 isrc->xi_vector, filter, handler, arg,
-				 flags|INTR_EXCL, port_handlep);
+	/* Assign the opaque handler (the event channel port) */
+	*port_handlep = &isrc->xi_vector;
+
+#ifdef SMP
+	if (type == EVTCHN_TYPE_PORT) {
+		/*
+		 * By default all interrupts are assigned to vCPU#0
+		 * unless specified otherwise, so shuffle them to balance
+		 * the interrupt load.
+		 */
+		xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu());
+	}
+#endif
+
+	if (filter == NULL && handler == NULL) {
+		/*
+		 * No filter/handler provided, leave the event channel
+		 * masked and without a valid handler, the caller is
+		 * in charge of setting that up.
+		 */
+		*isrcp = isrc;
+		return (0);
+	}
+
+	error = xen_intr_add_handler(intr_owner, filter, handler, arg, flags,
+	    *port_handlep);
 	if (error != 0) {
-		device_printf(intr_owner,
-			      "xen_intr_bind_irq: intr_add_handler failed\n");
 		xen_intr_release_isrc(isrc);
 		return (error);
 	}
 	*isrcp = isrc;
-	evtchn_unmask_port(local_port);
 	return (0);
 }
 
@@ -426,13 +473,17 @@
 static struct xenisrc *
 xen_intr_isrc(xen_intr_handle_t handle)
 {
-	struct intr_handler *ih;
+	int vector;
 
-	ih = handle;
-	if (ih == NULL || ih->ih_event == NULL)
+	if (handle == NULL)
 		return (NULL);
 
-	return (ih->ih_event->ie_source);
+	vector = *(int *)handle;
+	KASSERT(vector >= first_evtchn_irq &&
+	    vector < (first_evtchn_irq + xen_intr_auto_vector_count),
+	    ("Xen interrupt vector is out of range"));
+
+	return ((struct xenisrc *)intr_lookup_source(vector));
 }
 
 /**
@@ -451,6 +502,11 @@
 xen_intr_active_ports(struct xen_intr_pcpu_data *pcpu, shared_info_t *sh,
     u_int idx)
 {
+
+	CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(sh->evtchn_pending[0]));
+	CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(pcpu->evtchn_enabled[0]));
+	CTASSERT(sizeof(sh->evtchn_mask) == sizeof(sh->evtchn_pending));
+	CTASSERT(sizeof(sh->evtchn_mask) == sizeof(pcpu->evtchn_enabled));
 	return (sh->evtchn_pending[idx]
 	      & ~sh->evtchn_mask[idx]
 	      & pcpu->evtchn_enabled[idx]);
@@ -570,8 +626,10 @@
 static int
 xen_intr_init(void *dummy __unused)
 {
+	shared_info_t *s = HYPERVISOR_shared_info;
 	struct xen_intr_pcpu_data *pcpu;
-	int i;
+	struct physdev_pirq_eoi_gmfn eoi_gmfn;
+	int i, rc;
 
 	if (!xen_domain())
 		return (0);
@@ -579,25 +637,65 @@
 	mtx_init(&xen_intr_isrc_lock, "xen-irq-lock", NULL, MTX_DEF);
 
 	/*
-	 * Register interrupt count manually as we aren't
-	 * guaranteed to see a call to xen_intr_assign_cpu()
-	 * before our first interrupt. Also set the per-cpu
-	 * mask of CPU#0 to enable all, since by default
-	 * all event channels are bound to CPU#0.
+	 * Set the per-cpu mask of CPU#0 to enable all, since by default all
+	 * event channels are bound to CPU#0.
 	 */
 	CPU_FOREACH(i) {
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
 		memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
-		       sizeof(pcpu->evtchn_enabled));
-		xen_intr_intrcnt_add(i);
+		    sizeof(pcpu->evtchn_enabled));
 	}
 
+	for (i = 0; i < nitems(s->evtchn_mask); i++)
+		atomic_store_rel_long(&s->evtchn_mask[i], ~0);
+
+	/* Try to register PIRQ EOI map */
+	xen_intr_pirq_eoi_map = malloc(PAGE_SIZE, M_XENINTR, M_WAITOK | M_ZERO);
+	eoi_gmfn.gmfn = atop(vtophys(xen_intr_pirq_eoi_map));
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
+	if (rc != 0 && bootverbose)
+		printf("Xen interrupts: unable to register PIRQ EOI map\n");
+	else
+		xen_intr_pirq_eoi_map_enabled = true;
+
 	intr_register_pic(&xen_intr_pic);
+	if (xen_pv_domain() && xen_initial_domain())
+		intr_register_pic(&xen_intr_pirq_pic);
 
+	if (bootverbose)
+		printf("Xen interrupt system initialized\n");
+
 	return (0);
 }
-SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_MIDDLE, xen_intr_init, NULL);
+SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL);
 
+static void
+xen_intrcnt_init(void *dummy __unused)
+{
+	unsigned int i;
+
+	if (!xen_domain())
+		return;
+
+	/*
+	 * Register interrupt count manually as we aren't guaranteed to see a
+	 * call to xen_intr_assign_cpu() before our first interrupt.
+	 */
+	CPU_FOREACH(i)
+		xen_intr_intrcnt_add(i);
+}
+SYSINIT(xen_intrcnt_init, SI_SUB_INTR, SI_ORDER_MIDDLE, xen_intrcnt_init, NULL);
+
+void
+xen_intr_alloc_irqs(void)
+{
+
+	if (num_io_irqs > UINT_MAX - NR_EVENT_CHANNELS)
+		panic("IRQ allocation overflow (num_msi_irqs too high?)");
+	first_evtchn_irq = num_io_irqs;
+	num_io_irqs += NR_EVENT_CHANNELS;
+}
+
 /*--------------------------- Common PIC Functions ---------------------------*/
 /**
  * Prepare this PIC for system suspension.
@@ -685,8 +783,8 @@
 		struct xen_intr_pcpu_data *pcpu;
 
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
-		memset(pcpu->evtchn_enabled,
-		       i == 0 ? ~0 : 0, sizeof(pcpu->evtchn_enabled));
+		memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
+		    sizeof(pcpu->evtchn_enabled));
 	}
 
 	/* Mask all event channels. */
@@ -697,10 +795,10 @@
 	memset(xen_intr_port_to_isrc, 0, sizeof(xen_intr_port_to_isrc));
 
 	/* Free unused isrcs and rebind VIRQs and IPIs */
-	for (isrc_idx = 0; isrc_idx < xen_intr_isrc_count; isrc_idx++) {
+	for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx++) {
 		u_int vector;
 
-		vector = FIRST_EVTCHN_INT + isrc_idx;
+		vector = first_evtchn_irq + isrc_idx;
 		isrc = (struct xenisrc *)intr_lookup_source(vector);
 		if (isrc != NULL) {
 			isrc->xi_port = 0;
@@ -712,7 +810,6 @@
 				xen_rebind_virq(isrc);
 				break;
 			default:
-				isrc->xi_cpu = 0;
 				break;
 			}
 		}
@@ -798,16 +895,13 @@
 	struct evtchn_bind_vcpu bind_vcpu;
 	struct xenisrc *isrc;
 	u_int to_cpu, vcpu_id;
-	int error;
+	int error, masked;
 
-#ifdef XENHVM
 	if (xen_vector_callback_enabled == 0)
 		return (EOPNOTSUPP);
-#endif
 
 	to_cpu = apic_cpuid(apic_id);
 	vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id;
-	xen_intr_intrcnt_add(to_cpu);
 
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = (struct xenisrc *)base_isrc;
@@ -816,6 +910,11 @@
 		return (EINVAL);
 	}
 
+	/*
+	 * Mask the event channel while binding it to prevent interrupt
+	 * delivery with an inconsistent state in isrc->xi_cpu.
+	 */
+	masked = evtchn_test_and_set_mask(isrc->xi_port);
 	if ((isrc->xi_type == EVTCHN_TYPE_VIRQ) ||
 		(isrc->xi_type == EVTCHN_TYPE_IPI)) {
 		/*
@@ -826,18 +925,12 @@
 		evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 		isrc->xi_cpu = to_cpu;
 		evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
-		mtx_unlock(&xen_intr_isrc_lock);
-		return (0);
+		goto out;
 	}
 
 	bind_vcpu.port = isrc->xi_port;
 	bind_vcpu.vcpu = vcpu_id;
 
-	/*
-	 * Allow interrupts to be fielded on the new VCPU before
-	 * we ask the hypervisor to deliver them there.
-	 */
-	evtchn_cpu_unmask_port(to_cpu, isrc->xi_port);
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu);
 	if (isrc->xi_cpu != to_cpu) {
 		if (error == 0) {
@@ -844,11 +937,13 @@
 			/* Commit to new binding by removing the old one. */
 			evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 			isrc->xi_cpu = to_cpu;
-		} else {
-			/* Roll-back to previous binding. */
-			evtchn_cpu_mask_port(to_cpu, isrc->xi_port);
+			evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
 		}
 	}
+
+out:
+	if (masked == 0)
+		evtchn_unmask_port(isrc->xi_port);
 	mtx_unlock(&xen_intr_isrc_lock);
 	return (0);
 #else
@@ -865,8 +960,21 @@
  *              acknowledgements.
  */
 static void
-xen_intr_disable_source(struct intsrc *isrc, int eoi)
+xen_intr_disable_source(struct intsrc *base_isrc, int eoi)
 {
+	struct xenisrc *isrc;
+
+	isrc = (struct xenisrc *)base_isrc;
+
+	/*
+	 * NB: checking if the event channel is already masked is
+	 * needed because the event channel user-space device
+	 * masks event channels on it's filter as part of it's
+	 * normal operation, and those shouldn't be automatically
+	 * unmasked by the generic interrupt code. The event channel
+	 * device will unmask them when needed.
+	 */
+	isrc->xi_masked = !!evtchn_test_and_set_mask(isrc->xi_port);
 }
 
 /*
@@ -875,8 +983,14 @@
  * \param isrc  The interrupt source to unmask (if necessary).
  */
 static void
-xen_intr_enable_source(struct intsrc *isrc)
+xen_intr_enable_source(struct intsrc *base_isrc)
 {
+	struct xenisrc *isrc;
+
+	isrc = (struct xenisrc *)base_isrc;
+
+	if (isrc->xi_masked == 0)
+		evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
@@ -885,7 +999,7 @@
  * \param isrc  The interrupt source to EOI.
  */
 static void
-xen_intr_eoi_source(struct intsrc *isrc)
+xen_intr_eoi_source(struct intsrc *base_isrc)
 {
 }
 
@@ -916,7 +1030,11 @@
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
-	evtchn_mask_port(isrc->xi_port);
+
+	if (isrc->xi_edgetrigger == 0)
+		evtchn_mask_port(isrc->xi_port);
+	if (eoi == PIC_EOI)
+		xen_intr_pirq_eoi_source(base_isrc);
 }
 
 /*
@@ -930,7 +1048,9 @@
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
-	evtchn_unmask_port(isrc->xi_port);
+
+	if (isrc->xi_edgetrigger == 0)
+		evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
@@ -942,13 +1062,17 @@
 xen_intr_pirq_eoi_source(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
+	int error;
 
-	/* XXX Use shared page of flags for this. */
 	isrc = (struct xenisrc *)base_isrc;
-	if (isrc->xi_needs_eoi != 0) {
+
+	if (xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)) {
 		struct physdev_eoi eoi = { .irq = isrc->xi_pirq };
 
-		(void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+		error = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+		if (error != 0)
+			panic("Unable to EOI PIRQ#%d: %d\n",
+			    isrc->xi_pirq, error);
 	}
 }
 
@@ -958,10 +1082,118 @@
  * \param isrc  The interrupt source to enable.
  */
 static void
-xen_intr_pirq_enable_intr(struct intsrc *isrc)
+xen_intr_pirq_enable_intr(struct intsrc *base_isrc)
 {
+	struct xenisrc *isrc;
+	struct evtchn_bind_pirq bind_pirq;
+	struct physdev_irq_status_query irq_status;
+	int error;
+
+	isrc = (struct xenisrc *)base_isrc;
+
+	if (!xen_intr_pirq_eoi_map_enabled) {
+		irq_status.irq = isrc->xi_pirq;
+		error = HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query,
+		    &irq_status);
+		if (error)
+			panic("unable to get status of IRQ#%d", isrc->xi_pirq);
+
+		if (irq_status.flags & XENIRQSTAT_needs_eoi) {
+			/*
+			 * Since the dynamic PIRQ EOI map is not available
+			 * mark the PIRQ as needing EOI unconditionally.
+			 */
+			xen_set_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map);
+		}
+	}
+
+	bind_pirq.pirq = isrc->xi_pirq;
+	bind_pirq.flags = isrc->xi_edgetrigger ? 0 : BIND_PIRQ__WILL_SHARE;
+	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
+	if (error)
+		panic("unable to bind IRQ#%d", isrc->xi_pirq);
+
+	isrc->xi_port = bind_pirq.port;
+
+	mtx_lock(&xen_intr_isrc_lock);
+	KASSERT((xen_intr_port_to_isrc[bind_pirq.port] == NULL),
+	    ("trying to override an already setup event channel port"));
+	xen_intr_port_to_isrc[bind_pirq.port] = isrc;
+	mtx_unlock(&xen_intr_isrc_lock);
+
+	evtchn_unmask_port(isrc->xi_port);
 }
 
+/*
+ * Disable an interrupt source.
+ *
+ * \param isrc  The interrupt source to disable.
+ */
+static void
+xen_intr_pirq_disable_intr(struct intsrc *base_isrc)
+{
+	struct xenisrc *isrc;
+	struct evtchn_close close;
+	int error;
+
+	isrc = (struct xenisrc *)base_isrc;
+
+	evtchn_mask_port(isrc->xi_port);
+
+	close.port = isrc->xi_port;
+	error = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+	if (error)
+		panic("unable to close event channel %d IRQ#%d",
+		    isrc->xi_port, isrc->xi_pirq);
+
+	mtx_lock(&xen_intr_isrc_lock);
+	xen_intr_port_to_isrc[isrc->xi_port] = NULL;
+	mtx_unlock(&xen_intr_isrc_lock);
+
+	isrc->xi_port = 0;
+}
+
+/**
+ * Perform configuration of an interrupt source.
+ *
+ * \param isrc  The interrupt source to configure.
+ * \param trig  Edge or level.
+ * \param pol   Active high or low.
+ *
+ * \returns  0 if no events are pending, otherwise non-zero.
+ */
+static int
+xen_intr_pirq_config_intr(struct intsrc *base_isrc, enum intr_trigger trig,
+    enum intr_polarity pol)
+{
+	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
+	struct physdev_setup_gsi setup_gsi;
+	int error;
+
+	KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM),
+	    ("%s: Conforming trigger or polarity\n", __func__));
+
+	setup_gsi.gsi = isrc->xi_pirq;
+	setup_gsi.triggering = trig == INTR_TRIGGER_EDGE ? 0 : 1;
+	setup_gsi.polarity = pol == INTR_POLARITY_HIGH ? 0 : 1;
+
+	error = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+	if (error == -XEN_EEXIST) {
+		if ((isrc->xi_edgetrigger && (trig != INTR_TRIGGER_EDGE)) ||
+		    (isrc->xi_activehi && (pol != INTR_POLARITY_HIGH)))
+			panic("unable to reconfigure interrupt IRQ#%d",
+			    isrc->xi_pirq);
+		error = 0;
+	}
+	if (error)
+		panic("unable to configure IRQ#%d\n", isrc->xi_pirq);
+
+	isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
+	isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
+
+	return (0);
+}
+
 /*--------------------------- Public Functions -------------------------------*/
 /*------- API comments for these methods can be found in xen/xenintr.h -------*/
 int
@@ -972,8 +1204,9 @@
 	struct xenisrc *isrc;
 	int error;
 
-	error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT, dev,
-		    filter, handler, arg, flags, port_handlep);
+	error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT,
+	    device_get_nameunit(dev), filter, handler, arg, flags,
+	    port_handlep);
 	if (error != 0)
 		return (error);
 
@@ -1007,8 +1240,8 @@
 	}
 
 	error = xen_intr_bind_isrc(&isrc, alloc_unbound.port, EVTCHN_TYPE_PORT,
-				 dev, filter, handler, arg, flags,
-				 port_handlep);
+	    device_get_nameunit(dev), filter, handler, arg, flags,
+	    port_handlep);
 	if (error != 0) {
 		evtchn_close_t close = { .port = alloc_unbound.port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
@@ -1042,8 +1275,8 @@
 	}
 
 	error = xen_intr_bind_isrc(&isrc, bind_interdomain.local_port,
-				 EVTCHN_TYPE_PORT, dev, filter, handler,
-				 arg, flags, port_handlep);
+	    EVTCHN_TYPE_PORT, device_get_nameunit(dev), filter, handler, arg,
+	    flags, port_handlep);
 	if (error) {
 		evtchn_close_t close = { .port = bind_interdomain.local_port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
@@ -1069,9 +1302,6 @@
 	struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id };
 	int error;
 
-	/* Ensure the target CPU is ready to handle evtchn interrupts. */
-	xen_intr_intrcnt_add(cpu);
-
 	isrc = NULL;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq);
 	if (error != 0) {
@@ -1082,8 +1312,9 @@
 		return (-error);
 	}
 
-	error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ, dev,
-				 filter, handler, arg, flags, port_handlep);
+	error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ,
+	    device_get_nameunit(dev), filter, handler, arg, flags,
+	    port_handlep);
 
 #ifdef SMP
 	if (error == 0)
@@ -1122,19 +1353,17 @@
 }
 
 int
-xen_intr_alloc_and_bind_ipi(device_t dev, u_int cpu,
-    driver_filter_t filter, enum intr_type flags,
-    xen_intr_handle_t *port_handlep)
+xen_intr_alloc_and_bind_ipi(u_int cpu, driver_filter_t filter,
+    enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 #ifdef SMP
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	struct xenisrc *isrc;
 	struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
+	/* Same size as the one used by intr_handler->ih_name. */
+	char name[MAXCOMLEN + 1];
 	int error;
 
-	/* Ensure the target CPU is ready to handle evtchn interrupts. */
-	xen_intr_intrcnt_add(cpu);
-
 	isrc = NULL;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
 	if (error != 0) {
@@ -1145,12 +1374,10 @@
 		return (-error);
 	}
 
+	snprintf(name, sizeof(name), "cpu%u", cpu);
+
 	error = xen_intr_bind_isrc(&isrc, bind_ipi.port, EVTCHN_TYPE_IPI,
-	                           dev, filter, NULL, NULL, flags,
-	                           port_handlep);
-	if (error == 0)
-		error = intr_event_bind(isrc->xi_intsrc.is_event, cpu);
-
+	    name, filter, NULL, NULL, flags, port_handlep);
 	if (error != 0) {
 		evtchn_close_t close = { .port = bind_ipi.port };
 
@@ -1182,6 +1409,101 @@
 }
 
 int
+xen_register_pirq(int vector, enum intr_trigger trig, enum intr_polarity pol)
+{
+	struct physdev_map_pirq map_pirq;
+	struct xenisrc *isrc;
+	int error;
+
+	if (vector == 0)
+		return (EINVAL);
+
+	if (bootverbose)
+		printf("xen: register IRQ#%d\n", vector);
+
+	map_pirq.domid = DOMID_SELF;
+	map_pirq.type = MAP_PIRQ_TYPE_GSI;
+	map_pirq.index = vector;
+	map_pirq.pirq = vector;
+
+	error = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_pirq);
+	if (error) {
+		printf("xen: unable to map IRQ#%d\n", vector);
+		return (error);
+	}
+
+	mtx_lock(&xen_intr_isrc_lock);
+	isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector);
+	mtx_unlock(&xen_intr_isrc_lock);
+	KASSERT((isrc != NULL), ("xen: unable to allocate isrc for interrupt"));
+	isrc->xi_pirq = vector;
+	isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
+	isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
+
+	return (0);
+}
+
+int
+xen_register_msi(device_t dev, int vector, int count)
+{
+	struct physdev_map_pirq msi_irq;
+	struct xenisrc *isrc;
+	int ret;
+
+	memset(&msi_irq, 0, sizeof(msi_irq));
+	msi_irq.domid = DOMID_SELF;
+	msi_irq.type = count == 1 ?
+	    MAP_PIRQ_TYPE_MSI_SEG : MAP_PIRQ_TYPE_MULTI_MSI;
+	msi_irq.index = -1;
+	msi_irq.pirq = -1;
+	msi_irq.bus = pci_get_bus(dev) | (pci_get_domain(dev) << 16);
+	msi_irq.devfn = (pci_get_slot(dev) << 3) | pci_get_function(dev);
+	msi_irq.entry_nr = count;
+
+	ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &msi_irq);
+	if (ret != 0)
+		return (ret);
+	if (count != msi_irq.entry_nr) {
+		panic("unable to setup all requested MSI vectors "
+		    "(expected %d got %d)", count, msi_irq.entry_nr);
+	}
+
+	mtx_lock(&xen_intr_isrc_lock);
+	for (int i = 0; i < count; i++) {
+		isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector + i);
+		KASSERT(isrc != NULL,
+		    ("xen: unable to allocate isrc for interrupt"));
+		isrc->xi_pirq = msi_irq.pirq + i;
+		/* MSI interrupts are always edge triggered */
+		isrc->xi_edgetrigger = 1;
+	}
+	mtx_unlock(&xen_intr_isrc_lock);
+
+	return (0);
+}
+
+int
+xen_release_msi(int vector)
+{
+	struct physdev_unmap_pirq unmap;
+	struct xenisrc *isrc;
+	int ret;
+
+	isrc = (struct xenisrc *)intr_lookup_source(vector);
+	if (isrc == NULL)
+		return (ENXIO);
+
+	unmap.pirq = isrc->xi_pirq;
+	ret = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap);
+	if (ret != 0)
+		return (ret);
+
+	xen_intr_release_isrc(isrc);
+
+	return (0);
+}
+
+int
 xen_intr_describe(xen_intr_handle_t port_handle, const char *fmt, ...)
 {
 	char descr[MAXCOMLEN + 1];
@@ -1195,22 +1517,24 @@
 	va_start(ap, fmt);
 	vsnprintf(descr, sizeof(descr), fmt, ap);
 	va_end(ap);
-	return (intr_describe(isrc->xi_vector, port_handle, descr));
+	return (intr_describe(isrc->xi_vector, isrc->xi_cookie, descr));
 }
 
 void
 xen_intr_unbind(xen_intr_handle_t *port_handlep)
 {
-	struct intr_handler *handler;
 	struct xenisrc *isrc;
 
-	handler = *port_handlep;
+	KASSERT(port_handlep != NULL,
+	    ("NULL xen_intr_handle_t passed to xen_intr_unbind"));
+
+	isrc = xen_intr_isrc(*port_handlep);
 	*port_handlep = NULL;
-	isrc = xen_intr_isrc(handler);
 	if (isrc == NULL)
 		return;
 
-	intr_remove_handler(handler);
+	if (isrc->xi_cookie != NULL)
+		intr_remove_handler(isrc->xi_cookie);
 	xen_intr_release_isrc(isrc);
 }
 
@@ -1240,3 +1564,96 @@
 	
 	return (isrc->xi_port);
 }
+
+int
+xen_intr_add_handler(const char *name, driver_filter_t filter,
+    driver_intr_t handler, void *arg, enum intr_type flags,
+    xen_intr_handle_t handle)
+{
+	struct xenisrc *isrc;
+	int error;
+
+	isrc = xen_intr_isrc(handle);
+	if (isrc == NULL || isrc->xi_cookie != NULL)
+		return (EINVAL);
+
+	error = intr_add_handler(name, isrc->xi_vector,filter, handler, arg,
+	    flags|INTR_EXCL, &isrc->xi_cookie);
+	if (error != 0) {
+		printf(
+		    "%s: xen_intr_add_handler: intr_add_handler failed: %d\n",
+		    name, error);
+	}
+
+	return (error);
+}
+
+#ifdef DDB
+static const char *
+xen_intr_print_type(enum evtchn_type type)
+{
+	static const char *evtchn_type_to_string[EVTCHN_TYPE_COUNT] = {
+		[EVTCHN_TYPE_UNBOUND]	= "UNBOUND",
+		[EVTCHN_TYPE_PIRQ]	= "PIRQ",
+		[EVTCHN_TYPE_VIRQ]	= "VIRQ",
+		[EVTCHN_TYPE_IPI]	= "IPI",
+		[EVTCHN_TYPE_PORT]	= "PORT",
+	};
+
+	if (type >= EVTCHN_TYPE_COUNT)
+		return ("UNKNOWN");
+
+	return (evtchn_type_to_string[type]);
+}
+
+static void
+xen_intr_dump_port(struct xenisrc *isrc)
+{
+	struct xen_intr_pcpu_data *pcpu;
+	shared_info_t *s = HYPERVISOR_shared_info;
+	int i;
+
+	db_printf("Port %d Type: %s\n",
+	    isrc->xi_port, xen_intr_print_type(isrc->xi_type));
+	if (isrc->xi_type == EVTCHN_TYPE_PIRQ) {
+		db_printf("\tPirq: %d ActiveHi: %d EdgeTrigger: %d "
+		    "NeedsEOI: %d\n",
+		    isrc->xi_pirq, isrc->xi_activehi, isrc->xi_edgetrigger,
+		    !!xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map));
+	}
+	if (isrc->xi_type == EVTCHN_TYPE_VIRQ)
+		db_printf("\tVirq: %d\n", isrc->xi_virq);
+
+	db_printf("\tMasked: %d Pending: %d\n",
+	    !!xen_test_bit(isrc->xi_port, &s->evtchn_mask[0]),
+	    !!xen_test_bit(isrc->xi_port, &s->evtchn_pending[0]));
+
+	db_printf("\tPer-CPU Masks: ");
+	CPU_FOREACH(i) {
+		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
+		db_printf("cpu#%d: %d ", i,
+		    !!xen_test_bit(isrc->xi_port, pcpu->evtchn_enabled));
+	}
+	db_printf("\n");
+}
+
+DB_SHOW_COMMAND(xen_evtchn, db_show_xen_evtchn)
+{
+	int i;
+
+	if (!xen_domain()) {
+		db_printf("Only available on Xen guests\n");
+		return;
+	}
+
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+		struct xenisrc *isrc;
+
+		isrc = xen_intr_port_to_isrc[i];
+		if (isrc == NULL)
+			continue;
+
+		xen_intr_dump_port(isrc);
+	}
+}
+#endif /* DDB */

Added: trunk/sys/x86/xen/xen_msi.c
===================================================================
--- trunk/sys/x86/xen/xen_msi.c	                        (rev 0)
+++ trunk/sys/x86/xen/xen_msi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,134 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_msi.c 344912 2019-03-08 01:04:19Z jhb $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/systm.h>
+#include <x86/apicreg.h>
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/frame.h>
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+#include <machine/specialreg.h>
+#include <dev/pci/pcivar.h>
+
+#include <xen/xen-os.h>
+#include <xen/xen_intr.h>
+#include <xen/xen_msi.h>
+
+static struct mtx msi_lock;
+static u_int msi_last_irq;
+
+void
+xen_msi_init(void)
+{
+
+	MPASS(num_io_irqs > 0);
+	first_msi_irq = min(MINIMUM_MSI_INT, num_io_irqs);
+	if (num_msi_irqs > UINT_MAX - first_msi_irq)
+		panic("num_msi_irqs too high");
+	num_io_irqs = first_msi_irq + num_msi_irqs;
+
+	mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
+}
+
+/*
+ * Try to allocate 'count' interrupt sources with contiguous IDT values.
+ */
+int
+xen_msi_alloc(device_t dev, int count, int maxcount, int *irqs)
+{
+	int i, ret = 0;
+
+	mtx_lock(&msi_lock);
+
+	/* If we would exceed the max, give up. */
+	if (msi_last_irq + count > num_msi_irqs) {
+		mtx_unlock(&msi_lock);
+		return (ENXIO);
+	}
+
+	/* Allocate MSI vectors */
+	for (i = 0; i < count; i++)
+		irqs[i] = first_msi_irq + msi_last_irq++;
+
+	mtx_unlock(&msi_lock);
+
+	ret = xen_register_msi(dev, irqs[0], count);
+	if (ret != 0)
+		return (ret);
+
+	for (i = 0; i < count; i++)
+		nexus_add_irq(irqs[i]);
+
+	return (0);
+}
+
+int
+xen_msi_release(int *irqs, int count)
+{
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		ret = xen_release_msi(irqs[i]);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+int
+xen_msi_map(int irq, uint64_t *addr, uint32_t *data)
+{
+
+	return (0);
+}
+
+int
+xen_msix_alloc(device_t dev, int *irq)
+{
+
+	return (ENXIO);
+}
+
+int
+xen_msix_release(int irq)
+{
+
+	return (ENOENT);
+}


Property changes on: trunk/sys/x86/xen/xen_msi.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xen_nexus.c
===================================================================
--- trunk/sys/x86/xen/xen_nexus.c	                        (rev 0)
+++ trunk/sys/x86/xen/xen_nexus.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,168 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2013 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_nexus.c 340016 2018-11-01 18:34:26Z jhb $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+
+#include <dev/acpica/acpivar.h>
+
+#include <x86/init.h>
+#include <machine/nexusvar.h>
+#include <machine/intr_machdep.h>
+
+#include <xen/xen-os.h>
+#include <xen/xen_intr.h>
+#include <xen/xen_msi.h>
+
+#include "pcib_if.h"
+
+/*
+ * Xen nexus(4) driver.
+ */
+static int
+nexus_xen_probe(device_t dev)
+{
+
+	if (!xen_pv_domain())
+		return (ENXIO);
+
+	return (BUS_PROBE_SPECIFIC);
+}
+
+static int
+nexus_xen_attach(device_t dev)
+{
+	int error;
+	device_t acpi_dev = NULL;
+
+	nexus_init_resources();
+	bus_generic_probe(dev);
+
+	if (xen_initial_domain()) {
+		/* Disable some ACPI devices that are not usable by Dom0 */
+		acpi_cpu_disabled = true;
+		acpi_hpet_disabled = true;
+		acpi_timer_disabled = true;
+
+		acpi_dev = BUS_ADD_CHILD(dev, 10, "acpi", 0);
+		if (acpi_dev == NULL)
+			panic("Unable to add ACPI bus to Xen Dom0");
+	}
+
+	error = bus_generic_attach(dev);
+	if (xen_initial_domain() && (error == 0))
+		acpi_install_wakeup_handler(device_get_softc(acpi_dev));
+
+	return (error);
+}
+
+static int
+nexus_xen_config_intr(device_t dev, int irq, enum intr_trigger trig,
+    enum intr_polarity pol)
+{
+	int ret;
+
+	/*
+	 * ISA and PCI intline IRQs are not preregistered on Xen, so
+	 * intercept calls to configure those and register them on the fly.
+	 */
+	if ((irq < first_msi_irq) && (intr_lookup_source(irq) == NULL)) {
+		ret = xen_register_pirq(irq, trig, pol);
+		if (ret != 0)
+			return (ret);
+		nexus_add_irq(irq);
+	}
+	return (intr_config_intr(irq, trig, pol));
+}
+
+static int
+nexus_xen_alloc_msix(device_t pcib, device_t dev, int *irq)
+{
+
+	return (xen_msix_alloc(dev, irq));
+}
+
+static int
+nexus_xen_release_msix(device_t pcib, device_t dev, int irq)
+{
+
+	return (xen_msix_release(irq));
+}
+
+static int
+nexus_xen_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs)
+{
+
+	return (xen_msi_alloc(dev, count, maxcount, irqs));
+}
+
+static int
+nexus_xen_release_msi(device_t pcib, device_t dev, int count, int *irqs)
+{
+
+	return (xen_msi_release(irqs, count));
+}
+
+static int
+nexus_xen_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data)
+{
+
+	return (xen_msi_map(irq, addr, data));
+}
+
+static device_method_t nexus_xen_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		nexus_xen_probe),
+	DEVMETHOD(device_attach,	nexus_xen_attach),
+
+	/* INTR */
+	DEVMETHOD(bus_config_intr,	nexus_xen_config_intr),
+
+	/* MSI */
+	DEVMETHOD(pcib_alloc_msi,	nexus_xen_alloc_msi),
+	DEVMETHOD(pcib_release_msi,	nexus_xen_release_msi),
+	DEVMETHOD(pcib_alloc_msix,	nexus_xen_alloc_msix),
+	DEVMETHOD(pcib_release_msix,	nexus_xen_release_msix),
+	DEVMETHOD(pcib_map_msi,		nexus_xen_map_msi),
+
+	{ 0, 0 }
+};
+
+DEFINE_CLASS_1(nexus, nexus_xen_driver, nexus_xen_methods, 1, nexus_driver);
+static devclass_t nexus_devclass;
+
+DRIVER_MODULE(nexus_xen, root, nexus_xen_driver, nexus_devclass, 0, 0);


Property changes on: trunk/sys/x86/xen/xen_nexus.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xen_pci_bus.c
===================================================================
--- trunk/sys/x86/xen/xen_pci_bus.c	                        (rev 0)
+++ trunk/sys/x86/xen/xen_pci_bus.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,91 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_pci_bus.c 275649 2014-12-09 18:03:25Z royger $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+
+#include <sys/pciio.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pci_private.h>
+
+#include <xen/xen-os.h>
+#include <xen/hypervisor.h>
+#include <xen/xen_pci.h>
+
+#include "pcib_if.h"
+#include "pci_if.h"
+
+void
+xen_pci_enable_msi_method(device_t dev, device_t child, uint64_t address,
+     uint16_t data)
+{
+	struct pci_devinfo *dinfo = device_get_ivars(child);
+	struct pcicfg_msi *msi = &dinfo->cfg.msi;
+
+	/* Enable MSI in the control register. */
+	msi->msi_ctrl |= PCIM_MSICTRL_MSI_ENABLE;
+	pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
+	    msi->msi_ctrl, 2);
+}
+
+void
+xen_pci_disable_msi_method(device_t dev, device_t child)
+{
+	struct pci_devinfo *dinfo = device_get_ivars(child);
+	struct pcicfg_msi *msi = &dinfo->cfg.msi;
+
+	msi->msi_ctrl &= ~PCIM_MSICTRL_MSI_ENABLE;
+	pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
+	    msi->msi_ctrl, 2);
+}
+
+void
+xen_pci_child_added_method(device_t dev, device_t child)
+{
+	struct pci_devinfo *dinfo;
+	struct physdev_pci_device_add add_pci;
+	int error;
+
+	dinfo = device_get_ivars(child);
+	KASSERT((dinfo != NULL),
+	    ("xen_pci_add_child_method called with NULL dinfo"));
+
+	bzero(&add_pci, sizeof(add_pci));
+	add_pci.seg = dinfo->cfg.domain;
+	add_pci.bus = dinfo->cfg.bus;
+	add_pci.devfn = (dinfo->cfg.slot << 3) | dinfo->cfg.func;
+	error = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add_pci);
+	if (error)
+		panic("unable to add device bus %u devfn %u error: %d\n",
+		    add_pci.bus, add_pci.devfn, error);
+}


Property changes on: trunk/sys/x86/xen/xen_pci_bus.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xenpv.c
===================================================================
--- trunk/sys/x86/xen/xenpv.c	                        (rev 0)
+++ trunk/sys/x86/xen/xenpv.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,203 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xenpv.c 331017 2018-03-15 19:08:33Z kevans $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/pcpu.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+#include <sys/limits.h>
+#include <sys/vmmeter.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_param.h>
+#include <vm/vm_phys.h>
+
+#include <xen/xen-os.h>
+#include <xen/gnttab.h>
+
+#include "xenmem_if.h"
+
+/*
+ * Allocate unused physical memory above 4GB in order to map memory
+ * from foreign domains. We use memory starting at 4GB in order to
+ * prevent clashes with MMIO/ACPI regions.
+ *
+ * Since this is not possible on i386 just use any available memory
+ * chunk and hope we don't clash with anything else.
+ */
+#ifdef __amd64__
+#define LOW_MEM_LIMIT	0x100000000ul
+#else
+#define LOW_MEM_LIMIT	0
+#endif
+
+static devclass_t xenpv_devclass;
+
+static void
+xenpv_identify(driver_t *driver, device_t parent)
+{
+	if (!xen_domain())
+		return;
+
+	/* Make sure there's only one xenpv device. */
+	if (devclass_get_device(xenpv_devclass, 0))
+		return;
+
+	/*
+	 * The xenpv bus should be the last to attach in order
+	 * to properly detect if an ISA bus has already been added.
+	 */
+	if (BUS_ADD_CHILD(parent, UINT_MAX, "xenpv", 0) == NULL)
+		panic("Unable to attach xenpv bus.");
+}
+
+static int
+xenpv_probe(device_t dev)
+{
+
+	device_set_desc(dev, "Xen PV bus");
+	return (BUS_PROBE_NOWILDCARD);
+}
+
+static int
+xenpv_attach(device_t dev)
+{
+	device_t child;
+
+	/*
+	 * Let our child drivers identify any child devices that they
+	 * can find.  Once that is done attach any devices that we
+	 * found.
+	 */
+	bus_generic_probe(dev);
+	bus_generic_attach(dev);
+
+	if (!devclass_get_device(devclass_find("isa"), 0)) {
+		child = BUS_ADD_CHILD(dev, 0, "isa", 0);
+		if (child == NULL)
+			panic("Failed to attach ISA bus.");
+		device_probe_and_attach(child);
+	}
+
+	return (0);
+}
+
+static struct resource *
+xenpv_alloc_physmem(device_t dev, device_t child, int *res_id, size_t size)
+{
+	struct resource *res;
+	vm_paddr_t phys_addr;
+	int error;
+
+	res = bus_alloc_resource(child, SYS_RES_MEMORY, res_id, LOW_MEM_LIMIT,
+	    ~0, size, RF_ACTIVE);
+	if (res == NULL)
+		return (NULL);
+
+	phys_addr = rman_get_start(res);
+	error = vm_phys_fictitious_reg_range(phys_addr, phys_addr + size,
+	    VM_MEMATTR_DEFAULT);
+	if (error) {
+		bus_release_resource(child, SYS_RES_MEMORY, *res_id, res);
+		return (NULL);
+	}
+
+	return (res);
+}
+
+static int
+xenpv_free_physmem(device_t dev, device_t child, int res_id, struct resource *res)
+{
+	vm_paddr_t phys_addr;
+	size_t size;
+
+	phys_addr = rman_get_start(res);
+	size = rman_get_size(res);
+
+	vm_phys_fictitious_unreg_range(phys_addr, phys_addr + size);
+	return (bus_release_resource(child, SYS_RES_MEMORY, res_id, res));
+}
+
+static device_method_t xenpv_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_identify,		xenpv_identify),
+	DEVMETHOD(device_probe,			xenpv_probe),
+	DEVMETHOD(device_attach,		xenpv_attach),
+	DEVMETHOD(device_suspend,		bus_generic_suspend),
+	DEVMETHOD(device_resume,		bus_generic_resume),
+
+	/* Bus interface */
+	DEVMETHOD(bus_add_child,		bus_generic_add_child),
+	DEVMETHOD(bus_alloc_resource,		bus_generic_alloc_resource),
+	DEVMETHOD(bus_release_resource,		bus_generic_release_resource),
+	DEVMETHOD(bus_activate_resource,	bus_generic_activate_resource),
+	DEVMETHOD(bus_deactivate_resource,	bus_generic_deactivate_resource),
+
+	/* Interface to allocate memory for foreign mappings */
+	DEVMETHOD(xenmem_alloc,			xenpv_alloc_physmem),
+	DEVMETHOD(xenmem_free,			xenpv_free_physmem),
+
+	DEVMETHOD_END
+};
+
+static driver_t xenpv_driver = {
+	"xenpv",
+	xenpv_methods,
+	0,
+};
+
+DRIVER_MODULE(xenpv, nexus, xenpv_driver, xenpv_devclass, 0, 0);
+
+struct resource *
+xenmem_alloc(device_t dev, int *res_id, size_t size)
+{
+	device_t parent;
+
+	parent = device_get_parent(dev);
+	if (parent == NULL)
+		return (NULL);
+	return (XENMEM_ALLOC(parent, dev, res_id, size));
+}
+
+int
+xenmem_free(device_t dev, int res_id, struct resource *res)
+{
+	device_t parent;
+
+	parent = device_get_parent(dev);
+	if (parent == NULL)
+		return (ENXIO);
+	return (XENMEM_FREE(parent, dev, res_id, res));
+}


Property changes on: trunk/sys/x86/xen/xenpv.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property


More information about the Midnightbsd-cvs mailing list