[Midnightbsd-cvs] src [12310] trunk/sys/x86: sync with FreeBSD 11-stable
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Sat Feb 8 14:32:42 EST 2020
Revision: 12310
http://svnweb.midnightbsd.org/src/?rev=12310
Author: laffer1
Date: 2020-02-08 14:32:41 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable
Modified Paths:
--------------
trunk/sys/x86/iommu/busdma_dmar.c
trunk/sys/x86/iommu/busdma_dmar.h
trunk/sys/x86/iommu/intel_ctx.c
trunk/sys/x86/iommu/intel_dmar.h
trunk/sys/x86/iommu/intel_drv.c
trunk/sys/x86/iommu/intel_fault.c
trunk/sys/x86/iommu/intel_gas.c
trunk/sys/x86/iommu/intel_idpgtbl.c
trunk/sys/x86/iommu/intel_qi.c
trunk/sys/x86/iommu/intel_quirks.c
trunk/sys/x86/iommu/intel_reg.h
trunk/sys/x86/iommu/intel_utils.c
trunk/sys/x86/isa/atpic.c
trunk/sys/x86/isa/atrtc.c
trunk/sys/x86/isa/clock.c
trunk/sys/x86/isa/elcr.c
trunk/sys/x86/isa/icu.h
trunk/sys/x86/isa/isa.c
trunk/sys/x86/isa/isa_dma.c
trunk/sys/x86/isa/nmi.c
trunk/sys/x86/isa/orm.c
trunk/sys/x86/pci/pci_bus.c
trunk/sys/x86/pci/qpi.c
trunk/sys/x86/x86/bus_machdep.c
trunk/sys/x86/x86/busdma_bounce.c
trunk/sys/x86/x86/busdma_machdep.c
trunk/sys/x86/x86/dump_machdep.c
trunk/sys/x86/x86/fdt_machdep.c
trunk/sys/x86/x86/identcpu.c
trunk/sys/x86/x86/intr_machdep.c
trunk/sys/x86/x86/io_apic.c
trunk/sys/x86/x86/legacy.c
trunk/sys/x86/x86/local_apic.c
trunk/sys/x86/x86/mca.c
trunk/sys/x86/x86/mptable.c
trunk/sys/x86/x86/mptable_pci.c
trunk/sys/x86/x86/msi.c
trunk/sys/x86/x86/nexus.c
trunk/sys/x86/x86/tsc.c
trunk/sys/x86/xen/hvm.c
trunk/sys/x86/xen/xen_intr.c
Added Paths:
-----------
trunk/sys/x86/iommu/intel_intrmap.c
trunk/sys/x86/iommu/iommu_intrmap.h
trunk/sys/x86/x86/autoconf.c
trunk/sys/x86/x86/cpu_machdep.c
trunk/sys/x86/x86/delay.c
trunk/sys/x86/x86/mp_watchdog.c
trunk/sys/x86/x86/mp_x86.c
trunk/sys/x86/x86/pvclock.c
trunk/sys/x86/x86/stack_machdep.c
trunk/sys/x86/x86/ucode.c
trunk/sys/x86/x86/x86_mem.c
trunk/sys/x86/xen/pv.c
trunk/sys/x86/xen/pvcpu_enum.c
trunk/sys/x86/xen/xen_apic.c
trunk/sys/x86/xen/xen_msi.c
trunk/sys/x86/xen/xen_nexus.c
trunk/sys/x86/xen/xen_pci_bus.c
trunk/sys/x86/xen/xenpv.c
Modified: trunk/sys/x86/iommu/busdma_dmar.c
===================================================================
--- trunk/sys/x86/iommu/busdma_dmar.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/busdma_dmar.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/busdma_dmar.c 284021 2015-06-05 08:36:25Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/busdma_dmar.c 316392 2017-04-02 07:11:15Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -48,6 +48,7 @@
#include <sys/taskqueue.h>
#include <sys/tree.h>
#include <sys/uio.h>
+#include <sys/vmem.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include <vm/vm.h>
@@ -74,14 +75,34 @@
dmar_bus_dma_is_dev_disabled(int domain, int bus, int slot, int func)
{
char str[128], *env;
+ int default_bounce;
+ bool ret;
+ static const char bounce_str[] = "bounce";
+ static const char dmar_str[] = "dmar";
- snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d.bounce",
+ default_bounce = 0;
+ env = kern_getenv("hw.busdma.default");
+ if (env != NULL) {
+ if (strcmp(env, bounce_str) == 0)
+ default_bounce = 1;
+ else if (strcmp(env, dmar_str) == 0)
+ default_bounce = 0;
+ freeenv(env);
+ }
+
+ snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d",
domain, bus, slot, func);
- env = getenv(str);
+ env = kern_getenv(str);
if (env == NULL)
- return (false);
+ return (default_bounce != 0);
+ if (strcmp(env, bounce_str) == 0)
+ ret = true;
+ else if (strcmp(env, dmar_str) == 0)
+ ret = false;
+ else
+ ret = default_bounce != 0;
freeenv(env);
- return (true);
+ return (ret);
}
/*
@@ -93,7 +114,7 @@
* domain, and must collectively be assigned to use either DMAR or
* bounce mapping.
*/
-static device_t
+device_t
dmar_get_requester(device_t dev, uint16_t *rid)
{
devclass_t pci_class;
@@ -225,7 +246,7 @@
disabled = dmar_bus_dma_is_dev_disabled(pci_get_domain(requester),
pci_get_bus(requester), pci_get_slot(requester),
pci_get_function(requester));
- ctx = dmar_get_ctx(dmar, requester, rid, disabled, rmrr);
+ ctx = dmar_get_ctx_for_dev(dmar, requester, rid, disabled, rmrr);
if (ctx == NULL)
return (NULL);
if (disabled) {
@@ -256,6 +277,8 @@
/* Not in scope of any DMAR ? */
if (dmar == NULL)
return (NULL);
+ if (!dmar->dma_enabled)
+ return (NULL);
dmar_quirks_pre_use(dmar);
dmar_instantiate_rmrr_ctxs(dmar);
@@ -369,16 +392,18 @@
{
struct bus_dma_tag_dmar *tag;
struct bus_dmamap_dmar *map;
+ struct dmar_domain *domain;
tag = (struct bus_dma_tag_dmar *)dmat;
map = (struct bus_dmamap_dmar *)map1;
if (map != NULL) {
- DMAR_CTX_LOCK(tag->ctx);
+ domain = tag->ctx->domain;
+ DMAR_DOMAIN_LOCK(domain);
if (!TAILQ_EMPTY(&map->map_entries)) {
- DMAR_CTX_UNLOCK(tag->ctx);
+ DMAR_DOMAIN_UNLOCK(domain);
return (EBUSY);
}
- DMAR_CTX_UNLOCK(tag->ctx);
+ DMAR_DOMAIN_UNLOCK(domain);
free(map, M_DMAR_DMAMAP);
}
tag->map_count--;
@@ -455,6 +480,7 @@
struct dmar_map_entries_tailq *unroll_list)
{
struct dmar_ctx *ctx;
+ struct dmar_domain *domain;
struct dmar_map_entry *entry;
dmar_gaddr_t size;
bus_size_t buflen1;
@@ -464,6 +490,7 @@
if (segs == NULL)
segs = tag->segments;
ctx = tag->ctx;
+ domain = ctx->domain;
seg = *segp;
error = 0;
idx = 0;
@@ -485,7 +512,7 @@
if (seg + 1 < tag->common.nsegments)
gas_flags |= DMAR_GM_CANSPLIT;
- error = dmar_gas_map(ctx, &tag->common, size, offset,
+ error = dmar_gas_map(domain, &tag->common, size, offset,
DMAR_MAP_ENTRY_READ | DMAR_MAP_ENTRY_WRITE,
gas_flags, ma + idx, &entry);
if (error != 0)
@@ -532,10 +559,10 @@
(uintmax_t)entry->start, (uintmax_t)entry->end,
(uintmax_t)buflen1, (uintmax_t)tag->common.maxsegsz));
- DMAR_CTX_LOCK(ctx);
+ DMAR_DOMAIN_LOCK(domain);
TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
entry->flags |= DMAR_MAP_ENTRY_MAP;
- DMAR_CTX_UNLOCK(ctx);
+ DMAR_DOMAIN_UNLOCK(domain);
TAILQ_INSERT_TAIL(unroll_list, entry, unroll_link);
segs[seg].ds_addr = entry->start + offset;
@@ -557,11 +584,13 @@
int flags, bus_dma_segment_t *segs, int *segp)
{
struct dmar_ctx *ctx;
+ struct dmar_domain *domain;
struct dmar_map_entry *entry, *entry1;
struct dmar_map_entries_tailq unroll_list;
int error;
ctx = tag->ctx;
+ domain = ctx->domain;
atomic_add_long(&ctx->loads, 1);
TAILQ_INIT(&unroll_list);
@@ -573,7 +602,7 @@
* partial buffer load, so unfortunately we have to
* revert all work done.
*/
- DMAR_CTX_LOCK(ctx);
+ DMAR_DOMAIN_LOCK(domain);
TAILQ_FOREACH_SAFE(entry, &unroll_list, unroll_link,
entry1) {
/*
@@ -584,12 +613,12 @@
*/
TAILQ_REMOVE(&map->map_entries, entry, dmamap_link);
TAILQ_REMOVE(&unroll_list, entry, unroll_link);
- TAILQ_INSERT_TAIL(&ctx->unload_entries, entry,
+ TAILQ_INSERT_TAIL(&domain->unload_entries, entry,
dmamap_link);
}
- DMAR_CTX_UNLOCK(ctx);
- taskqueue_enqueue(ctx->dmar->delayed_taskqueue,
- &ctx->unload_task);
+ DMAR_DOMAIN_UNLOCK(domain);
+ taskqueue_enqueue(domain->dmar->delayed_taskqueue,
+ &domain->unload_task);
}
if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 &&
@@ -596,7 +625,7 @@
!map->cansleep)
error = EINPROGRESS;
if (error == EINPROGRESS)
- dmar_bus_schedule_dmamap(ctx->dmar, map);
+ dmar_bus_schedule_dmamap(domain->dmar, map);
return (error);
}
@@ -762,6 +791,7 @@
struct bus_dma_tag_dmar *tag;
struct bus_dmamap_dmar *map;
struct dmar_ctx *ctx;
+ struct dmar_domain *domain;
#if defined(__amd64__)
struct dmar_map_entries_tailq entries;
#endif
@@ -769,20 +799,22 @@
tag = (struct bus_dma_tag_dmar *)dmat;
map = (struct bus_dmamap_dmar *)map1;
ctx = tag->ctx;
+ domain = ctx->domain;
atomic_add_long(&ctx->unloads, 1);
#if defined(__i386__)
- DMAR_CTX_LOCK(ctx);
- TAILQ_CONCAT(&ctx->unload_entries, &map->map_entries, dmamap_link);
- DMAR_CTX_UNLOCK(ctx);
- taskqueue_enqueue(ctx->dmar->delayed_taskqueue, &ctx->unload_task);
+ DMAR_DOMAIN_LOCK(domain);
+ TAILQ_CONCAT(&domain->unload_entries, &map->map_entries, dmamap_link);
+ DMAR_DOMAIN_UNLOCK(domain);
+ taskqueue_enqueue(domain->dmar->delayed_taskqueue,
+ &domain->unload_task);
#else /* defined(__amd64__) */
TAILQ_INIT(&entries);
- DMAR_CTX_LOCK(ctx);
+ DMAR_DOMAIN_LOCK(domain);
TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link);
- DMAR_CTX_UNLOCK(ctx);
+ DMAR_DOMAIN_UNLOCK(domain);
THREAD_NO_SLEEPING();
- dmar_ctx_unload(ctx, &entries, false);
+ dmar_domain_unload(domain, &entries, false);
THREAD_SLEEPING_OK();
KASSERT(TAILQ_EMPTY(&entries), ("lazy dmar_ctx_unload %p", ctx));
#endif
@@ -855,6 +887,8 @@
dmar_init_busdma(struct dmar_unit *unit)
{
+ unit->dma_enabled = 1;
+ TUNABLE_INT_FETCH("hw.dmar.dma", &unit->dma_enabled);
TAILQ_INIT(&unit->delayed_maps);
TASK_INIT(&unit->dmamap_load_task, 0, dmar_bus_task_dmamap, unit);
unit->delayed_taskqueue = taskqueue_create("dmar", M_WAITOK,
Modified: trunk/sys/x86/iommu/busdma_dmar.h
===================================================================
--- trunk/sys/x86/iommu/busdma_dmar.h 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/busdma_dmar.h 2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,7 +27,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/x86/iommu/busdma_dmar.h 257251 2013-10-28 13:33:29Z kib $
+ * $FreeBSD: stable/11/sys/x86/iommu/busdma_dmar.h 257251 2013-10-28 13:33:29Z kib $
*/
#ifndef __X86_IOMMU_BUSDMA_DMAR_H
Modified: trunk/sys/x86/iommu/intel_ctx.c
===================================================================
--- trunk/sys/x86/iommu/intel_ctx.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_ctx.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_ctx.c 279485 2015-03-01 10:35:54Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_ctx.c 320357 2017-06-26 12:30:39Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -49,6 +49,7 @@
#include <sys/taskqueue.h>
#include <sys/tree.h>
#include <sys/uio.h>
+#include <sys/vmem.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
@@ -68,8 +69,12 @@
#include <dev/pci/pcivar.h>
static MALLOC_DEFINE(M_DMAR_CTX, "dmar_ctx", "Intel DMAR Context");
+static MALLOC_DEFINE(M_DMAR_DOMAIN, "dmar_dom", "Intel DMAR Domain");
-static void dmar_ctx_unload_task(void *arg, int pending);
+static void dmar_domain_unload_task(void *arg, int pending);
+static void dmar_unref_domain_locked(struct dmar_unit *dmar,
+ struct dmar_domain *domain);
+static void dmar_domain_destroy(struct dmar_domain *domain);
static void
dmar_ensure_ctx_page(struct dmar_unit *dmar, int bus)
@@ -108,8 +113,8 @@
{
dmar_ctx_entry_t *ctxp;
- ctxp = dmar_map_pgtbl(ctx->dmar->ctx_obj, 1 + PCI_RID2BUS(ctx->rid),
- DMAR_PGF_NOALLOC | DMAR_PGF_WAITOK, sfp);
+ ctxp = dmar_map_pgtbl(ctx->domain->dmar->ctx_obj, 1 +
+ PCI_RID2BUS(ctx->rid), DMAR_PGF_NOALLOC | DMAR_PGF_WAITOK, sfp);
ctxp += ctx->rid & 0xff;
return (ctxp);
}
@@ -119,7 +124,7 @@
{
bus_addr_t maxaddr;
- maxaddr = MIN(ctx->end, BUS_SPACE_MAXADDR);
+ maxaddr = MIN(ctx->domain->end, BUS_SPACE_MAXADDR);
ctx->ctx_tag.common.ref_count = 1; /* Prevent free */
ctx->ctx_tag.common.impl = &bus_dma_dmar_impl;
ctx->ctx_tag.common.boundary = PCI_DMA_BOUNDARY;
@@ -130,33 +135,42 @@
ctx->ctx_tag.common.maxsegsz = maxaddr;
ctx->ctx_tag.ctx = ctx;
ctx->ctx_tag.owner = dev;
- /* XXXKIB initialize tag further */
}
static void
-ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp)
+ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp, bool move)
{
struct dmar_unit *unit;
+ struct dmar_domain *domain;
vm_page_t ctx_root;
- unit = ctx->dmar;
- KASSERT(ctxp->ctx1 == 0 && ctxp->ctx2 == 0,
+ domain = ctx->domain;
+ unit = domain->dmar;
+ KASSERT(move || (ctxp->ctx1 == 0 && ctxp->ctx2 == 0),
("dmar%d: initialized ctx entry %d:%d:%d 0x%jx 0x%jx",
unit->unit, pci_get_bus(ctx->ctx_tag.owner),
pci_get_slot(ctx->ctx_tag.owner),
pci_get_function(ctx->ctx_tag.owner),
- ctxp->ctx1,
- ctxp->ctx2));
- ctxp->ctx2 = DMAR_CTX2_DID(ctx->domain);
- ctxp->ctx2 |= ctx->awlvl;
- if ((ctx->flags & DMAR_CTX_IDMAP) != 0 &&
+ ctxp->ctx1, ctxp->ctx2));
+ /*
+ * For update due to move, the store is not atomic. It is
+ * possible that DMAR read upper doubleword, while low
+ * doubleword is not yet updated. The domain id is stored in
+ * the upper doubleword, while the table pointer in the lower.
+ *
+ * There is no good solution, for the same reason it is wrong
+ * to clear P bit in the ctx entry for update.
+ */
+ dmar_pte_store1(&ctxp->ctx2, DMAR_CTX2_DID(domain->domain) |
+ domain->awlvl);
+ if ((domain->flags & DMAR_DOMAIN_IDMAP) != 0 &&
(unit->hw_ecap & DMAR_ECAP_PT) != 0) {
- KASSERT(ctx->pgtbl_obj == NULL,
+ KASSERT(domain->pgtbl_obj == NULL,
("ctx %p non-null pgtbl_obj", ctx));
- dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P);
+ dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P);
} else {
- ctx_root = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_NOALLOC);
- dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_UNTR |
+ ctx_root = dmar_pgalloc(domain->pgtbl_obj, 0, DMAR_PGF_NOALLOC);
+ dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_UNTR |
(DMAR_CTX1_ASR_MASK & VM_PAGE_TO_PHYS(ctx_root)) |
DMAR_CTX1_P);
}
@@ -164,8 +178,32 @@
}
static int
-ctx_init_rmrr(struct dmar_ctx *ctx, device_t dev)
+dmar_flush_for_ctx_entry(struct dmar_unit *dmar, bool force)
{
+ int error;
+
+ /*
+ * If dmar declares Caching Mode as Set, follow 11.5 "Caching
+ * Mode Consideration" and do the (global) invalidation of the
+ * negative TLB entries.
+ */
+ if ((dmar->hw_cap & DMAR_CAP_CM) == 0 && !force)
+ return (0);
+ if (dmar->qi_enabled) {
+ dmar_qi_invalidate_ctx_glob_locked(dmar);
+ if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force)
+ dmar_qi_invalidate_iotlb_glob_locked(dmar);
+ return (0);
+ }
+ error = dmar_inv_ctx_glob(dmar);
+ if (error == 0 && ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force))
+ error = dmar_inv_iotlb_glob(dmar);
+ return (error);
+}
+
+static int
+domain_init_rmrr(struct dmar_domain *domain, device_t dev)
+{
struct dmar_map_entries_tailq rmrr_entries;
struct dmar_map_entry *entry, *entry1;
vm_page_t *ma;
@@ -175,7 +213,7 @@
error = 0;
TAILQ_INIT(&rmrr_entries);
- dmar_ctx_parse_rmrr(ctx, dev, &rmrr_entries);
+ dmar_dev_parse_rmrr(domain, dev, &rmrr_entries);
TAILQ_FOREACH_SAFE(entry, &rmrr_entries, unroll_link, entry1) {
/*
* VT-d specification requires that the start of an
@@ -195,7 +233,7 @@
if (bootverbose) {
device_printf(dev, "BIOS bug: dmar%d RMRR "
"region (%jx, %jx) corrected\n",
- ctx->dmar->unit, start, end);
+ domain->dmar->unit, start, end);
}
entry->end += DMAR_PAGE_SIZE * 0x20;
}
@@ -205,8 +243,9 @@
ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i,
VM_MEMATTR_DEFAULT);
}
- error1 = dmar_gas_map_region(ctx, entry, DMAR_MAP_ENTRY_READ |
- DMAR_MAP_ENTRY_WRITE, DMAR_GM_CANWAIT, ma);
+ error1 = dmar_gas_map_region(domain, entry,
+ DMAR_MAP_ENTRY_READ | DMAR_MAP_ENTRY_WRITE,
+ DMAR_GM_CANWAIT, ma);
/*
* Non-failed RMRR entries are owned by context rb
* tree. Get rid of the failed entry, but do not stop
@@ -214,18 +253,19 @@
* loaded and removed on the context destruction.
*/
if (error1 == 0 && entry->end != entry->start) {
- DMAR_LOCK(ctx->dmar);
- ctx->flags |= DMAR_CTX_RMRR;
- DMAR_UNLOCK(ctx->dmar);
+ DMAR_LOCK(domain->dmar);
+ domain->refs++; /* XXXKIB prevent free */
+ domain->flags |= DMAR_DOMAIN_RMRR;
+ DMAR_UNLOCK(domain->dmar);
} else {
if (error1 != 0) {
device_printf(dev,
"dmar%d failed to map RMRR region (%jx, %jx) %d\n",
- ctx->dmar->unit, start, end, error1);
+ domain->dmar->unit, start, end, error1);
error = error1;
}
TAILQ_REMOVE(&rmrr_entries, entry, unroll_link);
- dmar_gas_free_entry(ctx, entry);
+ dmar_gas_free_entry(domain, entry);
}
for (i = 0; i < size; i++)
vm_page_putfake(ma[i]);
@@ -234,47 +274,144 @@
return (error);
}
+static struct dmar_domain *
+dmar_domain_alloc(struct dmar_unit *dmar, bool id_mapped)
+{
+ struct dmar_domain *domain;
+ int error, id, mgaw;
+
+ id = alloc_unr(dmar->domids);
+ if (id == -1)
+ return (NULL);
+ domain = malloc(sizeof(*domain), M_DMAR_DOMAIN, M_WAITOK | M_ZERO);
+ domain->domain = id;
+ LIST_INIT(&domain->contexts);
+ RB_INIT(&domain->rb_root);
+ TAILQ_INIT(&domain->unload_entries);
+ TASK_INIT(&domain->unload_task, 0, dmar_domain_unload_task, domain);
+ mtx_init(&domain->lock, "dmardom", NULL, MTX_DEF);
+ domain->dmar = dmar;
+
+ /*
+ * For now, use the maximal usable physical address of the
+ * installed memory to calculate the mgaw on id_mapped domain.
+ * It is useful for the identity mapping, and less so for the
+ * virtualized bus address space.
+ */
+ domain->end = id_mapped ? ptoa(Maxmem) : BUS_SPACE_MAXADDR;
+ mgaw = dmar_maxaddr2mgaw(dmar, domain->end, !id_mapped);
+ error = domain_set_agaw(domain, mgaw);
+ if (error != 0)
+ goto fail;
+ if (!id_mapped)
+ /* Use all supported address space for remapping. */
+ domain->end = 1ULL << (domain->agaw - 1);
+
+ dmar_gas_init_domain(domain);
+
+ if (id_mapped) {
+ if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) {
+ domain->pgtbl_obj = domain_get_idmap_pgtbl(domain,
+ domain->end);
+ }
+ domain->flags |= DMAR_DOMAIN_IDMAP;
+ } else {
+ error = domain_alloc_pgtbl(domain);
+ if (error != 0)
+ goto fail;
+ /* Disable local apic region access */
+ error = dmar_gas_reserve_region(domain, 0xfee00000,
+ 0xfeefffff + 1);
+ if (error != 0)
+ goto fail;
+ }
+ return (domain);
+
+fail:
+ dmar_domain_destroy(domain);
+ return (NULL);
+}
+
static struct dmar_ctx *
-dmar_get_ctx_alloc(struct dmar_unit *dmar, uint16_t rid)
+dmar_ctx_alloc(struct dmar_domain *domain, uint16_t rid)
{
struct dmar_ctx *ctx;
ctx = malloc(sizeof(*ctx), M_DMAR_CTX, M_WAITOK | M_ZERO);
- RB_INIT(&ctx->rb_root);
- TAILQ_INIT(&ctx->unload_entries);
- TASK_INIT(&ctx->unload_task, 0, dmar_ctx_unload_task, ctx);
- mtx_init(&ctx->lock, "dmarctx", NULL, MTX_DEF);
- ctx->dmar = dmar;
+ ctx->domain = domain;
ctx->rid = rid;
+ ctx->refs = 1;
return (ctx);
}
static void
-dmar_ctx_dtr(struct dmar_ctx *ctx, bool gas_inited, bool pgtbl_inited)
+dmar_ctx_link(struct dmar_ctx *ctx)
{
+ struct dmar_domain *domain;
- if (gas_inited) {
- DMAR_CTX_LOCK(ctx);
- dmar_gas_fini_ctx(ctx);
- DMAR_CTX_UNLOCK(ctx);
+ domain = ctx->domain;
+ DMAR_ASSERT_LOCKED(domain->dmar);
+ KASSERT(domain->refs >= domain->ctx_cnt,
+ ("dom %p ref underflow %d %d", domain, domain->refs,
+ domain->ctx_cnt));
+ domain->refs++;
+ domain->ctx_cnt++;
+ LIST_INSERT_HEAD(&domain->contexts, ctx, link);
+}
+
+static void
+dmar_ctx_unlink(struct dmar_ctx *ctx)
+{
+ struct dmar_domain *domain;
+
+ domain = ctx->domain;
+ DMAR_ASSERT_LOCKED(domain->dmar);
+ KASSERT(domain->refs > 0,
+ ("domain %p ctx dtr refs %d", domain, domain->refs));
+ KASSERT(domain->ctx_cnt >= domain->refs,
+ ("domain %p ctx dtr refs %d ctx_cnt %d", domain,
+ domain->refs, domain->ctx_cnt));
+ domain->refs--;
+ domain->ctx_cnt--;
+ LIST_REMOVE(ctx, link);
+}
+
+static void
+dmar_domain_destroy(struct dmar_domain *domain)
+{
+
+ KASSERT(TAILQ_EMPTY(&domain->unload_entries),
+ ("unfinished unloads %p", domain));
+ KASSERT(LIST_EMPTY(&domain->contexts),
+ ("destroying dom %p with contexts", domain));
+ KASSERT(domain->ctx_cnt == 0,
+ ("destroying dom %p with ctx_cnt %d", domain, domain->ctx_cnt));
+ KASSERT(domain->refs == 0,
+ ("destroying dom %p with refs %d", domain, domain->refs));
+ if ((domain->flags & DMAR_DOMAIN_GAS_INITED) != 0) {
+ DMAR_DOMAIN_LOCK(domain);
+ dmar_gas_fini_domain(domain);
+ DMAR_DOMAIN_UNLOCK(domain);
}
- if (pgtbl_inited) {
- if (ctx->pgtbl_obj != NULL)
- DMAR_CTX_PGLOCK(ctx);
- ctx_free_pgtbl(ctx);
+ if ((domain->flags & DMAR_DOMAIN_PGTBL_INITED) != 0) {
+ if (domain->pgtbl_obj != NULL)
+ DMAR_DOMAIN_PGLOCK(domain);
+ domain_free_pgtbl(domain);
}
- mtx_destroy(&ctx->lock);
- free(ctx, M_DMAR_CTX);
+ mtx_destroy(&domain->lock);
+ free_unr(domain->dmar->domids, domain->domain);
+ free(domain, M_DMAR_DOMAIN);
}
struct dmar_ctx *
-dmar_get_ctx(struct dmar_unit *dmar, device_t dev, uint16_t rid, bool id_mapped,
- bool rmrr_init)
+dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev, uint16_t rid,
+ bool id_mapped, bool rmrr_init)
{
+ struct dmar_domain *domain, *domain1;
struct dmar_ctx *ctx, *ctx1;
dmar_ctx_entry_t *ctxp;
struct sf_buf *sf;
- int bus, slot, func, error, mgaw;
+ int bus, slot, func, error;
bool enable;
bus = pci_get_bus(dev);
@@ -292,67 +429,20 @@
*/
DMAR_UNLOCK(dmar);
dmar_ensure_ctx_page(dmar, PCI_RID2BUS(rid));
- ctx1 = dmar_get_ctx_alloc(dmar, rid);
-
- if (id_mapped) {
- /*
- * For now, use the maximal usable physical
- * address of the installed memory to
- * calculate the mgaw. It is useful for the
- * identity mapping, and less so for the
- * virtualized bus address space.
- */
- ctx1->end = ptoa(Maxmem);
- mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, false);
- error = ctx_set_agaw(ctx1, mgaw);
- if (error != 0) {
- dmar_ctx_dtr(ctx1, false, false);
- TD_PINNED_ASSERT;
- return (NULL);
- }
- } else {
- ctx1->end = BUS_SPACE_MAXADDR;
- mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, true);
- error = ctx_set_agaw(ctx1, mgaw);
- if (error != 0) {
- dmar_ctx_dtr(ctx1, false, false);
- TD_PINNED_ASSERT;
- return (NULL);
- }
- /* Use all supported address space for remapping. */
- ctx1->end = 1ULL << (ctx1->agaw - 1);
+ domain1 = dmar_domain_alloc(dmar, id_mapped);
+ if (domain1 == NULL) {
+ TD_PINNED_ASSERT;
+ return (NULL);
}
-
-
- dmar_gas_init_ctx(ctx1);
- if (id_mapped) {
- if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) {
- ctx1->pgtbl_obj = ctx_get_idmap_pgtbl(ctx1,
- ctx1->end);
- }
- ctx1->flags |= DMAR_CTX_IDMAP;
- } else {
- error = ctx_alloc_pgtbl(ctx1);
+ if (!id_mapped) {
+ error = domain_init_rmrr(domain1, dev);
if (error != 0) {
- dmar_ctx_dtr(ctx1, true, false);
+ dmar_domain_destroy(domain1);
TD_PINNED_ASSERT;
return (NULL);
}
- /* Disable local apic region access */
- error = dmar_gas_reserve_region(ctx1, 0xfee00000,
- 0xfeefffff + 1);
- if (error != 0) {
- dmar_ctx_dtr(ctx1, true, true);
- TD_PINNED_ASSERT;
- return (NULL);
- }
- error = ctx_init_rmrr(ctx1, dev);
- if (error != 0) {
- dmar_ctx_dtr(ctx1, true, true);
- TD_PINNED_ASSERT;
- return (NULL);
- }
}
+ ctx1 = dmar_ctx_alloc(domain1, rid);
ctxp = dmar_map_ctx_entry(ctx1, &sf);
DMAR_LOCK(dmar);
@@ -362,16 +452,10 @@
*/
ctx = dmar_find_ctx_locked(dmar, rid);
if (ctx == NULL) {
+ domain = domain1;
ctx = ctx1;
+ dmar_ctx_link(ctx);
ctx->ctx_tag.owner = dev;
- ctx->domain = alloc_unrl(dmar->domids);
- if (ctx->domain == -1) {
- DMAR_UNLOCK(dmar);
- dmar_unmap_pgtbl(sf);
- dmar_ctx_dtr(ctx, true, true);
- TD_PINNED_ASSERT;
- return (NULL);
- }
ctx_tag_init(ctx, dev);
/*
@@ -379,46 +463,35 @@
* DMAR unit. Enable the translation after
* everything is set up.
*/
- if (LIST_EMPTY(&dmar->contexts))
+ if (LIST_EMPTY(&dmar->domains))
enable = true;
- LIST_INSERT_HEAD(&dmar->contexts, ctx, link);
- ctx_id_entry_init(ctx, ctxp);
+ LIST_INSERT_HEAD(&dmar->domains, domain, link);
+ ctx_id_entry_init(ctx, ctxp, false);
device_printf(dev,
"dmar%d pci%d:%d:%d:%d rid %x domain %d mgaw %d "
"agaw %d %s-mapped\n",
dmar->unit, dmar->segment, bus, slot,
- func, rid, ctx->domain, ctx->mgaw, ctx->agaw,
- id_mapped ? "id" : "re");
+ func, rid, domain->domain, domain->mgaw,
+ domain->agaw, id_mapped ? "id" : "re");
+ dmar_unmap_pgtbl(sf);
} else {
- dmar_ctx_dtr(ctx1, true, true);
+ dmar_unmap_pgtbl(sf);
+ dmar_domain_destroy(domain1);
+ /* Nothing needs to be done to destroy ctx1. */
+ free(ctx1, M_DMAR_CTX);
+ domain = ctx->domain;
+ ctx->refs++; /* tag referenced us */
}
- dmar_unmap_pgtbl(sf);
+ } else {
+ domain = ctx->domain;
+ ctx->refs++; /* tag referenced us */
}
- ctx->refs++;
- if ((ctx->flags & DMAR_CTX_RMRR) != 0)
- ctx->refs++; /* XXXKIB */
- /*
- * If dmar declares Caching Mode as Set, follow 11.5 "Caching
- * Mode Consideration" and do the (global) invalidation of the
- * negative TLB entries.
- */
- if ((dmar->hw_cap & DMAR_CAP_CM) != 0 || enable) {
- if (dmar->qi_enabled) {
- dmar_qi_invalidate_ctx_glob_locked(dmar);
- if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0)
- dmar_qi_invalidate_iotlb_glob_locked(dmar);
- } else {
- error = dmar_inv_ctx_glob(dmar);
- if (error == 0 &&
- (dmar->hw_ecap & DMAR_ECAP_DI) != 0)
- error = dmar_inv_iotlb_glob(dmar);
- if (error != 0) {
- dmar_free_ctx_locked(dmar, ctx);
- TD_PINNED_ASSERT;
- return (NULL);
- }
- }
+ error = dmar_flush_for_ctx_entry(dmar, enable);
+ if (error != 0) {
+ dmar_free_ctx_locked(dmar, ctx);
+ TD_PINNED_ASSERT;
+ return (NULL);
}
/*
@@ -439,11 +512,74 @@
return (ctx);
}
+int
+dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx)
+{
+ struct dmar_unit *dmar;
+ struct dmar_domain *old_domain;
+ dmar_ctx_entry_t *ctxp;
+ struct sf_buf *sf;
+ int error;
+
+ dmar = domain->dmar;
+ old_domain = ctx->domain;
+ if (domain == old_domain)
+ return (0);
+ KASSERT(old_domain->dmar == dmar,
+ ("domain %p %u moving between dmars %u %u", domain,
+ domain->domain, old_domain->dmar->unit, domain->dmar->unit));
+ TD_PREP_PINNED_ASSERT;
+
+ ctxp = dmar_map_ctx_entry(ctx, &sf);
+ DMAR_LOCK(dmar);
+ dmar_ctx_unlink(ctx);
+ ctx->domain = domain;
+ dmar_ctx_link(ctx);
+ ctx_id_entry_init(ctx, ctxp, true);
+ dmar_unmap_pgtbl(sf);
+ error = dmar_flush_for_ctx_entry(dmar, true);
+ /* If flush failed, rolling back would not work as well. */
+ printf("dmar%d rid %x domain %d->%d %s-mapped\n",
+ dmar->unit, ctx->rid, old_domain->domain, domain->domain,
+ (domain->flags & DMAR_DOMAIN_IDMAP) != 0 ? "id" : "re");
+ dmar_unref_domain_locked(dmar, old_domain);
+ TD_PINNED_ASSERT;
+ return (error);
+}
+
+static void
+dmar_unref_domain_locked(struct dmar_unit *dmar, struct dmar_domain *domain)
+{
+
+ DMAR_ASSERT_LOCKED(dmar);
+ KASSERT(domain->refs >= 1,
+ ("dmar %d domain %p refs %u", dmar->unit, domain, domain->refs));
+ KASSERT(domain->refs > domain->ctx_cnt,
+ ("dmar %d domain %p refs %d ctx_cnt %d", dmar->unit, domain,
+ domain->refs, domain->ctx_cnt));
+
+ if (domain->refs > 1) {
+ domain->refs--;
+ DMAR_UNLOCK(dmar);
+ return;
+ }
+
+ KASSERT((domain->flags & DMAR_DOMAIN_RMRR) == 0,
+ ("lost ref on RMRR domain %p", domain));
+
+ LIST_REMOVE(domain, link);
+ DMAR_UNLOCK(dmar);
+
+ taskqueue_drain(dmar->delayed_taskqueue, &domain->unload_task);
+ dmar_domain_destroy(domain);
+}
+
void
dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx)
{
struct sf_buf *sf;
dmar_ctx_entry_t *ctxp;
+ struct dmar_domain *domain;
DMAR_ASSERT_LOCKED(dmar);
KASSERT(ctx->refs >= 1,
@@ -459,8 +595,6 @@
return;
}
- KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0,
- ("lost ref on RMRR ctx %p", ctx));
KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0,
("lost ref on disabled ctx %p", ctx));
@@ -488,8 +622,6 @@
return;
}
- KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0,
- ("lost ref on RMRR ctx %p", ctx));
KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0,
("lost ref on disabled ctx %p", ctx));
@@ -507,19 +639,11 @@
else
dmar_inv_iotlb_glob(dmar);
}
- LIST_REMOVE(ctx, link);
- DMAR_UNLOCK(dmar);
-
- /*
- * The rest of the destruction is invisible for other users of
- * the dmar unit.
- */
- taskqueue_drain(dmar->delayed_taskqueue, &ctx->unload_task);
- KASSERT(TAILQ_EMPTY(&ctx->unload_entries),
- ("unfinished unloads %p", ctx));
dmar_unmap_pgtbl(sf);
- free_unr(dmar->domids, ctx->domain);
- dmar_ctx_dtr(ctx, true, true);
+ domain = ctx->domain;
+ dmar_ctx_unlink(ctx);
+ free(ctx, M_DMAR_CTX);
+ dmar_unref_domain_locked(dmar, domain);
TD_PINNED_ASSERT;
}
@@ -528,86 +652,101 @@
{
struct dmar_unit *dmar;
- dmar = ctx->dmar;
+ dmar = ctx->domain->dmar;
DMAR_LOCK(dmar);
dmar_free_ctx_locked(dmar, ctx);
}
+/*
+ * Returns with the domain locked.
+ */
struct dmar_ctx *
dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid)
{
+ struct dmar_domain *domain;
struct dmar_ctx *ctx;
DMAR_ASSERT_LOCKED(dmar);
- LIST_FOREACH(ctx, &dmar->contexts, link) {
- if (ctx->rid == rid)
- return (ctx);
+ LIST_FOREACH(domain, &dmar->domains, link) {
+ LIST_FOREACH(ctx, &domain->contexts, link) {
+ if (ctx->rid == rid)
+ return (ctx);
+ }
}
return (NULL);
}
void
-dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free)
+dmar_domain_free_entry(struct dmar_map_entry *entry, bool free)
{
- struct dmar_ctx *ctx;
+ struct dmar_domain *domain;
- ctx = entry->ctx;
- DMAR_CTX_LOCK(ctx);
+ domain = entry->domain;
+ DMAR_DOMAIN_LOCK(domain);
if ((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0)
- dmar_gas_free_region(ctx, entry);
+ dmar_gas_free_region(domain, entry);
else
- dmar_gas_free_space(ctx, entry);
- DMAR_CTX_UNLOCK(ctx);
+ dmar_gas_free_space(domain, entry);
+ DMAR_DOMAIN_UNLOCK(domain);
if (free)
- dmar_gas_free_entry(ctx, entry);
+ dmar_gas_free_entry(domain, entry);
else
entry->flags = 0;
}
void
-dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free)
+dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free)
{
struct dmar_unit *unit;
- unit = entry->ctx->dmar;
+ unit = entry->domain->dmar;
if (unit->qi_enabled) {
DMAR_LOCK(unit);
- dmar_qi_invalidate_locked(entry->ctx, entry->start,
- entry->end - entry->start, &entry->gseq);
+ dmar_qi_invalidate_locked(entry->domain, entry->start,
+ entry->end - entry->start, &entry->gseq, true);
if (!free)
entry->flags |= DMAR_MAP_ENTRY_QI_NF;
TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link);
DMAR_UNLOCK(unit);
} else {
- ctx_flush_iotlb_sync(entry->ctx, entry->start, entry->end -
- entry->start);
- dmar_ctx_free_entry(entry, free);
+ domain_flush_iotlb_sync(entry->domain, entry->start,
+ entry->end - entry->start);
+ dmar_domain_free_entry(entry, free);
}
}
+static bool
+dmar_domain_unload_emit_wait(struct dmar_domain *domain,
+ struct dmar_map_entry *entry)
+{
+
+ if (TAILQ_NEXT(entry, dmamap_link) == NULL)
+ return (true);
+ return (domain->batch_no++ % dmar_batch_coalesce == 0);
+}
+
void
-dmar_ctx_unload(struct dmar_ctx *ctx, struct dmar_map_entries_tailq *entries,
- bool cansleep)
+dmar_domain_unload(struct dmar_domain *domain,
+ struct dmar_map_entries_tailq *entries, bool cansleep)
{
struct dmar_unit *unit;
struct dmar_map_entry *entry, *entry1;
- struct dmar_qi_genseq gseq;
int error;
- unit = ctx->dmar;
+ unit = domain->dmar;
TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
KASSERT((entry->flags & DMAR_MAP_ENTRY_MAP) != 0,
- ("not mapped entry %p %p", ctx, entry));
- error = ctx_unmap_buf(ctx, entry->start, entry->end -
+ ("not mapped entry %p %p", domain, entry));
+ error = domain_unmap_buf(domain, entry->start, entry->end -
entry->start, cansleep ? DMAR_PGF_WAITOK : 0);
- KASSERT(error == 0, ("unmap %p error %d", ctx, error));
+ KASSERT(error == 0, ("unmap %p error %d", domain, error));
if (!unit->qi_enabled) {
- ctx_flush_iotlb_sync(ctx, entry->start,
+ domain_flush_iotlb_sync(domain, entry->start,
entry->end - entry->start);
TAILQ_REMOVE(entries, entry, dmamap_link);
- dmar_ctx_free_entry(entry, true);
+ dmar_domain_free_entry(entry, true);
}
}
if (TAILQ_EMPTY(entries))
@@ -616,36 +755,30 @@
KASSERT(unit->qi_enabled, ("loaded entry left"));
DMAR_LOCK(unit);
TAILQ_FOREACH(entry, entries, dmamap_link) {
- entry->gseq.gen = 0;
- entry->gseq.seq = 0;
- dmar_qi_invalidate_locked(ctx, entry->start, entry->end -
- entry->start, TAILQ_NEXT(entry, dmamap_link) == NULL ?
- &gseq : NULL);
+ dmar_qi_invalidate_locked(domain, entry->start, entry->end -
+ entry->start, &entry->gseq,
+ dmar_domain_unload_emit_wait(domain, entry));
}
- TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
- entry->gseq = gseq;
- TAILQ_REMOVE(entries, entry, dmamap_link);
- TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link);
- }
+ TAILQ_CONCAT(&unit->tlb_flush_entries, entries, dmamap_link);
DMAR_UNLOCK(unit);
}
static void
-dmar_ctx_unload_task(void *arg, int pending)
+dmar_domain_unload_task(void *arg, int pending)
{
- struct dmar_ctx *ctx;
+ struct dmar_domain *domain;
struct dmar_map_entries_tailq entries;
- ctx = arg;
+ domain = arg;
TAILQ_INIT(&entries);
for (;;) {
- DMAR_CTX_LOCK(ctx);
- TAILQ_SWAP(&ctx->unload_entries, &entries, dmar_map_entry,
+ DMAR_DOMAIN_LOCK(domain);
+ TAILQ_SWAP(&domain->unload_entries, &entries, dmar_map_entry,
dmamap_link);
- DMAR_CTX_UNLOCK(ctx);
+ DMAR_DOMAIN_UNLOCK(domain);
if (TAILQ_EMPTY(&entries))
break;
- dmar_ctx_unload(ctx, &entries, true);
+ dmar_domain_unload(domain, &entries, true);
}
}
Modified: trunk/sys/x86/iommu/intel_dmar.h
===================================================================
--- trunk/sys/x86/iommu/intel_dmar.h 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_dmar.h 2020-02-08 19:32:41 UTC (rev 12310)
@@ -1,6 +1,6 @@
/* $MidnightBSD$ */
/*-
- * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Konstantin Belousov <kib at FreeBSD.org>
@@ -27,7 +27,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/x86/iommu/intel_dmar.h 281545 2015-04-15 06:56:51Z kib $
+ * $FreeBSD: stable/11/sys/x86/iommu/intel_dmar.h 320357 2017-06-26 12:30:39Z kib $
*/
#ifndef __X86_IOMMU_INTEL_DMAR_H
@@ -51,10 +51,10 @@
current R/B tree node */
u_int flags;
TAILQ_ENTRY(dmar_map_entry) dmamap_link; /* Link for dmamap entries */
- RB_ENTRY(dmar_map_entry) rb_entry; /* Links for ctx entries */
+ RB_ENTRY(dmar_map_entry) rb_entry; /* Links for domain entries */
TAILQ_ENTRY(dmar_map_entry) unroll_link; /* Link for unroll after
dmamap_load failure */
- struct dmar_ctx *ctx;
+ struct dmar_domain *domain;
struct dmar_qi_genseq gseq;
};
@@ -74,51 +74,85 @@
#define DMAR_MAP_ENTRY_SNOOP 0x4000 /* Snoop */
#define DMAR_MAP_ENTRY_TM 0x8000 /* Transient */
+/*
+ * Locking annotations:
+ * (u) - Protected by dmar unit lock
+ * (d) - Protected by domain lock
+ * (c) - Immutable after initialization
+ */
+
+/*
+ * The domain abstraction. Most non-constant members of the domain
+ * are protected by owning dmar unit lock, not by the domain lock.
+ * Most important, the dmar lock protects the contexts list.
+ *
+ * The domain lock protects the address map for the domain, and list
+ * of unload entries delayed.
+ *
+ * Page tables pages and pages content is protected by the vm object
+ * lock pgtbl_obj, which contains the page tables pages.
+ */
+struct dmar_domain {
+ int domain; /* (c) DID, written in context entry */
+ int mgaw; /* (c) Real max address width */
+ int agaw; /* (c) Adjusted guest address width */
+ int pglvl; /* (c) The pagelevel */
+ int awlvl; /* (c) The pagelevel as the bitmask,
+ to set in context entry */
+ dmar_gaddr_t end; /* (c) Highest address + 1 in
+ the guest AS */
+ u_int ctx_cnt; /* (u) Number of contexts owned */
+ u_int refs; /* (u) Refs, including ctx */
+ struct dmar_unit *dmar; /* (c) */
+ struct mtx lock; /* (c) */
+ LIST_ENTRY(dmar_domain) link; /* (u) Member in the dmar list */
+ LIST_HEAD(, dmar_ctx) contexts; /* (u) */
+ vm_object_t pgtbl_obj; /* (c) Page table pages */
+ u_int flags; /* (u) */
+ u_int entries_cnt; /* (d) */
+ struct dmar_gas_entries_tree rb_root; /* (d) */
+ struct dmar_map_entries_tailq unload_entries; /* (d) Entries to
+ unload */
+ struct dmar_map_entry *first_place, *last_place; /* (d) */
+ struct task unload_task; /* (c) */
+ u_int batch_no;
+};
+
struct dmar_ctx {
- uint16_t rid; /* pci RID */
- int domain; /* DID */
- int mgaw; /* Real max address width */
- int agaw; /* Adjusted guest address width */
- int pglvl; /* The pagelevel */
- int awlvl; /* The pagelevel as the bitmask, to set in
- context entry */
- dmar_gaddr_t end;/* Highest address + 1 in the guest AS */
- u_int refs; /* References to the context, from tags */
- struct dmar_unit *dmar;
- struct bus_dma_tag_dmar ctx_tag; /* Root tag */
- struct mtx lock;
- LIST_ENTRY(dmar_ctx) link; /* Member in the dmar list */
- vm_object_t pgtbl_obj; /* Page table pages */
- u_int flags; /* Protected by dmar lock */
+ struct bus_dma_tag_dmar ctx_tag; /* (c) Root tag */
+ uint16_t rid; /* (c) pci RID */
uint64_t last_fault_rec[2]; /* Last fault reported */
- u_int entries_cnt;
- u_long loads;
- u_long unloads;
- struct dmar_gas_entries_tree rb_root;
- struct dmar_map_entries_tailq unload_entries; /* Entries to unload */
- struct dmar_map_entry *first_place, *last_place;
- struct task unload_task;
+ struct dmar_domain *domain; /* (c) */
+ LIST_ENTRY(dmar_ctx) link; /* (u) Member in the domain list */
+ u_int refs; /* (u) References from tags */
+ u_int flags; /* (u) */
+ u_long loads; /* atomic updates, for stat only */
+ u_long unloads; /* same */
};
+#define DMAR_DOMAIN_GAS_INITED 0x0001
+#define DMAR_DOMAIN_PGTBL_INITED 0x0002
+#define DMAR_DOMAIN_IDMAP 0x0010 /* Domain uses identity
+ page table */
+#define DMAR_DOMAIN_RMRR 0x0020 /* Domain contains RMRR entry,
+ cannot be turned off */
+
/* struct dmar_ctx flags */
#define DMAR_CTX_FAULTED 0x0001 /* Fault was reported,
last_fault_rec is valid */
-#define DMAR_CTX_IDMAP 0x0002 /* Context uses identity page table */
-#define DMAR_CTX_RMRR 0x0004 /* Context contains RMRR entry,
- cannot be turned off */
-#define DMAR_CTX_DISABLED 0x0008 /* Device is disabled, the
+#define DMAR_CTX_DISABLED 0x0002 /* Device is disabled, the
ephemeral reference is kept
to prevent context destruction */
-#define DMAR_CTX_PGLOCK(ctx) VM_OBJECT_WLOCK((ctx)->pgtbl_obj)
-#define DMAR_CTX_PGTRYLOCK(ctx) VM_OBJECT_TRYWLOCK((ctx)->pgtbl_obj)
-#define DMAR_CTX_PGUNLOCK(ctx) VM_OBJECT_WUNLOCK((ctx)->pgtbl_obj)
-#define DMAR_CTX_ASSERT_PGLOCKED(ctx) \
- VM_OBJECT_ASSERT_WLOCKED((ctx)->pgtbl_obj)
+#define DMAR_DOMAIN_PGLOCK(dom) VM_OBJECT_WLOCK((dom)->pgtbl_obj)
+#define DMAR_DOMAIN_PGTRYLOCK(dom) VM_OBJECT_TRYWLOCK((dom)->pgtbl_obj)
+#define DMAR_DOMAIN_PGUNLOCK(dom) VM_OBJECT_WUNLOCK((dom)->pgtbl_obj)
+#define DMAR_DOMAIN_ASSERT_PGLOCKED(dom) \
+ VM_OBJECT_ASSERT_WLOCKED((dom)->pgtbl_obj)
-#define DMAR_CTX_LOCK(ctx) mtx_lock(&(ctx)->lock)
-#define DMAR_CTX_UNLOCK(ctx) mtx_unlock(&(ctx)->lock)
-#define DMAR_CTX_ASSERT_LOCKED(ctx) mtx_assert(&(ctx)->lock, MA_OWNED)
+#define DMAR_DOMAIN_LOCK(dom) mtx_lock(&(dom)->lock)
+#define DMAR_DOMAIN_UNLOCK(dom) mtx_unlock(&(dom)->lock)
+#define DMAR_DOMAIN_ASSERT_LOCKED(dom) mtx_assert(&(dom)->lock, MA_OWNED)
struct dmar_msi_data {
int irq;
@@ -158,7 +192,7 @@
/* Data for being a dmar */
struct mtx lock;
- LIST_HEAD(, dmar_ctx) contexts;
+ LIST_HEAD(, dmar_domain) domains;
struct unrhdr *domids;
vm_object_t ctx_obj;
u_int barrier_flags;
@@ -186,6 +220,13 @@
u_int inv_seq_waiters; /* count of waiters for seq */
u_int inv_queue_full; /* informational counter */
+ /* IR */
+ int ir_enabled;
+ vm_paddr_t irt_phys;
+ dmar_irte_t *irt;
+ u_int irte_cnt;
+ vmem_t *irtids;
+
/* Delayed freeing of map entries queue processing */
struct dmar_map_entries_tailq tlb_flush_entries;
struct task qi_task;
@@ -195,6 +236,8 @@
struct task dmamap_load_task;
TAILQ_HEAD(, bus_dmamap_dmar) delayed_maps;
struct taskqueue *delayed_taskqueue;
+
+ int dma_enabled;
};
#define DMAR_LOCK(dmar) mtx_lock(&(dmar)->lock)
@@ -207,6 +250,8 @@
#define DMAR_IS_COHERENT(dmar) (((dmar)->hw_ecap & DMAR_ECAP_C) != 0)
#define DMAR_HAS_QI(dmar) (((dmar)->hw_ecap & DMAR_ECAP_QI) != 0)
+#define DMAR_X2APIC(dmar) \
+ (x2apic_mode && ((dmar)->hw_ecap & DMAR_ECAP_EIM) != 0)
/* Barrier ids */
#define DMAR_BARRIER_RMRR 0
@@ -213,16 +258,18 @@
#define DMAR_BARRIER_USEQ 1
struct dmar_unit *dmar_find(device_t dev);
+struct dmar_unit *dmar_find_hpet(device_t dev, uint16_t *rid);
+struct dmar_unit *dmar_find_ioapic(u_int apic_id, uint16_t *rid);
u_int dmar_nd2mask(u_int nd);
bool dmar_pglvl_supported(struct dmar_unit *unit, int pglvl);
-int ctx_set_agaw(struct dmar_ctx *ctx, int mgaw);
-int dmar_maxaddr2mgaw(struct dmar_unit* unit, dmar_gaddr_t maxaddr,
+int domain_set_agaw(struct dmar_domain *domain, int mgaw);
+int dmar_maxaddr2mgaw(struct dmar_unit *unit, dmar_gaddr_t maxaddr,
bool allow_less);
vm_pindex_t pglvl_max_pages(int pglvl);
-int ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl);
+int domain_is_sp_lvl(struct dmar_domain *domain, int lvl);
dmar_gaddr_t pglvl_page_size(int total_pglvl, int lvl);
-dmar_gaddr_t ctx_page_size(struct dmar_ctx *ctx, int lvl);
+dmar_gaddr_t domain_page_size(struct dmar_domain *domain, int lvl);
int calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size,
dmar_gaddr_t *isizep);
struct vm_page *dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags);
@@ -239,8 +286,13 @@
void dmar_flush_root_to_ram(struct dmar_unit *unit, dmar_root_entry_t *dst);
int dmar_enable_translation(struct dmar_unit *unit);
int dmar_disable_translation(struct dmar_unit *unit);
+int dmar_load_irt_ptr(struct dmar_unit *unit);
+int dmar_enable_ir(struct dmar_unit *unit);
+int dmar_disable_ir(struct dmar_unit *unit);
bool dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id);
void dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id);
+uint64_t dmar_get_timeout(void);
+void dmar_update_timeout(uint64_t newval);
int dmar_fault_intr(void *arg);
void dmar_enable_fault_intr(struct dmar_unit *unit);
@@ -253,52 +305,61 @@
void dmar_disable_qi_intr(struct dmar_unit *unit);
int dmar_init_qi(struct dmar_unit *unit);
void dmar_fini_qi(struct dmar_unit *unit);
-void dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t start,
- dmar_gaddr_t size, struct dmar_qi_genseq *pseq);
+void dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t start,
+ dmar_gaddr_t size, struct dmar_qi_genseq *psec, bool emit_wait);
void dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit);
void dmar_qi_invalidate_iotlb_glob_locked(struct dmar_unit *unit);
+void dmar_qi_invalidate_iec_glob(struct dmar_unit *unit);
+void dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt);
-vm_object_t ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr);
+vm_object_t domain_get_idmap_pgtbl(struct dmar_domain *domain,
+ dmar_gaddr_t maxaddr);
void put_idmap_pgtbl(vm_object_t obj);
-int ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
- vm_page_t *ma, uint64_t pflags, int flags);
-int ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
- int flags);
-void ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base,
+int domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base,
+ dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags);
+int domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base,
+ dmar_gaddr_t size, int flags);
+void domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base,
dmar_gaddr_t size);
-int ctx_alloc_pgtbl(struct dmar_ctx *ctx);
-void ctx_free_pgtbl(struct dmar_ctx *ctx);
+int domain_alloc_pgtbl(struct dmar_domain *domain);
+void domain_free_pgtbl(struct dmar_domain *domain);
struct dmar_ctx *dmar_instantiate_ctx(struct dmar_unit *dmar, device_t dev,
bool rmrr);
-struct dmar_ctx *dmar_get_ctx(struct dmar_unit *dmar, device_t dev,
+struct dmar_ctx *dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev,
uint16_t rid, bool id_mapped, bool rmrr_init);
+int dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx);
void dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx);
void dmar_free_ctx(struct dmar_ctx *ctx);
struct dmar_ctx *dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid);
-void dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free);
-void dmar_ctx_unload(struct dmar_ctx *ctx,
+void dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free);
+void dmar_domain_unload(struct dmar_domain *domain,
struct dmar_map_entries_tailq *entries, bool cansleep);
-void dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free);
+void dmar_domain_free_entry(struct dmar_map_entry *entry, bool free);
int dmar_init_busdma(struct dmar_unit *unit);
void dmar_fini_busdma(struct dmar_unit *unit);
+device_t dmar_get_requester(device_t dev, uint16_t *rid);
-void dmar_gas_init_ctx(struct dmar_ctx *ctx);
-void dmar_gas_fini_ctx(struct dmar_ctx *ctx);
-struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags);
-void dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry);
-void dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry);
-int dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common,
- dmar_gaddr_t size, int offset, u_int eflags, u_int flags, vm_page_t *ma,
- struct dmar_map_entry **res);
-void dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry);
-int dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
- u_int eflags, u_int flags, vm_page_t *ma);
-int dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start,
+void dmar_gas_init_domain(struct dmar_domain *domain);
+void dmar_gas_fini_domain(struct dmar_domain *domain);
+struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_domain *domain,
+ u_int flags);
+void dmar_gas_free_entry(struct dmar_domain *domain,
+ struct dmar_map_entry *entry);
+void dmar_gas_free_space(struct dmar_domain *domain,
+ struct dmar_map_entry *entry);
+int dmar_gas_map(struct dmar_domain *domain,
+ const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset,
+ u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res);
+void dmar_gas_free_region(struct dmar_domain *domain,
+ struct dmar_map_entry *entry);
+int dmar_gas_map_region(struct dmar_domain *domain,
+ struct dmar_map_entry *entry, u_int eflags, u_int flags, vm_page_t *ma);
+int dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start,
dmar_gaddr_t end);
-void dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev,
+void dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev,
struct dmar_map_entries_tailq *rmrr_entries);
int dmar_instantiate_rmrr_ctxs(struct dmar_unit *dmar);
@@ -305,6 +366,9 @@
void dmar_quirks_post_ident(struct dmar_unit *dmar);
void dmar_quirks_pre_use(struct dmar_unit *dmar);
+int dmar_init_irt(struct dmar_unit *unit);
+void dmar_fini_irt(struct dmar_unit *unit);
+
#define DMAR_GM_CANWAIT 0x0001
#define DMAR_GM_CANSPLIT 0x0002
@@ -318,6 +382,7 @@
extern int haw;
extern int dmar_tbl_pagecnt;
extern int dmar_match_verbose;
+extern int dmar_batch_coalesce;
extern int dmar_check_free;
static inline uint32_t
@@ -375,13 +440,16 @@
* containing the P or R and W bits, is set only after the high word
* is written. For clear, the P bit is cleared first, then the high
* word is cleared.
+ *
+ * dmar_pte_update updates the pte. For amd64, the update is atomic.
+ * For i386, it first disables the entry by clearing the word
+ * containing the P bit, and then defer to dmar_pte_store. The locked
+ * cmpxchg8b is probably available on any machine having DMAR support,
+ * but interrupt translation table may be mapped uncached.
*/
static inline void
-dmar_pte_store(volatile uint64_t *dst, uint64_t val)
+dmar_pte_store1(volatile uint64_t *dst, uint64_t val)
{
-
- KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx",
- dst, (uintmax_t)*dst, (uintmax_t)val));
#ifdef __i386__
volatile uint32_t *p;
uint32_t hi, lo;
@@ -397,6 +465,28 @@
}
static inline void
+dmar_pte_store(volatile uint64_t *dst, uint64_t val)
+{
+
+ KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx",
+ dst, (uintmax_t)*dst, (uintmax_t)val));
+ dmar_pte_store1(dst, val);
+}
+
+static inline void
+dmar_pte_update(volatile uint64_t *dst, uint64_t val)
+{
+
+#ifdef __i386__
+ volatile uint32_t *p;
+
+ p = (volatile uint32_t *)dst;
+ *p = 0;
+#endif
+ dmar_pte_store1(dst, val);
+}
+
+static inline void
dmar_pte_clear(volatile uint64_t *dst)
{
#ifdef __i386__
@@ -420,6 +510,36 @@
return (start + size <= ((start + boundary) & ~(boundary - 1)));
}
+extern struct timespec dmar_hw_timeout;
+
+#define DMAR_WAIT_UNTIL(cond) \
+{ \
+ struct timespec last, curr; \
+ bool forever; \
+ \
+ if (dmar_hw_timeout.tv_sec == 0 && \
+ dmar_hw_timeout.tv_nsec == 0) { \
+ forever = true; \
+ } else { \
+ forever = false; \
+ nanouptime(&curr); \
+ last = curr; \
+ timespecadd(&last, &dmar_hw_timeout); \
+ } \
+ for (;;) { \
+ if (cond) { \
+ error = 0; \
+ break; \
+ } \
+ nanouptime(&curr); \
+ if (!forever && timespeccmp(&last, &curr, <)) { \
+ error = ETIMEDOUT; \
+ break; \
+ } \
+ cpu_spinwait(); \
+ } \
+}
+
#ifdef INVARIANTS
#define TD_PREP_PINNED_ASSERT \
int old_td_pinned; \
Modified: trunk/sys/x86/iommu/intel_drv.c
===================================================================
--- trunk/sys/x86/iommu/intel_drv.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_drv.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -1,6 +1,6 @@
/* $MidnightBSD$ */
/*-
- * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Konstantin Belousov <kib at FreeBSD.org>
@@ -29,10 +29,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_drv.c 279470 2015-03-01 04:22:06Z rstone $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_drv.c 323921 2017-09-22 10:51:32Z kib $");
#include "opt_acpi.h"
-#if defined(__amd64__) /* || defined(__ia64__) */
+#if defined(__amd64__)
#define DEV_APIC
#else
#include "opt_apic.h"
@@ -51,6 +51,7 @@
#include <sys/smp.h>
#include <sys/taskqueue.h>
#include <sys/tree.h>
+#include <sys/vmem.h>
#include <machine/bus.h>
#include <contrib/dev/acpica/include/acpi.h>
#include <contrib/dev/acpica/include/accommon.h>
@@ -66,10 +67,14 @@
#include <x86/iommu/intel_reg.h>
#include <x86/iommu/busdma_dmar.h>
#include <x86/iommu/intel_dmar.h>
+#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#ifdef DEV_APIC
#include "pcib_if.h"
+#include <machine/intr_machdep.h>
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
#endif
#define DMAR_FAULT_IRQ_RID 0
@@ -108,6 +113,7 @@
if (!iter(dmarh, arg))
break;
}
+ AcpiPutTable((ACPI_TABLE_HEADER *)dmartbl);
}
struct find_iter_args {
@@ -183,6 +189,7 @@
(unsigned)dmartbl->Flags,
"\020\001INTR_REMAP\002X2APIC_OPT_OUT");
}
+ AcpiPutTable((ACPI_TABLE_HEADER *)dmartbl);
dmar_iterate_tbl(dmar_count_iter, NULL);
if (dmar_devcnt == 0)
@@ -244,6 +251,7 @@
int i;
dmar_fini_busdma(unit);
+ dmar_fini_irt(unit);
dmar_fini_qi(unit);
dmar_fini_fault_log(unit);
for (i = 0; i < DMAR_INTR_TOTAL; i++)
@@ -304,7 +312,7 @@
dmd->name, error);
goto err4;
}
- bus_describe_intr(dev, dmd->irq_res, dmd->intr_handle, dmd->name);
+ bus_describe_intr(dev, dmd->irq_res, dmd->intr_handle, "%s", dmd->name);
error = PCIB_MAP_MSI(pcib, dev, dmd->irq, &msi_addr, &msi_data);
if (error != 0) {
device_printf(dev, "cannot map %s interrupt, %d\n",
@@ -398,6 +406,7 @@
{
struct dmar_unit *unit;
ACPI_DMAR_HARDWARE_UNIT *dmaru;
+ uint64_t timeout;
int i, error;
unit = device_get_softc(dev);
@@ -422,6 +431,10 @@
dmar_print_caps(dev, unit, dmaru);
dmar_quirks_post_ident(unit);
+ timeout = dmar_get_timeout();
+ TUNABLE_UINT64_FETCH("hw.dmar.timeout", &timeout);
+ dmar_update_timeout(timeout);
+
for (i = 0; i < DMAR_INTR_TOTAL; i++)
unit->intrs[i].irq = -1;
@@ -457,6 +470,7 @@
mtx_init(&unit->lock, "dmarhw", NULL, MTX_DEF);
unit->domids = new_unrhdr(0, dmar_nd2mask(DMAR_CAP_ND(unit->hw_cap)),
&unit->lock);
+ LIST_INIT(&unit->domains);
/*
* 9.2 "Context Entry":
@@ -510,6 +524,11 @@
dmar_release_resources(dev, unit);
return (error);
}
+ error = dmar_init_irt(unit);
+ if (error != 0) {
+ dmar_release_resources(dev, unit);
+ return (error);
+ }
error = dmar_init_busdma(unit);
if (error != 0) {
dmar_release_resources(dev, unit);
@@ -764,8 +783,87 @@
return (device_get_softc(dmar_dev));
}
+static struct dmar_unit *
+dmar_find_nonpci(u_int id, u_int entry_type, uint16_t *rid)
+{
+ device_t dmar_dev;
+ struct dmar_unit *unit;
+ ACPI_DMAR_HARDWARE_UNIT *dmarh;
+ ACPI_DMAR_DEVICE_SCOPE *devscope;
+ ACPI_DMAR_PCI_PATH *path;
+ char *ptr, *ptrend;
+#ifdef DEV_APIC
+ int error;
+#endif
+ int i;
+
+ for (i = 0; i < dmar_devcnt; i++) {
+ dmar_dev = dmar_devs[i];
+ if (dmar_dev == NULL)
+ continue;
+ unit = (struct dmar_unit *)device_get_softc(dmar_dev);
+ dmarh = dmar_find_by_index(i);
+ if (dmarh == NULL)
+ continue;
+ ptr = (char *)dmarh + sizeof(*dmarh);
+ ptrend = (char *)dmarh + dmarh->Header.Length;
+ for (;;) {
+ if (ptr >= ptrend)
+ break;
+ devscope = (ACPI_DMAR_DEVICE_SCOPE *)ptr;
+ ptr += devscope->Length;
+ if (devscope->EntryType != entry_type)
+ continue;
+ if (devscope->EnumerationId != id)
+ continue;
+#ifdef DEV_APIC
+ if (entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
+ error = ioapic_get_rid(id, rid);
+ /*
+ * If our IOAPIC has PCI bindings then
+ * use the PCI device rid.
+ */
+ if (error == 0)
+ return (unit);
+ }
+#endif
+ if (devscope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE)
+ == 2) {
+ if (rid != NULL) {
+ path = (ACPI_DMAR_PCI_PATH *)
+ (devscope + 1);
+ *rid = PCI_RID(devscope->Bus,
+ path->Device, path->Function);
+ }
+ return (unit);
+ }
+ printf(
+ "dmar_find_nonpci: id %d type %d path length != 2\n",
+ id, entry_type);
+ break;
+ }
+ }
+ return (NULL);
+}
+
+
+struct dmar_unit *
+dmar_find_hpet(device_t dev, uint16_t *rid)
+{
+
+ return (dmar_find_nonpci(hpet_get_uid(dev), ACPI_DMAR_SCOPE_TYPE_HPET,
+ rid));
+}
+
+struct dmar_unit *
+dmar_find_ioapic(u_int apic_id, uint16_t *rid)
+{
+
+ return (dmar_find_nonpci(apic_id, ACPI_DMAR_SCOPE_TYPE_IOAPIC, rid));
+}
+
struct rmrr_iter_args {
- struct dmar_ctx *ctx;
+ struct dmar_domain *domain;
device_t dev;
int dev_domain;
int dev_busno;
@@ -810,7 +908,8 @@
if (match == 1) {
if (dmar_match_verbose)
printf("matched\n");
- entry = dmar_gas_alloc_entry(ria->ctx, DMAR_PGF_WAITOK);
+ entry = dmar_gas_alloc_entry(ria->domain,
+ DMAR_PGF_WAITOK);
entry->start = resmem->BaseAddress;
/* The RMRR entry end address is inclusive. */
entry->end = resmem->EndAddress;
@@ -825,7 +924,7 @@
}
void
-dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev,
+dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev,
struct dmar_map_entries_tailq *rmrr_entries)
{
struct rmrr_iter_args ria;
@@ -841,7 +940,7 @@
dev_path);
}
- ria.ctx = ctx;
+ ria.domain = domain;
ria.dev = dev;
ria.dev_path = dev_path;
ria.rmrr_entries = rmrr_entries;
@@ -961,7 +1060,7 @@
printf("dmar%d: instantiating RMRR contexts\n", dmar->unit);
dmar_iterate_tbl(dmar_inst_rmrr_iter, &iria);
DMAR_LOCK(dmar);
- if (!LIST_EMPTY(&dmar->contexts)) {
+ if (!LIST_EMPTY(&dmar->domains)) {
KASSERT((dmar->hw_gcmd & DMAR_GCMD_TE) == 0,
("dmar%d: RMRR not handled but translation is already enabled",
dmar->unit));
@@ -976,7 +1075,7 @@
#include <ddb/db_lex.h>
static void
-dmar_print_ctx_entry(const struct dmar_map_entry *entry)
+dmar_print_domain_entry(const struct dmar_map_entry *entry)
{
struct dmar_map_entry *l, *r;
@@ -1000,24 +1099,39 @@
}
static void
-dmar_print_ctx(struct dmar_ctx *ctx, bool show_mappings)
+dmar_print_ctx(struct dmar_ctx *ctx)
{
- struct dmar_map_entry *entry;
db_printf(
- " @%p pci%d:%d:%d dom %d mgaw %d agaw %d pglvl %d end %jx\n"
- " refs %d flags %x pgobj %p map_ents %u loads %lu unloads %lu\n",
+ " @%p pci%d:%d:%d refs %d flags %x loads %lu unloads %lu\n",
ctx, pci_get_bus(ctx->ctx_tag.owner),
pci_get_slot(ctx->ctx_tag.owner),
- pci_get_function(ctx->ctx_tag.owner), ctx->domain, ctx->mgaw,
- ctx->agaw, ctx->pglvl, (uintmax_t)ctx->end, ctx->refs,
- ctx->flags, ctx->pgtbl_obj, ctx->entries_cnt, ctx->loads,
- ctx->unloads);
+ pci_get_function(ctx->ctx_tag.owner), ctx->refs, ctx->flags,
+ ctx->loads, ctx->unloads);
+}
+
+static void
+dmar_print_domain(struct dmar_domain *domain, bool show_mappings)
+{
+ struct dmar_map_entry *entry;
+ struct dmar_ctx *ctx;
+
+ db_printf(
+ " @%p dom %d mgaw %d agaw %d pglvl %d end %jx refs %d\n"
+ " ctx_cnt %d flags %x pgobj %p map_ents %u\n",
+ domain, domain->domain, domain->mgaw, domain->agaw, domain->pglvl,
+ (uintmax_t)domain->end, domain->refs, domain->ctx_cnt,
+ domain->flags, domain->pgtbl_obj, domain->entries_cnt);
+ if (!LIST_EMPTY(&domain->contexts)) {
+ db_printf(" Contexts:\n");
+ LIST_FOREACH(ctx, &domain->contexts, link)
+ dmar_print_ctx(ctx);
+ }
if (!show_mappings)
return;
db_printf(" mapped:\n");
- RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) {
- dmar_print_ctx_entry(entry);
+ RB_FOREACH(entry, dmar_gas_entries_tree, &domain->rb_root) {
+ dmar_print_domain_entry(entry);
if (db_pager_quit)
break;
}
@@ -1024,19 +1138,20 @@
if (db_pager_quit)
return;
db_printf(" unloading:\n");
- TAILQ_FOREACH(entry, &ctx->unload_entries, dmamap_link) {
- dmar_print_ctx_entry(entry);
+ TAILQ_FOREACH(entry, &domain->unload_entries, dmamap_link) {
+ dmar_print_domain_entry(entry);
if (db_pager_quit)
break;
}
}
-DB_FUNC(dmar_ctx, db_dmar_print_ctx, db_show_table, CS_OWN, NULL)
+DB_FUNC(dmar_domain, db_dmar_print_domain, db_show_table, CS_OWN, NULL)
{
struct dmar_unit *unit;
+ struct dmar_domain *domain;
struct dmar_ctx *ctx;
bool show_mappings, valid;
- int domain, bus, device, function, i, t;
+ int pci_domain, bus, device, function, i, t;
db_expr_t radix;
valid = false;
@@ -1057,7 +1172,7 @@
show_mappings = false;
}
if (t == tNUMBER) {
- domain = db_tok_number;
+ pci_domain = db_tok_number;
t = db_read_token();
if (t == tNUMBER) {
bus = db_tok_number;
@@ -1075,19 +1190,24 @@
db_radix = radix;
db_skip_to_eol();
if (!valid) {
- db_printf("usage: show dmar_ctx [/m] "
+ db_printf("usage: show dmar_domain [/m] "
"<domain> <bus> <device> <func>\n");
return;
}
for (i = 0; i < dmar_devcnt; i++) {
unit = device_get_softc(dmar_devs[i]);
- LIST_FOREACH(ctx, &unit->contexts, link) {
- if (domain == unit->segment &&
- bus == pci_get_bus(ctx->ctx_tag.owner) &&
- device == pci_get_slot(ctx->ctx_tag.owner) &&
- function == pci_get_function(ctx->ctx_tag.owner)) {
- dmar_print_ctx(ctx, show_mappings);
- goto out;
+ LIST_FOREACH(domain, &unit->domains, link) {
+ LIST_FOREACH(ctx, &domain->contexts, link) {
+ if (pci_domain == unit->segment &&
+ bus == pci_get_bus(ctx->ctx_tag.owner) &&
+ device ==
+ pci_get_slot(ctx->ctx_tag.owner) &&
+ function ==
+ pci_get_function(ctx->ctx_tag.owner)) {
+ dmar_print_domain(domain,
+ show_mappings);
+ goto out;
+ }
}
}
}
@@ -1095,10 +1215,10 @@
}
static void
-dmar_print_one(int idx, bool show_ctxs, bool show_mappings)
+dmar_print_one(int idx, bool show_domains, bool show_mappings)
{
struct dmar_unit *unit;
- struct dmar_ctx *ctx;
+ struct dmar_domain *domain;
int i, frir;
unit = device_get_softc(dmar_devs[idx]);
@@ -1110,6 +1230,10 @@
dmar_read4(unit, DMAR_GSTS_REG),
dmar_read4(unit, DMAR_FSTS_REG),
dmar_read4(unit, DMAR_FECTL_REG));
+ if (unit->ir_enabled) {
+ db_printf("ir is enabled; IRT @%p phys 0x%jx maxcnt %d\n",
+ unit->irt, (uintmax_t)unit->irt_phys, unit->irte_cnt);
+ }
db_printf("fed 0x%x fea 0x%x feua 0x%x\n",
dmar_read4(unit, DMAR_FEDATA_REG),
dmar_read4(unit, DMAR_FEADDR_REG),
@@ -1148,10 +1272,10 @@
db_printf("qi is disabled\n");
}
}
- if (show_ctxs) {
- db_printf("contexts:\n");
- LIST_FOREACH(ctx, &unit->contexts, link) {
- dmar_print_ctx(ctx, show_mappings);
+ if (show_domains) {
+ db_printf("domains:\n");
+ LIST_FOREACH(domain, &unit->domains, link) {
+ dmar_print_domain(domain, show_mappings);
if (db_pager_quit)
break;
}
@@ -1160,27 +1284,27 @@
DB_SHOW_COMMAND(dmar, db_dmar_print)
{
- bool show_ctxs, show_mappings;
+ bool show_domains, show_mappings;
- show_ctxs = strchr(modif, 'c') != NULL;
+ show_domains = strchr(modif, 'd') != NULL;
show_mappings = strchr(modif, 'm') != NULL;
if (!have_addr) {
- db_printf("usage: show dmar [/c] [/m] index\n");
+ db_printf("usage: show dmar [/d] [/m] index\n");
return;
}
- dmar_print_one((int)addr, show_ctxs, show_mappings);
+ dmar_print_one((int)addr, show_domains, show_mappings);
}
DB_SHOW_ALL_COMMAND(dmars, db_show_all_dmars)
{
int i;
- bool show_ctxs, show_mappings;
+ bool show_domains, show_mappings;
- show_ctxs = strchr(modif, 'c') != NULL;
+ show_domains = strchr(modif, 'd') != NULL;
show_mappings = strchr(modif, 'm') != NULL;
for (i = 0; i < dmar_devcnt; i++) {
- dmar_print_one(i, show_ctxs, show_mappings);
+ dmar_print_one(i, show_domains, show_mappings);
if (db_pager_quit)
break;
}
Modified: trunk/sys/x86/iommu/intel_fault.c
===================================================================
--- trunk/sys/x86/iommu/intel_fault.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_fault.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_fault.c 279485 2015-03-01 10:35:54Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_fault.c 309882 2016-12-12 09:43:48Z kib $");
#include "opt_acpi.h"
@@ -42,6 +42,7 @@
#include <sys/rman.h>
#include <sys/taskqueue.h>
#include <sys/tree.h>
+#include <sys/vmem.h>
#include <machine/bus.h>
#include <contrib/dev/acpica/include/acpi.h>
#include <contrib/dev/acpica/include/accommon.h>
@@ -179,7 +180,7 @@
}
if (enqueue) {
- taskqueue_enqueue_fast(unit->fault_taskqueue,
+ taskqueue_enqueue(unit->fault_taskqueue,
&unit->fault_task);
}
return (FILTER_HANDLED);
@@ -271,7 +272,7 @@
M_DEVBUF, M_WAITOK | M_ZERO);
TASK_INIT(&unit->fault_task, 0, dmar_fault_task, unit);
- unit->fault_taskqueue = taskqueue_create_fast("dmar", M_WAITOK,
+ unit->fault_taskqueue = taskqueue_create_fast("dmarff", M_WAITOK,
taskqueue_thread_enqueue, &unit->fault_taskqueue);
taskqueue_start_threads(&unit->fault_taskqueue, 1, PI_AV,
"dmar%d fault taskq", unit->unit);
Modified: trunk/sys/x86/iommu/intel_gas.c
===================================================================
--- trunk/sys/x86/iommu/intel_gas.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_gas.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_gas.c 281545 2015-04-15 06:56:51Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_gas.c 329942 2018-02-25 00:32:42Z markj $");
#define RB_AUGMENT(entry) dmar_gas_augment_entry(entry)
@@ -50,6 +50,7 @@
#include <sys/taskqueue.h>
#include <sys/tree.h>
#include <sys/uio.h>
+#include <sys/vmem.h>
#include <dev/pci/pcivar.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -79,12 +80,12 @@
dmar_map_entry_zone = uma_zcreate("DMAR_MAP_ENTRY",
sizeof(struct dmar_map_entry), NULL, NULL,
- NULL, NULL, UMA_ALIGN_PTR, 0);
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NODUMP);
}
SYSINIT(intel_gas, SI_SUB_DRIVERS, SI_ORDER_FIRST, intel_gas_init, NULL);
struct dmar_map_entry *
-dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags)
+dmar_gas_alloc_entry(struct dmar_domain *domain, u_int flags)
{
struct dmar_map_entry *res;
@@ -94,20 +95,20 @@
res = uma_zalloc(dmar_map_entry_zone, ((flags & DMAR_PGF_WAITOK) !=
0 ? M_WAITOK : M_NOWAIT) | M_ZERO);
if (res != NULL) {
- res->ctx = ctx;
- atomic_add_int(&ctx->entries_cnt, 1);
+ res->domain = domain;
+ atomic_add_int(&domain->entries_cnt, 1);
}
return (res);
}
void
-dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_free_entry(struct dmar_domain *domain, struct dmar_map_entry *entry)
{
- KASSERT(ctx == entry->ctx,
- ("mismatched free ctx %p entry %p entry->ctx %p", ctx,
- entry, entry->ctx));
- atomic_subtract_int(&ctx->entries_cnt, 1);
+ KASSERT(domain == entry->domain,
+ ("mismatched free domain %p entry %p entry->domain %p", domain,
+ entry, entry->domain));
+ atomic_subtract_int(&domain->entries_cnt, 1);
uma_zfree(dmar_map_entry_zone, entry);
}
@@ -158,12 +159,12 @@
dmar_gas_cmp_entries);
static void
-dmar_gas_fix_free(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_fix_free(struct dmar_domain *domain, struct dmar_map_entry *entry)
{
struct dmar_map_entry *next;
- next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
- entry->free_after = (next != NULL ? next->start : ctx->end) -
+ next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
+ entry->free_after = (next != NULL ? next->start : domain->end) -
entry->end;
dmar_gas_augment_entry(entry);
}
@@ -170,18 +171,18 @@
#ifdef INVARIANTS
static void
-dmar_gas_check_free(struct dmar_ctx *ctx)
+dmar_gas_check_free(struct dmar_domain *domain)
{
struct dmar_map_entry *entry, *next, *l, *r;
dmar_gaddr_t v;
- RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) {
- KASSERT(ctx == entry->ctx,
- ("mismatched free ctx %p entry %p entry->ctx %p", ctx,
- entry, entry->ctx));
- next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
+ RB_FOREACH(entry, dmar_gas_entries_tree, &domain->rb_root) {
+ KASSERT(domain == entry->domain,
+ ("mismatched free domain %p entry %p entry->domain %p",
+ domain, entry, entry->domain));
+ next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
if (next == NULL) {
- MPASS(entry->free_after == ctx->end - entry->end);
+ MPASS(entry->free_after == domain->end - entry->end);
} else {
MPASS(entry->free_after = next->start - entry->end);
MPASS(entry->end <= next->start);
@@ -198,7 +199,7 @@
l->free_down));
} else {
v = MAX(entry->free_after, l->free_down);
- v = MAX(entry->free_down, r->free_down);
+ v = MAX(v, r->free_down);
MPASS(entry->free_down == v);
}
}
@@ -206,93 +207,95 @@
#endif
static bool
-dmar_gas_rb_insert(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_rb_insert(struct dmar_domain *domain, struct dmar_map_entry *entry)
{
struct dmar_map_entry *prev, *found;
- found = RB_INSERT(dmar_gas_entries_tree, &ctx->rb_root, entry);
- dmar_gas_fix_free(ctx, entry);
- prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
+ found = RB_INSERT(dmar_gas_entries_tree, &domain->rb_root, entry);
+ dmar_gas_fix_free(domain, entry);
+ prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
if (prev != NULL)
- dmar_gas_fix_free(ctx, prev);
+ dmar_gas_fix_free(domain, prev);
return (found == NULL);
}
static void
-dmar_gas_rb_remove(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_rb_remove(struct dmar_domain *domain, struct dmar_map_entry *entry)
{
struct dmar_map_entry *prev;
- prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
- RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
+ prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
+ RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
if (prev != NULL)
- dmar_gas_fix_free(ctx, prev);
+ dmar_gas_fix_free(domain, prev);
}
void
-dmar_gas_init_ctx(struct dmar_ctx *ctx)
+dmar_gas_init_domain(struct dmar_domain *domain)
{
struct dmar_map_entry *begin, *end;
- begin = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK);
- end = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK);
+ begin = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK);
+ end = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK);
- DMAR_CTX_LOCK(ctx);
- KASSERT(ctx->entries_cnt == 2, ("dirty ctx %p", ctx));
- KASSERT(RB_EMPTY(&ctx->rb_root), ("non-empty entries %p", ctx));
+ DMAR_DOMAIN_LOCK(domain);
+ KASSERT(domain->entries_cnt == 2, ("dirty domain %p", domain));
+ KASSERT(RB_EMPTY(&domain->rb_root), ("non-empty entries %p", domain));
begin->start = 0;
begin->end = DMAR_PAGE_SIZE;
- begin->free_after = ctx->end - begin->end;
+ begin->free_after = domain->end - begin->end;
begin->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED;
- dmar_gas_rb_insert(ctx, begin);
+ dmar_gas_rb_insert(domain, begin);
- end->start = ctx->end;
- end->end = ctx->end;
+ end->start = domain->end;
+ end->end = domain->end;
end->free_after = 0;
end->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED;
- dmar_gas_rb_insert(ctx, end);
+ dmar_gas_rb_insert(domain, end);
- ctx->first_place = begin;
- ctx->last_place = end;
- DMAR_CTX_UNLOCK(ctx);
+ domain->first_place = begin;
+ domain->last_place = end;
+ domain->flags |= DMAR_DOMAIN_GAS_INITED;
+ DMAR_DOMAIN_UNLOCK(domain);
}
void
-dmar_gas_fini_ctx(struct dmar_ctx *ctx)
+dmar_gas_fini_domain(struct dmar_domain *domain)
{
struct dmar_map_entry *entry, *entry1;
- DMAR_CTX_ASSERT_LOCKED(ctx);
- KASSERT(ctx->entries_cnt == 2, ("ctx still in use %p", ctx));
+ DMAR_DOMAIN_ASSERT_LOCKED(domain);
+ KASSERT(domain->entries_cnt == 2, ("domain still in use %p", domain));
- entry = RB_MIN(dmar_gas_entries_tree, &ctx->rb_root);
- KASSERT(entry->start == 0, ("start entry start %p", ctx));
- KASSERT(entry->end == DMAR_PAGE_SIZE, ("start entry end %p", ctx));
+ entry = RB_MIN(dmar_gas_entries_tree, &domain->rb_root);
+ KASSERT(entry->start == 0, ("start entry start %p", domain));
+ KASSERT(entry->end == DMAR_PAGE_SIZE, ("start entry end %p", domain));
KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE,
- ("start entry flags %p", ctx));
- RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
- dmar_gas_free_entry(ctx, entry);
+ ("start entry flags %p", domain));
+ RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
+ dmar_gas_free_entry(domain, entry);
- entry = RB_MAX(dmar_gas_entries_tree, &ctx->rb_root);
- KASSERT(entry->start == ctx->end, ("end entry start %p", ctx));
- KASSERT(entry->end == ctx->end, ("end entry end %p", ctx));
- KASSERT(entry->free_after == 0, ("end entry free_after%p", ctx));
+ entry = RB_MAX(dmar_gas_entries_tree, &domain->rb_root);
+ KASSERT(entry->start == domain->end, ("end entry start %p", domain));
+ KASSERT(entry->end == domain->end, ("end entry end %p", domain));
+ KASSERT(entry->free_after == 0, ("end entry free_after %p", domain));
KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE,
- ("end entry flags %p", ctx));
- RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
- dmar_gas_free_entry(ctx, entry);
+ ("end entry flags %p", domain));
+ RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
+ dmar_gas_free_entry(domain, entry);
- RB_FOREACH_SAFE(entry, dmar_gas_entries_tree, &ctx->rb_root, entry1) {
+ RB_FOREACH_SAFE(entry, dmar_gas_entries_tree, &domain->rb_root,
+ entry1) {
KASSERT((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0,
- ("non-RMRR entry left %p", ctx));
- RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
- dmar_gas_free_entry(ctx, entry);
+ ("non-RMRR entry left %p", domain));
+ RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
+ dmar_gas_free_entry(domain, entry);
}
}
struct dmar_gas_match_args {
- struct dmar_ctx *ctx;
+ struct dmar_domain *domain;
dmar_gaddr_t size;
int offset;
const struct bus_dma_tag_common *common;
@@ -325,8 +328,8 @@
* the boundary. Check if there is enough space after the
* next boundary after the prev->end.
*/
- bs = (a->entry->start + a->offset + a->common->boundary) &
- ~(a->common->boundary - 1);
+ bs = rounddown2(a->entry->start + a->offset + a->common->boundary,
+ a->common->boundary);
start = roundup2(bs, a->common->alignment);
/* DMAR_PAGE_SIZE to create gap after new entry. */
if (start + a->offset + a->size + DMAR_PAGE_SIZE <=
@@ -371,12 +374,12 @@
*/
a->entry->end = a->entry->start + a->size;
- next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev);
+ next = RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, prev);
KASSERT(next->start >= a->entry->end &&
next->start - a->entry->start >= a->size &&
prev->end <= a->entry->end,
("dmar_gas_match_insert hole failed %p prev (%jx, %jx) "
- "free_after %jx next (%jx, %jx) entry (%jx, %jx)", a->ctx,
+ "free_after %jx next (%jx, %jx) entry (%jx, %jx)", a->domain,
(uintmax_t)prev->start, (uintmax_t)prev->end,
(uintmax_t)prev->free_after,
(uintmax_t)next->start, (uintmax_t)next->end,
@@ -385,19 +388,19 @@
prev->free_after = a->entry->start - prev->end;
a->entry->free_after = next->start - a->entry->end;
- found = dmar_gas_rb_insert(a->ctx, a->entry);
+ found = dmar_gas_rb_insert(a->domain, a->entry);
KASSERT(found, ("found dup %p start %jx size %jx",
- a->ctx, (uintmax_t)a->entry->start, (uintmax_t)a->size));
+ a->domain, (uintmax_t)a->entry->start, (uintmax_t)a->size));
a->entry->flags = DMAR_MAP_ENTRY_MAP;
- KASSERT(RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root,
+ KASSERT(RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root,
a->entry) == prev,
("entry %p prev %p inserted prev %p", a->entry, prev,
- RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry)));
- KASSERT(RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root,
+ RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root, a->entry)));
+ KASSERT(RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root,
a->entry) == next,
("entry %p next %p inserted next %p", a->entry, next,
- RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry)));
+ RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, a->entry)));
}
static int
@@ -434,11 +437,12 @@
struct dmar_map_entry *next, *prev, find_entry;
find_entry.start = a->common->highaddr;
- next = RB_NFIND(dmar_gas_entries_tree, &a->ctx->rb_root, &find_entry);
+ next = RB_NFIND(dmar_gas_entries_tree, &a->domain->rb_root,
+ &find_entry);
if (next == NULL)
return (ENOMEM);
- prev = RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, next);
- KASSERT(prev != NULL, ("no prev %p %jx", a->ctx,
+ prev = RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root, next);
+ KASSERT(prev != NULL, ("no prev %p %jx", a->domain,
(uintmax_t)find_entry.start));
for (;;) {
a->entry->start = prev->start + DMAR_PAGE_SIZE;
@@ -446,7 +450,7 @@
a->entry->start = a->common->highaddr;
a->entry->start = roundup2(a->entry->start,
a->common->alignment);
- if (dmar_gas_match_one(a, prev, a->ctx->end)) {
+ if (dmar_gas_match_one(a, prev, a->domain->end)) {
dmar_gas_match_insert(a, prev);
return (0);
}
@@ -459,16 +463,17 @@
* non-optimal way.
*/
prev = next;
- next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev);
- KASSERT(next != NULL, ("no next %p %jx", a->ctx,
+ next = RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root,
+ prev);
+ KASSERT(next != NULL, ("no next %p %jx", a->domain,
(uintmax_t)find_entry.start));
- if (next->end >= a->ctx->end)
+ if (next->end >= a->domain->end)
return (ENOMEM);
}
}
static int
-dmar_gas_find_space(struct dmar_ctx *ctx,
+dmar_gas_find_space(struct dmar_domain *domain,
const struct bus_dma_tag_common *common, dmar_gaddr_t size,
int offset, u_int flags, struct dmar_map_entry *entry)
{
@@ -475,11 +480,11 @@
struct dmar_gas_match_args a;
int error;
- DMAR_CTX_ASSERT_LOCKED(ctx);
- KASSERT(entry->flags == 0, ("dirty entry %p %p", ctx, entry));
+ DMAR_DOMAIN_ASSERT_LOCKED(domain);
+ KASSERT(entry->flags == 0, ("dirty entry %p %p", domain, entry));
KASSERT((size & DMAR_PAGE_MASK) == 0, ("size %jx", (uintmax_t)size));
- a.ctx = ctx;
+ a.domain = domain;
a.size = size;
a.offset = offset;
a.common = common;
@@ -488,7 +493,7 @@
/* Handle lower region. */
if (common->lowaddr > 0) {
- error = dmar_gas_lowermatch(&a, RB_ROOT(&ctx->rb_root));
+ error = dmar_gas_lowermatch(&a, RB_ROOT(&domain->rb_root));
if (error == 0)
return (0);
KASSERT(error == ENOMEM,
@@ -495,7 +500,7 @@
("error %d from dmar_gas_lowermatch", error));
}
/* Handle upper region. */
- if (common->highaddr >= ctx->end)
+ if (common->highaddr >= domain->end)
return (ENOMEM);
error = dmar_gas_uppermatch(&a);
KASSERT(error == ENOMEM,
@@ -504,13 +509,13 @@
}
static int
-dmar_gas_alloc_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
+dmar_gas_alloc_region(struct dmar_domain *domain, struct dmar_map_entry *entry,
u_int flags)
{
struct dmar_map_entry *next, *prev;
bool found;
- DMAR_CTX_ASSERT_LOCKED(ctx);
+ DMAR_DOMAIN_ASSERT_LOCKED(domain);
if ((entry->start & DMAR_PAGE_MASK) != 0 ||
(entry->end & DMAR_PAGE_MASK) != 0)
@@ -517,13 +522,13 @@
return (EINVAL);
if (entry->start >= entry->end)
return (EINVAL);
- if (entry->end >= ctx->end)
+ if (entry->end >= domain->end)
return (EINVAL);
- next = RB_NFIND(dmar_gas_entries_tree, &ctx->rb_root, entry);
- KASSERT(next != NULL, ("next must be non-null %p %jx", ctx,
+ next = RB_NFIND(dmar_gas_entries_tree, &domain->rb_root, entry);
+ KASSERT(next != NULL, ("next must be non-null %p %jx", domain,
(uintmax_t)entry->start));
- prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, next);
+ prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, next);
/* prev could be NULL */
/*
@@ -551,23 +556,23 @@
if (prev != NULL && prev->end > entry->start) {
/* This assumes that prev is the placeholder entry. */
- dmar_gas_rb_remove(ctx, prev);
+ dmar_gas_rb_remove(domain, prev);
prev = NULL;
}
if (next != NULL && next->start < entry->end) {
- dmar_gas_rb_remove(ctx, next);
+ dmar_gas_rb_remove(domain, next);
next = NULL;
}
- found = dmar_gas_rb_insert(ctx, entry);
+ found = dmar_gas_rb_insert(domain, entry);
KASSERT(found, ("found RMRR dup %p start %jx end %jx",
- ctx, (uintmax_t)entry->start, (uintmax_t)entry->end));
+ domain, (uintmax_t)entry->start, (uintmax_t)entry->end));
entry->flags = DMAR_MAP_ENTRY_RMRR;
#ifdef INVARIANTS
struct dmar_map_entry *ip, *in;
- ip = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
- in = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
+ ip = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
+ in = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
KASSERT(prev == NULL || ip == prev,
("RMRR %p (%jx %jx) prev %p (%jx %jx) ins prev %p (%jx %jx)",
entry, entry->start, entry->end, prev,
@@ -584,47 +589,47 @@
}
void
-dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_free_space(struct dmar_domain *domain, struct dmar_map_entry *entry)
{
- DMAR_CTX_ASSERT_LOCKED(ctx);
+ DMAR_DOMAIN_ASSERT_LOCKED(domain);
KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR |
DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_MAP,
- ("permanent entry %p %p", ctx, entry));
+ ("permanent entry %p %p", domain, entry));
- dmar_gas_rb_remove(ctx, entry);
+ dmar_gas_rb_remove(domain, entry);
entry->flags &= ~DMAR_MAP_ENTRY_MAP;
#ifdef INVARIANTS
if (dmar_check_free)
- dmar_gas_check_free(ctx);
+ dmar_gas_check_free(domain);
#endif
}
void
-dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_free_region(struct dmar_domain *domain, struct dmar_map_entry *entry)
{
struct dmar_map_entry *next, *prev;
- DMAR_CTX_ASSERT_LOCKED(ctx);
+ DMAR_DOMAIN_ASSERT_LOCKED(domain);
KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR |
DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_RMRR,
- ("non-RMRR entry %p %p", ctx, entry));
+ ("non-RMRR entry %p %p", domain, entry));
- prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
- next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
- dmar_gas_rb_remove(ctx, entry);
+ prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
+ next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
+ dmar_gas_rb_remove(domain, entry);
entry->flags &= ~DMAR_MAP_ENTRY_RMRR;
if (prev == NULL)
- dmar_gas_rb_insert(ctx, ctx->first_place);
+ dmar_gas_rb_insert(domain, domain->first_place);
if (next == NULL)
- dmar_gas_rb_insert(ctx, ctx->last_place);
+ dmar_gas_rb_insert(domain, domain->last_place);
}
int
-dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common,
- dmar_gaddr_t size, int offset, u_int eflags, u_int flags, vm_page_t *ma,
- struct dmar_map_entry **res)
+dmar_gas_map(struct dmar_domain *domain,
+ const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset,
+ u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res)
{
struct dmar_map_entry *entry;
int error;
@@ -632,29 +637,31 @@
KASSERT((flags & ~(DMAR_GM_CANWAIT | DMAR_GM_CANSPLIT)) == 0,
("invalid flags 0x%x", flags));
- entry = dmar_gas_alloc_entry(ctx, (flags & DMAR_GM_CANWAIT) != 0 ?
+ entry = dmar_gas_alloc_entry(domain, (flags & DMAR_GM_CANWAIT) != 0 ?
DMAR_PGF_WAITOK : 0);
if (entry == NULL)
return (ENOMEM);
- DMAR_CTX_LOCK(ctx);
- error = dmar_gas_find_space(ctx, common, size, offset, flags, entry);
+ DMAR_DOMAIN_LOCK(domain);
+ error = dmar_gas_find_space(domain, common, size, offset, flags,
+ entry);
if (error == ENOMEM) {
- DMAR_CTX_UNLOCK(ctx);
- dmar_gas_free_entry(ctx, entry);
+ DMAR_DOMAIN_UNLOCK(domain);
+ dmar_gas_free_entry(domain, entry);
return (error);
}
#ifdef INVARIANTS
if (dmar_check_free)
- dmar_gas_check_free(ctx);
+ dmar_gas_check_free(domain);
#endif
KASSERT(error == 0,
("unexpected error %d from dmar_gas_find_entry", error));
- KASSERT(entry->end < ctx->end, ("allocated GPA %jx, max GPA %jx",
- (uintmax_t)entry->end, (uintmax_t)ctx->end));
+ KASSERT(entry->end < domain->end, ("allocated GPA %jx, max GPA %jx",
+ (uintmax_t)entry->end, (uintmax_t)domain->end));
entry->flags |= eflags;
- DMAR_CTX_UNLOCK(ctx);
+ DMAR_DOMAIN_UNLOCK(domain);
- error = ctx_map_buf(ctx, entry->start, entry->end - entry->start, ma,
+ error = domain_map_buf(domain, entry->start, entry->end - entry->start,
+ ma,
((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) |
((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) |
((eflags & DMAR_MAP_ENTRY_SNOOP) != 0 ? DMAR_PTE_SNP : 0) |
@@ -661,11 +668,11 @@
((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0),
(flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0);
if (error == ENOMEM) {
- dmar_ctx_unload_entry(entry, true);
+ dmar_domain_unload_entry(entry, true);
return (error);
}
KASSERT(error == 0,
- ("unexpected error %d from ctx_map_buf", error));
+ ("unexpected error %d from domain_map_buf", error));
*res = entry;
return (0);
@@ -672,30 +679,30 @@
}
int
-dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
+dmar_gas_map_region(struct dmar_domain *domain, struct dmar_map_entry *entry,
u_int eflags, u_int flags, vm_page_t *ma)
{
dmar_gaddr_t start;
int error;
- KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", ctx,
+ KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", domain,
entry, entry->flags));
KASSERT((flags & ~(DMAR_GM_CANWAIT)) == 0,
("invalid flags 0x%x", flags));
start = entry->start;
- DMAR_CTX_LOCK(ctx);
- error = dmar_gas_alloc_region(ctx, entry, flags);
+ DMAR_DOMAIN_LOCK(domain);
+ error = dmar_gas_alloc_region(domain, entry, flags);
if (error != 0) {
- DMAR_CTX_UNLOCK(ctx);
+ DMAR_DOMAIN_UNLOCK(domain);
return (error);
}
entry->flags |= eflags;
- DMAR_CTX_UNLOCK(ctx);
+ DMAR_DOMAIN_UNLOCK(domain);
if (entry->end == entry->start)
return (0);
- error = ctx_map_buf(ctx, entry->start, entry->end - entry->start,
+ error = domain_map_buf(domain, entry->start, entry->end - entry->start,
ma + OFF_TO_IDX(start - entry->start),
((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) |
((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) |
@@ -703,31 +710,31 @@
((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0),
(flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0);
if (error == ENOMEM) {
- dmar_ctx_unload_entry(entry, false);
+ dmar_domain_unload_entry(entry, false);
return (error);
}
KASSERT(error == 0,
- ("unexpected error %d from ctx_map_buf", error));
+ ("unexpected error %d from domain_map_buf", error));
return (0);
}
int
-dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start,
+dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start,
dmar_gaddr_t end)
{
struct dmar_map_entry *entry;
int error;
- entry = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK);
+ entry = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK);
entry->start = start;
entry->end = end;
- DMAR_CTX_LOCK(ctx);
- error = dmar_gas_alloc_region(ctx, entry, DMAR_GM_CANWAIT);
+ DMAR_DOMAIN_LOCK(domain);
+ error = dmar_gas_alloc_region(domain, entry, DMAR_GM_CANWAIT);
if (error == 0)
entry->flags |= DMAR_MAP_ENTRY_UNMAPPED;
- DMAR_CTX_UNLOCK(ctx);
+ DMAR_DOMAIN_UNLOCK(domain);
if (error != 0)
- dmar_gas_free_entry(ctx, entry);
+ dmar_gas_free_entry(domain, entry);
return (error);
}
Modified: trunk/sys/x86/iommu/intel_idpgtbl.c
===================================================================
--- trunk/sys/x86/iommu/intel_idpgtbl.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_idpgtbl.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_idpgtbl.c 286854 2015-08-17 18:36:16Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_idpgtbl.c 286777 2015-08-14 13:51:59Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -49,6 +49,7 @@
#include <sys/taskqueue.h>
#include <sys/tree.h>
#include <sys/uio.h>
+#include <sys/vmem.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
@@ -66,8 +67,8 @@
#include <x86/iommu/busdma_dmar.h>
#include <x86/iommu/intel_dmar.h>
-static int ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base,
- dmar_gaddr_t size, int flags);
+static int domain_unmap_buf_locked(struct dmar_domain *domain,
+ dmar_gaddr_t base, dmar_gaddr_t size, int flags);
/*
* The cache of the identity mapping page tables for the DMARs. Using
@@ -105,7 +106,7 @@
* mapped by the page table page.
*/
static void
-ctx_idmap_nextlvl(struct idpgtbl *tbl, int lvl, vm_pindex_t idx,
+domain_idmap_nextlvl(struct idpgtbl *tbl, int lvl, vm_pindex_t idx,
dmar_gaddr_t addr)
{
vm_page_t m1;
@@ -124,7 +125,7 @@
pg_sz = pglvl_page_size(tbl->pglvl, lvl);
if (lvl != tbl->leaf) {
for (i = 0, f = addr; i < DMAR_NPTEPG; i++, f += pg_sz)
- ctx_idmap_nextlvl(tbl, lvl + 1, base + i, f);
+ domain_idmap_nextlvl(tbl, lvl + 1, base + i, f);
}
VM_OBJECT_WUNLOCK(tbl->pgtbl_obj);
pte = dmar_map_pgtbl(tbl->pgtbl_obj, idx, DMAR_PGF_WAITOK, &sf);
@@ -146,7 +147,7 @@
VM_PAGE_TO_PHYS(m1)) | DMAR_PTE_R | DMAR_PTE_W;
}
}
- /* ctx_get_idmap_pgtbl flushes CPU cache if needed. */
+ /* domain_get_idmap_pgtbl flushes CPU cache if needed. */
dmar_unmap_pgtbl(sf);
VM_OBJECT_WLOCK(tbl->pgtbl_obj);
}
@@ -160,7 +161,7 @@
* maxaddr is typically mapped.
*/
vm_object_t
-ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr)
+domain_get_idmap_pgtbl(struct dmar_domain *domain, dmar_gaddr_t maxaddr)
{
struct dmar_unit *unit;
struct idpgtbl *tbl;
@@ -173,8 +174,8 @@
/*
* First, determine where to stop the paging structures.
*/
- for (i = 0; i < ctx->pglvl; i++) {
- if (i == ctx->pglvl - 1 || ctx_is_sp_lvl(ctx, i)) {
+ for (i = 0; i < domain->pglvl; i++) {
+ if (i == domain->pglvl - 1 || domain_is_sp_lvl(domain, i)) {
leaf = i;
break;
}
@@ -191,12 +192,12 @@
sx_slock(&idpgtbl_lock);
LIST_FOREACH(tbl, &idpgtbls, link) {
if (tbl->maxaddr >= maxaddr &&
- dmar_pglvl_supported(ctx->dmar, tbl->pglvl) &&
+ dmar_pglvl_supported(domain->dmar, tbl->pglvl) &&
tbl->leaf == leaf) {
res = tbl->pgtbl_obj;
vm_object_reference(res);
sx_sunlock(&idpgtbl_lock);
- ctx->pglvl = tbl->pglvl; /* XXXKIB ? */
+ domain->pglvl = tbl->pglvl; /* XXXKIB ? */
goto end;
}
}
@@ -210,12 +211,12 @@
sx_xlock(&idpgtbl_lock);
LIST_FOREACH(tbl, &idpgtbls, link) {
if (tbl->maxaddr >= maxaddr &&
- dmar_pglvl_supported(ctx->dmar, tbl->pglvl) &&
+ dmar_pglvl_supported(domain->dmar, tbl->pglvl) &&
tbl->leaf == leaf) {
res = tbl->pgtbl_obj;
vm_object_reference(res);
sx_xunlock(&idpgtbl_lock);
- ctx->pglvl = tbl->pglvl; /* XXXKIB ? */
+ domain->pglvl = tbl->pglvl; /* XXXKIB ? */
return (res);
}
}
@@ -224,13 +225,13 @@
* Still not found, create new page table.
*/
tbl = malloc(sizeof(*tbl), M_DMAR_IDPGTBL, M_WAITOK);
- tbl->pglvl = ctx->pglvl;
+ tbl->pglvl = domain->pglvl;
tbl->leaf = leaf;
tbl->maxaddr = maxaddr;
tbl->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL,
IDX_TO_OFF(pglvl_max_pages(tbl->pglvl)), 0, 0, NULL);
VM_OBJECT_WLOCK(tbl->pgtbl_obj);
- ctx_idmap_nextlvl(tbl, 0, 0, 0);
+ domain_idmap_nextlvl(tbl, 0, 0, 0);
VM_OBJECT_WUNLOCK(tbl->pgtbl_obj);
LIST_INSERT_HEAD(&idpgtbls, tbl, link);
res = tbl->pgtbl_obj;
@@ -251,7 +252,7 @@
* If DMAR cannot look into the chipset write buffer, flush it
* as well.
*/
- unit = ctx->dmar;
+ unit = domain->dmar;
if (!DMAR_IS_COHERENT(unit)) {
VM_OBJECT_WLOCK(res);
for (m = vm_page_lookup(res, 0); m != NULL;
@@ -320,10 +321,11 @@
* the level lvl.
*/
static int
-ctx_pgtbl_pte_off(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl)
+domain_pgtbl_pte_off(struct dmar_domain *domain, dmar_gaddr_t base, int lvl)
{
- base >>= DMAR_PAGE_SHIFT + (ctx->pglvl - lvl - 1) * DMAR_NPTEPGSHIFT;
+ base >>= DMAR_PAGE_SHIFT + (domain->pglvl - lvl - 1) *
+ DMAR_NPTEPGSHIFT;
return (base & DMAR_PTEMASK);
}
@@ -333,21 +335,24 @@
* lvl.
*/
static vm_pindex_t
-ctx_pgtbl_get_pindex(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl)
+domain_pgtbl_get_pindex(struct dmar_domain *domain, dmar_gaddr_t base, int lvl)
{
vm_pindex_t idx, pidx;
int i;
- KASSERT(lvl >= 0 && lvl < ctx->pglvl, ("wrong lvl %p %d", ctx, lvl));
+ KASSERT(lvl >= 0 && lvl < domain->pglvl,
+ ("wrong lvl %p %d", domain, lvl));
- for (pidx = idx = 0, i = 0; i < lvl; i++, pidx = idx)
- idx = ctx_pgtbl_pte_off(ctx, base, i) + pidx * DMAR_NPTEPG + 1;
+ for (pidx = idx = 0, i = 0; i < lvl; i++, pidx = idx) {
+ idx = domain_pgtbl_pte_off(domain, base, i) +
+ pidx * DMAR_NPTEPG + 1;
+ }
return (idx);
}
static dmar_pte_t *
-ctx_pgtbl_map_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags,
- vm_pindex_t *idxp, struct sf_buf **sf)
+domain_pgtbl_map_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl,
+ int flags, vm_pindex_t *idxp, struct sf_buf **sf)
{
vm_page_t m;
struct sf_buf *sfp;
@@ -354,10 +359,10 @@
dmar_pte_t *pte, *ptep;
vm_pindex_t idx, idx1;
- DMAR_CTX_ASSERT_PGLOCKED(ctx);
+ DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
KASSERT((flags & DMAR_PGF_OBJL) != 0, ("lost PGF_OBJL"));
- idx = ctx_pgtbl_get_pindex(ctx, base, lvl);
+ idx = domain_pgtbl_get_pindex(domain, base, lvl);
if (*sf != NULL && idx == *idxp) {
pte = (dmar_pte_t *)sf_buf_kva(*sf);
} else {
@@ -365,15 +370,16 @@
dmar_unmap_pgtbl(*sf);
*idxp = idx;
retry:
- pte = dmar_map_pgtbl(ctx->pgtbl_obj, idx, flags, sf);
+ pte = dmar_map_pgtbl(domain->pgtbl_obj, idx, flags, sf);
if (pte == NULL) {
- KASSERT(lvl > 0, ("lost root page table page %p", ctx));
+ KASSERT(lvl > 0,
+ ("lost root page table page %p", domain));
/*
* Page table page does not exist, allocate
* it and create a pte in the preceeding page level
* to reference the allocated page table page.
*/
- m = dmar_pgalloc(ctx->pgtbl_obj, idx, flags |
+ m = dmar_pgalloc(domain->pgtbl_obj, idx, flags |
DMAR_PGF_ZERO);
if (m == NULL)
return (NULL);
@@ -381,25 +387,26 @@
/*
* Prevent potential free while pgtbl_obj is
* unlocked in the recursive call to
- * ctx_pgtbl_map_pte(), if other thread did
- * pte write and clean while the lock if
+ * domain_pgtbl_map_pte(), if other thread did
+ * pte write and clean while the lock is
* dropped.
*/
m->wire_count++;
sfp = NULL;
- ptep = ctx_pgtbl_map_pte(ctx, base, lvl - 1, flags,
- &idx1, &sfp);
+ ptep = domain_pgtbl_map_pte(domain, base, lvl - 1,
+ flags, &idx1, &sfp);
if (ptep == NULL) {
KASSERT(m->pindex != 0,
- ("loosing root page %p", ctx));
+ ("loosing root page %p", domain));
m->wire_count--;
- dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags);
+ dmar_pgfree(domain->pgtbl_obj, m->pindex,
+ flags);
return (NULL);
}
dmar_pte_store(&ptep->pte, DMAR_PTE_R | DMAR_PTE_W |
VM_PAGE_TO_PHYS(m));
- dmar_flush_pte_to_ram(ctx->dmar, ptep);
+ dmar_flush_pte_to_ram(domain->dmar, ptep);
sf_buf_page(sfp)->wire_count += 1;
m->wire_count--;
dmar_unmap_pgtbl(sfp);
@@ -407,13 +414,13 @@
goto retry;
}
}
- pte += ctx_pgtbl_pte_off(ctx, base, lvl);
+ pte += domain_pgtbl_pte_off(domain, base, lvl);
return (pte);
}
static int
-ctx_map_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
- vm_page_t *ma, uint64_t pflags, int flags)
+domain_map_buf_locked(struct dmar_domain *domain, dmar_gaddr_t base,
+ dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags)
{
dmar_pte_t *pte;
struct sf_buf *sf;
@@ -422,7 +429,7 @@
int lvl;
bool superpage;
- DMAR_CTX_ASSERT_PGLOCKED(ctx);
+ DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
base1 = base;
size1 = size;
@@ -432,15 +439,15 @@
for (sf = NULL, pi = 0; size > 0; base += pg_sz, size -= pg_sz,
pi += run_sz) {
for (lvl = 0, c = 0, superpage = false;; lvl++) {
- pg_sz = ctx_page_size(ctx, lvl);
+ pg_sz = domain_page_size(domain, lvl);
run_sz = pg_sz >> DMAR_PAGE_SHIFT;
- if (lvl == ctx->pglvl - 1)
+ if (lvl == domain->pglvl - 1)
break;
/*
* Check if the current base suitable for the
* superpage mapping. First, verify the level.
*/
- if (!ctx_is_sp_lvl(ctx, lvl))
+ if (!domain_is_sp_lvl(domain, lvl))
continue;
/*
* Next, look at the size of the mapping and
@@ -464,22 +471,23 @@
}
}
KASSERT(size >= pg_sz,
- ("mapping loop overflow %p %jx %jx %jx", ctx,
+ ("mapping loop overflow %p %jx %jx %jx", domain,
(uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz));
KASSERT(pg_sz > 0, ("pg_sz 0 lvl %d", lvl));
- pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf);
+ pte = domain_pgtbl_map_pte(domain, base, lvl, flags, &idx, &sf);
if (pte == NULL) {
KASSERT((flags & DMAR_PGF_WAITOK) == 0,
- ("failed waitable pte alloc %p", ctx));
+ ("failed waitable pte alloc %p", domain));
if (sf != NULL)
dmar_unmap_pgtbl(sf);
- ctx_unmap_buf_locked(ctx, base1, base - base1, flags);
+ domain_unmap_buf_locked(domain, base1, base - base1,
+ flags);
TD_PINNED_ASSERT;
return (ENOMEM);
}
dmar_pte_store(&pte->pte, VM_PAGE_TO_PHYS(ma[pi]) | pflags |
(superpage ? DMAR_PTE_SP : 0));
- dmar_flush_pte_to_ram(ctx->dmar, pte);
+ dmar_flush_pte_to_ram(domain->dmar, pte);
sf_buf_page(sf)->wire_count += 1;
}
if (sf != NULL)
@@ -489,32 +497,32 @@
}
int
-ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
+domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base, dmar_gaddr_t size,
vm_page_t *ma, uint64_t pflags, int flags)
{
struct dmar_unit *unit;
int error;
- unit = ctx->dmar;
+ unit = domain->dmar;
- KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0,
- ("modifying idmap pagetable ctx %p", ctx));
+ KASSERT((domain->flags & DMAR_DOMAIN_IDMAP) == 0,
+ ("modifying idmap pagetable domain %p", domain));
KASSERT((base & DMAR_PAGE_MASK) == 0,
- ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base,
+ ("non-aligned base %p %jx %jx", domain, (uintmax_t)base,
(uintmax_t)size));
KASSERT((size & DMAR_PAGE_MASK) == 0,
- ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base,
+ ("non-aligned size %p %jx %jx", domain, (uintmax_t)base,
(uintmax_t)size));
- KASSERT(size > 0, ("zero size %p %jx %jx", ctx, (uintmax_t)base,
+ KASSERT(size > 0, ("zero size %p %jx %jx", domain, (uintmax_t)base,
(uintmax_t)size));
- KASSERT(base < (1ULL << ctx->agaw),
- ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
- (uintmax_t)size, ctx->agaw));
- KASSERT(base + size < (1ULL << ctx->agaw),
- ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
- (uintmax_t)size, ctx->agaw));
+ KASSERT(base < (1ULL << domain->agaw),
+ ("base too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+ (uintmax_t)size, domain->agaw));
+ KASSERT(base + size < (1ULL << domain->agaw),
+ ("end too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+ (uintmax_t)size, domain->agaw));
KASSERT(base + size > base,
- ("size overflow %p %jx %jx", ctx, (uintmax_t)base,
+ ("size overflow %p %jx %jx", domain, (uintmax_t)base,
(uintmax_t)size));
KASSERT((pflags & (DMAR_PTE_R | DMAR_PTE_W)) != 0,
("neither read nor write %jx", (uintmax_t)pflags));
@@ -524,21 +532,21 @@
KASSERT((pflags & DMAR_PTE_SNP) == 0 ||
(unit->hw_ecap & DMAR_ECAP_SC) != 0,
("PTE_SNP for dmar without snoop control %p %jx",
- ctx, (uintmax_t)pflags));
+ domain, (uintmax_t)pflags));
KASSERT((pflags & DMAR_PTE_TM) == 0 ||
(unit->hw_ecap & DMAR_ECAP_DI) != 0,
("PTE_TM for dmar without DIOTLB %p %jx",
- ctx, (uintmax_t)pflags));
+ domain, (uintmax_t)pflags));
KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags));
- DMAR_CTX_PGLOCK(ctx);
- error = ctx_map_buf_locked(ctx, base, size, ma, pflags, flags);
- DMAR_CTX_PGUNLOCK(ctx);
+ DMAR_DOMAIN_PGLOCK(domain);
+ error = domain_map_buf_locked(domain, base, size, ma, pflags, flags);
+ DMAR_DOMAIN_PGUNLOCK(domain);
if (error != 0)
return (error);
if ((unit->hw_cap & DMAR_CAP_CM) != 0)
- ctx_flush_iotlb_sync(ctx, base, size);
+ domain_flush_iotlb_sync(domain, base, size);
else if ((unit->hw_cap & DMAR_CAP_RWBF) != 0) {
/* See 11.1 Write Buffer Flushing. */
DMAR_LOCK(unit);
@@ -548,11 +556,13 @@
return (0);
}
-static void ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base,
- int lvl, int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_fs);
+static void domain_unmap_clear_pte(struct dmar_domain *domain,
+ dmar_gaddr_t base, int lvl, int flags, dmar_pte_t *pte,
+ struct sf_buf **sf, bool free_fs);
static void
-ctx_free_pgtbl_pde(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags)
+domain_free_pgtbl_pde(struct dmar_domain *domain, dmar_gaddr_t base,
+ int lvl, int flags)
{
struct sf_buf *sf;
dmar_pte_t *pde;
@@ -559,18 +569,18 @@
vm_pindex_t idx;
sf = NULL;
- pde = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf);
- ctx_unmap_clear_pte(ctx, base, lvl, flags, pde, &sf, true);
+ pde = domain_pgtbl_map_pte(domain, base, lvl, flags, &idx, &sf);
+ domain_unmap_clear_pte(domain, base, lvl, flags, pde, &sf, true);
}
static void
-ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl,
+domain_unmap_clear_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl,
int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_sf)
{
vm_page_t m;
dmar_pte_clear(&pte->pte);
- dmar_flush_pte_to_ram(ctx->dmar, pte);
+ dmar_flush_pte_to_ram(domain->dmar, pte);
m = sf_buf_page(*sf);
if (free_sf) {
dmar_unmap_pgtbl(*sf);
@@ -580,13 +590,13 @@
if (m->wire_count != 0)
return;
KASSERT(lvl != 0,
- ("lost reference (lvl) on root pg ctx %p base %jx lvl %d",
- ctx, (uintmax_t)base, lvl));
+ ("lost reference (lvl) on root pg domain %p base %jx lvl %d",
+ domain, (uintmax_t)base, lvl));
KASSERT(m->pindex != 0,
- ("lost reference (idx) on root pg ctx %p base %jx lvl %d",
- ctx, (uintmax_t)base, lvl));
- dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags);
- ctx_free_pgtbl_pde(ctx, base, lvl - 1, flags);
+ ("lost reference (idx) on root pg domain %p base %jx lvl %d",
+ domain, (uintmax_t)base, lvl));
+ dmar_pgfree(domain->pgtbl_obj, m->pindex, flags);
+ domain_free_pgtbl_pde(domain, base, lvl - 1, flags);
}
/*
@@ -593,7 +603,7 @@
* Assumes that the unmap is never partial.
*/
static int
-ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base,
+domain_unmap_buf_locked(struct dmar_domain *domain, dmar_gaddr_t base,
dmar_gaddr_t size, int flags)
{
dmar_pte_t *pte;
@@ -602,26 +612,26 @@
dmar_gaddr_t pg_sz;
int lvl;
- DMAR_CTX_ASSERT_PGLOCKED(ctx);
+ DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
if (size == 0)
return (0);
- KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0,
- ("modifying idmap pagetable ctx %p", ctx));
+ KASSERT((domain->flags & DMAR_DOMAIN_IDMAP) == 0,
+ ("modifying idmap pagetable domain %p", domain));
KASSERT((base & DMAR_PAGE_MASK) == 0,
- ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base,
+ ("non-aligned base %p %jx %jx", domain, (uintmax_t)base,
(uintmax_t)size));
KASSERT((size & DMAR_PAGE_MASK) == 0,
- ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base,
+ ("non-aligned size %p %jx %jx", domain, (uintmax_t)base,
(uintmax_t)size));
- KASSERT(base < (1ULL << ctx->agaw),
- ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
- (uintmax_t)size, ctx->agaw));
- KASSERT(base + size < (1ULL << ctx->agaw),
- ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
- (uintmax_t)size, ctx->agaw));
+ KASSERT(base < (1ULL << domain->agaw),
+ ("base too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+ (uintmax_t)size, domain->agaw));
+ KASSERT(base + size < (1ULL << domain->agaw),
+ ("end too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+ (uintmax_t)size, domain->agaw));
KASSERT(base + size > base,
- ("size overflow %p %jx %jx", ctx, (uintmax_t)base,
+ ("size overflow %p %jx %jx", domain, (uintmax_t)base,
(uintmax_t)size));
KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags));
@@ -630,26 +640,27 @@
TD_PREP_PINNED_ASSERT;
for (sf = NULL; size > 0; base += pg_sz, size -= pg_sz) {
- for (lvl = 0; lvl < ctx->pglvl; lvl++) {
- if (lvl != ctx->pglvl - 1 && !ctx_is_sp_lvl(ctx, lvl))
+ for (lvl = 0; lvl < domain->pglvl; lvl++) {
+ if (lvl != domain->pglvl - 1 &&
+ !domain_is_sp_lvl(domain, lvl))
continue;
- pg_sz = ctx_page_size(ctx, lvl);
+ pg_sz = domain_page_size(domain, lvl);
if (pg_sz > size)
continue;
- pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags,
+ pte = domain_pgtbl_map_pte(domain, base, lvl, flags,
&idx, &sf);
KASSERT(pte != NULL,
("sleeping or page missed %p %jx %d 0x%x",
- ctx, (uintmax_t)base, lvl, flags));
+ domain, (uintmax_t)base, lvl, flags));
if ((pte->pte & DMAR_PTE_SP) != 0 ||
- lvl == ctx->pglvl - 1) {
- ctx_unmap_clear_pte(ctx, base, lvl, flags,
- pte, &sf, false);
+ lvl == domain->pglvl - 1) {
+ domain_unmap_clear_pte(domain, base, lvl,
+ flags, pte, &sf, false);
break;
}
}
KASSERT(size >= pg_sz,
- ("unmapping loop overflow %p %jx %jx %jx", ctx,
+ ("unmapping loop overflow %p %jx %jx %jx", domain,
(uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz));
}
if (sf != NULL)
@@ -664,54 +675,58 @@
}
int
-ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
- int flags)
+domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base,
+ dmar_gaddr_t size, int flags)
{
int error;
- DMAR_CTX_PGLOCK(ctx);
- error = ctx_unmap_buf_locked(ctx, base, size, flags);
- DMAR_CTX_PGUNLOCK(ctx);
+ DMAR_DOMAIN_PGLOCK(domain);
+ error = domain_unmap_buf_locked(domain, base, size, flags);
+ DMAR_DOMAIN_PGUNLOCK(domain);
return (error);
}
int
-ctx_alloc_pgtbl(struct dmar_ctx *ctx)
+domain_alloc_pgtbl(struct dmar_domain *domain)
{
vm_page_t m;
- KASSERT(ctx->pgtbl_obj == NULL, ("already initialized %p", ctx));
+ KASSERT(domain->pgtbl_obj == NULL,
+ ("already initialized %p", domain));
- ctx->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL,
- IDX_TO_OFF(pglvl_max_pages(ctx->pglvl)), 0, 0, NULL);
- DMAR_CTX_PGLOCK(ctx);
- m = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_WAITOK |
+ domain->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL,
+ IDX_TO_OFF(pglvl_max_pages(domain->pglvl)), 0, 0, NULL);
+ DMAR_DOMAIN_PGLOCK(domain);
+ m = dmar_pgalloc(domain->pgtbl_obj, 0, DMAR_PGF_WAITOK |
DMAR_PGF_ZERO | DMAR_PGF_OBJL);
/* No implicit free of the top level page table page. */
m->wire_count = 1;
- DMAR_CTX_PGUNLOCK(ctx);
+ DMAR_DOMAIN_PGUNLOCK(domain);
+ DMAR_LOCK(domain->dmar);
+ domain->flags |= DMAR_DOMAIN_PGTBL_INITED;
+ DMAR_UNLOCK(domain->dmar);
return (0);
}
void
-ctx_free_pgtbl(struct dmar_ctx *ctx)
+domain_free_pgtbl(struct dmar_domain *domain)
{
vm_object_t obj;
vm_page_t m;
- obj = ctx->pgtbl_obj;
+ obj = domain->pgtbl_obj;
if (obj == NULL) {
- KASSERT((ctx->dmar->hw_ecap & DMAR_ECAP_PT) != 0 &&
- (ctx->flags & DMAR_CTX_IDMAP) != 0,
- ("lost pagetable object ctx %p", ctx));
+ KASSERT((domain->dmar->hw_ecap & DMAR_ECAP_PT) != 0 &&
+ (domain->flags & DMAR_DOMAIN_IDMAP) != 0,
+ ("lost pagetable object domain %p", domain));
return;
}
- DMAR_CTX_ASSERT_PGLOCKED(ctx);
- ctx->pgtbl_obj = NULL;
+ DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
+ domain->pgtbl_obj = NULL;
- if ((ctx->flags & DMAR_CTX_IDMAP) != 0) {
+ if ((domain->flags & DMAR_DOMAIN_IDMAP) != 0) {
put_idmap_pgtbl(obj);
- ctx->flags &= ~DMAR_CTX_IDMAP;
+ domain->flags &= ~DMAR_DOMAIN_IDMAP;
return;
}
@@ -724,7 +739,7 @@
}
static inline uint64_t
-ctx_wait_iotlb_flush(struct dmar_unit *unit, uint64_t wt, int iro)
+domain_wait_iotlb_flush(struct dmar_unit *unit, uint64_t wt, int iro)
{
uint64_t iotlbr;
@@ -740,7 +755,8 @@
}
void
-ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size)
+domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base,
+ dmar_gaddr_t size)
{
struct dmar_unit *unit;
dmar_gaddr_t isize;
@@ -747,14 +763,14 @@
uint64_t iotlbr;
int am, iro;
- unit = ctx->dmar;
+ unit = domain->dmar;
KASSERT(!unit->qi_enabled, ("dmar%d: sync iotlb flush call",
unit->unit));
iro = DMAR_ECAP_IRO(unit->hw_ecap) * 16;
DMAR_LOCK(unit);
if ((unit->hw_cap & DMAR_CAP_PSI) == 0 || size > 2 * 1024 * 1024) {
- iotlbr = ctx_wait_iotlb_flush(unit, DMAR_IOTLB_IIRG_DOM |
- DMAR_IOTLB_DID(ctx->domain), iro);
+ iotlbr = domain_wait_iotlb_flush(unit, DMAR_IOTLB_IIRG_DOM |
+ DMAR_IOTLB_DID(domain->domain), iro);
KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) !=
DMAR_IOTLB_IAIG_INVLD,
("dmar%d: invalidation failed %jx", unit->unit,
@@ -763,9 +779,9 @@
for (; size > 0; base += isize, size -= isize) {
am = calc_am(unit, base, size, &isize);
dmar_write8(unit, iro, base | am);
- iotlbr = ctx_wait_iotlb_flush(unit,
- DMAR_IOTLB_IIRG_PAGE | DMAR_IOTLB_DID(ctx->domain),
- iro);
+ iotlbr = domain_wait_iotlb_flush(unit,
+ DMAR_IOTLB_IIRG_PAGE |
+ DMAR_IOTLB_DID(domain->domain), iro);
KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) !=
DMAR_IOTLB_IAIG_INVLD,
("dmar%d: PSI invalidation failed "
Added: trunk/sys/x86/iommu/intel_intrmap.c
===================================================================
--- trunk/sys/x86/iommu/intel_intrmap.c (rev 0)
+++ trunk/sys/x86/iommu/intel_intrmap.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,381 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_intrmap.c 340016 2018-11-01 18:34:26Z jhb $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/rman.h>
+#include <sys/rwlock.h>
+#include <sys/taskqueue.h>
+#include <sys/tree.h>
+#include <sys/vmem.h>
+#include <machine/bus.h>
+#include <machine/intr_machdep.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <x86/include/apicreg.h>
+#include <x86/include/apicvar.h>
+#include <x86/include/busdma_impl.h>
+#include <x86/iommu/intel_reg.h>
+#include <x86/iommu/busdma_dmar.h>
+#include <x86/iommu/intel_dmar.h>
+#include <dev/pci/pcivar.h>
+#include <x86/iommu/iommu_intrmap.h>
+
+static struct dmar_unit *dmar_ir_find(device_t src, uint16_t *rid,
+ int *is_dmar);
+static void dmar_ir_program_irte(struct dmar_unit *unit, u_int idx,
+ uint64_t low, uint16_t rid);
+static int dmar_ir_free_irte(struct dmar_unit *unit, u_int cookie);
+
+int
+iommu_alloc_msi_intr(device_t src, u_int *cookies, u_int count)
+{
+ struct dmar_unit *unit;
+ vmem_addr_t vmem_res;
+ u_int idx, i;
+ int error;
+
+ unit = dmar_ir_find(src, NULL, NULL);
+ if (unit == NULL || !unit->ir_enabled) {
+ for (i = 0; i < count; i++)
+ cookies[i] = -1;
+ return (EOPNOTSUPP);
+ }
+
+ error = vmem_alloc(unit->irtids, count, M_FIRSTFIT | M_NOWAIT,
+ &vmem_res);
+ if (error != 0) {
+ KASSERT(error != EOPNOTSUPP,
+ ("impossible EOPNOTSUPP from vmem"));
+ return (error);
+ }
+ idx = vmem_res;
+ for (i = 0; i < count; i++)
+ cookies[i] = idx + i;
+ return (0);
+}
+
+int
+iommu_map_msi_intr(device_t src, u_int cpu, u_int vector, u_int cookie,
+ uint64_t *addr, uint32_t *data)
+{
+ struct dmar_unit *unit;
+ uint64_t low;
+ uint16_t rid;
+ int is_dmar;
+
+ unit = dmar_ir_find(src, &rid, &is_dmar);
+ if (is_dmar) {
+ KASSERT(unit == NULL, ("DMAR cannot translate itself"));
+
+ /*
+ * See VT-d specification, 5.1.6 Remapping Hardware -
+ * Interrupt Programming.
+ */
+ *data = vector;
+ *addr = MSI_INTEL_ADDR_BASE | ((cpu & 0xff) << 12);
+ if (x2apic_mode)
+ *addr |= ((uint64_t)cpu & 0xffffff00) << 32;
+ else
+ KASSERT(cpu <= 0xff, ("cpu id too big %d", cpu));
+ return (0);
+ }
+ if (unit == NULL || !unit->ir_enabled || cookie == -1)
+ return (EOPNOTSUPP);
+
+ low = (DMAR_X2APIC(unit) ? DMAR_IRTE1_DST_x2APIC(cpu) :
+ DMAR_IRTE1_DST_xAPIC(cpu)) | DMAR_IRTE1_V(vector) |
+ DMAR_IRTE1_DLM_FM | DMAR_IRTE1_TM_EDGE | DMAR_IRTE1_RH_DIRECT |
+ DMAR_IRTE1_DM_PHYSICAL | DMAR_IRTE1_P;
+ dmar_ir_program_irte(unit, cookie, low, rid);
+
+ if (addr != NULL) {
+ /*
+ * See VT-d specification, 5.1.5.2 MSI and MSI-X
+ * Register Programming.
+ */
+ *addr = MSI_INTEL_ADDR_BASE | ((cookie & 0x7fff) << 5) |
+ ((cookie & 0x8000) << 2) | 0x18;
+ *data = 0;
+ }
+ return (0);
+}
+
+int
+iommu_unmap_msi_intr(device_t src, u_int cookie)
+{
+ struct dmar_unit *unit;
+
+ if (cookie == -1)
+ return (0);
+ unit = dmar_ir_find(src, NULL, NULL);
+ return (dmar_ir_free_irte(unit, cookie));
+}
+
+int
+iommu_map_ioapic_intr(u_int ioapic_id, u_int cpu, u_int vector, bool edge,
+ bool activehi, int irq, u_int *cookie, uint32_t *hi, uint32_t *lo)
+{
+ struct dmar_unit *unit;
+ vmem_addr_t vmem_res;
+ uint64_t low, iorte;
+ u_int idx;
+ int error;
+ uint16_t rid;
+
+ unit = dmar_find_ioapic(ioapic_id, &rid);
+ if (unit == NULL || !unit->ir_enabled) {
+ *cookie = -1;
+ return (EOPNOTSUPP);
+ }
+
+ error = vmem_alloc(unit->irtids, 1, M_FIRSTFIT | M_NOWAIT, &vmem_res);
+ if (error != 0) {
+ KASSERT(error != EOPNOTSUPP,
+ ("impossible EOPNOTSUPP from vmem"));
+ return (error);
+ }
+ idx = vmem_res;
+ low = 0;
+ switch (irq) {
+ case IRQ_EXTINT:
+ low |= DMAR_IRTE1_DLM_ExtINT;
+ break;
+ case IRQ_NMI:
+ low |= DMAR_IRTE1_DLM_NMI;
+ break;
+ case IRQ_SMI:
+ low |= DMAR_IRTE1_DLM_SMI;
+ break;
+ default:
+ KASSERT(vector != 0, ("No vector for IRQ %u", irq));
+ low |= DMAR_IRTE1_DLM_FM | DMAR_IRTE1_V(vector);
+ break;
+ }
+ low |= (DMAR_X2APIC(unit) ? DMAR_IRTE1_DST_x2APIC(cpu) :
+ DMAR_IRTE1_DST_xAPIC(cpu)) |
+ (edge ? DMAR_IRTE1_TM_EDGE : DMAR_IRTE1_TM_LEVEL) |
+ DMAR_IRTE1_RH_DIRECT | DMAR_IRTE1_DM_PHYSICAL | DMAR_IRTE1_P;
+ dmar_ir_program_irte(unit, idx, low, rid);
+
+ if (hi != NULL) {
+ /*
+ * See VT-d specification, 5.1.5.1 I/OxAPIC
+ * Programming.
+ */
+ iorte = (1ULL << 48) | ((uint64_t)(idx & 0x7fff) << 49) |
+ ((idx & 0x8000) != 0 ? (1 << 11) : 0) |
+ (edge ? IOART_TRGREDG : IOART_TRGRLVL) |
+ (activehi ? IOART_INTAHI : IOART_INTALO) |
+ IOART_DELFIXED | vector;
+ *hi = iorte >> 32;
+ *lo = iorte;
+ }
+ *cookie = idx;
+ return (0);
+}
+
+int
+iommu_unmap_ioapic_intr(u_int ioapic_id, u_int *cookie)
+{
+ struct dmar_unit *unit;
+ u_int idx;
+
+ idx = *cookie;
+ if (idx == -1)
+ return (0);
+ *cookie = -1;
+ unit = dmar_find_ioapic(ioapic_id, NULL);
+ KASSERT(unit != NULL && unit->ir_enabled,
+ ("unmap: cookie %d unit %p", idx, unit));
+ return (dmar_ir_free_irte(unit, idx));
+}
+
+static struct dmar_unit *
+dmar_ir_find(device_t src, uint16_t *rid, int *is_dmar)
+{
+ devclass_t src_class;
+ struct dmar_unit *unit;
+
+ /*
+ * We need to determine if the interrupt source generates FSB
+ * interrupts. If yes, it is either DMAR, in which case
+ * interrupts are not remapped. Or it is HPET, and interrupts
+ * are remapped. For HPET, source id is reported by HPET
+ * record in DMAR ACPI table.
+ */
+ if (is_dmar != NULL)
+ *is_dmar = FALSE;
+ src_class = device_get_devclass(src);
+ if (src_class == devclass_find("dmar")) {
+ unit = NULL;
+ if (is_dmar != NULL)
+ *is_dmar = TRUE;
+ } else if (src_class == devclass_find("hpet")) {
+ unit = dmar_find_hpet(src, rid);
+ } else {
+ unit = dmar_find(src);
+ if (unit != NULL && rid != NULL)
+ dmar_get_requester(src, rid);
+ }
+ return (unit);
+}
+
+static void
+dmar_ir_program_irte(struct dmar_unit *unit, u_int idx, uint64_t low,
+ uint16_t rid)
+{
+ dmar_irte_t *irte;
+ uint64_t high;
+
+ KASSERT(idx < unit->irte_cnt,
+ ("bad cookie %d %d", idx, unit->irte_cnt));
+ irte = &(unit->irt[idx]);
+ high = DMAR_IRTE2_SVT_RID | DMAR_IRTE2_SQ_RID |
+ DMAR_IRTE2_SID_RID(rid);
+ device_printf(unit->dev,
+ "programming irte[%d] rid %#x high %#jx low %#jx\n",
+ idx, rid, (uintmax_t)high, (uintmax_t)low);
+ DMAR_LOCK(unit);
+ if ((irte->irte1 & DMAR_IRTE1_P) != 0) {
+ /*
+ * The rte is already valid. Assume that the request
+ * is to remap the interrupt for balancing. Only low
+ * word of rte needs to be changed. Assert that the
+ * high word contains expected value.
+ */
+ KASSERT(irte->irte2 == high,
+ ("irte2 mismatch, %jx %jx", (uintmax_t)irte->irte2,
+ (uintmax_t)high));
+ dmar_pte_update(&irte->irte1, low);
+ } else {
+ dmar_pte_store(&irte->irte2, high);
+ dmar_pte_store(&irte->irte1, low);
+ }
+ dmar_qi_invalidate_iec(unit, idx, 1);
+ DMAR_UNLOCK(unit);
+
+}
+
+static int
+dmar_ir_free_irte(struct dmar_unit *unit, u_int cookie)
+{
+ dmar_irte_t *irte;
+
+ KASSERT(unit != NULL && unit->ir_enabled,
+ ("unmap: cookie %d unit %p", cookie, unit));
+ KASSERT(cookie < unit->irte_cnt,
+ ("bad cookie %u %u", cookie, unit->irte_cnt));
+ irte = &(unit->irt[cookie]);
+ dmar_pte_clear(&irte->irte1);
+ dmar_pte_clear(&irte->irte2);
+ DMAR_LOCK(unit);
+ dmar_qi_invalidate_iec(unit, cookie, 1);
+ DMAR_UNLOCK(unit);
+ vmem_free(unit->irtids, cookie, 1);
+ return (0);
+}
+
+static u_int
+clp2(u_int v)
+{
+
+ return (powerof2(v) ? v : 1 << fls(v));
+}
+
+int
+dmar_init_irt(struct dmar_unit *unit)
+{
+
+ if ((unit->hw_ecap & DMAR_ECAP_IR) == 0)
+ return (0);
+ unit->ir_enabled = 1;
+ TUNABLE_INT_FETCH("hw.dmar.ir", &unit->ir_enabled);
+ if (!unit->ir_enabled)
+ return (0);
+ if (!unit->qi_enabled) {
+ unit->ir_enabled = 0;
+ if (bootverbose)
+ device_printf(unit->dev,
+ "QI disabled, disabling interrupt remapping\n");
+ return (0);
+ }
+ unit->irte_cnt = clp2(num_io_irqs);
+ unit->irt = (dmar_irte_t *)(uintptr_t)kmem_alloc_contig(kernel_arena,
+ unit->irte_cnt * sizeof(dmar_irte_t), M_ZERO | M_WAITOK, 0,
+ dmar_high, PAGE_SIZE, 0, DMAR_IS_COHERENT(unit) ?
+ VM_MEMATTR_DEFAULT : VM_MEMATTR_UNCACHEABLE);
+ if (unit->irt == NULL)
+ return (ENOMEM);
+ unit->irt_phys = pmap_kextract((vm_offset_t)unit->irt);
+ unit->irtids = vmem_create("dmarirt", 0, unit->irte_cnt, 1, 0,
+ M_FIRSTFIT | M_NOWAIT);
+ DMAR_LOCK(unit);
+ dmar_load_irt_ptr(unit);
+ dmar_qi_invalidate_iec_glob(unit);
+ DMAR_UNLOCK(unit);
+
+ /*
+ * Initialize mappings for already configured interrupt pins.
+ * Required, because otherwise the interrupts fault without
+ * irtes.
+ */
+ intr_reprogram();
+
+ DMAR_LOCK(unit);
+ dmar_enable_ir(unit);
+ DMAR_UNLOCK(unit);
+ return (0);
+}
+
+void
+dmar_fini_irt(struct dmar_unit *unit)
+{
+
+ unit->ir_enabled = 0;
+ if (unit->irt != NULL) {
+ dmar_disable_ir(unit);
+ dmar_qi_invalidate_iec_glob(unit);
+ vmem_destroy(unit->irtids);
+ kmem_free(kernel_arena, (vm_offset_t)unit->irt,
+ unit->irte_cnt * sizeof(dmar_irte_t));
+ }
+}
Property changes on: trunk/sys/x86/iommu/intel_intrmap.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/iommu/intel_qi.c
===================================================================
--- trunk/sys/x86/iommu/intel_qi.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_qi.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_qi.c 284019 2015-06-05 08:23:33Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_qi.c 320357 2017-06-26 12:30:39Z kib $");
#include "opt_acpi.h"
@@ -41,7 +41,9 @@
#include <sys/module.h>
#include <sys/rman.h>
#include <sys/taskqueue.h>
+#include <sys/time.h>
#include <sys/tree.h>
+#include <sys/vmem.h>
#include <machine/bus.h>
#include <contrib/dev/acpica/include/acpi.h>
#include <contrib/dev/acpica/include/accommon.h>
@@ -70,27 +72,27 @@
static int
dmar_enable_qi(struct dmar_unit *unit)
{
+ int error;
DMAR_ASSERT_LOCKED(unit);
unit->hw_gcmd |= DMAR_GCMD_QIE;
dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
- /* XXXKIB should have a timeout */
- while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) == 0)
- cpu_spinwait();
- return (0);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES)
+ != 0));
+ return (error);
}
static int
dmar_disable_qi(struct dmar_unit *unit)
{
+ int error;
DMAR_ASSERT_LOCKED(unit);
unit->hw_gcmd &= ~DMAR_GCMD_QIE;
dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
- /* XXXKIB should have a timeout */
- while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) != 0)
- cpu_spinwait();
- return (0);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES)
+ == 0));
+ return (error);
}
static void
@@ -170,7 +172,8 @@
}
static void
-dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct dmar_qi_genseq *pseq)
+dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct dmar_qi_genseq *pseq,
+ bool emit_wait)
{
struct dmar_qi_genseq gsec;
uint32_t seq;
@@ -191,17 +194,21 @@
seq = unit->inv_waitd_seq++;
pseq->gen = unit->inv_waitd_gen;
pseq->seq = seq;
- dmar_qi_emit_wait_descr(unit, seq, true, true, false);
+ if (emit_wait) {
+ dmar_qi_ensure(unit, 1);
+ dmar_qi_emit_wait_descr(unit, seq, true, true, false);
+ }
}
static void
-dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct dmar_qi_genseq *gseq)
+dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct dmar_qi_genseq *gseq,
+ bool nowait)
{
DMAR_ASSERT_LOCKED(unit);
unit->inv_seq_waiters++;
while (!dmar_qi_seq_processed(unit, gseq)) {
- if (cold) {
+ if (cold || nowait) {
cpu_spinwait();
} else {
msleep(&unit->inv_seq_waiters, &unit->lock, 0,
@@ -212,14 +219,14 @@
}
void
-dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t base,
- dmar_gaddr_t size, struct dmar_qi_genseq *pseq)
+dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t base,
+ dmar_gaddr_t size, struct dmar_qi_genseq *pseq, bool emit_wait)
{
struct dmar_unit *unit;
dmar_gaddr_t isize;
int am;
- unit = ctx->dmar;
+ unit = domain->dmar;
DMAR_ASSERT_LOCKED(unit);
for (; size > 0; base += isize, size -= isize) {
am = calc_am(unit, base, size, &isize);
@@ -227,13 +234,10 @@
dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV |
DMAR_IQ_DESCR_IOTLB_PAGE | DMAR_IQ_DESCR_IOTLB_DW |
DMAR_IQ_DESCR_IOTLB_DR |
- DMAR_IQ_DESCR_IOTLB_DID(ctx->domain),
+ DMAR_IQ_DESCR_IOTLB_DID(domain->domain),
base | am);
}
- if (pseq != NULL) {
- dmar_qi_ensure(unit, 1);
- dmar_qi_emit_wait_seq(unit, pseq);
- }
+ dmar_qi_emit_wait_seq(unit, pseq, emit_wait);
dmar_qi_advance_tail(unit);
}
@@ -245,9 +249,9 @@
DMAR_ASSERT_LOCKED(unit);
dmar_qi_ensure(unit, 2);
dmar_qi_emit(unit, DMAR_IQ_DESCR_CTX_INV | DMAR_IQ_DESCR_CTX_GLOB, 0);
- dmar_qi_emit_wait_seq(unit, &gseq);
+ dmar_qi_emit_wait_seq(unit, &gseq, true);
dmar_qi_advance_tail(unit);
- dmar_qi_wait_for_seq(unit, &gseq);
+ dmar_qi_wait_for_seq(unit, &gseq, false);
}
void
@@ -259,11 +263,64 @@
dmar_qi_ensure(unit, 2);
dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV | DMAR_IQ_DESCR_IOTLB_GLOB |
DMAR_IQ_DESCR_IOTLB_DW | DMAR_IQ_DESCR_IOTLB_DR, 0);
- dmar_qi_emit_wait_seq(unit, &gseq);
+ dmar_qi_emit_wait_seq(unit, &gseq, true);
dmar_qi_advance_tail(unit);
- dmar_qi_wait_for_seq(unit, &gseq);
+ dmar_qi_wait_for_seq(unit, &gseq, false);
}
+void
+dmar_qi_invalidate_iec_glob(struct dmar_unit *unit)
+{
+ struct dmar_qi_genseq gseq;
+
+ DMAR_ASSERT_LOCKED(unit);
+ dmar_qi_ensure(unit, 2);
+ dmar_qi_emit(unit, DMAR_IQ_DESCR_IEC_INV, 0);
+ dmar_qi_emit_wait_seq(unit, &gseq, true);
+ dmar_qi_advance_tail(unit);
+ dmar_qi_wait_for_seq(unit, &gseq, false);
+}
+
+void
+dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt)
+{
+ struct dmar_qi_genseq gseq;
+ u_int c, l;
+
+ DMAR_ASSERT_LOCKED(unit);
+ KASSERT(start < unit->irte_cnt && start < start + cnt &&
+ start + cnt <= unit->irte_cnt,
+ ("inv iec overflow %d %d %d", unit->irte_cnt, start, cnt));
+ for (; cnt > 0; cnt -= c, start += c) {
+ l = ffs(start | cnt) - 1;
+ c = 1 << l;
+ dmar_qi_ensure(unit, 1);
+ dmar_qi_emit(unit, DMAR_IQ_DESCR_IEC_INV |
+ DMAR_IQ_DESCR_IEC_IDX | DMAR_IQ_DESCR_IEC_IIDX(start) |
+ DMAR_IQ_DESCR_IEC_IM(l), 0);
+ }
+ dmar_qi_ensure(unit, 1);
+ dmar_qi_emit_wait_seq(unit, &gseq, true);
+ dmar_qi_advance_tail(unit);
+
+ /*
+ * The caller of the function, in particular,
+ * dmar_ir_program_irte(), may be called from the context
+ * where the sleeping is forbidden (in fact, the
+ * intr_table_lock mutex may be held, locked from
+ * intr_shuffle_irqs()). Wait for the invalidation completion
+ * using the busy wait.
+ *
+ * The impact on the interrupt input setup code is small, the
+ * expected overhead is comparable with the chipset register
+ * read. It is more harmful for the parallel DMA operations,
+ * since we own the dmar unit lock until whole invalidation
+ * queue is processed, which includes requests possibly issued
+ * before our request.
+ */
+ dmar_qi_wait_for_seq(unit, &gseq, true);
+}
+
int
dmar_qi_intr(void *arg)
{
@@ -271,7 +328,7 @@
unit = arg;
KASSERT(unit->qi_enabled, ("dmar%d: QI is not enabled", unit->unit));
- taskqueue_enqueue_fast(unit->qi_taskqueue, &unit->qi_task);
+ taskqueue_enqueue(unit->qi_taskqueue, &unit->qi_task);
return (FILTER_HANDLED);
}
@@ -289,12 +346,11 @@
entry = TAILQ_FIRST(&unit->tlb_flush_entries);
if (entry == NULL)
break;
- if ((entry->gseq.gen == 0 && entry->gseq.seq == 0) ||
- !dmar_qi_seq_processed(unit, &entry->gseq))
+ if (!dmar_qi_seq_processed(unit, &entry->gseq))
break;
TAILQ_REMOVE(&unit->tlb_flush_entries, entry, dmamap_link);
DMAR_UNLOCK(unit);
- dmar_ctx_free_entry(entry, (entry->flags &
+ dmar_domain_free_entry(entry, (entry->flags &
DMAR_MAP_ENTRY_QI_NF) == 0);
DMAR_LOCK(unit);
}
@@ -324,7 +380,7 @@
TAILQ_INIT(&unit->tlb_flush_entries);
TASK_INIT(&unit->qi_task, 0, dmar_qi_task, unit);
- unit->qi_taskqueue = taskqueue_create_fast("dmar", M_WAITOK,
+ unit->qi_taskqueue = taskqueue_create_fast("dmarqf", M_WAITOK,
taskqueue_thread_enqueue, &unit->qi_taskqueue);
taskqueue_start_threads(&unit->qi_taskqueue, 1, PI_AV,
"dmar%d qi taskq", unit->unit);
@@ -377,9 +433,9 @@
DMAR_LOCK(unit);
/* quisce */
dmar_qi_ensure(unit, 1);
- dmar_qi_emit_wait_seq(unit, &gseq);
+ dmar_qi_emit_wait_seq(unit, &gseq, true);
dmar_qi_advance_tail(unit);
- dmar_qi_wait_for_seq(unit, &gseq);
+ dmar_qi_wait_for_seq(unit, &gseq, false);
/* only after the quisce, disable queue */
dmar_disable_qi_intr(unit);
dmar_disable_qi(unit);
Modified: trunk/sys/x86/iommu/intel_quirks.c
===================================================================
--- trunk/sys/x86/iommu/intel_quirks.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_quirks.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -1,6 +1,6 @@
/* $MidnightBSD$ */
/*-
- * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013, 2015 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Konstantin Belousov <kib at FreeBSD.org>
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_quirks.c 257251 2013-10-28 13:33:29Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_quirks.c 280260 2015-03-19 13:57:47Z kib $");
#include <sys/param.h>
#include <sys/bus.h>
@@ -43,6 +43,7 @@
#include <sys/smp.h>
#include <sys/taskqueue.h>
#include <sys/tree.h>
+#include <sys/vmem.h>
#include <machine/bus.h>
#include <contrib/dev/acpica/include/acpi.h>
#include <contrib/dev/acpica/include/accommon.h>
@@ -60,7 +61,7 @@
#include <x86/iommu/intel_dmar.h>
#include <dev/pci/pcivar.h>
-typedef void (*dmar_quirk_fun)(struct dmar_unit *);
+typedef void (*dmar_quirk_cpu_fun)(struct dmar_unit *);
struct intel_dmar_quirk_cpu {
u_int ext_family;
@@ -68,17 +69,21 @@
u_int family_code;
u_int model;
u_int stepping;
- dmar_quirk_fun quirk;
+ dmar_quirk_cpu_fun quirk;
const char *descr;
};
+typedef void (*dmar_quirk_nb_fun)(struct dmar_unit *, device_t nb);
+
struct intel_dmar_quirk_nb {
u_int dev_id;
u_int rev_no;
- dmar_quirk_fun quirk;
+ dmar_quirk_nb_fun quirk;
const char *descr;
};
+#define QUIRK_NB_ALL_REV 0xffffffff
+
static void
dmar_match_quirks(struct dmar_unit *dmar,
const struct intel_dmar_quirk_nb *nb_quirks, int nb_quirks_len,
@@ -100,13 +105,14 @@
for (i = 0; i < nb_quirks_len; i++) {
nb_quirk = &nb_quirks[i];
if (nb_quirk->dev_id == dev_id &&
- nb_quirk->rev_no == rev_no) {
+ (nb_quirk->rev_no == rev_no ||
+ nb_quirk->rev_no == QUIRK_NB_ALL_REV)) {
if (bootverbose) {
device_printf(dmar->dev,
"NB IOMMU quirk %s\n",
nb_quirk->descr);
}
- nb_quirk->quirk(dmar);
+ nb_quirk->quirk(dmar, nb);
}
}
} else {
@@ -140,12 +146,29 @@
}
static void
-nb_5400_no_low_high_prot_mem(struct dmar_unit *unit)
+nb_5400_no_low_high_prot_mem(struct dmar_unit *unit, device_t nb __unused)
{
unit->hw_cap &= ~(DMAR_CAP_PHMR | DMAR_CAP_PLMR);
}
+static void
+nb_no_ir(struct dmar_unit *unit, device_t nb __unused)
+{
+
+ unit->hw_ecap &= ~(DMAR_ECAP_IR | DMAR_ECAP_EIM);
+}
+
+static void
+nb_5500_no_ir_rev13(struct dmar_unit *unit, device_t nb)
+{
+ u_int rev_no;
+
+ rev_no = pci_get_revid(nb);
+ if (rev_no <= 0x13)
+ nb_no_ir(unit, nb);
+}
+
static const struct intel_dmar_quirk_nb pre_use_nb[] = {
{
.dev_id = 0x4001, .rev_no = 0x20,
@@ -157,6 +180,26 @@
.quirk = nb_5400_no_low_high_prot_mem,
.descr = "5400 E23" /* no low/high protected memory */
},
+ {
+ .dev_id = 0x3403, .rev_no = QUIRK_NB_ALL_REV,
+ .quirk = nb_5500_no_ir_rev13,
+ .descr = "5500 E47, E53" /* interrupt remapping does not work */
+ },
+ {
+ .dev_id = 0x3405, .rev_no = QUIRK_NB_ALL_REV,
+ .quirk = nb_5500_no_ir_rev13,
+ .descr = "5500 E47, E53" /* interrupt remapping does not work */
+ },
+ {
+ .dev_id = 0x3405, .rev_no = 0x22,
+ .quirk = nb_no_ir,
+ .descr = "5500 E47, E53" /* interrupt remapping does not work */
+ },
+ {
+ .dev_id = 0x3406, .rev_no = QUIRK_NB_ALL_REV,
+ .quirk = nb_5500_no_ir_rev13,
+ .descr = "5500 E47, E53" /* interrupt remapping does not work */
+ },
};
static void
Modified: trunk/sys/x86/iommu/intel_reg.h
===================================================================
--- trunk/sys/x86/iommu/intel_reg.h 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_reg.h 2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,7 +27,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/x86/iommu/intel_reg.h 306466 2016-09-30 00:31:17Z jhb $
+ * $FreeBSD: stable/11/sys/x86/iommu/intel_reg.h 306466 2016-09-30 00:31:17Z jhb $
*/
#ifndef __X86_IOMMU_INTEL_REG_H
Modified: trunk/sys/x86/iommu/intel_utils.c
===================================================================
--- trunk/sys/x86/iommu/intel_utils.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_utils.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_utils.c 279470 2015-03-01 04:22:06Z rstone $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_utils.c 327785 2018-01-10 20:39:26Z markj $");
#include <sys/param.h>
#include <sys/bus.h>
@@ -47,7 +47,9 @@
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
+#include <sys/time.h>
#include <sys/tree.h>
+#include <sys/vmem.h>
#include <dev/pci/pcivar.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -58,6 +60,8 @@
#include <vm/vm_pageout.h>
#include <machine/bus.h>
#include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <x86/include/apicvar.h>
#include <x86/include/busdma_impl.h>
#include <x86/iommu/intel_reg.h>
#include <x86/iommu/busdma_dmar.h>
@@ -98,7 +102,6 @@
{.agaw = 64, .cap = DMAR_CAP_SAGAW_6LVL, .awlvl = DMAR_CTX2_AW_6LVL,
.pglvl = 6}
};
-#define SIZEOF_SAGAW_BITS (sizeof(sagaw_bits) / sizeof(sagaw_bits[0]))
bool
dmar_pglvl_supported(struct dmar_unit *unit, int pglvl)
@@ -105,7 +108,7 @@
{
int i;
- for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
+ for (i = 0; i < nitems(sagaw_bits); i++) {
if (sagaw_bits[i].pglvl != pglvl)
continue;
if ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
@@ -115,26 +118,23 @@
}
int
-ctx_set_agaw(struct dmar_ctx *ctx, int mgaw)
+domain_set_agaw(struct dmar_domain *domain, int mgaw)
{
int sagaw, i;
- ctx->mgaw = mgaw;
- sagaw = DMAR_CAP_SAGAW(ctx->dmar->hw_cap);
- for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
+ domain->mgaw = mgaw;
+ sagaw = DMAR_CAP_SAGAW(domain->dmar->hw_cap);
+ for (i = 0; i < nitems(sagaw_bits); i++) {
if (sagaw_bits[i].agaw >= mgaw) {
- ctx->agaw = sagaw_bits[i].agaw;
- ctx->pglvl = sagaw_bits[i].pglvl;
- ctx->awlvl = sagaw_bits[i].awlvl;
+ domain->agaw = sagaw_bits[i].agaw;
+ domain->pglvl = sagaw_bits[i].pglvl;
+ domain->awlvl = sagaw_bits[i].awlvl;
return (0);
}
}
- device_printf(ctx->dmar->dev,
- "context request mgaw %d for pci%d:%d:%d:%d, "
- "no agaw found, sagaw %x\n", mgaw, ctx->dmar->segment,
- pci_get_bus(ctx->ctx_tag.owner),
- pci_get_slot(ctx->ctx_tag.owner),
- pci_get_function(ctx->ctx_tag.owner), sagaw);
+ device_printf(domain->dmar->dev,
+ "context request mgaw %d: no agaw found, sagaw %x\n",
+ mgaw, sagaw);
return (EINVAL);
}
@@ -150,18 +150,18 @@
{
int i;
- for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
+ for (i = 0; i < nitems(sagaw_bits); i++) {
if ((1ULL << sagaw_bits[i].agaw) >= maxaddr &&
(DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
break;
}
- if (allow_less && i == SIZEOF_SAGAW_BITS) {
+ if (allow_less && i == nitems(sagaw_bits)) {
do {
i--;
} while ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap)
== 0);
}
- if (i < SIZEOF_SAGAW_BITS)
+ if (i < nitems(sagaw_bits))
return (sagaw_bits[i].agaw);
KASSERT(0, ("no mgaw for maxaddr %jx allow_less %d",
(uintmax_t) maxaddr, allow_less));
@@ -190,7 +190,7 @@
* the context ctx.
*/
int
-ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl)
+domain_is_sp_lvl(struct dmar_domain *domain, int lvl)
{
int alvl, cap_sps;
static const int sagaw_sp[] = {
@@ -200,10 +200,9 @@
DMAR_CAP_SPS_1T
};
- alvl = ctx->pglvl - lvl - 1;
- cap_sps = DMAR_CAP_SPS(ctx->dmar->hw_cap);
- return (alvl < sizeof(sagaw_sp) / sizeof(sagaw_sp[0]) &&
- (sagaw_sp[alvl] & cap_sps) != 0);
+ alvl = domain->pglvl - lvl - 1;
+ cap_sps = DMAR_CAP_SPS(domain->dmar->hw_cap);
+ return (alvl < nitems(sagaw_sp) && (sagaw_sp[alvl] & cap_sps) != 0);
}
dmar_gaddr_t
@@ -222,16 +221,15 @@
KASSERT(lvl >= 0 && lvl < total_pglvl,
("total %d lvl %d", total_pglvl, lvl));
rlvl = total_pglvl - lvl - 1;
- KASSERT(rlvl < sizeof(pg_sz) / sizeof(pg_sz[0]),
- ("sizeof pg_sz lvl %d", lvl));
+ KASSERT(rlvl < nitems(pg_sz), ("sizeof pg_sz lvl %d", lvl));
return (pg_sz[rlvl]);
}
dmar_gaddr_t
-ctx_page_size(struct dmar_ctx *ctx, int lvl)
+domain_page_size(struct dmar_domain *domain, int lvl)
{
- return (pglvl_page_size(ctx->pglvl, lvl));
+ return (pglvl_page_size(domain->pglvl, lvl));
}
int
@@ -260,9 +258,12 @@
dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags)
{
vm_page_t m;
- int zeroed;
+ int zeroed, aflags;
zeroed = (flags & DMAR_PGF_ZERO) != 0 ? VM_ALLOC_ZERO : 0;
+ aflags = zeroed | VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP |
+ ((flags & DMAR_PGF_WAITOK) != 0 ? VM_ALLOC_WAITFAIL :
+ VM_ALLOC_NOWAIT);
for (;;) {
if ((flags & DMAR_PGF_OBJL) == 0)
VM_OBJECT_WLOCK(obj);
@@ -272,8 +273,7 @@
VM_OBJECT_WUNLOCK(obj);
break;
}
- m = vm_page_alloc_contig(obj, idx, VM_ALLOC_NOBUSY |
- VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP | zeroed, 1, 0,
+ m = vm_page_alloc_contig(obj, idx, aflags, 1, 0,
dmar_high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
if ((flags & DMAR_PGF_OBJL) == 0)
VM_OBJECT_WUNLOCK(obj);
@@ -285,11 +285,6 @@
}
if ((flags & DMAR_PGF_WAITOK) == 0)
break;
- if ((flags & DMAR_PGF_OBJL) != 0)
- VM_OBJECT_WUNLOCK(obj);
- VM_WAIT;
- if ((flags & DMAR_PGF_OBJL) != 0)
- VM_OBJECT_WLOCK(obj);
}
return (m);
}
@@ -405,6 +400,7 @@
dmar_load_root_entry_ptr(struct dmar_unit *unit)
{
vm_page_t root_entry;
+ int error;
/*
* Access to the GCMD register must be serialized while the
@@ -417,10 +413,9 @@
VM_OBJECT_RUNLOCK(unit->ctx_obj);
dmar_write8(unit, DMAR_RTADDR_REG, VM_PAGE_TO_PHYS(root_entry));
dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SRTP);
- /* XXXKIB should have a timeout */
- while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS) == 0)
- cpu_spinwait();
- return (0);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS)
+ != 0));
+ return (error);
}
/*
@@ -430,6 +425,7 @@
int
dmar_inv_ctx_glob(struct dmar_unit *unit)
{
+ int error;
/*
* Access to the CCMD register must be serialized while the
@@ -445,10 +441,9 @@
* writes the upper dword last.
*/
dmar_write8(unit, DMAR_CCMD_REG, DMAR_CCMD_ICC | DMAR_CCMD_CIRG_GLOB);
- /* XXXKIB should have a timeout */
- while ((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32) != 0)
- cpu_spinwait();
- return (0);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32)
+ == 0));
+ return (error);
}
/*
@@ -457,7 +452,7 @@
int
dmar_inv_iotlb_glob(struct dmar_unit *unit)
{
- int reg;
+ int error, reg;
DMAR_ASSERT_LOCKED(unit);
KASSERT(!unit->qi_enabled, ("QI enabled"));
@@ -466,11 +461,9 @@
/* See a comment about DMAR_CCMD_ICC in dmar_inv_ctx_glob. */
dmar_write8(unit, reg + DMAR_IOTLB_REG_OFF, DMAR_IOTLB_IVT |
DMAR_IOTLB_IIRG_GLB | DMAR_IOTLB_DR | DMAR_IOTLB_DW);
- /* XXXKIB should have a timeout */
- while ((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) &
- DMAR_IOTLB_IVT32) != 0)
- cpu_spinwait();
- return (0);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) &
+ DMAR_IOTLB_IVT32) == 0));
+ return (error);
}
/*
@@ -480,6 +473,7 @@
int
dmar_flush_write_bufs(struct dmar_unit *unit)
{
+ int error;
DMAR_ASSERT_LOCKED(unit);
@@ -490,38 +484,86 @@
("dmar%d: no RWBF", unit->unit));
dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_WBF);
- /* XXXKIB should have a timeout */
- while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS) == 0)
- cpu_spinwait();
- return (0);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS)
+ != 0));
+ return (error);
}
int
dmar_enable_translation(struct dmar_unit *unit)
{
+ int error;
DMAR_ASSERT_LOCKED(unit);
unit->hw_gcmd |= DMAR_GCMD_TE;
dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
- /* XXXKIB should have a timeout */
- while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) == 0)
- cpu_spinwait();
- return (0);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES)
+ != 0));
+ return (error);
}
int
dmar_disable_translation(struct dmar_unit *unit)
{
+ int error;
DMAR_ASSERT_LOCKED(unit);
unit->hw_gcmd &= ~DMAR_GCMD_TE;
dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
- /* XXXKIB should have a timeout */
- while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) != 0)
- cpu_spinwait();
- return (0);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES)
+ == 0));
+ return (error);
}
+int
+dmar_load_irt_ptr(struct dmar_unit *unit)
+{
+ uint64_t irta, s;
+ int error;
+
+ DMAR_ASSERT_LOCKED(unit);
+ irta = unit->irt_phys;
+ if (DMAR_X2APIC(unit))
+ irta |= DMAR_IRTA_EIME;
+ s = fls(unit->irte_cnt) - 2;
+ KASSERT(unit->irte_cnt >= 2 && s <= DMAR_IRTA_S_MASK &&
+ powerof2(unit->irte_cnt),
+ ("IRTA_REG_S overflow %x", unit->irte_cnt));
+ irta |= s;
+ dmar_write8(unit, DMAR_IRTA_REG, irta);
+ dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SIRTP);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRTPS)
+ != 0));
+ return (error);
+}
+
+int
+dmar_enable_ir(struct dmar_unit *unit)
+{
+ int error;
+
+ DMAR_ASSERT_LOCKED(unit);
+ unit->hw_gcmd |= DMAR_GCMD_IRE;
+ unit->hw_gcmd &= ~DMAR_GCMD_CFI;
+ dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES)
+ != 0));
+ return (error);
+}
+
+int
+dmar_disable_ir(struct dmar_unit *unit)
+{
+ int error;
+
+ DMAR_ASSERT_LOCKED(unit);
+ unit->hw_gcmd &= ~DMAR_GCMD_IRE;
+ dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
+ DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES)
+ == 0));
+ return (error);
+}
+
#define BARRIER_F \
u_int f_done, f_inproc, f_wakeup; \
\
@@ -573,18 +615,62 @@
}
int dmar_match_verbose;
+int dmar_batch_coalesce = 100;
+struct timespec dmar_hw_timeout = {
+ .tv_sec = 0,
+ .tv_nsec = 1000000
+};
-static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL,
- "");
-SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD | CTLFLAG_TUN,
+static const uint64_t d = 1000000000;
+
+void
+dmar_update_timeout(uint64_t newval)
+{
+
+ /* XXXKIB not atomic */
+ dmar_hw_timeout.tv_sec = newval / d;
+ dmar_hw_timeout.tv_nsec = newval % d;
+}
+
+uint64_t
+dmar_get_timeout(void)
+{
+
+ return ((uint64_t)dmar_hw_timeout.tv_sec * d +
+ dmar_hw_timeout.tv_nsec);
+}
+
+static int
+dmar_timeout_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int error;
+
+ val = dmar_get_timeout();
+ error = sysctl_handle_long(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ dmar_update_timeout(val);
+ return (error);
+}
+
+static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL, "");
+SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD,
&dmar_tbl_pagecnt, 0,
"Count of pages used for DMAR pagetables");
-SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RW | CTLFLAG_TUN,
+SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RWTUN,
&dmar_match_verbose, 0,
"Verbose matching of the PCI devices to DMAR paths");
+SYSCTL_INT(_hw_dmar, OID_AUTO, batch_coalesce, CTLFLAG_RWTUN,
+ &dmar_batch_coalesce, 0,
+ "Number of qi batches between interrupt");
+SYSCTL_PROC(_hw_dmar, OID_AUTO, timeout,
+ CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+ dmar_timeout_sysctl, "QU",
+ "Timeout for command wait, in nanoseconds");
#ifdef INVARIANTS
int dmar_check_free;
-SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RW | CTLFLAG_TUN,
+SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RWTUN,
&dmar_check_free, 0,
"Check the GPA RBtree for free_down and free_after validity");
#endif
Added: trunk/sys/x86/iommu/iommu_intrmap.h
===================================================================
--- trunk/sys/x86/iommu/iommu_intrmap.h (rev 0)
+++ trunk/sys/x86/iommu/iommu_intrmap.h 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,44 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/iommu/iommu_intrmap.h 280260 2015-03-19 13:57:47Z kib $
+ */
+
+#ifndef __X86_IOMMU_IOMMU_INTRMAP_H
+#define __X86_IOMMU_IOMMU_INTRMAP_H
+
+int iommu_alloc_msi_intr(device_t src, u_int *cookies, u_int count);
+int iommu_map_msi_intr(device_t src, u_int cpu, u_int vector, u_int cookie,
+ uint64_t *addr, uint32_t *data);
+int iommu_unmap_msi_intr(device_t src, u_int cookie);
+int iommu_map_ioapic_intr(u_int ioapic_id, u_int cpu, u_int vector, bool edge,
+ bool activehi, int irq, u_int *cookie, uint32_t *hi, uint32_t *lo);
+int iommu_unmap_ioapic_intr(u_int ioapic_id, u_int *cookie);
+
+#endif
Property changes on: trunk/sys/x86/iommu/iommu_intrmap.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/isa/atpic.c
===================================================================
--- trunk/sys/x86/isa/atpic.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/atpic.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -30,10 +30,11 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/atpic.c 262192 2014-02-18 20:27:17Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/atpic.c 340016 2018-11-01 18:34:26Z jhb $");
#include "opt_auto_eoi.h"
#include "opt_isa.h"
+#include "opt_mca.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -55,9 +56,12 @@
#ifdef PC98
#include <pc98/cbus/cbus.h>
#else
-#include <x86/isa/isa.h>
+#include <isa/isareg.h>
#endif
#include <isa/isavar.h>
+#ifdef DEV_MCA
+#include <i386/bios/mca_machdep.h>
+#endif
#ifdef __amd64__
#define SDT_ATPIC SDT_SYSIGT
@@ -70,12 +74,12 @@
#define MASTER 0
#define SLAVE 1
+#define IMEN_MASK(ai) (IRQ_MASK((ai)->at_irq))
+
#define NUM_ISA_IRQS 16
static void atpic_init(void *dummy);
-unsigned int imen; /* XXX */
-
inthand_t
IDTVEC(atpic_intr0), IDTVEC(atpic_intr1), IDTVEC(atpic_intr2),
IDTVEC(atpic_intr3), IDTVEC(atpic_intr4), IDTVEC(atpic_intr5),
@@ -83,19 +87,42 @@
IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11),
IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14),
IDTVEC(atpic_intr15);
+/* XXXKIB i386 uses stubs until pti comes */
+inthand_t
+ IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti),
+ IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti),
+ IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti),
+ IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti),
+ IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti),
+ IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti),
+ IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti),
+ IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti);
#define IRQ(ap, ai) ((ap)->at_irqbase + (ai)->at_irq)
-#define ATPIC(io, base, eoi, imenptr) \
- { { atpic_enable_source, atpic_disable_source, (eoi), \
- atpic_enable_intr, atpic_disable_intr, atpic_vector, \
- atpic_source_pending, NULL, atpic_resume, atpic_config_intr,\
- atpic_assign_cpu }, (io), (base), IDT_IO_INTS + (base), \
- (imenptr) }
+#define ATPIC(io, base, eoi) { \
+ .at_pic = { \
+ .pic_register_sources = atpic_register_sources, \
+ .pic_enable_source = atpic_enable_source, \
+ .pic_disable_source = atpic_disable_source, \
+ .pic_eoi_source = (eoi), \
+ .pic_enable_intr = atpic_enable_intr, \
+ .pic_disable_intr = atpic_disable_intr, \
+ .pic_vector = atpic_vector, \
+ .pic_source_pending = atpic_source_pending, \
+ .pic_resume = atpic_resume, \
+ .pic_config_intr = atpic_config_intr, \
+ .pic_assign_cpu = atpic_assign_cpu \
+ }, \
+ .at_ioaddr = (io), \
+ .at_irqbase = (base), \
+ .at_intbase = IDT_IO_INTS + (base), \
+ .at_imen = 0xff, \
+ }
#define INTSRC(irq) \
{ { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ), \
- (irq) % 8 }
+ IDTVEC(atpic_intr ## irq ## _pti), (irq) % 8 }
struct atpic {
struct pic at_pic;
@@ -102,12 +129,12 @@
int at_ioaddr;
int at_irqbase;
uint8_t at_intbase;
- uint8_t *at_imen;
+ uint8_t at_imen;
};
struct atpic_intsrc {
struct intsrc at_intsrc;
- inthand_t *at_intr;
+ inthand_t *at_intr, *at_intr_pti;
int at_irq; /* Relative to PIC base. */
enum intr_trigger at_trigger;
u_long at_count;
@@ -114,6 +141,7 @@
u_long at_straycount;
};
+static void atpic_register_sources(struct pic *pic);
static void atpic_enable_source(struct intsrc *isrc);
static void atpic_disable_source(struct intsrc *isrc, int eoi);
static void atpic_eoi_master(struct intsrc *isrc);
@@ -129,8 +157,8 @@
static void i8259_init(struct atpic *pic, int slave);
static struct atpic atpics[] = {
- ATPIC(IO_ICU1, 0, atpic_eoi_master, (uint8_t *)&imen),
- ATPIC(IO_ICU2, 8, atpic_eoi_slave, ((uint8_t *)&imen) + 1)
+ ATPIC(IO_ICU1, 0, atpic_eoi_master),
+ ATPIC(IO_ICU2, 8, atpic_eoi_slave)
};
static struct atpic_intsrc atintrs[] = {
@@ -152,7 +180,7 @@
INTSRC(15),
};
-CTASSERT(sizeof(atintrs) / sizeof(atintrs[0]) == NUM_ISA_IRQS);
+CTASSERT(nitems(atintrs) == NUM_ISA_IRQS);
static __inline void
_atpic_eoi_master(struct intsrc *isrc)
@@ -184,6 +212,42 @@
}
static void
+atpic_register_sources(struct pic *pic)
+{
+ struct atpic *ap = (struct atpic *)pic;
+ struct atpic_intsrc *ai;
+ int i;
+
+ /*
+ * If any of the ISA IRQs have an interrupt source already, then
+ * assume that the I/O APICs are being used and don't register any
+ * of our interrupt sources. This makes sure we don't accidentally
+ * use mixed mode. The "accidental" use could otherwise occur on
+ * machines that route the ACPI SCI interrupt to a different ISA
+ * IRQ (at least one machine routes it to IRQ 13) thus disabling
+ * that APIC ISA routing and allowing the ATPIC source for that IRQ
+ * to leak through. We used to depend on this feature for routing
+ * IRQ0 via mixed mode, but now we don't use mixed mode at all.
+ *
+ * To avoid the slave not register sources after the master
+ * registers its sources, register all IRQs when this function is
+ * called on the master.
+ */
+ if (ap != &atpics[MASTER])
+ return;
+ for (i = 0; i < NUM_ISA_IRQS; i++)
+ if (intr_lookup_source(i) != NULL)
+ return;
+
+ /* Loop through all interrupt sources and add them. */
+ for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) {
+ if (i == ICU_SLAVEID)
+ continue;
+ intr_register_source(&ai->at_intsrc);
+ }
+}
+
+static void
atpic_enable_source(struct intsrc *isrc)
{
struct atpic_intsrc *ai = (struct atpic_intsrc *)isrc;
@@ -190,9 +254,9 @@
struct atpic *ap = (struct atpic *)isrc->is_pic;
spinlock_enter();
- if (*ap->at_imen & IMEN_MASK(ai)) {
- *ap->at_imen &= ~IMEN_MASK(ai);
- outb(ap->at_ioaddr + ICU_IMR_OFFSET, *ap->at_imen);
+ if (ap->at_imen & IMEN_MASK(ai)) {
+ ap->at_imen &= ~IMEN_MASK(ai);
+ outb(ap->at_ioaddr + ICU_IMR_OFFSET, ap->at_imen);
}
spinlock_exit();
}
@@ -205,8 +269,8 @@
spinlock_enter();
if (ai->at_trigger != INTR_TRIGGER_EDGE) {
- *ap->at_imen |= IMEN_MASK(ai);
- outb(ap->at_ioaddr + ICU_IMR_OFFSET, *ap->at_imen);
+ ap->at_imen |= IMEN_MASK(ai);
+ outb(ap->at_ioaddr + ICU_IMR_OFFSET, ap->at_imen);
}
/*
@@ -400,7 +464,7 @@
outb(imr_addr, MASTER_MODE);
/* Set interrupt enable mask. */
- outb(imr_addr, *pic->at_imen);
+ outb(imr_addr, pic->at_imen);
/* Reset is finished, default to IRR on read. */
outb(pic->at_ioaddr, OCW3_SEL | OCW3_RR);
@@ -420,7 +484,6 @@
int i;
/* Start off with all interrupts disabled. */
- imen = 0xffff;
i8259_init(&atpics[MASTER], 0);
i8259_init(&atpics[SLAVE], 1);
atpic_enable_source((struct intsrc *)&atintrs[ICU_SLAVEID]);
@@ -432,7 +495,8 @@
ai->at_intsrc.is_count = &ai->at_count;
ai->at_intsrc.is_straycount = &ai->at_straycount;
setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase +
- ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC);
+ ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC,
+ SEL_KPL, GSEL_ATPIC);
}
#ifdef DEV_MCA
@@ -492,8 +556,6 @@
static void
atpic_init(void *dummy __unused)
{
- struct atpic_intsrc *ai;
- int i;
/*
* Register our PICs, even if we aren't going to use any of their
@@ -503,29 +565,10 @@
intr_register_pic(&atpics[1].at_pic) != 0)
panic("Unable to register ATPICs");
- /*
- * If any of the ISA IRQs have an interrupt source already, then
- * assume that the APICs are being used and don't register any
- * of our interrupt sources. This makes sure we don't accidentally
- * use mixed mode. The "accidental" use could otherwise occur on
- * machines that route the ACPI SCI interrupt to a different ISA
- * IRQ (at least one machines routes it to IRQ 13) thus disabling
- * that APIC ISA routing and allowing the ATPIC source for that IRQ
- * to leak through. We used to depend on this feature for routing
- * IRQ0 via mixed mode, but now we don't use mixed mode at all.
- */
- for (i = 0; i < NUM_ISA_IRQS; i++)
- if (intr_lookup_source(i) != NULL)
- return;
-
- /* Loop through all interrupt sources and add them. */
- for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) {
- if (i == ICU_SLAVEID)
- continue;
- intr_register_source(&ai->at_intsrc);
- }
+ if (num_io_irqs == 0)
+ num_io_irqs = NUM_ISA_IRQS;
}
-SYSINIT(atpic_init, SI_SUB_INTR, SI_ORDER_SECOND + 1, atpic_init, NULL);
+SYSINIT(atpic_init, SI_SUB_INTR, SI_ORDER_FOURTH, atpic_init, NULL);
void
atpic_handle_intr(u_int vector, struct trapframe *frame)
Modified: trunk/sys/x86/isa/atrtc.c
===================================================================
--- trunk/sys/x86/isa/atrtc.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/atrtc.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -25,12 +25,13 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/x86/isa/atrtc.c 285446 2015-07-13 11:58:08Z brueffer $
+ * $FreeBSD: stable/11/sys/x86/isa/atrtc.c 345590 2019-03-27 19:17:42Z wulf $
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/atrtc.c 285446 2015-07-13 11:58:08Z brueffer $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/atrtc.c 345590 2019-03-27 19:17:42Z wulf $");
+#include "opt_acpi.h"
#include "opt_isa.h"
#include <sys/param.h>
@@ -53,10 +54,24 @@
#endif
#include <machine/intr_machdep.h>
#include "clock_if.h"
+#ifdef DEV_ACPI
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/accommon.h>
+#include <dev/acpica/acpivar.h>
+#include <machine/md_var.h>
+#endif
-#define RTC_LOCK do { if (!kdb_active) mtx_lock_spin(&clock_lock); } while (0)
-#define RTC_UNLOCK do { if (!kdb_active) mtx_unlock_spin(&clock_lock); } while (0)
+/*
+ * atrtc_lock protects low-level access to individual hardware registers.
+ * atrtc_time_lock protects the entire sequence of accessing multiple registers
+ * to read or write the date and time.
+ */
+static struct mtx atrtc_lock;
+MTX_SYSINIT(atrtc_lock_init, &atrtc_lock, "atrtc", MTX_SPIN);
+struct mtx atrtc_time_lock;
+MTX_SYSINIT(atrtc_time_lock_init, &atrtc_time_lock, "atrtc_time", MTX_DEF);
+
int atrtcclock_disable = 0;
static int rtc_reg = -1;
@@ -63,16 +78,19 @@
static u_char rtc_statusa = RTCSA_DIVIDER | RTCSA_NOPROF;
static u_char rtc_statusb = RTCSB_24HR;
+#ifdef DEV_ACPI
+#define _COMPONENT ACPI_TIMER
+ACPI_MODULE_NAME("ATRTC")
+#endif
+
/*
* RTC support routines
*/
-int
-rtcin(int reg)
+static inline u_char
+rtcin_locked(int reg)
{
- u_char val;
- RTC_LOCK;
if (rtc_reg != reg) {
inb(0x84);
outb(IO_RTC, reg);
@@ -79,16 +97,13 @@
rtc_reg = reg;
inb(0x84);
}
- val = inb(IO_RTC + 1);
- RTC_UNLOCK;
- return (val);
+ return (inb(IO_RTC + 1));
}
-void
-writertc(int reg, u_char val)
+static inline void
+rtcout_locked(int reg, u_char val)
{
- RTC_LOCK;
if (rtc_reg != reg) {
inb(0x84);
outb(IO_RTC, reg);
@@ -97,21 +112,36 @@
}
outb(IO_RTC + 1, val);
inb(0x84);
- RTC_UNLOCK;
}
-static __inline int
-readrtc(int port)
+int
+rtcin(int reg)
{
- return(bcd2bin(rtcin(port)));
+ u_char val;
+
+ mtx_lock_spin(&atrtc_lock);
+ val = rtcin_locked(reg);
+ mtx_unlock_spin(&atrtc_lock);
+ return (val);
}
+void
+writertc(int reg, u_char val)
+{
+
+ mtx_lock_spin(&atrtc_lock);
+ rtcout_locked(reg, val);
+ mtx_unlock_spin(&atrtc_lock);
+}
+
static void
atrtc_start(void)
{
- writertc(RTC_STATUSA, rtc_statusa);
- writertc(RTC_STATUSB, RTCSB_24HR);
+ mtx_lock_spin(&atrtc_lock);
+ rtcout_locked(RTC_STATUSA, rtc_statusa);
+ rtcout_locked(RTC_STATUSB, RTCSB_24HR);
+ mtx_unlock_spin(&atrtc_lock);
}
static void
@@ -127,8 +157,10 @@
{
rtc_statusb |= RTCSB_PINTR;
- writertc(RTC_STATUSB, rtc_statusb);
- rtcin(RTC_INTR);
+ mtx_lock_spin(&atrtc_lock);
+ rtcout_locked(RTC_STATUSB, rtc_statusb);
+ rtcin_locked(RTC_INTR);
+ mtx_unlock_spin(&atrtc_lock);
}
static void
@@ -136,8 +168,10 @@
{
rtc_statusb &= ~RTCSB_PINTR;
- writertc(RTC_STATUSB, rtc_statusb);
- rtcin(RTC_INTR);
+ mtx_lock_spin(&atrtc_lock);
+ rtcout_locked(RTC_STATUSB, rtc_statusb);
+ rtcin_locked(RTC_INTR);
+ mtx_unlock_spin(&atrtc_lock);
}
void
@@ -145,11 +179,13 @@
{
/* Restore all of the RTC's "status" (actually, control) registers. */
- rtcin(RTC_STATUSA); /* dummy to get rtc_reg set */
- writertc(RTC_STATUSB, RTCSB_24HR);
- writertc(RTC_STATUSA, rtc_statusa);
- writertc(RTC_STATUSB, rtc_statusb);
- rtcin(RTC_INTR);
+ mtx_lock_spin(&atrtc_lock);
+ rtcin_locked(RTC_STATUSA); /* dummy to get rtc_reg set */
+ rtcout_locked(RTC_STATUSB, RTCSB_24HR);
+ rtcout_locked(RTC_STATUSA, rtc_statusa);
+ rtcout_locked(RTC_STATUSB, rtc_statusb);
+ rtcin_locked(RTC_INTR);
+ mtx_unlock_spin(&atrtc_lock);
}
/**********************************************************************
@@ -162,6 +198,9 @@
struct resource *intr_res;
void *intr_handler;
struct eventtimer et;
+#ifdef DEV_ACPI
+ ACPI_HANDLE acpi_handle;
+#endif
};
static int
@@ -216,7 +255,145 @@
return(flag ? FILTER_HANDLED : FILTER_STRAY);
}
+#ifdef DEV_ACPI
/*
+ * ACPI RTC CMOS address space handler
+ */
+#define ATRTC_LAST_REG 0x40
+
+static void
+rtcin_region(int reg, void *buf, int len)
+{
+ u_char *ptr = buf;
+
+ /* Drop lock after each IO as intr and settime have greater priority */
+ while (len-- > 0)
+ *ptr++ = rtcin(reg++) & 0xff;
+}
+
+static void
+rtcout_region(int reg, const void *buf, int len)
+{
+ const u_char *ptr = buf;
+
+ while (len-- > 0)
+ writertc(reg++, *ptr++);
+}
+
+static bool
+atrtc_check_cmos_access(bool is_read, ACPI_PHYSICAL_ADDRESS addr, UINT32 len)
+{
+
+ /* Block address space wrapping on out-of-bound access */
+ if (addr >= ATRTC_LAST_REG || addr + len > ATRTC_LAST_REG)
+ return (false);
+
+ if (is_read) {
+ /* Reading 0x0C will muck with interrupts */
+ if (addr <= RTC_INTR && addr + len > RTC_INTR)
+ return (false);
+ } else {
+ /*
+ * Allow single-byte writes to alarm registers and
+ * multi-byte writes to addr >= 0x30, else deny.
+ */
+ if (!((len == 1 && (addr == RTC_SECALRM ||
+ addr == RTC_MINALRM ||
+ addr == RTC_HRSALRM)) ||
+ addr >= 0x30))
+ return (false);
+ }
+ return (true);
+}
+
+static ACPI_STATUS
+atrtc_acpi_cmos_handler(UINT32 func, ACPI_PHYSICAL_ADDRESS addr,
+ UINT32 bitwidth, UINT64 *value, void *context, void *region_context)
+{
+ device_t dev = context;
+ UINT32 bytewidth = howmany(bitwidth, 8);
+ bool is_read = func == ACPI_READ;
+
+ /* ACPICA is very verbose on CMOS handler failures, so we, too */
+#define CMOS_HANDLER_ERR(fmt, ...) \
+ device_printf(dev, "ACPI [SystemCMOS] handler: " fmt, ##__VA_ARGS__)
+
+ ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
+
+ if (value == NULL) {
+ CMOS_HANDLER_ERR("NULL parameter\n");
+ return (AE_BAD_PARAMETER);
+ }
+ if (bitwidth == 0 || (bitwidth & 0x07) != 0) {
+ CMOS_HANDLER_ERR("Invalid bitwidth: %u\n", bitwidth);
+ return (AE_BAD_PARAMETER);
+ }
+ if (!atrtc_check_cmos_access(is_read, addr, bytewidth)) {
+ CMOS_HANDLER_ERR("%s access rejected: addr=%#04jx, len=%u\n",
+ is_read ? "Read" : "Write", (uintmax_t)addr, bytewidth);
+ return (AE_BAD_PARAMETER);
+ }
+
+ switch (func) {
+ case ACPI_READ:
+ rtcin_region(addr, value, bytewidth);
+ break;
+ case ACPI_WRITE:
+ rtcout_region(addr, value, bytewidth);
+ break;
+ default:
+ CMOS_HANDLER_ERR("Invalid function: %u\n", func);
+ return (AE_BAD_PARAMETER);
+ }
+
+ ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
+ "ACPI RTC CMOS %s access: addr=%#04x, len=%u, val=%*D\n",
+ is_read ? "read" : "write", (unsigned)addr, bytewidth,
+ bytewidth, value, " ");
+
+ return (AE_OK);
+}
+
+static int
+atrtc_reg_acpi_cmos_handler(device_t dev)
+{
+ struct atrtc_softc *sc = device_get_softc(dev);
+
+ ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__);
+
+ /* Don't handle address space events if driver is disabled. */
+ if (acpi_disabled("atrtc"))
+ return (ENXIO);
+
+ sc->acpi_handle = acpi_get_handle(dev);
+ if (sc->acpi_handle == NULL ||
+ ACPI_FAILURE(AcpiInstallAddressSpaceHandler(sc->acpi_handle,
+ ACPI_ADR_SPACE_CMOS, atrtc_acpi_cmos_handler, NULL, dev))) {
+ sc->acpi_handle = NULL;
+ device_printf(dev,
+ "Can't register ACPI CMOS address space handler\n");
+ return (ENXIO);
+ }
+
+ return (0);
+}
+
+static int
+atrtc_unreg_acpi_cmos_handler(device_t dev)
+{
+ struct atrtc_softc *sc = device_get_softc(dev);
+
+ ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__);
+
+ if (sc->acpi_handle != NULL)
+ AcpiRemoveAddressSpaceHandler(sc->acpi_handle,
+ ACPI_ADR_SPACE_CMOS, atrtc_acpi_cmos_handler);
+
+ return (0);
+}
+#endif /* DEV_ACPI */
+
+/*
* Attach to the ISA PnP descriptors for the timer and realtime clock.
*/
static struct isa_pnp_id atrtc_ids[] = {
@@ -242,7 +419,7 @@
atrtc_attach(device_t dev)
{
struct atrtc_softc *sc;
- u_long s;
+ rman_res_t s;
int i;
sc = device_get_softc(dev);
@@ -288,6 +465,37 @@
}
static int
+atrtc_isa_attach(device_t dev)
+{
+
+ return (atrtc_attach(dev));
+}
+
+#ifdef DEV_ACPI
+static int
+atrtc_acpi_attach(device_t dev)
+{
+ int ret;
+
+ ret = atrtc_attach(dev);
+ if (ret)
+ return (ret);
+
+ (void)atrtc_reg_acpi_cmos_handler(dev);
+
+ return (0);
+}
+
+static int
+atrtc_acpi_detach(device_t dev)
+{
+
+ (void)atrtc_unreg_acpi_cmos_handler(dev);
+ return (0);
+}
+#endif /* DEV_ACPI */
+
+static int
atrtc_resume(device_t dev)
{
@@ -298,28 +506,38 @@
static int
atrtc_settime(device_t dev __unused, struct timespec *ts)
{
- struct clocktime ct;
+ struct bcd_clocktime bct;
- clock_ts_to_ct(ts, &ct);
+ clock_ts_to_bcd(ts, &bct, false);
+ clock_dbgprint_bcd(dev, CLOCK_DBG_WRITE, &bct);
- /* Disable RTC updates and interrupts. */
- writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR);
+ mtx_lock(&atrtc_time_lock);
+ mtx_lock_spin(&atrtc_lock);
- writertc(RTC_SEC, bin2bcd(ct.sec)); /* Write back Seconds */
- writertc(RTC_MIN, bin2bcd(ct.min)); /* Write back Minutes */
- writertc(RTC_HRS, bin2bcd(ct.hour)); /* Write back Hours */
+ /* Disable RTC updates and interrupts. */
+ rtcout_locked(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR);
- writertc(RTC_WDAY, ct.dow + 1); /* Write back Weekday */
- writertc(RTC_DAY, bin2bcd(ct.day)); /* Write back Day */
- writertc(RTC_MONTH, bin2bcd(ct.mon)); /* Write back Month */
- writertc(RTC_YEAR, bin2bcd(ct.year % 100)); /* Write back Year */
+ /* Write all the time registers. */
+ rtcout_locked(RTC_SEC, bct.sec);
+ rtcout_locked(RTC_MIN, bct.min);
+ rtcout_locked(RTC_HRS, bct.hour);
+ rtcout_locked(RTC_WDAY, bct.dow + 1);
+ rtcout_locked(RTC_DAY, bct.day);
+ rtcout_locked(RTC_MONTH, bct.mon);
+ rtcout_locked(RTC_YEAR, bct.year & 0xff);
#ifdef USE_RTC_CENTURY
- writertc(RTC_CENTURY, bin2bcd(ct.year / 100)); /* ... and Century */
+ rtcout_locked(RTC_CENTURY, bct.year >> 8);
#endif
- /* Reenable RTC updates and interrupts. */
- writertc(RTC_STATUSB, rtc_statusb);
- rtcin(RTC_INTR);
+ /*
+ * Re-enable RTC updates and interrupts.
+ */
+ rtcout_locked(RTC_STATUSB, rtc_statusb);
+ rtcin_locked(RTC_INTR);
+
+ mtx_unlock_spin(&atrtc_lock);
+ mtx_unlock(&atrtc_time_lock);
+
return (0);
}
@@ -326,7 +544,7 @@
static int
atrtc_gettime(device_t dev, struct timespec *ts)
{
- struct clocktime ct;
+ struct bcd_clocktime bct;
/* Look if we have a RTC present and the time is valid */
if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) {
@@ -341,32 +559,32 @@
* to make sure that no more than 240us pass after we start reading,
* and try again if so.
*/
+ mtx_lock(&atrtc_time_lock);
while (rtcin(RTC_STATUSA) & RTCSA_TUP)
continue;
- critical_enter();
- ct.nsec = 0;
- ct.sec = readrtc(RTC_SEC);
- ct.min = readrtc(RTC_MIN);
- ct.hour = readrtc(RTC_HRS);
- ct.day = readrtc(RTC_DAY);
- ct.dow = readrtc(RTC_WDAY) - 1;
- ct.mon = readrtc(RTC_MONTH);
- ct.year = readrtc(RTC_YEAR);
+ mtx_lock_spin(&atrtc_lock);
+ bct.sec = rtcin_locked(RTC_SEC);
+ bct.min = rtcin_locked(RTC_MIN);
+ bct.hour = rtcin_locked(RTC_HRS);
+ bct.day = rtcin_locked(RTC_DAY);
+ bct.mon = rtcin_locked(RTC_MONTH);
+ bct.year = rtcin_locked(RTC_YEAR);
#ifdef USE_RTC_CENTURY
- ct.year += readrtc(RTC_CENTURY) * 100;
-#else
- ct.year += (ct.year < 80 ? 2000 : 1900);
+ bct.year |= rtcin_locked(RTC_CENTURY) << 8;
#endif
- critical_exit();
- /* Set dow = -1 because some clocks don't set it correctly. */
- ct.dow = -1;
- return (clock_ct_to_ts(&ct, ts));
+ mtx_unlock_spin(&atrtc_lock);
+ mtx_unlock(&atrtc_time_lock);
+ /* dow is unused in timespec conversion and we have no nsec info. */
+ bct.dow = 0;
+ bct.nsec = 0;
+ clock_dbgprint_bcd(dev, CLOCK_DBG_READ, &bct);
+ return (clock_bcd_to_ts(&bct, ts, false));
}
-static device_method_t atrtc_methods[] = {
+static device_method_t atrtc_isa_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, atrtc_probe),
- DEVMETHOD(device_attach, atrtc_attach),
+ DEVMETHOD(device_attach, atrtc_isa_attach),
DEVMETHOD(device_detach, bus_generic_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
DEVMETHOD(device_suspend, bus_generic_suspend),
@@ -380,26 +598,38 @@
{ 0, 0 }
};
-static driver_t atrtc_driver = {
+static driver_t atrtc_isa_driver = {
"atrtc",
- atrtc_methods,
+ atrtc_isa_methods,
sizeof(struct atrtc_softc),
};
-static devclass_t atrtc_devclass;
+#ifdef DEV_ACPI
+static device_method_t atrtc_acpi_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, atrtc_probe),
+ DEVMETHOD(device_attach, atrtc_acpi_attach),
+ DEVMETHOD(device_detach, atrtc_acpi_detach),
+ /* XXX stop statclock? */
+ DEVMETHOD(device_resume, atrtc_resume),
-DRIVER_MODULE(atrtc, isa, atrtc_driver, atrtc_devclass, 0, 0);
-DRIVER_MODULE(atrtc, acpi, atrtc_driver, atrtc_devclass, 0, 0);
+ /* clock interface */
+ DEVMETHOD(clock_gettime, atrtc_gettime),
+ DEVMETHOD(clock_settime, atrtc_settime),
-#include "opt_ddb.h"
-#ifdef DDB
-#include <ddb/ddb.h>
+ { 0, 0 }
+};
-DB_SHOW_COMMAND(rtc, rtc)
-{
- printf("%02x/%02x/%02x %02x:%02x:%02x, A = %02x, B = %02x, C = %02x\n",
- rtcin(RTC_YEAR), rtcin(RTC_MONTH), rtcin(RTC_DAY),
- rtcin(RTC_HRS), rtcin(RTC_MIN), rtcin(RTC_SEC),
- rtcin(RTC_STATUSA), rtcin(RTC_STATUSB), rtcin(RTC_INTR));
-}
-#endif /* DDB */
+static driver_t atrtc_acpi_driver = {
+ "atrtc",
+ atrtc_acpi_methods,
+ sizeof(struct atrtc_softc),
+};
+#endif /* DEV_ACPI */
+
+static devclass_t atrtc_devclass;
+
+DRIVER_MODULE(atrtc, isa, atrtc_isa_driver, atrtc_devclass, 0, 0);
+#ifdef DEV_ACPI
+DRIVER_MODULE(atrtc, acpi, atrtc_acpi_driver, atrtc_devclass, 0, 0);
+#endif
Modified: trunk/sys/x86/isa/clock.c
===================================================================
--- trunk/sys/x86/isa/clock.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/clock.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/clock.c 254373 2013-08-15 17:21:06Z brooks $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/clock.c 331722 2018-03-29 02:50:57Z eadler $");
/*
* Routines to handle clock hardware.
@@ -66,6 +66,7 @@
#include <machine/intr_machdep.h>
#include <machine/ppireg.h>
#include <machine/timerreg.h>
+#include <x86/init.h>
#ifdef PC98
#include <pc98/pc98/pc98_machdep.h>
@@ -98,7 +99,7 @@
int i8254_max_count;
static int i8254_timecounter = 1;
-struct mtx clock_lock;
+static struct mtx clock_lock;
static struct intsrc *i8254_intsrc;
static uint16_t i8254_lastcount;
static uint16_t i8254_offset;
@@ -140,6 +141,15 @@
static unsigned i8254_get_timecount(struct timecounter *tc);
static void set_i8254_freq(int mode, uint32_t period);
+void
+clock_init(void)
+{
+ /* Init the clock lock */
+ mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE);
+ /* Init the clock in order to use DELAY */
+ init_ops.early_clock_source_init();
+}
+
static int
clkintr(void *arg)
{
@@ -157,7 +167,7 @@
mtx_unlock_spin(&clock_lock);
}
- if (sc && sc->et.et_active && sc->mode != MODE_STOP)
+ if (sc->et.et_active && sc->mode != MODE_STOP)
sc->et.et_event_cb(&sc->et, sc->et.et_arg);
#ifdef DEV_MCA
@@ -248,54 +258,6 @@
return ((high << 8) | low);
}
-#ifndef DELAYDEBUG
-static u_int
-get_tsc(__unused struct timecounter *tc)
-{
-
- return (rdtsc32());
-}
-
-static __inline int
-delay_tc(int n)
-{
- struct timecounter *tc;
- timecounter_get_t *func;
- uint64_t end, freq, now;
- u_int last, mask, u;
-
- tc = timecounter;
- freq = atomic_load_acq_64(&tsc_freq);
- if (tsc_is_invariant && freq != 0) {
- func = get_tsc;
- mask = ~0u;
- } else {
- if (tc->tc_quality <= 0)
- return (0);
- func = tc->tc_get_timecount;
- mask = tc->tc_counter_mask;
- freq = tc->tc_frequency;
- }
- now = 0;
- end = freq * n / 1000000;
- if (func == get_tsc)
- sched_pin();
- last = func(tc) & mask;
- do {
- cpu_spinwait();
- u = func(tc) & mask;
- if (u < last)
- now += mask - last + u + 1;
- else
- now += u - last;
- last = u;
- } while (now < end);
- if (func == get_tsc)
- sched_unpin();
- return (1);
-}
-#endif
-
/*
* Wait "n" microseconds.
* Relies on timer 1 counting down from (i8254_freq / hz)
@@ -302,7 +264,7 @@
* Note: timer had better have been programmed before this is first used!
*/
void
-DELAY(int n)
+i8254_delay(int n)
{
int delta, prev_tick, tick, ticks_left;
#ifdef DELAYDEBUG
@@ -318,9 +280,6 @@
}
if (state == 1)
printf("DELAY(%d)...", n);
-#else
- if (delay_tc(n))
- return;
#endif
/*
* Read the counter first, so that the rest of the setup overhead is
@@ -500,7 +459,6 @@
i8254_init(void)
{
- mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE);
#ifdef PC98
if (pc98_machine_type & M_8M)
i8254_freq = 1996800L; /* 1.9968 MHz */
@@ -518,8 +476,27 @@
void
cpu_initclocks(void)
{
+#ifdef EARLY_AP_STARTUP
+ struct thread *td;
+ int i;
+ td = curthread;
cpu_initclocks_bsp();
+ CPU_FOREACH(i) {
+ if (i == 0)
+ continue;
+ thread_lock(td);
+ sched_bind(td, i);
+ thread_unlock(td);
+ cpu_initclocks_ap();
+ }
+ thread_lock(td);
+ if (sched_is_bound(td))
+ sched_unbind(td);
+ thread_unlock(td);
+#else
+ cpu_initclocks_bsp();
+#endif
}
static int
@@ -699,7 +676,7 @@
attimer_attach(device_t dev)
{
struct attimer_softc *sc;
- u_long s;
+ rman_res_t s;
int i;
attimer_sc = sc = device_get_softc(dev);
Modified: trunk/sys/x86/isa/elcr.c
===================================================================
--- trunk/sys/x86/isa/elcr.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/elcr.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/elcr.c 262192 2014-02-18 20:27:17Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/elcr.c 261520 2014-02-05 18:13:27Z jhb $");
/*
* The ELCR is a register that controls the trigger mode and polarity of
Modified: trunk/sys/x86/isa/icu.h
===================================================================
--- trunk/sys/x86/isa/icu.h 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/icu.h 2020-02-08 19:32:41 UTC (rev 12310)
@@ -31,7 +31,7 @@
* SUCH DAMAGE.
*
* from: @(#)icu.h 5.6 (Berkeley) 5/9/91
- * $FreeBSD: stable/10/sys/x86/isa/icu.h 233031 2012-03-16 12:13:44Z nyan $
+ * $FreeBSD: stable/11/sys/x86/isa/icu.h 339928 2018-10-30 19:10:41Z jhb $
*/
/*
@@ -88,7 +88,6 @@
#endif
#define IRQ_MASK(irq) (1 << (irq))
-#define IMEN_MASK(ai) (IRQ_MASK((ai)->at_irq))
void atpic_handle_intr(u_int vector, struct trapframe *frame);
void atpic_startup(void);
Modified: trunk/sys/x86/isa/isa.c
===================================================================
--- trunk/sys/x86/isa/isa.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/isa.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/isa.c 221526 2011-05-06 13:48:53Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/isa.c 295832 2016-02-20 01:32:58Z jhibbits $");
/*-
* Modifications for Intel architecture by Garrett A. Wollman.
@@ -89,13 +89,13 @@
*/
struct resource *
isa_alloc_resource(device_t bus, device_t child, int type, int *rid,
- u_long start, u_long end, u_long count, u_int flags)
+ rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
/*
* Consider adding a resource definition.
*/
int passthrough = (device_get_parent(child) != bus);
- int isdefault = (start == 0UL && end == ~0UL);
+ int isdefault = RMAN_IS_DEFAULT_RANGE(start, end);
struct isa_device* idev = DEVTOISA(child);
struct resource_list *rl = &idev->id_resources;
struct resource_list_entry *rle;
@@ -242,3 +242,8 @@
* On this platform, isa can also attach to the legacy bus.
*/
DRIVER_MODULE(isa, legacy, isa_driver, isa_devclass, 0, 0);
+
+/*
+ * Attach the ISA bus to the xenpv bus in order to get syscons.
+ */
+DRIVER_MODULE(isa, xenpv, isa_driver, isa_devclass, 0, 0);
Modified: trunk/sys/x86/isa/isa_dma.c
===================================================================
--- trunk/sys/x86/isa/isa_dma.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/isa_dma.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -34,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/isa_dma.c 233675 2012-03-29 18:58:02Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/isa_dma.c 332304 2018-04-08 20:52:09Z emaste $");
/*
* code to manage AT bus
@@ -62,7 +62,7 @@
#include <isa/isavar.h>
#include <isa/isa_dmareg.h>
-#define ISARAM_END RAM_END
+#define ISARAM_END 0x1000000
static int isa_dmarangecheck(caddr_t va, u_int length, int chan);
@@ -145,8 +145,7 @@
* in open() or during its initialization.
*/
int
-isa_dma_acquire(chan)
- int chan;
+isa_dma_acquire(int chan)
{
#ifdef DIAGNOSTIC
if (chan & ~VALID_DMA_MASK)
@@ -171,8 +170,7 @@
* during close() or during its shutdown.
*/
void
-isa_dma_release(chan)
- int chan;
+isa_dma_release(int chan)
{
#ifdef DIAGNOSTIC
if (chan & ~VALID_DMA_MASK)
@@ -206,8 +204,7 @@
* external dma control by a board.
*/
void
-isa_dmacascade(chan)
- int chan;
+isa_dmacascade(int chan)
{
#ifdef DIAGNOSTIC
if (chan & ~VALID_DMA_MASK)
Modified: trunk/sys/x86/isa/nmi.c
===================================================================
--- trunk/sys/x86/isa/nmi.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/nmi.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -34,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/nmi.c 204309 2010-02-25 14:13:39Z attilio $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/nmi.c 331722 2018-03-29 02:50:57Z eadler $");
#include "opt_mca.h"
Modified: trunk/sys/x86/isa/orm.c
===================================================================
--- trunk/sys/x86/isa/orm.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/orm.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/orm.c 204309 2010-02-25 14:13:39Z attilio $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/orm.c 299392 2016-05-10 22:28:06Z bz $");
/*
* Driver to take care of holes in ISA I/O memory occupied
@@ -59,7 +59,7 @@
{ 0, NULL },
};
-#define MAX_ROMS 16
+#define MAX_ROMS 32
struct orm_softc {
int rnum;
@@ -92,6 +92,9 @@
struct orm_softc *sc;
u_int8_t buf[3];
+ if (resource_disabled("orm", 0))
+ return;
+
child = BUS_ADD_CHILD(parent, ISA_ORDER_SENSITIVE, "orm", -1);
device_set_driver(child, driver);
isa_set_logicalid(child, ORM_ID);
@@ -98,7 +101,7 @@
isa_set_vendorid(child, ORM_ID);
sc = device_get_softc(child);
sc->rnum = 0;
- while (chunk < IOMEM_END) {
+ while (sc->rnum < MAX_ROMS && chunk < IOMEM_END) {
bus_set_resource(child, SYS_RES_MEMORY, sc->rnum, chunk,
IOMEM_STEP);
rid = sc->rnum;
Modified: trunk/sys/x86/pci/pci_bus.c
===================================================================
--- trunk/sys/x86/pci/pci_bus.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/pci/pci_bus.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/pci/pci_bus.c 280970 2015-04-01 21:48:54Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/pci/pci_bus.c 294883 2016-01-27 02:23:54Z jhibbits $");
#include "opt_cpu.h"
@@ -525,7 +525,7 @@
device_probe_and_attach(pir);
}
#endif
- device_add_child(dev, "pci", bus);
+ device_add_child(dev, "pci", -1);
return bus_generic_attach(dev);
}
@@ -576,12 +576,11 @@
SYSCTL_DECL(_hw_pci);
static unsigned long host_mem_start = 0x80000000;
-TUNABLE_ULONG("hw.pci.host_mem_start", &host_mem_start);
SYSCTL_ULONG(_hw_pci, OID_AUTO, host_mem_start, CTLFLAG_RDTUN, &host_mem_start,
0, "Limit the host bridge memory to being above this address.");
-u_long
-hostb_alloc_start(int type, u_long start, u_long end, u_long count)
+rman_res_t
+hostb_alloc_start(int type, rman_res_t start, rman_res_t end, rman_res_t count)
{
if (start + count - 1 != end) {
@@ -595,7 +594,7 @@
struct resource *
legacy_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
- u_long start, u_long end, u_long count, u_int flags)
+ rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
@@ -611,7 +610,7 @@
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
int
legacy_pcib_adjust_resource(device_t dev, device_t child, int type,
- struct resource *r, u_long start, u_long end)
+ struct resource *r, rman_res_t start, rman_res_t end)
{
if (type == PCI_RES_BUS)
Modified: trunk/sys/x86/pci/qpi.c
===================================================================
--- trunk/sys/x86/pci/qpi.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/pci/qpi.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,14 +27,14 @@
*/
/*
- * This driver provides a psuedo-bus to enumerate the PCI buses
- * present on a sytem using a QPI chipset. It creates a qpi0 bus that
- * is a child of nexus0 and then creates two Host-PCI bridges as a
+ * This driver provides a pseudo-bus to enumerate the PCI buses
+ * present on a system using a QPI chipset. It creates a qpi0 bus that
+ * is a child of nexus0 and then creates Host-PCI bridges as a
* child of that.
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/pci/qpi.c 283927 2015-06-02 19:20:39Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/pci/qpi.c 323609 2017-09-15 09:03:01Z kib $");
#include <sys/param.h>
#include <sys/bus.h>
@@ -64,17 +64,23 @@
static void
qpi_identify(driver_t *driver, device_t parent)
{
+ int do_qpi;
- /* Check CPUID to ensure this is an i7 CPU of some sort. */
- if (!(cpu_vendor_id == CPU_VENDOR_INTEL &&
- CPUID_TO_FAMILY(cpu_id) == 0x6 &&
- (CPUID_TO_MODEL(cpu_id) == 0x1a || CPUID_TO_MODEL(cpu_id) == 0x2c)))
- return;
+ /* Check CPUID to ensure this is an i7 CPU of some sort. */
+ if (cpu_vendor_id != CPU_VENDOR_INTEL ||
+ CPUID_TO_FAMILY(cpu_id) != 0x6)
+ return;
- /* PCI config register access is required. */
- if (pci_cfgregopen() == 0)
- return;
+ /* Only discover buses with configuration devices if allowed by user */
+ do_qpi = 0;
+ TUNABLE_INT_FETCH("hw.attach_intel_csr_pci", &do_qpi);
+ if (!do_qpi)
+ return;
+ /* PCI config register access is required. */
+ if (pci_cfgregopen() == 0)
+ return;
+
/* Add a qpi bus device. */
if (BUS_ADD_CHILD(parent, 20, "qpi", -1) == NULL)
panic("Failed to add qpi bus");
@@ -98,6 +104,7 @@
struct qpi_device *qdev;
device_t child;
uint32_t devid;
+ int s;
/*
* If a PCI bus already exists for this bus number, then
@@ -107,18 +114,23 @@
return (EEXIST);
/*
- * Attempt to read the device id for device 0, function 0 on
- * the bus. A value of 0xffffffff means that the bus is not
- * present.
+ * Attempt to read the device id for every slot, function 0 on
+ * the bus. If all read values are 0xffffffff this means that
+ * the bus is not present.
*/
- devid = pci_cfgregread(bus, 0, 0, PCIR_DEVVENDOR, 4);
+ for (s = 0; s <= PCI_SLOTMAX; s++) {
+ devid = pci_cfgregread(bus, s, 0, PCIR_DEVVENDOR, 4);
+ if (devid != 0xffffffff)
+ break;
+ }
if (devid == 0xffffffff)
return (ENOENT);
if ((devid & 0xffff) != 0x8086) {
- device_printf(dev,
- "Device at pci%d.0.0 has non-Intel vendor 0x%x\n", bus,
- devid & 0xffff);
+ if (bootverbose)
+ device_printf(dev,
+ "Device at pci%d.%d.0 has non-Intel vendor 0x%x\n",
+ bus, s, devid & 0xffff);
return (ENXIO);
}
@@ -138,12 +150,12 @@
int bus;
/*
- * Each processor socket has a dedicated PCI bus counting down from
- * 255. We keep probing buses until one fails.
+ * Each processor socket has a dedicated PCI bus, sometimes
+ * not enumerated by ACPI. Probe all unattached buses from 0
+ * to 255.
*/
- for (bus = 255;; bus--)
- if (qpi_probe_pcib(dev, bus) != 0)
- break;
+ for (bus = PCI_BUSMAX; bus >= 0; bus--)
+ qpi_probe_pcib(dev, bus);
return (bus_generic_attach(dev));
}
@@ -219,8 +231,8 @@
qpi_pcib_attach(device_t dev)
{
- device_add_child(dev, "pci", pcib_get_bus(dev));
- return (bus_generic_attach(dev));
+ device_add_child(dev, "pci", -1);
+ return (bus_generic_attach(dev));
}
static int
@@ -242,7 +254,7 @@
#if defined(NEW_PCIB) && defined(PCI_RES_BUS)
static struct resource *
qpi_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
- u_long start, u_long end, u_long count, u_int flags)
+ rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
if (type == PCI_RES_BUS)
Added: trunk/sys/x86/x86/autoconf.c
===================================================================
--- trunk/sys/x86/x86/autoconf.c (rev 0)
+++ trunk/sys/x86/x86/autoconf.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,162 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)autoconf.c 7.1 (Berkeley) 5/9/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/autoconf.c 332304 2018-04-08 20:52:09Z emaste $");
+
+/*
+ * Setup the system to run on the current machine.
+ *
+ * Configure() is called at boot time and initializes the vba
+ * device tables and the memory controller monitoring. Available
+ * devices are determined (from possibilities mentioned in ioconf.c),
+ * and the drivers are initialized.
+ */
+#include "opt_bootp.h"
+#include "opt_isa.h"
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/reboot.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/cons.h>
+
+#include <sys/socket.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+
+#ifdef PC98
+#include <machine/bootinfo.h>
+#endif
+#include <machine/md_var.h>
+
+#ifdef DEV_ISA
+#include <isa/isavar.h>
+
+device_t isa_bus_device = 0;
+#endif
+
+static void configure_first(void *);
+static void configure(void *);
+static void configure_final(void *);
+
+SYSINIT(configure1, SI_SUB_CONFIGURE, SI_ORDER_FIRST, configure_first, NULL);
+/* SI_ORDER_SECOND is hookable */
+SYSINIT(configure2, SI_SUB_CONFIGURE, SI_ORDER_THIRD, configure, NULL);
+/* SI_ORDER_MIDDLE is hookable */
+SYSINIT(configure3, SI_SUB_CONFIGURE, SI_ORDER_ANY, configure_final, NULL);
+
+/*
+ * Determine i/o configuration for a machine.
+ */
+static void
+configure_first(void *dummy)
+{
+
+ /* nexus0 is the top of the x86 device tree */
+ device_add_child(root_bus, "nexus", 0);
+}
+
+static void
+configure(void *dummy)
+{
+
+ /* initialize new bus architecture */
+ root_bus_configure();
+
+#ifdef DEV_ISA
+ /*
+ * Explicitly probe and attach ISA last. The isa bus saves
+ * it's device node at attach time for us here.
+ */
+ if (isa_bus_device)
+ isa_probe_children(isa_bus_device);
+#endif
+}
+
+static void
+configure_final(void *dummy)
+{
+
+ cninit_finish();
+
+ if (bootverbose) {
+#ifdef PC98
+ int i;
+
+ /*
+ * Print out the BIOS's idea of the disk geometries.
+ */
+ printf("BIOS Geometries:\n");
+ for (i = 0; i < N_BIOS_GEOM; i++) {
+ unsigned long bios_geom;
+ int max_cylinder, max_head, max_sector;
+
+ bios_geom = bootinfo.bi_bios_geom[i];
+
+ /*
+ * XXX the bootstrap punts a 1200K floppy geometry
+ * when the get-disk-geometry interrupt fails. Skip
+ * drives that have this geometry.
+ */
+ if (bios_geom == 0x4f020f)
+ continue;
+
+ printf(" %x:%08lx ", i, bios_geom);
+ max_cylinder = bios_geom >> 16;
+ max_head = (bios_geom >> 8) & 0xff;
+ max_sector = bios_geom & 0xff;
+ printf(
+ "0..%d=%d cylinders, 0..%d=%d heads, 1..%d=%d sectors\n",
+ max_cylinder, max_cylinder + 1,
+ max_head, max_head + 1,
+ max_sector, max_sector);
+ }
+ printf(" %d accounted for\n", bootinfo.bi_n_bios_used);
+#endif
+
+ printf("Device configuration finished.\n");
+ }
+ cold = 0;
+}
Property changes on: trunk/sys/x86/x86/autoconf.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/bus_machdep.c
===================================================================
--- trunk/sys/x86/x86/bus_machdep.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/bus_machdep.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/bus_machdep.c 287126 2015-08-25 14:39:40Z marcel $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/bus_machdep.c 286667 2015-08-12 15:26:32Z marcel $");
#include <sys/param.h>
#include <sys/systm.h>
Modified: trunk/sys/x86/x86/busdma_bounce.c
===================================================================
--- trunk/sys/x86/x86/busdma_bounce.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/busdma_bounce.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/busdma_bounce.c 318977 2017-05-27 08:17:59Z hselasky $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/busdma_bounce.c 343361 2019-01-23 20:49:14Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -80,7 +80,8 @@
vm_offset_t vaddr; /* kva of bounce buffer */
bus_addr_t busaddr; /* Physical address */
vm_offset_t datavaddr; /* kva of client data */
- bus_addr_t dataaddr; /* client physical address */
+ vm_offset_t dataoffs; /* page offset of client data */
+ vm_page_t datapage[2]; /* physical page(s) of client data */
bus_size_t datacount; /* client data count */
STAILQ_ENTRY(bounce_page) links;
};
@@ -135,10 +136,9 @@
static int reserve_bounce_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
int commit);
static bus_addr_t add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map,
- vm_offset_t vaddr, bus_addr_t addr,
- bus_size_t size);
+ vm_offset_t vaddr, bus_addr_t addr1,
+ bus_addr_t addr2, bus_size_t size);
static void free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage);
-int run_filter(bus_dma_tag_t dmat, bus_addr_t paddr);
static void _bus_dmamap_count_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
pmap_t pmap, void *buf, bus_size_t buflen,
int flags);
@@ -148,11 +148,6 @@
static int _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
int flags);
-#ifdef XEN
-#undef pmap_kextract
-#define pmap_kextract pmap_kextract_ma
-#endif
-
/*
* Allocate a device specific dma_tag.
*/
@@ -494,7 +489,8 @@
while (buflen != 0) {
sgsize = MIN(buflen, dmat->common.maxsegsz);
if (bus_dma_run_filter(&dmat->common, curaddr)) {
- sgsize = MIN(sgsize, PAGE_SIZE);
+ sgsize = MIN(sgsize,
+ PAGE_SIZE - (curaddr & PAGE_MASK));
map->pagesneeded++;
}
curaddr += sgsize;
@@ -544,6 +540,51 @@
}
}
+static void
+_bus_dmamap_count_ma(bus_dma_tag_t dmat, bus_dmamap_t map, struct vm_page **ma,
+ int ma_offs, bus_size_t buflen, int flags)
+{
+ bus_size_t sg_len, max_sgsize;
+ int page_index;
+ vm_paddr_t paddr;
+
+ if ((map != &nobounce_dmamap && map->pagesneeded == 0)) {
+ CTR4(KTR_BUSDMA, "lowaddr= %d Maxmem= %d, boundary= %d, "
+ "alignment= %d", dmat->common.lowaddr,
+ ptoa((vm_paddr_t)Maxmem),
+ dmat->common.boundary, dmat->common.alignment);
+ CTR3(KTR_BUSDMA, "map= %p, nobouncemap= %p, pagesneeded= %d",
+ map, &nobounce_dmamap, map->pagesneeded);
+
+ /*
+ * Count the number of bounce pages
+ * needed in order to complete this transfer
+ */
+ page_index = 0;
+ while (buflen > 0) {
+ paddr = VM_PAGE_TO_PHYS(ma[page_index]) + ma_offs;
+ sg_len = PAGE_SIZE - ma_offs;
+ max_sgsize = MIN(buflen, dmat->common.maxsegsz);
+ sg_len = MIN(sg_len, max_sgsize);
+ if (bus_dma_run_filter(&dmat->common, paddr) != 0) {
+ sg_len = roundup2(sg_len,
+ dmat->common.alignment);
+ sg_len = MIN(sg_len, max_sgsize);
+ KASSERT((sg_len & (dmat->common.alignment - 1))
+ == 0, ("Segment size is not aligned"));
+ map->pagesneeded++;
+ }
+ if (((ma_offs + sg_len) & ~PAGE_MASK) != 0)
+ page_index++;
+ ma_offs = (ma_offs + sg_len) & PAGE_MASK;
+ KASSERT(buflen >= sg_len,
+ ("Segment length overruns original buffer"));
+ buflen -= sg_len;
+ }
+ CTR1(KTR_BUSDMA, "pagesneeded= %d\n", map->pagesneeded);
+ }
+}
+
static int
_bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int flags)
{
@@ -648,8 +689,8 @@
if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) &&
map->pagesneeded != 0 &&
bus_dma_run_filter(&dmat->common, curaddr)) {
- sgsize = MIN(sgsize, PAGE_SIZE);
- curaddr = add_bounce_page(dmat, map, 0, curaddr,
+ sgsize = MIN(sgsize, PAGE_SIZE - (curaddr & PAGE_MASK));
+ curaddr = add_bounce_page(dmat, map, 0, curaddr, 0,
sgsize);
}
sgsize = _bus_dmamap_addseg(dmat, map, curaddr, sgsize, segs,
@@ -677,7 +718,7 @@
{
bus_size_t sgsize, max_sgsize;
bus_addr_t curaddr;
- vm_offset_t vaddr;
+ vm_offset_t kvaddr, vaddr;
int error;
if (map == NULL)
@@ -700,22 +741,25 @@
/*
* Get the physical address for this segment.
*/
- if (pmap == kernel_pmap)
+ if (pmap == kernel_pmap) {
curaddr = pmap_kextract(vaddr);
- else
+ kvaddr = vaddr;
+ } else {
curaddr = pmap_extract(pmap, vaddr);
+ kvaddr = 0;
+ }
/*
* Compute the segment size, and adjust counts.
*/
max_sgsize = MIN(buflen, dmat->common.maxsegsz);
- sgsize = PAGE_SIZE - ((vm_offset_t)curaddr & PAGE_MASK);
+ sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) &&
map->pagesneeded != 0 &&
bus_dma_run_filter(&dmat->common, curaddr)) {
sgsize = roundup2(sgsize, dmat->common.alignment);
sgsize = MIN(sgsize, max_sgsize);
- curaddr = add_bounce_page(dmat, map, vaddr, curaddr,
+ curaddr = add_bounce_page(dmat, map, kvaddr, curaddr, 0,
sgsize);
} else {
sgsize = MIN(sgsize, max_sgsize);
@@ -734,6 +778,88 @@
return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */
}
+static int
+bounce_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map,
+ struct vm_page **ma, bus_size_t buflen, int ma_offs, int flags,
+ bus_dma_segment_t *segs, int *segp)
+{
+ vm_paddr_t paddr, next_paddr;
+ int error, page_index;
+ bus_size_t sgsize, max_sgsize;
+
+ if (dmat->common.flags & BUS_DMA_KEEP_PG_OFFSET) {
+ /*
+ * If we have to keep the offset of each page this function
+ * is not suitable, switch back to bus_dmamap_load_ma_triv
+ * which is going to do the right thing in this case.
+ */
+ error = bus_dmamap_load_ma_triv(dmat, map, ma, buflen, ma_offs,
+ flags, segs, segp);
+ return (error);
+ }
+
+ if (map == NULL)
+ map = &nobounce_dmamap;
+
+ if (segs == NULL)
+ segs = dmat->segments;
+
+ if ((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) {
+ _bus_dmamap_count_ma(dmat, map, ma, ma_offs, buflen, flags);
+ if (map->pagesneeded != 0) {
+ error = _bus_dmamap_reserve_pages(dmat, map, flags);
+ if (error)
+ return (error);
+ }
+ }
+
+ page_index = 0;
+ while (buflen > 0) {
+ /*
+ * Compute the segment size, and adjust counts.
+ */
+ paddr = VM_PAGE_TO_PHYS(ma[page_index]) + ma_offs;
+ max_sgsize = MIN(buflen, dmat->common.maxsegsz);
+ sgsize = PAGE_SIZE - ma_offs;
+ if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) &&
+ map->pagesneeded != 0 &&
+ bus_dma_run_filter(&dmat->common, paddr)) {
+ sgsize = roundup2(sgsize, dmat->common.alignment);
+ sgsize = MIN(sgsize, max_sgsize);
+ KASSERT((sgsize & (dmat->common.alignment - 1)) == 0,
+ ("Segment size is not aligned"));
+ /*
+ * Check if two pages of the user provided buffer
+ * are used.
+ */
+ if ((ma_offs + sgsize) > PAGE_SIZE)
+ next_paddr =
+ VM_PAGE_TO_PHYS(ma[page_index + 1]);
+ else
+ next_paddr = 0;
+ paddr = add_bounce_page(dmat, map, 0, paddr,
+ next_paddr, sgsize);
+ } else {
+ sgsize = MIN(sgsize, max_sgsize);
+ }
+ sgsize = _bus_dmamap_addseg(dmat, map, paddr, sgsize, segs,
+ segp);
+ if (sgsize == 0)
+ break;
+ KASSERT(buflen >= sgsize,
+ ("Segment length overruns original buffer"));
+ buflen -= sgsize;
+ if (((ma_offs + sgsize) & ~PAGE_MASK) != 0)
+ page_index++;
+ ma_offs = (ma_offs + sgsize) & PAGE_MASK;
+ }
+
+ /*
+ * Did we fit?
+ */
+ return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */
+}
+
static void
bounce_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map,
struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg)
@@ -779,6 +905,8 @@
bus_dmasync_op_t op)
{
struct bounce_page *bpage;
+ vm_offset_t datavaddr, tempvaddr;
+ bus_size_t datacount1, datacount2;
if (map == NULL || (bpage = STAILQ_FIRST(&map->bpages)) == NULL)
return;
@@ -792,13 +920,40 @@
if ((op & BUS_DMASYNC_PREWRITE) != 0) {
while (bpage != NULL) {
- if (bpage->datavaddr != 0) {
- bcopy((void *)bpage->datavaddr,
- (void *)bpage->vaddr, bpage->datacount);
- } else {
- physcopyout(bpage->dataaddr,
- (void *)bpage->vaddr, bpage->datacount);
+ tempvaddr = 0;
+ datavaddr = bpage->datavaddr;
+ datacount1 = bpage->datacount;
+ if (datavaddr == 0) {
+ tempvaddr =
+ pmap_quick_enter_page(bpage->datapage[0]);
+ datavaddr = tempvaddr | bpage->dataoffs;
+ datacount1 = min(PAGE_SIZE - bpage->dataoffs,
+ datacount1);
}
+
+ bcopy((void *)datavaddr,
+ (void *)bpage->vaddr, datacount1);
+
+ if (tempvaddr != 0)
+ pmap_quick_remove_page(tempvaddr);
+
+ if (bpage->datapage[1] == 0) {
+ KASSERT(datacount1 == bpage->datacount,
+ ("Mismatch between data size and provided memory space"));
+ goto next_w;
+ }
+
+ /*
+ * We are dealing with an unmapped buffer that expands
+ * over two pages.
+ */
+ datavaddr = pmap_quick_enter_page(bpage->datapage[1]);
+ datacount2 = bpage->datacount - datacount1;
+ bcopy((void *)datavaddr,
+ (void *)(bpage->vaddr + datacount1), datacount2);
+ pmap_quick_remove_page(datavaddr);
+
+next_w:
bpage = STAILQ_NEXT(bpage, links);
}
dmat->bounce_zone->total_bounced++;
@@ -806,14 +961,40 @@
if ((op & BUS_DMASYNC_POSTREAD) != 0) {
while (bpage != NULL) {
- if (bpage->datavaddr != 0) {
- bcopy((void *)bpage->vaddr,
- (void *)bpage->datavaddr,
- bpage->datacount);
- } else {
- physcopyin((void *)bpage->vaddr,
- bpage->dataaddr, bpage->datacount);
+ tempvaddr = 0;
+ datavaddr = bpage->datavaddr;
+ datacount1 = bpage->datacount;
+ if (datavaddr == 0) {
+ tempvaddr =
+ pmap_quick_enter_page(bpage->datapage[0]);
+ datavaddr = tempvaddr | bpage->dataoffs;
+ datacount1 = min(PAGE_SIZE - bpage->dataoffs,
+ datacount1);
}
+
+ bcopy((void *)bpage->vaddr, (void *)datavaddr,
+ datacount1);
+
+ if (tempvaddr != 0)
+ pmap_quick_remove_page(tempvaddr);
+
+ if (bpage->datapage[1] == 0) {
+ KASSERT(datacount1 == bpage->datacount,
+ ("Mismatch between data size and provided memory space"));
+ goto next_r;
+ }
+
+ /*
+ * We are dealing with an unmapped buffer that expands
+ * over two pages.
+ */
+ datavaddr = pmap_quick_enter_page(bpage->datapage[1]);
+ datacount2 = bpage->datacount - datacount1;
+ bcopy((void *)(bpage->vaddr + datacount1),
+ (void *)datavaddr, datacount2);
+ pmap_quick_remove_page(datavaddr);
+
+next_r:
bpage = STAILQ_NEXT(bpage, links);
}
dmat->bounce_zone->total_bounced++;
@@ -979,7 +1160,7 @@
static bus_addr_t
add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr,
- bus_addr_t addr, bus_size_t size)
+ bus_addr_t addr1, bus_addr_t addr2, bus_size_t size)
{
struct bounce_zone *bz;
struct bounce_page *bpage;
@@ -1009,11 +1190,16 @@
if (dmat->common.flags & BUS_DMA_KEEP_PG_OFFSET) {
/* Page offset needs to be preserved. */
- bpage->vaddr |= addr & PAGE_MASK;
- bpage->busaddr |= addr & PAGE_MASK;
+ bpage->vaddr |= addr1 & PAGE_MASK;
+ bpage->busaddr |= addr1 & PAGE_MASK;
+ KASSERT(addr2 == 0,
+ ("Trying to bounce multiple pages with BUS_DMA_KEEP_PG_OFFSET"));
}
bpage->datavaddr = vaddr;
- bpage->dataaddr = addr;
+ bpage->datapage[0] = PHYS_TO_VM_PAGE(addr1);
+ KASSERT((addr2 & PAGE_MASK) == 0, ("Second page is not aligned"));
+ bpage->datapage[1] = PHYS_TO_VM_PAGE(addr2);
+ bpage->dataoffs = addr1 & PAGE_MASK;
bpage->datacount = size;
STAILQ_INSERT_TAIL(&(map->bpages), bpage, links);
return (bpage->busaddr);
@@ -1085,7 +1271,7 @@
.mem_free = bounce_bus_dmamem_free,
.load_phys = bounce_bus_dmamap_load_phys,
.load_buffer = bounce_bus_dmamap_load_buffer,
- .load_ma = bus_dmamap_load_ma_triv,
+ .load_ma = bounce_bus_dmamap_load_ma,
.map_waitok = bounce_bus_dmamap_waitok,
.map_complete = bounce_bus_dmamap_complete,
.map_unload = bounce_bus_dmamap_unload,
Modified: trunk/sys/x86/x86/busdma_machdep.c
===================================================================
--- trunk/sys/x86/x86/busdma_machdep.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/busdma_machdep.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/busdma_machdep.c 259511 2013-12-17 13:39:50Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/busdma_machdep.c 257230 2013-10-27 22:05:10Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
Added: trunk/sys/x86/x86/cpu_machdep.c
===================================================================
--- trunk/sys/x86/x86/cpu_machdep.c (rev 0)
+++ trunk/sys/x86/x86/cpu_machdep.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,1359 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/cpu_machdep.c 355701 2019-12-13 06:54:41Z scottl $");
+
+#include "opt_atpic.h"
+#include "opt_compat.h"
+#include "opt_cpu.h"
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_isa.h"
+#include "opt_kdb.h"
+#include "opt_kstack_pages.h"
+#include "opt_maxmem.h"
+#include "opt_mp_watchdog.h"
+#include "opt_perfmon.h"
+#include "opt_platform.h"
+#ifdef __i386__
+#include "opt_apic.h"
+#include "opt_xbox.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/cpu.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/cputypes.h>
+#include <machine/specialreg.h>
+#include <machine/md_var.h>
+#include <machine/mp_watchdog.h>
+#ifdef PERFMON
+#include <machine/perfmon.h>
+#endif
+#include <machine/tss.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+#ifdef CPU_ELAN
+#include <machine/elan_mmcr.h>
+#endif
+#include <x86/acpica_machdep.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_param.h>
+
+#ifndef PC98
+#include <isa/isareg.h>
+#endif
+
+#define STATE_RUNNING 0x0
+#define STATE_MWAIT 0x1
+#define STATE_SLEEPING 0x2
+
+#ifdef SMP
+static u_int cpu_reset_proxyid;
+static volatile u_int cpu_reset_proxy_active;
+#endif
+
+struct msr_op_arg {
+ u_int msr;
+ int op;
+ uint64_t arg1;
+};
+
+static void
+x86_msr_op_one(void *argp)
+{
+ struct msr_op_arg *a;
+ uint64_t v;
+
+ a = argp;
+ switch (a->op) {
+ case MSR_OP_ANDNOT:
+ v = rdmsr(a->msr);
+ v &= ~a->arg1;
+ wrmsr(a->msr, v);
+ break;
+ case MSR_OP_OR:
+ v = rdmsr(a->msr);
+ v |= a->arg1;
+ wrmsr(a->msr, v);
+ break;
+ case MSR_OP_WRITE:
+ wrmsr(a->msr, a->arg1);
+ break;
+ }
+}
+
+#define MSR_OP_EXMODE_MASK 0xf0000000
+#define MSR_OP_OP_MASK 0x000000ff
+
+void
+x86_msr_op(u_int msr, u_int op, uint64_t arg1)
+{
+ struct thread *td;
+ struct msr_op_arg a;
+ u_int exmode;
+ int bound_cpu, i, is_bound;
+
+ a.op = op & MSR_OP_OP_MASK;
+ MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
+ a.op == MSR_OP_WRITE);
+ exmode = op & MSR_OP_EXMODE_MASK;
+ MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED ||
+ exmode == MSR_OP_RENDEZVOUS);
+ a.msr = msr;
+ a.arg1 = arg1;
+ switch (exmode) {
+ case MSR_OP_LOCAL:
+ x86_msr_op_one(&a);
+ break;
+ case MSR_OP_SCHED:
+ td = curthread;
+ thread_lock(td);
+ is_bound = sched_is_bound(td);
+ bound_cpu = td->td_oncpu;
+ CPU_FOREACH(i) {
+ sched_bind(td, i);
+ x86_msr_op_one(&a);
+ }
+ if (is_bound)
+ sched_bind(td, bound_cpu);
+ else
+ sched_unbind(td);
+ thread_unlock(td);
+ break;
+ case MSR_OP_RENDEZVOUS:
+ smp_rendezvous(NULL, x86_msr_op_one, NULL, &a);
+ break;
+ }
+}
+
+/*
+ * Machine dependent boot() routine
+ *
+ * I haven't seen anything to put here yet
+ * Possibly some stuff might be grafted back here from boot()
+ */
+void
+cpu_boot(int howto)
+{
+}
+
+/*
+ * Flush the D-cache for non-DMA I/O so that the I-cache can
+ * be made coherent later.
+ */
+void
+cpu_flush_dcache(void *ptr, size_t len)
+{
+ /* Not applicable */
+}
+
+void
+acpi_cpu_c1(void)
+{
+
+ __asm __volatile("sti; hlt");
+}
+
+/*
+ * Use mwait to pause execution while waiting for an interrupt or
+ * another thread to signal that there is more work.
+ *
+ * NOTE: Interrupts will cause a wakeup; however, this function does
+ * not enable interrupt handling. The caller is responsible to enable
+ * interrupts.
+ */
+void
+acpi_cpu_idle_mwait(uint32_t mwait_hint)
+{
+ int *state;
+ uint64_t v;
+
+ /*
+ * A comment in Linux patch claims that 'CPUs run faster with
+ * speculation protection disabled. All CPU threads in a core
+ * must disable speculation protection for it to be
+ * disabled. Disable it while we are idle so the other
+ * hyperthread can run fast.'
+ *
+ * XXXKIB. Software coordination mode should be supported,
+ * but all Intel CPUs provide hardware coordination.
+ */
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ KASSERT(atomic_load_int(state) == STATE_SLEEPING,
+ ("cpu_mwait_cx: wrong monitorbuf state"));
+ atomic_store_int(state, STATE_MWAIT);
+ if (PCPU_GET(ibpb_set) || hw_ssb_active) {
+ v = rdmsr(MSR_IA32_SPEC_CTRL);
+ wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
+ IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
+ } else {
+ v = 0;
+ }
+ cpu_monitor(state, 0, 0);
+ if (atomic_load_int(state) == STATE_MWAIT)
+ cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
+
+ /*
+ * SSB cannot be disabled while we sleep, or rather, if it was
+ * disabled, the sysctl thread will bind to our cpu to tweak
+ * MSR.
+ */
+ if (v != 0)
+ wrmsr(MSR_IA32_SPEC_CTRL, v);
+
+ /*
+ * We should exit on any event that interrupts mwait, because
+ * that event might be a wanted interrupt.
+ */
+ atomic_store_int(state, STATE_RUNNING);
+}
+
+/* Get current clock frequency for the given cpu id. */
+int
+cpu_est_clockrate(int cpu_id, uint64_t *rate)
+{
+ uint64_t tsc1, tsc2;
+ uint64_t acnt, mcnt, perf;
+ register_t reg;
+
+ if (pcpu_find(cpu_id) == NULL || rate == NULL)
+ return (EINVAL);
+#ifdef __i386__
+ if ((cpu_feature & CPUID_TSC) == 0)
+ return (EOPNOTSUPP);
+#endif
+
+ /*
+ * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
+ * DELAY(9) based logic fails.
+ */
+ if (tsc_is_invariant && !tsc_perf_stat)
+ return (EOPNOTSUPP);
+
+#ifdef SMP
+ if (smp_cpus > 1) {
+ /* Schedule ourselves on the indicated cpu. */
+ thread_lock(curthread);
+ sched_bind(curthread, cpu_id);
+ thread_unlock(curthread);
+ }
+#endif
+
+ /* Calibrate by measuring a short delay. */
+ reg = intr_disable();
+ if (tsc_is_invariant) {
+ wrmsr(MSR_MPERF, 0);
+ wrmsr(MSR_APERF, 0);
+ tsc1 = rdtsc();
+ DELAY(1000);
+ mcnt = rdmsr(MSR_MPERF);
+ acnt = rdmsr(MSR_APERF);
+ tsc2 = rdtsc();
+ intr_restore(reg);
+ perf = 1000 * acnt / mcnt;
+ *rate = (tsc2 - tsc1) * perf;
+ } else {
+ tsc1 = rdtsc();
+ DELAY(1000);
+ tsc2 = rdtsc();
+ intr_restore(reg);
+ *rate = (tsc2 - tsc1) * 1000;
+ }
+
+#ifdef SMP
+ if (smp_cpus > 1) {
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
+ }
+#endif
+
+ return (0);
+}
+
+/*
+ * Shutdown the CPU as much as possible
+ */
+void
+cpu_halt(void)
+{
+ for (;;)
+ halt();
+}
+
+static void
+cpu_reset_real(void)
+{
+ struct region_descriptor null_idt;
+#ifndef PC98
+ int b;
+#endif
+
+ disable_intr();
+#ifdef CPU_ELAN
+ if (elan_mmcr != NULL)
+ elan_mmcr->RESCFG = 1;
+#endif
+#ifdef __i386__
+ if (cpu == CPU_GEODE1100) {
+ /* Attempt Geode's own reset */
+ outl(0xcf8, 0x80009044ul);
+ outl(0xcfc, 0xf);
+ }
+#endif
+#ifdef PC98
+ /*
+ * Attempt to do a CPU reset via CPU reset port.
+ */
+ if ((inb(0x35) & 0xa0) != 0xa0) {
+ outb(0x37, 0x0f); /* SHUT0 = 0. */
+ outb(0x37, 0x0b); /* SHUT1 = 0. */
+ }
+ outb(0xf0, 0x00); /* Reset. */
+#else
+#if !defined(BROKEN_KEYBOARD_RESET)
+ /*
+ * Attempt to do a CPU reset via the keyboard controller,
+ * do not turn off GateA20, as any machine that fails
+ * to do the reset here would then end up in no man's land.
+ */
+ outb(IO_KBD + 4, 0xFE);
+ DELAY(500000); /* wait 0.5 sec to see if that did it */
+#endif
+
+ /*
+ * Attempt to force a reset via the Reset Control register at
+ * I/O port 0xcf9. Bit 2 forces a system reset when it
+ * transitions from 0 to 1. Bit 1 selects the type of reset
+ * to attempt: 0 selects a "soft" reset, and 1 selects a
+ * "hard" reset. We try a "hard" reset. The first write sets
+ * bit 1 to select a "hard" reset and clears bit 2. The
+ * second write forces a 0 -> 1 transition in bit 2 to trigger
+ * a reset.
+ */
+ outb(0xcf9, 0x2);
+ outb(0xcf9, 0x6);
+ DELAY(500000); /* wait 0.5 sec to see if that did it */
+
+ /*
+ * Attempt to force a reset via the Fast A20 and Init register
+ * at I/O port 0x92. Bit 1 serves as an alternate A20 gate.
+ * Bit 0 asserts INIT# when set to 1. We are careful to only
+ * preserve bit 1 while setting bit 0. We also must clear bit
+ * 0 before setting it if it isn't already clear.
+ */
+ b = inb(0x92);
+ if (b != 0xff) {
+ if ((b & 0x1) != 0)
+ outb(0x92, b & 0xfe);
+ outb(0x92, b | 0x1);
+ DELAY(500000); /* wait 0.5 sec to see if that did it */
+ }
+#endif /* PC98 */
+
+ printf("No known reset method worked, attempting CPU shutdown\n");
+ DELAY(1000000); /* wait 1 sec for printf to complete */
+
+ /* Wipe the IDT. */
+ null_idt.rd_limit = 0;
+ null_idt.rd_base = 0;
+ lidt(&null_idt);
+
+ /* "good night, sweet prince .... <THUNK!>" */
+ breakpoint();
+
+ /* NOTREACHED */
+ while(1);
+}
+
+#ifdef SMP
+static void
+cpu_reset_proxy(void)
+{
+
+ cpu_reset_proxy_active = 1;
+ while (cpu_reset_proxy_active == 1)
+ ia32_pause(); /* Wait for other cpu to see that we've started */
+
+ printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
+ DELAY(1000000);
+ cpu_reset_real();
+}
+#endif
+
+void
+cpu_reset(void)
+{
+#ifdef SMP
+ cpuset_t map;
+ u_int cnt;
+
+ if (smp_started) {
+ map = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &map);
+ CPU_NAND(&map, &stopped_cpus);
+ if (!CPU_EMPTY(&map)) {
+ printf("cpu_reset: Stopping other CPUs\n");
+ stop_cpus(map);
+ }
+
+ if (PCPU_GET(cpuid) != 0) {
+ cpu_reset_proxyid = PCPU_GET(cpuid);
+ cpustop_restartfunc = cpu_reset_proxy;
+ cpu_reset_proxy_active = 0;
+ printf("cpu_reset: Restarting BSP\n");
+
+ /* Restart CPU #0. */
+ CPU_SETOF(0, &started_cpus);
+ wmb();
+
+ cnt = 0;
+ while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
+ ia32_pause();
+ cnt++; /* Wait for BSP to announce restart */
+ }
+ if (cpu_reset_proxy_active == 0) {
+ printf("cpu_reset: Failed to restart BSP\n");
+ } else {
+ cpu_reset_proxy_active = 2;
+ while (1)
+ ia32_pause();
+ /* NOTREACHED */
+ }
+ }
+
+ DELAY(1000000);
+ }
+#endif
+ cpu_reset_real();
+ /* NOTREACHED */
+}
+
+bool
+cpu_mwait_usable(void)
+{
+
+ return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
+ (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
+ (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
+}
+
+void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */
+static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
+static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
+SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
+ 0, "Use MONITOR/MWAIT for short idle");
+
+#ifndef PC98
+static void
+cpu_idle_acpi(sbintime_t sbt)
+{
+ int *state;
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ atomic_store_int(state, STATE_SLEEPING);
+
+ /* See comments in cpu_idle_hlt(). */
+ disable_intr();
+ if (sched_runnable())
+ enable_intr();
+ else if (cpu_idle_hook)
+ cpu_idle_hook(sbt);
+ else
+ acpi_cpu_c1();
+ atomic_store_int(state, STATE_RUNNING);
+}
+#endif /* !PC98 */
+
+static void
+cpu_idle_hlt(sbintime_t sbt)
+{
+ int *state;
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ atomic_store_int(state, STATE_SLEEPING);
+
+ /*
+ * Since we may be in a critical section from cpu_idle(), if
+ * an interrupt fires during that critical section we may have
+ * a pending preemption. If the CPU halts, then that thread
+ * may not execute until a later interrupt awakens the CPU.
+ * To handle this race, check for a runnable thread after
+ * disabling interrupts and immediately return if one is
+ * found. Also, we must absolutely guarentee that hlt is
+ * the next instruction after sti. This ensures that any
+ * interrupt that fires after the call to disable_intr() will
+ * immediately awaken the CPU from hlt. Finally, please note
+ * that on x86 this works fine because of interrupts enabled only
+ * after the instruction following sti takes place, while IF is set
+ * to 1 immediately, allowing hlt instruction to acknowledge the
+ * interrupt.
+ */
+ disable_intr();
+ if (sched_runnable())
+ enable_intr();
+ else
+ acpi_cpu_c1();
+ atomic_store_int(state, STATE_RUNNING);
+}
+
+static void
+cpu_idle_mwait(sbintime_t sbt)
+{
+ int *state;
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ atomic_store_int(state, STATE_MWAIT);
+
+ /* See comments in cpu_idle_hlt(). */
+ disable_intr();
+ if (sched_runnable()) {
+ atomic_store_int(state, STATE_RUNNING);
+ enable_intr();
+ return;
+ }
+
+ cpu_monitor(state, 0, 0);
+ if (atomic_load_int(state) == STATE_MWAIT)
+ __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
+ else
+ enable_intr();
+ atomic_store_int(state, STATE_RUNNING);
+}
+
+static void
+cpu_idle_spin(sbintime_t sbt)
+{
+ int *state;
+ int i;
+
+ state = (int *)PCPU_PTR(monitorbuf);
+ atomic_store_int(state, STATE_RUNNING);
+
+ /*
+ * The sched_runnable() call is racy but as long as there is
+ * a loop missing it one time will have just a little impact if any
+ * (and it is much better than missing the check at all).
+ */
+ for (i = 0; i < 1000; i++) {
+ if (sched_runnable())
+ return;
+ cpu_spinwait();
+ }
+}
+
+/*
+ * C1E renders the local APIC timer dead, so we disable it by
+ * reading the Interrupt Pending Message register and clearing
+ * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
+ *
+ * Reference:
+ * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
+ * #32559 revision 3.00+
+ */
+#define MSR_AMDK8_IPM 0xc0010055
+#define AMDK8_SMIONCMPHALT (1ULL << 27)
+#define AMDK8_C1EONCMPHALT (1ULL << 28)
+#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
+
+void
+cpu_probe_amdc1e(void)
+{
+
+ /*
+ * Detect the presence of C1E capability mostly on latest
+ * dual-cores (or future) k8 family.
+ */
+ if (cpu_vendor_id == CPU_VENDOR_AMD &&
+ (cpu_id & 0x00000f00) == 0x00000f00 &&
+ (cpu_id & 0x0fff0000) >= 0x00040000) {
+ cpu_ident_amdc1e = 1;
+ }
+}
+
+#if defined(__i386__) && defined(PC98)
+void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
+#else
+void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
+#endif
+
+void
+cpu_idle(int busy)
+{
+ uint64_t msr;
+ sbintime_t sbt = -1;
+
+ CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
+ busy, curcpu);
+#ifdef MP_WATCHDOG
+ ap_watchdog(PCPU_GET(cpuid));
+#endif
+
+ /* If we are busy - try to use fast methods. */
+ if (busy) {
+ if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
+ cpu_idle_mwait(busy);
+ goto out;
+ }
+ }
+
+ /* If we have time - switch timers into idle mode. */
+ if (!busy) {
+ critical_enter();
+ sbt = cpu_idleclock();
+ }
+
+ /* Apply AMD APIC timer C1E workaround. */
+ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
+ msr = rdmsr(MSR_AMDK8_IPM);
+ if (msr & AMDK8_CMPHALT)
+ wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
+ }
+
+ /* Call main idle method. */
+ cpu_idle_fn(sbt);
+
+ /* Switch timers back into active mode. */
+ if (!busy) {
+ cpu_activeclock();
+ critical_exit();
+ }
+out:
+ CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
+ busy, curcpu);
+}
+
+static int cpu_idle_apl31_workaround;
+SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
+ &cpu_idle_apl31_workaround, 0,
+ "Apollo Lake APL31 MWAIT bug workaround");
+
+int
+cpu_idle_wakeup(int cpu)
+{
+ int *state;
+
+ state = (int *)pcpu_find(cpu)->pc_monitorbuf;
+ switch (atomic_load_int(state)) {
+ case STATE_SLEEPING:
+ return (0);
+ case STATE_MWAIT:
+ atomic_store_int(state, STATE_RUNNING);
+ return (cpu_idle_apl31_workaround ? 0 : 1);
+ case STATE_RUNNING:
+ return (1);
+ default:
+ panic("bad monitor state");
+ return (1);
+ }
+}
+
+/*
+ * Ordered by speed/power consumption.
+ */
+static struct {
+ void *id_fn;
+ char *id_name;
+ int id_cpuid2_flag;
+} idle_tbl[] = {
+ { .id_fn = cpu_idle_spin, .id_name = "spin" },
+ { .id_fn = cpu_idle_mwait, .id_name = "mwait",
+ .id_cpuid2_flag = CPUID2_MON },
+ { .id_fn = cpu_idle_hlt, .id_name = "hlt" },
+#if !defined(__i386__) || !defined(PC98)
+ { .id_fn = cpu_idle_acpi, .id_name = "acpi" },
+#endif
+};
+
+static int
+idle_sysctl_available(SYSCTL_HANDLER_ARGS)
+{
+ char *avail, *p;
+ int error;
+ int i;
+
+ avail = malloc(256, M_TEMP, M_WAITOK);
+ p = avail;
+ for (i = 0; i < nitems(idle_tbl); i++) {
+ if (idle_tbl[i].id_cpuid2_flag != 0 &&
+ (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
+ continue;
+#if !defined(__i386__) || !defined(PC98)
+ if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
+ cpu_idle_hook == NULL)
+ continue;
+#endif
+ p += sprintf(p, "%s%s", p != avail ? ", " : "",
+ idle_tbl[i].id_name);
+ }
+ error = sysctl_handle_string(oidp, avail, 0, req);
+ free(avail, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
+ 0, 0, idle_sysctl_available, "A", "list of available idle functions");
+
+static bool
+cpu_idle_selector(const char *new_idle_name)
+{
+ int i;
+
+ for (i = 0; i < nitems(idle_tbl); i++) {
+ if (idle_tbl[i].id_cpuid2_flag != 0 &&
+ (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
+ continue;
+#if !defined(__i386__) || !defined(PC98)
+ if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
+ cpu_idle_hook == NULL)
+ continue;
+#endif
+ if (strcmp(idle_tbl[i].id_name, new_idle_name))
+ continue;
+ cpu_idle_fn = idle_tbl[i].id_fn;
+ if (bootverbose)
+ printf("CPU idle set to %s\n", idle_tbl[i].id_name);
+ return (true);
+ }
+ return (false);
+}
+
+static int
+cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ char buf[16], *p;
+ int error, i;
+
+ p = "unknown";
+ for (i = 0; i < nitems(idle_tbl); i++) {
+ if (idle_tbl[i].id_fn == cpu_idle_fn) {
+ p = idle_tbl[i].id_name;
+ break;
+ }
+ }
+ strncpy(buf, p, sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ return (cpu_idle_selector(buf) ? 0 : EINVAL);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
+ cpu_idle_sysctl, "A", "currently selected idle function");
+
+static void
+cpu_idle_tun(void *unused __unused)
+{
+ char tunvar[16];
+
+ if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
+ cpu_idle_selector(tunvar);
+ else if (cpu_vendor_id == CPU_VENDOR_AMD &&
+ CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
+ /* Ryzen erratas 1057, 1109. */
+ cpu_idle_selector("hlt");
+ idle_mwait = 0;
+ }
+
+ if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
+ /*
+ * Apollo Lake errata APL31 (public errata APL30).
+ * Stores to the armed address range may not trigger
+ * MWAIT to resume execution. OS needs to use
+ * interrupts to wake processors from MWAIT-induced
+ * sleep states.
+ */
+ cpu_idle_apl31_workaround = 1;
+ }
+ TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
+}
+SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
+
+static int panic_on_nmi = 1;
+SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
+ &panic_on_nmi, 0,
+ "Panic on NMI raised by hardware failure");
+int nmi_is_broadcast = 1;
+SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
+ &nmi_is_broadcast, 0,
+ "Chipset NMI is broadcast");
+#ifdef KDB
+int kdb_on_nmi = 1;
+SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
+ &kdb_on_nmi, 0,
+ "Go to KDB on NMI with unknown source");
+#endif
+
+void
+nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
+{
+ bool claimed = false;
+
+#ifdef DEV_ISA
+ /* machine/parity/power fail/"kitchen sink" faults */
+ if (isa_nmi(frame->tf_err)) {
+ claimed = true;
+ if (panic_on_nmi)
+ panic("NMI indicates hardware failure");
+ }
+#endif /* DEV_ISA */
+#ifdef KDB
+ if (!claimed && kdb_on_nmi) {
+ /*
+ * NMI can be hooked up to a pushbutton for debugging.
+ */
+ printf("NMI/cpu%d ... going to debugger\n", cpu);
+ kdb_trap(type, 0, frame);
+ }
+#endif /* KDB */
+}
+
+void
+nmi_handle_intr(u_int type, struct trapframe *frame)
+{
+
+#ifdef SMP
+ if (nmi_is_broadcast) {
+ nmi_call_kdb_smp(type, frame);
+ return;
+ }
+#endif
+ nmi_call_kdb(PCPU_GET(cpuid), type, frame);
+}
+
+int hw_ibrs_active;
+int hw_ibrs_disable = 1;
+
+SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
+ "Indirect Branch Restricted Speculation active");
+
+void
+hw_ibrs_recalculate(void)
+{
+ if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
+ x86_msr_op(MSR_IA32_SPEC_CTRL, MSR_OP_LOCAL |
+ (hw_ibrs_disable ? MSR_OP_ANDNOT : MSR_OP_OR),
+ IA32_SPEC_CTRL_IBRS);
+ return;
+ }
+ hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 &&
+ !hw_ibrs_disable;
+}
+
+static int
+hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error, val;
+
+ val = hw_ibrs_disable;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ hw_ibrs_disable = val != 0;
+ hw_ibrs_recalculate();
+ return (0);
+}
+SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
+ CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
+ "Disable Indirect Branch Restricted Speculation");
+
+int hw_ssb_active;
+int hw_ssb_disable;
+
+SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
+ &hw_ssb_active, 0,
+ "Speculative Store Bypass Disable active");
+
+static void
+hw_ssb_set(bool enable, bool for_all_cpus)
+{
+
+ if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
+ hw_ssb_active = 0;
+ return;
+ }
+ hw_ssb_active = enable;
+ x86_msr_op(MSR_IA32_SPEC_CTRL,
+ (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
+ (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD);
+}
+
+void
+hw_ssb_recalculate(bool all_cpus)
+{
+
+ switch (hw_ssb_disable) {
+ default:
+ hw_ssb_disable = 0;
+ /* FALLTHROUGH */
+ case 0: /* off */
+ hw_ssb_set(false, all_cpus);
+ break;
+ case 1: /* on */
+ hw_ssb_set(true, all_cpus);
+ break;
+ case 2: /* auto */
+ hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
+ false : true, all_cpus);
+ break;
+ }
+}
+
+static int
+hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error, val;
+
+ val = hw_ssb_disable;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ hw_ssb_disable = val;
+ hw_ssb_recalculate(true);
+ return (0);
+}
+SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
+ CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+ hw_ssb_disable_handler, "I",
+ "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
+
+int hw_mds_disable;
+
+/*
+ * Handler for Microarchitectural Data Sampling issues. Really not a
+ * pointer to C function: on amd64 the code must not change any CPU
+ * architectural state except possibly %rflags. Also, it is always
+ * called with interrupts disabled.
+ */
+void mds_handler_void(void);
+void mds_handler_verw(void);
+void mds_handler_ivb(void);
+void mds_handler_bdw(void);
+void mds_handler_skl_sse(void);
+void mds_handler_skl_avx(void);
+void mds_handler_skl_avx512(void);
+void mds_handler_silvermont(void);
+void (*mds_handler)(void) = mds_handler_void;
+
+static int
+sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
+{
+ const char *state;
+
+ if (mds_handler == mds_handler_void)
+ state = "inactive";
+ else if (mds_handler == mds_handler_verw)
+ state = "VERW";
+ else if (mds_handler == mds_handler_ivb)
+ state = "software IvyBridge";
+ else if (mds_handler == mds_handler_bdw)
+ state = "software Broadwell";
+ else if (mds_handler == mds_handler_skl_sse)
+ state = "software Skylake SSE";
+ else if (mds_handler == mds_handler_skl_avx)
+ state = "software Skylake AVX";
+ else if (mds_handler == mds_handler_skl_avx512)
+ state = "software Skylake AVX512";
+ else if (mds_handler == mds_handler_silvermont)
+ state = "software Silvermont";
+ else
+ state = "unknown";
+ return (SYSCTL_OUT(req, state, strlen(state)));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_hw_mds_disable_state_handler, "A",
+ "Microarchitectural Data Sampling Mitigation state");
+
+_Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
+
+void
+hw_mds_recalculate(void)
+{
+ struct pcpu *pc;
+ vm_offset_t b64;
+ u_long xcr0;
+ int i;
+
+ /*
+ * Allow user to force VERW variant even if MD_CLEAR is not
+ * reported. For instance, hypervisor might unknowingly
+ * filter the cap out.
+ * For the similar reasons, and for testing, allow to enable
+ * mitigation even for RDCL_NO or MDS_NO caps.
+ */
+ if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
+ ((cpu_ia32_arch_caps & (IA32_ARCH_CAP_RDCL_NO |
+ IA32_ARCH_CAP_MDS_NO)) != 0 && hw_mds_disable == 3)) {
+ mds_handler = mds_handler_void;
+ } else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
+ hw_mds_disable == 3) || hw_mds_disable == 1) {
+ mds_handler = mds_handler_verw;
+ } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+ (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
+ CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
+ CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
+ CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
+ CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
+ CPUID_TO_MODEL(cpu_id) == 0x3a) &&
+ (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+ /*
+ * Nehalem, SandyBridge, IvyBridge
+ */
+ CPU_FOREACH(i) {
+ pc = pcpu_find(i);
+ if (pc->pc_mds_buf == NULL) {
+ pc->pc_mds_buf = malloc(672, M_TEMP,
+ M_WAITOK);
+ bzero(pc->pc_mds_buf, 16);
+ }
+ }
+ mds_handler = mds_handler_ivb;
+ } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+ (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
+ CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
+ CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
+ CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
+ (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+ /*
+ * Haswell, Broadwell
+ */
+ CPU_FOREACH(i) {
+ pc = pcpu_find(i);
+ if (pc->pc_mds_buf == NULL) {
+ pc->pc_mds_buf = malloc(1536, M_TEMP,
+ M_WAITOK);
+ bzero(pc->pc_mds_buf, 16);
+ }
+ }
+ mds_handler = mds_handler_bdw;
+ } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+ ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
+ CPUID_STEPPING) <= 5) ||
+ CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
+ (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
+ CPUID_STEPPING) <= 0xb) ||
+ (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
+ CPUID_STEPPING) <= 0xc)) &&
+ (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+ /*
+ * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
+ * CascadeLake
+ */
+ CPU_FOREACH(i) {
+ pc = pcpu_find(i);
+ if (pc->pc_mds_buf == NULL) {
+ pc->pc_mds_buf = malloc(6 * 1024,
+ M_TEMP, M_WAITOK);
+ b64 = (vm_offset_t)malloc(64 + 63,
+ M_TEMP, M_WAITOK);
+ pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
+ bzero(pc->pc_mds_buf64, 64);
+ }
+ }
+ xcr0 = rxcr(0);
+ if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
+ (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0)
+ mds_handler = mds_handler_skl_avx512;
+ else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
+ (cpu_feature2 & CPUID2_AVX) != 0)
+ mds_handler = mds_handler_skl_avx;
+ else
+ mds_handler = mds_handler_skl_sse;
+ } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+ ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
+ CPUID_TO_MODEL(cpu_id) == 0x4a ||
+ CPUID_TO_MODEL(cpu_id) == 0x4c ||
+ CPUID_TO_MODEL(cpu_id) == 0x4d ||
+ CPUID_TO_MODEL(cpu_id) == 0x5a ||
+ CPUID_TO_MODEL(cpu_id) == 0x5d ||
+ CPUID_TO_MODEL(cpu_id) == 0x6e ||
+ CPUID_TO_MODEL(cpu_id) == 0x65 ||
+ CPUID_TO_MODEL(cpu_id) == 0x75 ||
+ CPUID_TO_MODEL(cpu_id) == 0x1c ||
+ CPUID_TO_MODEL(cpu_id) == 0x26 ||
+ CPUID_TO_MODEL(cpu_id) == 0x27 ||
+ CPUID_TO_MODEL(cpu_id) == 0x35 ||
+ CPUID_TO_MODEL(cpu_id) == 0x36 ||
+ CPUID_TO_MODEL(cpu_id) == 0x7a))) {
+ /* Silvermont, Airmont */
+ CPU_FOREACH(i) {
+ pc = pcpu_find(i);
+ if (pc->pc_mds_buf == NULL)
+ pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
+ }
+ mds_handler = mds_handler_silvermont;
+ } else {
+ hw_mds_disable = 0;
+ mds_handler = mds_handler_void;
+ }
+}
+
+static void
+hw_mds_recalculate_boot(void *arg __unused)
+{
+
+ hw_mds_recalculate();
+}
+SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
+
+static int
+sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error, val;
+
+ val = hw_mds_disable;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (val < 0 || val > 3)
+ return (EINVAL);
+ hw_mds_disable = val;
+ hw_mds_recalculate();
+ return (0);
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
+ CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_mds_disable_handler, "I",
+ "Microarchitectural Data Sampling Mitigation "
+ "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
+
+
+/*
+ * Intel Transactional Memory Asynchronous Abort Mitigation
+ * CVE-2019-11135
+ */
+int x86_taa_enable;
+int x86_taa_state;
+enum {
+ TAA_NONE = 0, /* No mitigation enabled */
+ TAA_TSX_DISABLE = 1, /* Disable TSX via MSR */
+ TAA_VERW = 2, /* Use VERW mitigation */
+ TAA_AUTO = 3, /* Automatically select the mitigation */
+
+ /* The states below are not selectable by the operator */
+
+ TAA_TAA_UC = 4, /* Mitigation present in microcode */
+ TAA_NOT_PRESENT = 5 /* TSX is not present */
+};
+
+static void
+taa_set(bool enable, bool all)
+{
+
+ x86_msr_op(MSR_IA32_TSX_CTRL,
+ (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
+ (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
+ IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR);
+}
+
+void
+x86_taa_recalculate(void)
+{
+ static int taa_saved_mds_disable = 0;
+ int taa_need = 0, taa_state = 0;
+ int mds_disable = 0, need_mds_recalc = 0;
+
+ /* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
+ if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
+ (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
+ /* TSX is not present */
+ x86_taa_state = TAA_NOT_PRESENT;
+ return;
+ }
+
+ /* Check to see what mitigation options the CPU gives us */
+ if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
+ /* CPU is not suseptible to TAA */
+ taa_need = TAA_TAA_UC;
+ } else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
+ /*
+ * CPU can turn off TSX. This is the next best option
+ * if TAA_NO hardware mitigation isn't present
+ */
+ taa_need = TAA_TSX_DISABLE;
+ } else {
+ /* No TSX/TAA specific remedies are available. */
+ if (x86_taa_enable == TAA_TSX_DISABLE) {
+ if (bootverbose)
+ printf("TSX control not available\n");
+ return;
+ } else
+ taa_need = TAA_VERW;
+ }
+
+ /* Can we automatically take action, or are we being forced? */
+ if (x86_taa_enable == TAA_AUTO)
+ taa_state = taa_need;
+ else
+ taa_state = x86_taa_enable;
+
+ /* No state change, nothing to do */
+ if (taa_state == x86_taa_state) {
+ if (bootverbose)
+ printf("No TSX change made\n");
+ return;
+ }
+
+ /* Does the MSR need to be turned on or off? */
+ if (taa_state == TAA_TSX_DISABLE)
+ taa_set(true, true);
+ else if (x86_taa_state == TAA_TSX_DISABLE)
+ taa_set(false, true);
+
+ /* Does MDS need to be set to turn on VERW? */
+ if (taa_state == TAA_VERW) {
+ taa_saved_mds_disable = hw_mds_disable;
+ mds_disable = hw_mds_disable = 1;
+ need_mds_recalc = 1;
+ } else if (x86_taa_state == TAA_VERW) {
+ mds_disable = hw_mds_disable = taa_saved_mds_disable;
+ need_mds_recalc = 1;
+ }
+ if (need_mds_recalc) {
+ hw_mds_recalculate();
+ if (mds_disable != hw_mds_disable) {
+ if (bootverbose)
+ printf("Cannot change MDS state for TAA\n");
+ /* Don't update our state */
+ return;
+ }
+ }
+
+ x86_taa_state = taa_state;
+ return;
+}
+
+static void
+taa_recalculate_boot(void * arg __unused)
+{
+
+ x86_taa_recalculate();
+}
+SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
+
+SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW, 0,
+ "TSX Asynchronous Abort Mitigation");
+
+static int
+sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error, val;
+
+ val = x86_taa_enable;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (val < TAA_NONE || val > TAA_AUTO)
+ return (EINVAL);
+ x86_taa_enable = val;
+ x86_taa_recalculate();
+ return (0);
+}
+
+SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
+ CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_taa_handler, "I",
+ "TAA Mitigation enablement control "
+ "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO");
+
+static int
+sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
+{
+ const char *state;
+
+ switch (x86_taa_state) {
+ case TAA_NONE:
+ state = "inactive";
+ break;
+ case TAA_TSX_DISABLE:
+ state = "TSX disabled";
+ break;
+ case TAA_VERW:
+ state = "VERW";
+ break;
+ case TAA_TAA_UC:
+ state = "Mitigated in microcode";
+ break;
+ case TAA_NOT_PRESENT:
+ state = "TSX not present";
+ break;
+ default:
+ state = "unknown";
+ }
+
+ return (SYSCTL_OUT(req, state, strlen(state)));
+}
+
+SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_taa_state_handler, "A",
+ "TAA Mitigation state");
+
Property changes on: trunk/sys/x86/x86/cpu_machdep.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/delay.c
===================================================================
--- trunk/sys/x86/x86/delay.c (rev 0)
+++ trunk/sys/x86/x86/delay.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,138 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * Copyright (c) 2010 Alexander Motin <mav at FreeBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz and Don Ahn.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)clock.c 7.2 (Berkeley) 5/12/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/delay.c 340270 2018-11-08 22:42:55Z jhb $");
+
+/* Generic x86 routines to handle delay */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/timetc.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/sched.h>
+
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <x86/init.h>
+
+static void
+delay_tsc(int n)
+{
+ uint64_t end, now;
+
+ /*
+ * Pin the current thread ensure correct behavior if the TSCs
+ * on different CPUs are not in sync.
+ */
+ sched_pin();
+ now = rdtsc();
+ end = now + tsc_freq * n / 1000000;
+ do {
+ cpu_spinwait();
+ now = rdtsc();
+ } while (now < end);
+ sched_unpin();
+}
+
+static int
+delay_tc(int n)
+{
+ struct timecounter *tc;
+ timecounter_get_t *func;
+ uint64_t end, freq, now;
+ u_int last, mask, u;
+
+ /*
+ * Only use the TSC if it is P-state invariant. If the TSC is
+ * not P-state invariant and the CPU is not running at the
+ * "full" P-state, then the TSC will increment at some rate
+ * less than tsc_freq and delay_tsc() will wait too long.
+ */
+ if (tsc_is_invariant && tsc_freq != 0) {
+ delay_tsc(n);
+ return (1);
+ }
+ tc = timecounter;
+ if (tc->tc_quality <= 0)
+ return (0);
+ func = tc->tc_get_timecount;
+ mask = tc->tc_counter_mask;
+ freq = tc->tc_frequency;
+ now = 0;
+ end = freq * n / 1000000;
+ last = func(tc) & mask;
+ do {
+ cpu_spinwait();
+ u = func(tc) & mask;
+ if (u < last)
+ now += mask - last + u + 1;
+ else
+ now += u - last;
+ last = u;
+ } while (now < end);
+ return (1);
+}
+
+void
+DELAY(int n)
+{
+
+ if (delay_tc(n))
+ return;
+
+ init_ops.early_delay(n);
+}
+
+void
+cpu_lock_delay(void)
+{
+
+ /*
+ * Use TSC to wait for a usec if present, otherwise fall back
+ * to reading from port 0x84. We can't call into timecounters
+ * for this delay since timecounters might use spin locks.
+ *
+ * Note that unlike delay_tc(), this uses the TSC even if it
+ * is not P-state invariant. For this function it is ok to
+ * wait even a few usecs.
+ */
+ if (tsc_freq != 0)
+ delay_tsc(1);
+ else
+ inb(0x84);
+}
Property changes on: trunk/sys/x86/x86/delay.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/dump_machdep.c
===================================================================
--- trunk/sys/x86/x86/dump_machdep.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/dump_machdep.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,355 +26,30 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/dump_machdep.c 236503 2012-06-03 08:01:12Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/dump_machdep.c 276772 2015-01-07 01:01:39Z markj $");
#include "opt_watchdog.h"
#include <sys/param.h>
-#include <sys/systm.h>
#include <sys/conf.h>
-#include <sys/cons.h>
+#include <sys/kerneldump.h>
#include <sys/sysctl.h>
-#include <sys/kernel.h>
-#include <sys/kerneldump.h>
-#include <sys/watchdog.h>
+#include <sys/systm.h>
#include <vm/vm.h>
#include <vm/pmap.h>
-#include <machine/elf.h>
-#include <machine/md_var.h>
-#ifdef __amd64__
-#define KERNELDUMP_VERSION KERNELDUMP_AMD64_VERSION
-#define EM_VALUE EM_X86_64
-#else
-#define KERNELDUMP_VERSION KERNELDUMP_I386_VERSION
-#define EM_VALUE EM_386
-#endif
-
-CTASSERT(sizeof(struct kerneldumpheader) == 512);
-
int do_minidump = 1;
-TUNABLE_INT("debug.minidump", &do_minidump);
-SYSCTL_INT(_debug, OID_AUTO, minidump, CTLFLAG_RW, &do_minidump, 0,
+SYSCTL_INT(_debug, OID_AUTO, minidump, CTLFLAG_RWTUN, &do_minidump, 0,
"Enable mini crash dumps");
-/*
- * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
- * is to protect us from metadata and to protect metadata from us.
- */
-#define SIZEOF_METADATA (64*1024)
-
-#define MD_ALIGN(x) (((off_t)(x) + PAGE_MASK) & ~PAGE_MASK)
-#define DEV_ALIGN(x) (((off_t)(x) + (DEV_BSIZE-1)) & ~(DEV_BSIZE-1))
-
-struct md_pa {
- vm_paddr_t md_start;
- vm_paddr_t md_size;
-};
-
-typedef int callback_t(struct md_pa *, int, void *);
-
-static struct kerneldumpheader kdh;
-static off_t dumplo, fileofs;
-
-/* Handle buffered writes. */
-static char buffer[DEV_BSIZE];
-static size_t fragsz;
-
-/* 20 phys_avail entry pairs correspond to 10 md_pa's */
-static struct md_pa dump_map[10];
-
-static void
-md_pa_init(void)
-{
- int n, idx;
-
- bzero(dump_map, sizeof(dump_map));
- for (n = 0; n < sizeof(dump_map) / sizeof(dump_map[0]); n++) {
- idx = n * 2;
- if (dump_avail[idx] == 0 && dump_avail[idx + 1] == 0)
- break;
- dump_map[n].md_start = dump_avail[idx];
- dump_map[n].md_size = dump_avail[idx + 1] - dump_avail[idx];
- }
-}
-
-static struct md_pa *
-md_pa_first(void)
-{
-
- return (&dump_map[0]);
-}
-
-static struct md_pa *
-md_pa_next(struct md_pa *mdp)
-{
-
- mdp++;
- if (mdp->md_size == 0)
- mdp = NULL;
- return (mdp);
-}
-
-static int
-buf_write(struct dumperinfo *di, char *ptr, size_t sz)
-{
- size_t len;
- int error;
-
- while (sz) {
- len = DEV_BSIZE - fragsz;
- if (len > sz)
- len = sz;
- bcopy(ptr, buffer + fragsz, len);
- fragsz += len;
- ptr += len;
- sz -= len;
- if (fragsz == DEV_BSIZE) {
- error = dump_write(di, buffer, 0, dumplo,
- DEV_BSIZE);
- if (error)
- return error;
- dumplo += DEV_BSIZE;
- fragsz = 0;
- }
- }
-
- return (0);
-}
-
-static int
-buf_flush(struct dumperinfo *di)
-{
- int error;
-
- if (fragsz == 0)
- return (0);
-
- error = dump_write(di, buffer, 0, dumplo, DEV_BSIZE);
- dumplo += DEV_BSIZE;
- fragsz = 0;
- return (error);
-}
-
-#define PG2MB(pgs) ((pgs + (1 << 8) - 1) >> 8)
-
-static int
-cb_dumpdata(struct md_pa *mdp, int seqnr, void *arg)
-{
- struct dumperinfo *di = (struct dumperinfo*)arg;
- vm_paddr_t a, pa;
- void *va;
- uint64_t pgs;
- size_t counter, sz, chunk;
- int i, c, error, twiddle;
- u_int maxdumppgs;
-
- error = 0; /* catch case in which chunk size is 0 */
- counter = 0; /* Update twiddle every 16MB */
- twiddle = 0;
- va = 0;
- pgs = mdp->md_size / PAGE_SIZE;
- pa = mdp->md_start;
- maxdumppgs = min(di->maxiosize / PAGE_SIZE, MAXDUMPPGS);
- if (maxdumppgs == 0) /* seatbelt */
- maxdumppgs = 1;
-
- printf(" chunk %d: %juMB (%ju pages)", seqnr, (uintmax_t)PG2MB(pgs),
- (uintmax_t)pgs);
-
- while (pgs) {
- chunk = pgs;
- if (chunk > maxdumppgs)
- chunk = maxdumppgs;
- sz = chunk << PAGE_SHIFT;
- counter += sz;
- if (counter >> 24) {
- printf(" %ju", (uintmax_t)PG2MB(pgs));
- counter &= (1<<24) - 1;
- }
- for (i = 0; i < chunk; i++) {
- a = pa + i * PAGE_SIZE;
- va = pmap_kenter_temporary(trunc_page(a), i);
- }
-
- wdog_kern_pat(WD_LASTVAL);
-
- error = dump_write(di, va, 0, dumplo, sz);
- if (error)
- break;
- dumplo += sz;
- pgs -= chunk;
- pa += sz;
-
- /* Check for user abort. */
- c = cncheckc();
- if (c == 0x03)
- return (ECANCELED);
- if (c != -1)
- printf(" (CTRL-C to abort) ");
- }
- printf(" ... %s\n", (error) ? "fail" : "ok");
- return (error);
-}
-
-static int
-cb_dumphdr(struct md_pa *mdp, int seqnr, void *arg)
-{
- struct dumperinfo *di = (struct dumperinfo*)arg;
- Elf_Phdr phdr;
- uint64_t size;
- int error;
-
- size = mdp->md_size;
- bzero(&phdr, sizeof(phdr));
- phdr.p_type = PT_LOAD;
- phdr.p_flags = PF_R; /* XXX */
- phdr.p_offset = fileofs;
- phdr.p_vaddr = mdp->md_start;
- phdr.p_paddr = mdp->md_start;
- phdr.p_filesz = size;
- phdr.p_memsz = size;
- phdr.p_align = PAGE_SIZE;
-
- error = buf_write(di, (char*)&phdr, sizeof(phdr));
- fileofs += phdr.p_filesz;
- return (error);
-}
-
-static int
-cb_size(struct md_pa *mdp, int seqnr, void *arg)
-{
- uint64_t *sz = (uint64_t*)arg;
-
- *sz += (uint64_t)mdp->md_size;
- return (0);
-}
-
-static int
-foreach_chunk(callback_t cb, void *arg)
-{
- struct md_pa *mdp;
- int error, seqnr;
-
- seqnr = 0;
- mdp = md_pa_first();
- while (mdp != NULL) {
- error = (*cb)(mdp, seqnr++, arg);
- if (error)
- return (-error);
- mdp = md_pa_next(mdp);
- }
- return (seqnr);
-}
-
void
-dumpsys(struct dumperinfo *di)
+dumpsys_map_chunk(vm_paddr_t pa, size_t chunk, void **va)
{
- Elf_Ehdr ehdr;
- uint64_t dumpsize;
- off_t hdrgap;
- size_t hdrsz;
- int error;
+ int i;
+ vm_paddr_t a;
- if (do_minidump) {
- minidumpsys(di);
- return;
+ for (i = 0; i < chunk; i++) {
+ a = pa + i * PAGE_SIZE;
+ *va = pmap_kenter_temporary(trunc_page(a), i);
}
- bzero(&ehdr, sizeof(ehdr));
- ehdr.e_ident[EI_MAG0] = ELFMAG0;
- ehdr.e_ident[EI_MAG1] = ELFMAG1;
- ehdr.e_ident[EI_MAG2] = ELFMAG2;
- ehdr.e_ident[EI_MAG3] = ELFMAG3;
- ehdr.e_ident[EI_CLASS] = ELF_CLASS;
-#if BYTE_ORDER == LITTLE_ENDIAN
- ehdr.e_ident[EI_DATA] = ELFDATA2LSB;
-#else
- ehdr.e_ident[EI_DATA] = ELFDATA2MSB;
-#endif
- ehdr.e_ident[EI_VERSION] = EV_CURRENT;
- ehdr.e_ident[EI_OSABI] = ELFOSABI_STANDALONE; /* XXX big picture? */
- ehdr.e_type = ET_CORE;
- ehdr.e_machine = EM_VALUE;
- ehdr.e_phoff = sizeof(ehdr);
- ehdr.e_flags = 0;
- ehdr.e_ehsize = sizeof(ehdr);
- ehdr.e_phentsize = sizeof(Elf_Phdr);
- ehdr.e_shentsize = sizeof(Elf_Shdr);
-
- md_pa_init();
-
- /* Calculate dump size. */
- dumpsize = 0L;
- ehdr.e_phnum = foreach_chunk(cb_size, &dumpsize);
- hdrsz = ehdr.e_phoff + ehdr.e_phnum * ehdr.e_phentsize;
- fileofs = MD_ALIGN(hdrsz);
- dumpsize += fileofs;
- hdrgap = fileofs - DEV_ALIGN(hdrsz);
-
- /* Determine dump offset on device. */
- if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
- error = ENOSPC;
- goto fail;
- }
- dumplo = di->mediaoffset + di->mediasize - dumpsize;
- dumplo -= sizeof(kdh) * 2;
-
- mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_VERSION, dumpsize,
- di->blocksize);
-
- printf("Dumping %llu MB (%d chunks)\n", (long long)dumpsize >> 20,
- ehdr.e_phnum);
-
- /* Dump leader */
- error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
- if (error)
- goto fail;
- dumplo += sizeof(kdh);
-
- /* Dump ELF header */
- error = buf_write(di, (char*)&ehdr, sizeof(ehdr));
- if (error)
- goto fail;
-
- /* Dump program headers */
- error = foreach_chunk(cb_dumphdr, di);
- if (error < 0)
- goto fail;
- buf_flush(di);
-
- /*
- * All headers are written using blocked I/O, so we know the
- * current offset is (still) block aligned. Skip the alignement
- * in the file to have the segment contents aligned at page
- * boundary. We cannot use MD_ALIGN on dumplo, because we don't
- * care and may very well be unaligned within the dump device.
- */
- dumplo += hdrgap;
-
- /* Dump memory chunks (updates dumplo) */
- error = foreach_chunk(cb_dumpdata, di);
- if (error < 0)
- goto fail;
-
- /* Dump trailer */
- error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
- if (error)
- goto fail;
-
- /* Signal completion, signoff and exit stage left. */
- dump_write(di, NULL, 0, 0, 0);
- printf("\nDump complete\n");
- return;
-
- fail:
- if (error < 0)
- error = -error;
-
- if (error == ECANCELED)
- printf("\nDump aborted\n");
- else if (error == ENOSPC)
- printf("\nDump failed. Partition too small.\n");
- else
- printf("\n** DUMP FAILED (ERROR %d) **\n", error);
}
Modified: trunk/sys/x86/x86/fdt_machdep.c
===================================================================
--- trunk/sys/x86/x86/fdt_machdep.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/fdt_machdep.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/fdt_machdep.c 250840 2013-05-21 03:05:49Z marcel $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/fdt_machdep.c 287000 2015-08-21 15:57:57Z royger $");
#include "opt_platform.h"
@@ -55,7 +55,7 @@
mdp = preload_search_by_type("elf kernel");
if (mdp == NULL)
mdp = preload_search_by_type("elf32 kernel");
- dtbp = (mdp != NULL) ? MD_FETCH(mdp, MODINFOMD_DTBP, void *) : NULL;
+ dtbp = MD_FETCH(mdp, MODINFOMD_DTBP, void *);
#if defined(FDT_DTB_STATIC)
/*
Modified: trunk/sys/x86/x86/identcpu.c
===================================================================
--- trunk/sys/x86/x86/identcpu.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/identcpu.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -40,7 +40,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/identcpu.c 332743 2018-04-19 00:11:02Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/identcpu.c 354658 2019-11-12 19:35:46Z scottl $");
#include "opt_cpu.h"
@@ -84,9 +84,46 @@
static void print_via_padlock_info(void);
static void print_vmx_info(void);
+#ifdef __i386__
+int cpu; /* Are we 386, 386sx, 486, etc? */
int cpu_class;
+#endif
+u_int cpu_feature; /* Feature flags */
+u_int cpu_feature2; /* Feature flags */
+u_int amd_feature; /* AMD feature flags */
+u_int amd_feature2; /* AMD feature flags */
+u_int amd_pminfo; /* AMD advanced power management info */
+u_int amd_extended_feature_extensions;
+u_int via_feature_rng; /* VIA RNG features */
+u_int via_feature_xcrypt; /* VIA ACE features */
+u_int cpu_high; /* Highest arg to CPUID */
+u_int cpu_exthigh; /* Highest arg to extended CPUID */
+u_int cpu_id; /* Stepping ID */
+u_int cpu_procinfo; /* HyperThreading Info / Brand Index / CLFUSH */
+u_int cpu_procinfo2; /* Multicore info */
+char cpu_vendor[20]; /* CPU Origin code */
+u_int cpu_vendor_id; /* CPU vendor ID */
+u_int cpu_fxsr; /* SSE enabled */
+u_int cpu_mxcsr_mask; /* Valid bits in mxcsr */
+u_int cpu_clflush_line_size = 32;
+u_int cpu_stdext_feature; /* %ebx */
+u_int cpu_stdext_feature2; /* %ecx */
+u_int cpu_stdext_feature3; /* %edx */
+uint64_t cpu_ia32_arch_caps;
+u_int cpu_max_ext_state_size;
+u_int cpu_mon_mwait_flags; /* MONITOR/MWAIT flags (CPUID.05H.ECX) */
+u_int cpu_mon_min_size; /* MONITOR minimum range size, bytes */
+u_int cpu_mon_max_size; /* MONITOR minimum range size, bytes */
+u_int cpu_maxphyaddr; /* Max phys addr width in bits */
char machine[] = MACHINE;
+SYSCTL_UINT(_hw, OID_AUTO, via_feature_rng, CTLFLAG_RD,
+ &via_feature_rng, 0,
+ "VIA RNG feature available in CPU");
+SYSCTL_UINT(_hw, OID_AUTO, via_feature_xcrypt, CTLFLAG_RD,
+ &via_feature_xcrypt, 0,
+ "VIA xcrypt feature available in CPU");
+
#ifdef __amd64__
#ifdef SCTL_MASK32
extern int adaptive_machine_arch;
@@ -109,8 +146,8 @@
return (error);
}
-SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD,
- NULL, 0, sysctl_hw_machine, "A", "Machine class");
+SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine, "A", "Machine class");
#else
SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD,
machine, 0, "Machine class");
@@ -117,7 +154,7 @@
#endif
static char cpu_model[128];
-SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD,
+SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD | CTLFLAG_MPSAFE,
cpu_model, 0, "Machine model");
static int hw_clockrate;
@@ -126,8 +163,8 @@
u_int hv_high;
char hv_vendor[16];
-SYSCTL_STRING(_hw, OID_AUTO, hv_vendor, CTLFLAG_RD, hv_vendor, 0,
- "Hypervisor vendor");
+SYSCTL_STRING(_hw, OID_AUTO, hv_vendor, CTLFLAG_RD | CTLFLAG_MPSAFE, hv_vendor,
+ 0, "Hypervisor vendor");
static eventhandler_tag tsc_post_tag;
@@ -147,13 +184,11 @@
NULL,
"Intel Pentium 4"
};
-#endif
static struct {
char *cpu_name;
int cpu_class;
} cpus[] = {
-#ifdef __i386__
{ "Intel 80286", CPUCLASS_286 }, /* CPU_286 */
{ "i386SX", CPUCLASS_386 }, /* CPU_386SX */
{ "i386DX", CPUCLASS_386 }, /* CPU_386 */
@@ -171,11 +206,8 @@
{ "Pentium II", CPUCLASS_686 }, /* CPU_PII */
{ "Pentium III", CPUCLASS_686 }, /* CPU_PIII */
{ "Pentium 4", CPUCLASS_686 }, /* CPU_P4 */
-#else
- { "Clawhammer", CPUCLASS_K8 }, /* CPU_CLAWHAMMER */
- { "Sledgehammer", CPUCLASS_K8 }, /* CPU_SLEDGEHAMMER */
+};
#endif
-};
static struct {
char *vendor;
@@ -205,9 +237,13 @@
u_int regs[4], i;
char *brand;
+ printf("CPU: ");
+#ifdef __i386__
cpu_class = cpus[cpu].cpu_class;
- printf("CPU: ");
strncpy(cpu_model, cpus[cpu].cpu_name, sizeof (cpu_model));
+#else
+ strncpy(cpu_model, "Hammer", sizeof (cpu_model));
+#endif
/* Check for extended CPUID information and a processor name. */
if (cpu_exthigh >= 0x80000004) {
@@ -660,8 +696,8 @@
(intmax_t)(tsc_freq + 4999) / 1000000,
(u_int)((tsc_freq + 4999) / 10000) % 100);
}
+#ifdef __i386__
switch(cpu_class) {
-#ifdef __i386__
case CPUCLASS_286:
printf("286");
break;
@@ -683,14 +719,12 @@
printf("686");
break;
#endif
-#else
- case CPUCLASS_K8:
- printf("K8");
- break;
-#endif
default:
printf("Unknown"); /* will panic below... */
}
+#else
+ printf("K8");
+#endif
printf("-class CPU)\n");
if (*cpu_vendor)
printf(" Origin=\"%s\"", cpu_vendor);
@@ -914,6 +948,7 @@
"\020PQE"
/* AVX512 Foundation */
"\021AVX512F"
+ "\022AVX512DQ"
/* Enhanced NRBG */
"\023RDSEED"
/* ADCX + ADOX */
@@ -920,12 +955,17 @@
"\024ADX"
/* Supervisor Mode Access Prevention */
"\025SMAP"
+ "\026AVX512IFMA"
+ "\027PCOMMIT"
"\030CLFLUSHOPT"
+ "\031CLWB"
"\032PROCTRACE"
"\033AVX512PF"
"\034AVX512ER"
"\035AVX512CD"
"\036SHA"
+ "\037AVX512BW"
+ "\040AVX512VL"
);
}
@@ -934,14 +974,35 @@
cpu_stdext_feature2,
"\020"
"\001PREFETCHWT1"
+ "\002AVX512VBMI"
"\003UMIP"
"\004PKU"
"\005OSPKE"
+ "\006WAITPKG"
+ "\011GFNI"
"\027RDPID"
+ "\032CLDEMOTE"
+ "\034MOVDIRI"
+ "\035MOVDIRI64B"
"\037SGXLC"
);
}
+ if (cpu_stdext_feature3 != 0) {
+ printf("\n Structured Extended Features3=0x%b",
+ cpu_stdext_feature3,
+ "\020"
+ "\013MD_CLEAR"
+ "\016TSXFA"
+ "\033IBPB"
+ "\034STIBP"
+ "\035L1DFL"
+ "\036ARCH_CAP"
+ "\037CORE_CAP"
+ "\040SSBD"
+ );
+ }
+
if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
cpuid_count(0xd, 0x1, regs);
if (regs[0] != 0) {
@@ -955,6 +1016,31 @@
}
}
+ if (cpu_ia32_arch_caps != 0) {
+ printf("\n IA32_ARCH_CAPS=0x%b",
+ (u_int)cpu_ia32_arch_caps,
+ "\020"
+ "\001RDCL_NO"
+ "\002IBRS_ALL"
+ "\003RSBA"
+ "\004SKIP_L1DFL_VME"
+ "\005SSB_NO"
+ "\006MDS_NO"
+ "\010TSX_CTRL"
+ "\011TAA_NO"
+ );
+ }
+
+ if (amd_extended_feature_extensions != 0) {
+ printf("\n "
+ "AMD Extended Feature Extensions ID EBX="
+ "0x%b", amd_extended_feature_extensions,
+ "\020"
+ "\001CLZERO"
+ "\002IRPerf"
+ "\003XSaveErPtr");
+ }
+
if (via_feature_rng != 0 || via_feature_xcrypt != 0)
print_via_padlock_info();
@@ -1008,11 +1094,11 @@
print_hypervisor_info();
}
+#ifdef __i386__
void
panicifcpuunsupported(void)
{
-#ifdef __i386__
#if !defined(lint)
#if !defined(I486_CPU) && !defined(I586_CPU) && !defined(I686_CPU)
#error This kernel is not configured for one of the supported CPUs
@@ -1019,17 +1105,11 @@
#endif
#else /* lint */
#endif /* lint */
-#else /* __amd64__ */
-#ifndef HAMMER
-#error "You need to specify a cpu type"
-#endif
-#endif
/*
* Now that we have told the user what they have,
* let them know if that machine type isn't configured.
*/
switch (cpu_class) {
-#ifdef __i386__
case CPUCLASS_286: /* a 286 should not make it this far, anyway */
case CPUCLASS_386:
#if !defined(I486_CPU)
@@ -1041,12 +1121,6 @@
#if !defined(I686_CPU)
case CPUCLASS_686:
#endif
-#else /* __amd64__ */
- case CPUCLASS_X86:
-#ifndef HAMMER
- case CPUCLASS_K8:
-#endif
-#endif
panic("CPU class not configured");
default:
break;
@@ -1053,7 +1127,6 @@
}
}
-#ifdef __i386__
static volatile u_int trap_by_rdmsr;
/*
@@ -1210,7 +1283,6 @@
SYSINIT(hook_tsc_freq, SI_SUB_CONFIGURE, SI_ORDER_ANY, hook_tsc_freq, NULL);
-#ifndef XEN
static const char *const vm_bnames[] = {
"QEMU", /* QEMU */
"Plex86", /* Plex86 */
@@ -1270,6 +1342,10 @@
vm_guest = VM_GUEST_VMWARE;
else if (strcmp(hv_vendor, "Microsoft Hv") == 0)
vm_guest = VM_GUEST_HV;
+ else if (strcmp(hv_vendor, "KVMKVMKVM") == 0)
+ vm_guest = VM_GUEST_KVM;
+ else if (strcmp(hv_vendor, "bhyve bhyve") == 0)
+ vm_guest = VM_GUEST_BHYVE;
}
return;
}
@@ -1277,7 +1353,7 @@
/*
* Examine SMBIOS strings for older hypervisors.
*/
- p = getenv("smbios.system.serial");
+ p = kern_getenv("smbios.system.serial");
if (p != NULL) {
if (strncmp(p, "VMware-", 7) == 0 || strncmp(p, "VMW", 3) == 0) {
vmware_hvcall(VMW_HVCMD_GETVERSION, regs);
@@ -1294,7 +1370,7 @@
* XXX: Some of these entries may not be needed since they were
* added to FreeBSD before the checks above.
*/
- p = getenv("smbios.bios.vendor");
+ p = kern_getenv("smbios.bios.vendor");
if (p != NULL) {
for (i = 0; vm_bnames[i] != NULL; i++)
if (strcmp(p, vm_bnames[i]) == 0) {
@@ -1304,7 +1380,7 @@
}
freeenv(p);
}
- p = getenv("smbios.system.product");
+ p = kern_getenv("smbios.system.product");
if (p != NULL) {
for (i = 0; vm_pnames[i] != NULL; i++)
if (strcmp(p, vm_pnames[i]) == 0) {
@@ -1315,7 +1391,6 @@
freeenv(p);
}
}
-#endif
bool
fix_cpuid(void)
@@ -1360,9 +1435,8 @@
return (false);
}
-#ifdef __amd64__
void
-identify_cpu(void)
+identify_cpu1(void)
{
u_int regs[4];
@@ -1379,8 +1453,34 @@
cpu_feature = regs[3];
cpu_feature2 = regs[2];
}
-#endif
+void
+identify_cpu2(void)
+{
+ u_int regs[4], cpu_stdext_disable;
+
+ if (cpu_high >= 7) {
+ cpuid_count(7, 0, regs);
+ cpu_stdext_feature = regs[1];
+
+ /*
+ * Some hypervisors failed to filter out unsupported
+ * extended features. Allow to disable the
+ * extensions, activation of which requires setting a
+ * bit in CR4, and which VM monitors do not support.
+ */
+ cpu_stdext_disable = 0;
+ TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
+ cpu_stdext_feature &= ~cpu_stdext_disable;
+
+ cpu_stdext_feature2 = regs[2];
+ cpu_stdext_feature3 = regs[3];
+
+ if ((cpu_stdext_feature3 & CPUID_STDEXT3_ARCH_CAP) != 0)
+ cpu_ia32_arch_caps = rdmsr(MSR_IA32_ARCH_CAP);
+ }
+}
+
/*
* Final stage of CPU identification.
*/
@@ -1387,7 +1487,7 @@
void
finishidentcpu(void)
{
- u_int regs[4], cpu_stdext_disable;
+ u_int regs[4];
#ifdef __i386__
u_char ccr3;
#endif
@@ -1406,26 +1506,8 @@
cpu_mon_max_size = regs[1] & CPUID5_MON_MAX_SIZE;
}
- if (cpu_high >= 7) {
- cpuid_count(7, 0, regs);
- cpu_stdext_feature = regs[1];
+ identify_cpu2();
- /*
- * Some hypervisors fail to filter out unsupported
- * extended features. For now, disable the
- * extensions, activation of which requires setting a
- * bit in CR4, and which VM monitors do not support.
- */
- if (cpu_feature2 & CPUID2_HV) {
- cpu_stdext_disable = CPUID_STDEXT_FSGSBASE |
- CPUID_STDEXT_SMEP;
- } else
- cpu_stdext_disable = 0;
- TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
- cpu_stdext_feature &= ~cpu_stdext_disable;
- cpu_stdext_feature2 = regs[2];
- }
-
#ifdef __i386__
if (cpu_high > 0 &&
(cpu_vendor_id == CPU_VENDOR_INTEL ||
@@ -1457,6 +1539,7 @@
if (cpu_exthigh >= 0x80000008) {
do_cpuid(0x80000008, regs);
cpu_maxphyaddr = regs[0] & 0xff;
+ amd_extended_feature_extensions = regs[1];
cpu_procinfo2 = regs[2];
} else {
cpu_maxphyaddr = (cpu_feature & CPUID_PAE) != 0 ? 36 : 32;
@@ -1550,18 +1633,26 @@
return;
}
}
-#else
- /* XXX */
- cpu = CPU_CLAWHAMMER;
#endif
}
+int
+pti_get_default(void)
+{
+
+ if (strcmp(cpu_vendor, AMD_VENDOR_ID) == 0)
+ return (0);
+ if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) != 0)
+ return (0);
+ return (1);
+}
+
static u_int
find_cpu_vendor_id(void)
{
int i;
- for (i = 0; i < sizeof(cpu_vendors) / sizeof(cpu_vendors[0]); i++)
+ for (i = 0; i < nitems(cpu_vendors); i++)
if (strcmp(cpu_vendor, cpu_vendors[i].vendor) == 0)
return (cpu_vendors[i].vendor_id);
return (0);
Modified: trunk/sys/x86/x86/intr_machdep.c
===================================================================
--- trunk/sys/x86/x86/intr_machdep.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/intr_machdep.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -24,7 +24,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: stable/10/sys/x86/x86/intr_machdep.c 307244 2016-10-14 02:03:53Z sephe $
+ * $FreeBSD: stable/11/sys/x86/x86/intr_machdep.c 340016 2018-11-01 18:34:26Z jhb $
*/
/*
@@ -37,6 +37,7 @@
#include "opt_atpic.h"
#include "opt_ddb.h"
+#include "opt_smp.h"
#include <sys/param.h>
#include <sys/bus.h>
@@ -44,6 +45,7 @@
#include <sys/ktr.h>
#include <sys/kernel.h>
#include <sys/lock.h>
+#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/smp.h>
@@ -50,6 +52,7 @@
#include <sys/sx.h>
#include <sys/syslog.h>
#include <sys/systm.h>
+#include <sys/vmmeter.h>
#include <machine/clock.h>
#include <machine/intr_machdep.h>
#include <machine/smp.h>
@@ -65,7 +68,7 @@
#ifdef PC98
#include <pc98/cbus/cbus.h>
#else
-#include <x86/isa/isa.h>
+#include <isa/isareg.h>
#endif
#endif
@@ -74,22 +77,26 @@
typedef void (*mask_fn)(void *);
static int intrcnt_index;
-static struct intsrc *interrupt_sources[NUM_IO_INTS];
+static struct intsrc **interrupt_sources;
static struct sx intrsrc_lock;
static struct mtx intrpic_lock;
static struct mtx intrcnt_lock;
static TAILQ_HEAD(pics_head, pic) pics;
+u_int num_io_irqs;
-#ifdef SMP
+#if defined(SMP) && !defined(EARLY_AP_STARTUP)
static int assign_cpu;
#endif
-u_long intrcnt[INTRCNT_COUNT];
-char intrnames[INTRCNT_COUNT * (MAXCOMLEN + 1)];
+u_long *intrcnt;
+char *intrnames;
size_t sintrcnt = sizeof(intrcnt);
size_t sintrnames = sizeof(intrnames);
+int nintrcnt;
-static int intr_assign_cpu(void *arg, u_char cpu);
+static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
+
+static int intr_assign_cpu(void *arg, int cpu);
static void intr_disable_src(void *arg);
static void intr_init(void *__dummy);
static int intr_pic_registered(struct pic *pic);
@@ -97,6 +104,18 @@
static void intrcnt_updatename(struct intsrc *is);
static void intrcnt_register(struct intsrc *is);
+/*
+ * SYSINIT levels for SI_SUB_INTR:
+ *
+ * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
+ * SI_ORDER_SECOND: Xen PICs
+ * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
+ * SI_ORDER_FOURTH: Add 8259A PICs
+ * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
+ * SI_ORDER_MIDDLE: SMP interrupt counters
+ * SI_ORDER_ANY: Enable interrupts on BSP
+ */
+
static int
intr_pic_registered(struct pic *pic)
{
@@ -132,6 +151,56 @@
}
/*
+ * Allocate interrupt source arrays and register interrupt sources
+ * once the number of interrupts is known.
+ */
+static void
+intr_init_sources(void *arg)
+{
+ struct pic *pic;
+
+ MPASS(num_io_irqs > 0);
+
+ interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
+ M_INTR, M_WAITOK | M_ZERO);
+
+ /*
+ * - 1 ??? dummy counter.
+ * - 2 counters for each I/O interrupt.
+ * - 1 counter for each CPU for lapic timer.
+ * - 1 counter for each CPU for the Hyper-V vmbus driver.
+ * - 8 counters for each CPU for IPI counters for SMP.
+ */
+ nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
+#ifdef COUNT_IPIS
+ if (mp_ncpus > 1)
+ nintrcnt += 8 * mp_ncpus;
+#endif
+ intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
+ M_ZERO);
+ intrnames = mallocarray(nintrcnt, MAXCOMLEN + 1, M_INTR, M_WAITOK |
+ M_ZERO);
+ sintrcnt = nintrcnt * sizeof(u_long);
+ sintrnames = nintrcnt * (MAXCOMLEN + 1);
+
+ intrcnt_setname("???", 0);
+ intrcnt_index = 1;
+
+ /*
+ * NB: intrpic_lock is not held here to avoid LORs due to
+ * malloc() in intr_register_source(). However, we are still
+ * single-threaded at this point in startup so the list of
+ * PICs shouldn't change.
+ */
+ TAILQ_FOREACH(pic, &pics, pics) {
+ if (pic->pic_register_sources != NULL)
+ pic->pic_register_sources(pic);
+ }
+}
+SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
+ NULL);
+
+/*
* Register a new interrupt source with the global interrupt system.
* The global interrupts need to be disabled when this function is
* called.
@@ -143,6 +212,8 @@
KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
vector = isrc->is_pic->pic_vector(isrc);
+ KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
+ num_io_irqs));
if (interrupt_sources[vector] != NULL)
return (EEXIST);
error = intr_event_create(&isrc->is_event, isrc, 0, vector,
@@ -168,6 +239,8 @@
intr_lookup_source(int vector)
{
+ if (vector < 0 || vector >= num_io_irqs)
+ return (NULL);
return (interrupt_sources[vector]);
}
@@ -308,17 +381,24 @@
}
static int
-intr_assign_cpu(void *arg, u_char cpu)
+intr_assign_cpu(void *arg, int cpu)
{
#ifdef SMP
struct intsrc *isrc;
int error;
+#ifdef EARLY_AP_STARTUP
+ MPASS(mp_ncpus == 1 || smp_started);
+
+ /* Nothing to do if there is only a single CPU. */
+ if (mp_ncpus > 1 && cpu != NOCPU) {
+#else
/*
* Don't do anything during early boot. We will pick up the
* assignment once the APs are started.
*/
if (assign_cpu && cpu != NOCPU) {
+#endif
isrc = arg;
sx_xlock(&intrsrc_lock);
error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
@@ -353,6 +433,7 @@
KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
mtx_lock_spin(&intrcnt_lock);
+ MPASS(intrcnt_index + 2 <= nintrcnt);
is->is_index = intrcnt_index;
intrcnt_index += 2;
snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
@@ -369,6 +450,7 @@
{
mtx_lock_spin(&intrcnt_lock);
+ MPASS(intrcnt_index < nintrcnt);
*countp = &intrcnt[intrcnt_index];
intrcnt_setname(name, intrcnt_index);
intrcnt_index++;
@@ -379,8 +461,6 @@
intr_init(void *dummy __unused)
{
- intrcnt_setname("???", 0);
- intrcnt_index = 1;
TAILQ_INIT(&pics);
mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
sx_init(&intrsrc_lock, "intrsrc");
@@ -388,6 +468,21 @@
}
SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
+static void
+intr_init_final(void *dummy __unused)
+{
+
+ /*
+ * Enable interrupts on the BSP after all of the interrupt
+ * controllers are initialized. Device interrupts are still
+ * disabled in the interrupt controllers until interrupt
+ * handlers are registered. Interrupts are enabled on each AP
+ * after their first context switch.
+ */
+ enable_intr();
+}
+SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
+
#ifndef DEV_ATPIC
/* Initialize the two 8259A's to a known-good shutdown state. */
void
@@ -427,6 +522,23 @@
return (0);
}
+void
+intr_reprogram(void)
+{
+ struct intsrc *is;
+ u_int v;
+
+ sx_xlock(&intrsrc_lock);
+ for (v = 0; v < num_io_irqs; v++) {
+ is = interrupt_sources[v];
+ if (is == NULL)
+ continue;
+ if (is->is_pic->pic_reprogram_pin != NULL)
+ is->is_pic->pic_reprogram_pin(is);
+ }
+ sx_xunlock(&intrsrc_lock);
+}
+
#ifdef DDB
/*
* Dump data about interrupt handlers
@@ -434,7 +546,8 @@
DB_SHOW_COMMAND(irqs, db_show_irqs)
{
struct intsrc **isrc;
- int i, verbose;
+ u_int i;
+ int verbose;
if (strcmp(modif, "v") == 0)
verbose = 1;
@@ -441,7 +554,7 @@
else
verbose = 0;
isrc = interrupt_sources;
- for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++)
+ for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
if (*isrc != NULL)
db_dump_intr_event((*isrc)->is_event, verbose);
}
@@ -453,7 +566,7 @@
* allocate CPUs round-robin.
*/
-static cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
+cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
static int current_cpu;
/*
@@ -465,9 +578,15 @@
{
u_int apic_id;
+#ifdef EARLY_AP_STARTUP
+ MPASS(mp_ncpus == 1 || smp_started);
+ if (mp_ncpus == 1)
+ return (PCPU_GET(apic_id));
+#else
/* Leave all interrupts on the BSP during boot. */
if (!assign_cpu)
return (PCPU_GET(apic_id));
+#endif
mtx_lock_spin(&icu_lock);
apic_id = cpu_apic_ids[current_cpu];
@@ -509,6 +628,7 @@
CPU_SET(cpu, &intr_cpus);
}
+#ifndef EARLY_AP_STARTUP
/*
* Distribute all the interrupt sources among the available CPUs once the
* AP's have been launched.
@@ -517,15 +637,8 @@
intr_shuffle_irqs(void *arg __unused)
{
struct intsrc *isrc;
- int i;
+ u_int i;
-#ifdef XEN
- /*
- * Doesn't work yet
- */
- return;
-#endif
-
/* Don't bother on UP. */
if (mp_ncpus == 1)
return;
@@ -533,7 +646,7 @@
/* Round-robin assign a CPU to each enabled source. */
sx_xlock(&intrsrc_lock);
assign_cpu = 1;
- for (i = 0; i < NUM_IO_INTS; i++) {
+ for (i = 0; i < num_io_irqs; i++) {
isrc = interrupt_sources[i];
if (isrc != NULL && isrc->is_handlers > 0) {
/*
@@ -556,6 +669,7 @@
}
SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
NULL);
+#endif
#else
/*
* Always route interrupts to the current processor in the UP case.
Modified: trunk/sys/x86/x86/io_apic.c
===================================================================
--- trunk/sys/x86/x86/io_apic.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/io_apic.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,8 +26,9 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/io_apic.c 330959 2018-03-14 23:59:52Z marius $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/io_apic.c 340016 2018-11-01 18:34:26Z jhb $");
+#include "opt_acpi.h"
#include "opt_isa.h"
#include <sys/param.h>
@@ -38,6 +39,7 @@
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
+#include <sys/rman.h>
#include <sys/sysctl.h>
#include <dev/pci/pcireg.h>
@@ -49,9 +51,10 @@
#include <x86/apicreg.h>
#include <machine/frame.h>
#include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
#include <machine/resource.h>
#include <machine/segments.h>
+#include <x86/iommu/iommu_intrmap.h>
#define IOAPIC_ISA_INTS 16
#define IOAPIC_MEM_REGION 32
@@ -58,11 +61,6 @@
#define IOAPIC_REDTBL_LO(i) (IOAPIC_REDTBL + (i) * 2)
#define IOAPIC_REDTBL_HI(i) (IOAPIC_REDTBL_LO(i) + 1)
-#define IRQ_EXTINT (NUM_IO_INTS + 1)
-#define IRQ_NMI (NUM_IO_INTS + 2)
-#define IRQ_SMI (NUM_IO_INTS + 3)
-#define IRQ_DISABLED (NUM_IO_INTS + 4)
-
static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures");
/*
@@ -81,15 +79,16 @@
struct ioapic_intsrc {
struct intsrc io_intsrc;
- u_int io_irq;
+ int io_irq;
u_int io_intpin:8;
u_int io_vector:8;
- u_int io_cpu:8;
+ u_int io_cpu;
u_int io_activehi:1;
u_int io_edgetrigger:1;
u_int io_masked:1;
int io_bus:4;
uint32_t io_lowreg;
+ u_int io_remap_cookie;
};
struct ioapic {
@@ -98,9 +97,13 @@
u_int io_apic_id:4;
u_int io_intbase:8; /* System Interrupt base */
u_int io_numintr:8;
+ u_int io_haseoi:1;
volatile ioapic_t *io_addr; /* XXX: should use bus_space */
vm_paddr_t io_paddr;
STAILQ_ENTRY(ioapic) io_next;
+ device_t pci_dev; /* matched pci device, if found */
+ struct resource *pci_wnd; /* BAR 0, should be same or alias to
+ io_paddr */
struct ioapic_intsrc io_pins[0];
};
@@ -108,6 +111,7 @@
static void ioapic_write(volatile ioapic_t *apic, int reg, u_int val);
static const char *ioapic_bus_string(int bus_type);
static void ioapic_print_irq(struct ioapic_intsrc *intpin);
+static void ioapic_register_sources(struct pic *pic);
static void ioapic_enable_source(struct intsrc *isrc);
static void ioapic_disable_source(struct intsrc *isrc, int eoi);
static void ioapic_eoi_source(struct intsrc *isrc);
@@ -120,27 +124,79 @@
static void ioapic_resume(struct pic *pic, bool suspend_cancelled);
static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id);
static void ioapic_program_intpin(struct ioapic_intsrc *intpin);
+static void ioapic_reprogram_intpin(struct intsrc *isrc);
static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list);
-struct pic ioapic_template = { ioapic_enable_source, ioapic_disable_source,
- ioapic_eoi_source, ioapic_enable_intr,
- ioapic_disable_intr, ioapic_vector,
- ioapic_source_pending, NULL, ioapic_resume,
- ioapic_config_intr, ioapic_assign_cpu };
+struct pic ioapic_template = {
+ .pic_register_sources = ioapic_register_sources,
+ .pic_enable_source = ioapic_enable_source,
+ .pic_disable_source = ioapic_disable_source,
+ .pic_eoi_source = ioapic_eoi_source,
+ .pic_enable_intr = ioapic_enable_intr,
+ .pic_disable_intr = ioapic_disable_intr,
+ .pic_vector = ioapic_vector,
+ .pic_source_pending = ioapic_source_pending,
+ .pic_suspend = NULL,
+ .pic_resume = ioapic_resume,
+ .pic_config_intr = ioapic_config_intr,
+ .pic_assign_cpu = ioapic_assign_cpu,
+ .pic_reprogram_pin = ioapic_reprogram_intpin,
+};
-static int next_ioapic_base;
+static u_int next_ioapic_base;
static u_int next_id;
-static SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
static int enable_extint;
SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0,
"Enable the ExtINT pin in the first I/O APIC");
-TUNABLE_INT("hw.apic.enable_extint", &enable_extint);
-static __inline void
-_ioapic_eoi_source(struct intsrc *isrc)
+static void
+_ioapic_eoi_source(struct intsrc *isrc, int locked)
{
+ struct ioapic_intsrc *src;
+ struct ioapic *io;
+ volatile uint32_t *apic_eoi;
+ uint32_t low1;
+
lapic_eoi();
+ if (!lapic_eoi_suppression)
+ return;
+ src = (struct ioapic_intsrc *)isrc;
+ if (src->io_edgetrigger)
+ return;
+ io = (struct ioapic *)isrc->is_pic;
+
+ /*
+ * Handle targeted EOI for level-triggered pins, if broadcast
+ * EOI suppression is supported by LAPICs.
+ */
+ if (io->io_haseoi) {
+ /*
+ * If IOAPIC has EOI Register, simply write vector
+ * number into the reg.
+ */
+ apic_eoi = (volatile uint32_t *)((volatile char *)
+ io->io_addr + IOAPIC_EOIR);
+ *apic_eoi = src->io_vector;
+ } else {
+ /*
+ * Otherwise, if IO-APIC is too old to provide EOIR,
+ * do what Intel did for the Linux kernel. Temporary
+ * switch the pin to edge-trigger and back, masking
+ * the pin during the trick.
+ */
+ if (!locked)
+ mtx_lock_spin(&icu_lock);
+ low1 = src->io_lowreg;
+ low1 &= ~IOART_TRGRLVL;
+ low1 |= IOART_TRGREDG | IOART_INTMSET;
+ ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
+ low1);
+ ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
+ src->io_lowreg);
+ if (!locked)
+ mtx_unlock_spin(&icu_lock);
+ }
}
static u_int
@@ -195,7 +251,7 @@
printf("SMI");
break;
default:
- printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus),
+ printf("%s IRQ %d", ioapic_bus_string(intpin->io_bus),
intpin->io_irq);
}
}
@@ -233,7 +289,7 @@
}
if (eoi == PIC_EOI)
- _ioapic_eoi_source(isrc);
+ _ioapic_eoi_source(isrc, 1);
mtx_unlock_spin(&icu_lock);
}
@@ -242,7 +298,7 @@
ioapic_eoi_source(struct intsrc *isrc)
{
- _ioapic_eoi_source(isrc);
+ _ioapic_eoi_source(isrc, 0);
}
/*
@@ -254,6 +310,9 @@
{
struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic;
uint32_t low, high;
+#ifdef ACPI_DMAR
+ int error;
+#endif
/*
* If a pin is completely invalid or if it is valid but hasn't
@@ -260,7 +319,7 @@
* been enabled yet, just ensure that the pin is masked.
*/
mtx_assert(&icu_lock, MA_OWNED);
- if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS &&
+ if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq >= 0 &&
intpin->io_vector == 0)) {
low = ioapic_read(io->io_addr,
IOAPIC_REDTBL_LO(intpin->io_intpin));
@@ -268,9 +327,34 @@
ioapic_write(io->io_addr,
IOAPIC_REDTBL_LO(intpin->io_intpin),
low | IOART_INTMSET);
+#ifdef ACPI_DMAR
+ mtx_unlock_spin(&icu_lock);
+ iommu_unmap_ioapic_intr(io->io_apic_id,
+ &intpin->io_remap_cookie);
+ mtx_lock_spin(&icu_lock);
+#endif
return;
}
+#ifdef ACPI_DMAR
+ mtx_unlock_spin(&icu_lock);
+ error = iommu_map_ioapic_intr(io->io_apic_id,
+ intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
+ intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie,
+ &high, &low);
+ mtx_lock_spin(&icu_lock);
+ if (error == 0) {
+ ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin),
+ high);
+ intpin->io_lowreg = low;
+ ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
+ low);
+ return;
+ } else if (error != EOPNOTSUPP) {
+ return;
+ }
+#endif
+
/*
* Set the destination. Note that with Intel interrupt remapping,
* the previously reserved bits 55:48 now have a purpose so ensure
@@ -318,6 +402,15 @@
ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low);
}
+static void
+ioapic_reprogram_intpin(struct intsrc *isrc)
+{
+
+ mtx_lock_spin(&icu_lock);
+ ioapic_program_intpin((struct ioapic_intsrc *)isrc);
+ mtx_unlock_spin(&icu_lock);
+}
+
static int
ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id)
{
@@ -537,6 +630,8 @@
io = malloc(sizeof(struct ioapic) +
numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK);
io->io_pic = ioapic_template;
+ io->pci_dev = NULL;
+ io->pci_wnd = NULL;
mtx_lock_spin(&icu_lock);
io->io_id = next_id++;
io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
@@ -557,11 +652,29 @@
io->io_id, intbase, next_ioapic_base);
io->io_intbase = intbase;
next_ioapic_base = intbase + numintr;
+ if (next_ioapic_base > num_io_irqs)
+ num_io_irqs = next_ioapic_base;
io->io_numintr = numintr;
io->io_addr = apic;
io->io_paddr = addr;
+ if (bootverbose) {
+ printf("ioapic%u: ver 0x%02x maxredir 0x%02x\n", io->io_id,
+ (value & IOART_VER_VERSION), (value & IOART_VER_MAXREDIR)
+ >> MAXREDIRSHIFT);
+ }
/*
+ * The summary information about IO-APIC versions is taken from
+ * the Linux kernel source:
+ * 0Xh 82489DX
+ * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
+ * 30h-FFh Reserved
+ * IO-APICs with version >= 0x20 have working EOIR register.
+ */
+ io->io_haseoi = (value & IOART_VER_VERSION) >= 0x20;
+
+ /*
* Initialize pins. Start off with interrupts disabled. Default
* to active-hi and edge-triggered for ISA interrupts and active-lo
* and level-triggered for all others.
@@ -599,6 +712,15 @@
intpin->io_cpu = PCPU_GET(apic_id);
value = ioapic_read(apic, IOAPIC_REDTBL_LO(i));
ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET);
+#ifdef ACPI_DMAR
+ /* dummy, but sets cookie */
+ mtx_unlock_spin(&icu_lock);
+ iommu_map_ioapic_intr(io->io_apic_id,
+ intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
+ intpin->io_activehi, intpin->io_irq,
+ &intpin->io_remap_cookie, NULL, NULL);
+ mtx_lock_spin(&icu_lock);
+#endif
}
mtx_unlock_spin(&icu_lock);
@@ -640,7 +762,7 @@
io = (struct ioapic *)cookie;
if (pin >= io->io_numintr || vector < 0)
return (EINVAL);
- if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+ if (io->io_pins[pin].io_irq < 0)
return (EINVAL);
io->io_pins[pin].io_irq = vector;
if (bootverbose)
@@ -659,7 +781,7 @@
io = (struct ioapic *)cookie;
if (pin >= io->io_numintr)
return (EINVAL);
- if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+ if (io->io_pins[pin].io_irq < 0)
return (EINVAL);
if (io->io_pins[pin].io_bus == bus_type)
return (0);
@@ -680,7 +802,7 @@
return (EINVAL);
if (io->io_pins[pin].io_irq == IRQ_NMI)
return (0);
- if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+ if (io->io_pins[pin].io_irq < 0)
return (EINVAL);
io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
io->io_pins[pin].io_irq = IRQ_NMI;
@@ -703,7 +825,7 @@
return (EINVAL);
if (io->io_pins[pin].io_irq == IRQ_SMI)
return (0);
- if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+ if (io->io_pins[pin].io_irq < 0)
return (EINVAL);
io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
io->io_pins[pin].io_irq = IRQ_SMI;
@@ -726,7 +848,7 @@
return (EINVAL);
if (io->io_pins[pin].io_irq == IRQ_EXTINT)
return (0);
- if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+ if (io->io_pins[pin].io_irq < 0)
return (EINVAL);
io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
io->io_pins[pin].io_irq = IRQ_EXTINT;
@@ -751,7 +873,7 @@
io = (struct ioapic *)cookie;
if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM)
return (EINVAL);
- if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+ if (io->io_pins[pin].io_irq < 0)
return (EINVAL);
activehi = (pol == INTR_POLARITY_HIGH);
if (io->io_pins[pin].io_activehi == activehi)
@@ -772,7 +894,7 @@
io = (struct ioapic *)cookie;
if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM)
return (EINVAL);
- if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+ if (io->io_pins[pin].io_irq < 0)
return (EINVAL);
edgetrigger = (trigger == INTR_TRIGGER_EDGE);
if (io->io_pins[pin].io_edgetrigger == edgetrigger)
@@ -808,14 +930,26 @@
/*
* Reprogram pins to handle special case pins (such as NMI and
- * SMI) and register valid pins as interrupt sources.
+ * SMI) and disable normal pins until a handler is registered.
*/
intr_register_pic(&io->io_pic);
+ for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++)
+ ioapic_reprogram_intpin(&pin->io_intsrc);
+}
+
+/*
+ * Add interrupt sources for I/O APIC interrupt pins.
+ */
+static void
+ioapic_register_sources(struct pic *pic)
+{
+ struct ioapic_intsrc *pin;
+ struct ioapic *io;
+ int i;
+
+ io = (struct ioapic *)pic;
for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) {
- mtx_lock_spin(&icu_lock);
- ioapic_program_intpin(pin);
- mtx_unlock_spin(&icu_lock);
- if (pin->io_irq < NUM_IO_INTS)
+ if (pin->io_irq >= 0)
intr_register_source(&pin->io_intsrc);
}
}
@@ -846,7 +980,72 @@
static int
ioapic_pci_attach(device_t dev)
{
+ struct resource *res;
+ volatile ioapic_t *apic;
+ struct ioapic *io;
+ int rid;
+ u_int apic_id;
+ /*
+ * Try to match the enumerated ioapic. Match BAR start
+ * against io_paddr. Due to a fear that PCI window is not the
+ * same as the MADT reported io window, but an alias, read the
+ * APIC ID from the mapped BAR and match against it.
+ */
+ rid = PCIR_BAR(0);
+ res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
+ RF_ACTIVE | RF_SHAREABLE);
+ if (res == NULL) {
+ if (bootverbose)
+ device_printf(dev, "cannot activate BAR0\n");
+ return (ENXIO);
+ }
+ apic = (volatile ioapic_t *)rman_get_virtual(res);
+ if (rman_get_size(res) < IOAPIC_WND_SIZE) {
+ if (bootverbose)
+ device_printf(dev,
+ "BAR0 too small (%jd) for IOAPIC window\n",
+ (uintmax_t)rman_get_size(res));
+ goto fail;
+ }
+ mtx_lock_spin(&icu_lock);
+ apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
+ /* First match by io window address */
+ STAILQ_FOREACH(io, &ioapic_list, io_next) {
+ if (io->io_paddr == (vm_paddr_t)rman_get_start(res))
+ goto found;
+ }
+ /* Then by apic id */
+ STAILQ_FOREACH(io, &ioapic_list, io_next) {
+ if (io->io_apic_id == apic_id)
+ goto found;
+ }
+ mtx_unlock_spin(&icu_lock);
+ if (bootverbose)
+ device_printf(dev,
+ "cannot match pci bar apic id %d against MADT\n",
+ apic_id);
+fail:
+ bus_release_resource(dev, SYS_RES_MEMORY, rid, res);
+ return (ENXIO);
+found:
+ KASSERT(io->pci_dev == NULL,
+ ("ioapic %d pci_dev not NULL", io->io_id));
+ KASSERT(io->pci_wnd == NULL,
+ ("ioapic %d pci_wnd not NULL", io->io_id));
+
+ io->pci_dev = dev;
+ io->pci_wnd = res;
+ if (bootverbose && (io->io_paddr != (vm_paddr_t)rman_get_start(res) ||
+ io->io_apic_id != apic_id)) {
+ device_printf(dev, "pci%d:%d:%d:%d pci BAR0@%jx id %d "
+ "MADT id %d paddr@%jx\n",
+ pci_get_domain(dev), pci_get_bus(dev),
+ pci_get_slot(dev), pci_get_function(dev),
+ (uintmax_t)rman_get_start(res), apic_id,
+ io->io_apic_id, (uintmax_t)io->io_paddr);
+ }
+ mtx_unlock_spin(&icu_lock);
return (0);
}
@@ -863,6 +1062,28 @@
static devclass_t ioapic_devclass;
DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0);
+int
+ioapic_get_rid(u_int apic_id, uint16_t *ridp)
+{
+ struct ioapic *io;
+ uintptr_t rid;
+ int error;
+
+ mtx_lock_spin(&icu_lock);
+ STAILQ_FOREACH(io, &ioapic_list, io_next) {
+ if (io->io_apic_id == apic_id)
+ break;
+ }
+ mtx_unlock_spin(&icu_lock);
+ if (io == NULL || io->pci_dev == NULL)
+ return (EINVAL);
+ error = pci_get_id(io->pci_dev, PCI_ID_RID, &rid);
+ if (error != 0)
+ return (error);
+ *ridp = rid;
+ return (0);
+}
+
/*
* A new-bus driver to consume the memory resources associated with
* the APICs in the system. On some systems ACPI or PnPBIOS system
@@ -896,19 +1117,11 @@
{
int error;
-#ifdef PAE
- /*
- * Resources use long's to track resources, so we can't
- * include memory regions above 4GB.
- */
- if (base >= ~0ul)
- return;
-#endif
error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length);
if (error)
panic("apic_add_resource: resource %d failed set with %d", rid,
error);
- bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0);
+ bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_SHAREABLE);
}
static int
@@ -918,7 +1131,7 @@
int i;
/* Reserve the local APIC. */
- apic_add_resource(dev, 0, lapic_paddr, sizeof(lapic_t));
+ apic_add_resource(dev, 0, lapic_paddr, LAPIC_MEM_REGION);
i = 1;
STAILQ_FOREACH(io, &ioapic_list, io_next) {
apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION);
Modified: trunk/sys/x86/x86/legacy.c
===================================================================
--- trunk/sys/x86/x86/legacy.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/legacy.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -33,7 +33,7 @@
#include "opt_mca.h"
#endif
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/legacy.c 233707 2012-03-30 19:10:14Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/legacy.c 233707 2012-03-30 19:10:14Z jhb $");
/*
* This code implements a system driver for legacy systems that do not
Modified: trunk/sys/x86/x86/local_apic.c
===================================================================
--- trunk/sys/x86/x86/local_apic.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/local_apic.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -33,11 +33,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/local_apic.c 314662 2017-03-04 12:04:24Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/local_apic.c 351757 2019-09-03 16:27:23Z emaste $");
#include "opt_atpic.h"
#include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
#include "opt_ddb.h"
@@ -51,6 +50,7 @@
#include <sys/proc.h>
#include <sys/sched.h>
#include <sys/smp.h>
+#include <sys/sysctl.h>
#include <sys/timeet.h>
#include <vm/vm.h>
@@ -58,14 +58,16 @@
#include <x86/apicreg.h>
#include <machine/clock.h>
+#include <machine/cpufunc.h>
#include <machine/cputypes.h>
#include <machine/frame.h>
#include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/smp.h>
#include <machine/specialreg.h>
+#include <x86/init.h>
#ifdef DDB
#include <sys/interrupt.h>
@@ -88,12 +90,24 @@
CTASSERT(APIC_LOCAL_INTS == 240);
CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
-/* Magic IRQ values for the timer and syscalls. */
-#define IRQ_TIMER (NUM_IO_INTS + 1)
-#define IRQ_SYSCALL (NUM_IO_INTS + 2)
-#define IRQ_DTRACE_RET (NUM_IO_INTS + 3)
-#define IRQ_EVTCHN (NUM_IO_INTS + 4)
+/*
+ * I/O interrupts use non-negative IRQ values. These values are used
+ * to mark unused IDT entries or IDT entries reserved for a non-I/O
+ * interrupt.
+ */
+#define IRQ_FREE -1
+#define IRQ_TIMER -2
+#define IRQ_SYSCALL -3
+#define IRQ_DTRACE_RET -4
+#define IRQ_EVTCHN -5
+enum lat_timer_mode {
+ LAT_MODE_UNDEF = 0,
+ LAT_MODE_PERIODIC = 1,
+ LAT_MODE_ONESHOT = 2,
+ LAT_MODE_DEADLINE = 3,
+};
+
/*
* Support for local APICs. Local APICs manage interrupts on each
* individual processor as opposed to I/O APICs which receive interrupts
@@ -114,14 +128,16 @@
struct lapic {
struct lvt la_lvts[APIC_LVT_MAX + 1];
+ struct lvt la_elvts[APIC_ELVT_MAX + 1];
u_int la_id:8;
u_int la_cluster:4;
u_int la_cluster_id:2;
u_int la_present:1;
u_long *la_timer_count;
- u_long la_timer_period;
- u_int la_timer_mode;
- uint32_t lvt_timer_cache;
+ uint64_t la_timer_period;
+ enum lat_timer_mode la_timer_mode;
+ uint32_t lvt_timer_base;
+ uint32_t lvt_timer_last;
/* Include IDT_SYSCALL to make indexing easier. */
int la_ioint_irqs[APIC_NUM_IOINTS + 1];
} static lapics[MAX_APIC_ID + 1];
@@ -137,6 +153,14 @@
{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */
};
+/* Global defaults for AMD local APIC ELVT entries. */
+static struct lvt elvts[APIC_ELVT_MAX + 1] = {
+ { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
+ { 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT },
+ { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
+ { 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
+};
+
static inthand_t *ioint_handlers[] = {
NULL, /* 0 - 31 */
IDTVEC(apic_isr1), /* 32 - 63 */
@@ -148,6 +172,16 @@
IDTVEC(apic_isr7), /* 224 - 255 */
};
+static inthand_t *ioint_pti_handlers[] = {
+ NULL, /* 0 - 31 */
+ IDTVEC(apic_isr1_pti), /* 32 - 63 */
+ IDTVEC(apic_isr2_pti), /* 64 - 95 */
+ IDTVEC(apic_isr3_pti), /* 96 - 127 */
+ IDTVEC(apic_isr4_pti), /* 128 - 159 */
+ IDTVEC(apic_isr5_pti), /* 160 - 191 */
+ IDTVEC(apic_isr6_pti), /* 192 - 223 */
+ IDTVEC(apic_isr7_pti), /* 224 - 255 */
+};
static u_int32_t lapic_timer_divisors[] = {
APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
@@ -154,42 +188,223 @@
APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
};
-extern inthand_t IDTVEC(rsvd);
+extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
-volatile lapic_t *lapic;
+volatile char *lapic_map;
vm_paddr_t lapic_paddr;
-static u_long lapic_timer_divisor;
+int x2apic_mode;
+int lapic_eoi_suppression;
+static int lapic_timer_tsc_deadline;
+static u_long lapic_timer_divisor, count_freq;
static struct eventtimer lapic_et;
#ifdef SMP
static uint64_t lapic_ipi_wait_mult;
#endif
+SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
+SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, "");
+SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD,
+ &lapic_eoi_suppression, 0, "");
+SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD,
+ &lapic_timer_tsc_deadline, 0, "");
+
+static void lapic_calibrate_initcount(struct lapic *la);
+static void lapic_calibrate_deadline(struct lapic *la);
+
+static uint32_t
+lapic_read32(enum LAPIC_REGISTERS reg)
+{
+ uint32_t res;
+
+ if (x2apic_mode) {
+ res = rdmsr32(MSR_APIC_000 + reg);
+ } else {
+ res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL);
+ }
+ return (res);
+}
+
+static void
+lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val)
+{
+
+ if (x2apic_mode) {
+ mfence();
+ lfence();
+ wrmsr(MSR_APIC_000 + reg, val);
+ } else {
+ *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
+ }
+}
+
+static void
+lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val)
+{
+
+ if (x2apic_mode) {
+ wrmsr(MSR_APIC_000 + reg, val);
+ } else {
+ *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
+ }
+}
+
+#ifdef SMP
+static uint64_t
+lapic_read_icr(void)
+{
+ uint64_t v;
+ uint32_t vhi, vlo;
+
+ if (x2apic_mode) {
+ v = rdmsr(MSR_APIC_000 + LAPIC_ICR_LO);
+ } else {
+ vhi = lapic_read32(LAPIC_ICR_HI);
+ vlo = lapic_read32(LAPIC_ICR_LO);
+ v = ((uint64_t)vhi << 32) | vlo;
+ }
+ return (v);
+}
+
+static uint64_t
+lapic_read_icr_lo(void)
+{
+
+ return (lapic_read32(LAPIC_ICR_LO));
+}
+
+static void
+lapic_write_icr(uint32_t vhi, uint32_t vlo)
+{
+ uint64_t v;
+
+ if (x2apic_mode) {
+ v = ((uint64_t)vhi << 32) | vlo;
+ mfence();
+ wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v);
+ } else {
+ lapic_write32(LAPIC_ICR_HI, vhi);
+ lapic_write32(LAPIC_ICR_LO, vlo);
+ }
+}
+#endif /* SMP */
+
+static void
+native_lapic_enable_x2apic(void)
+{
+ uint64_t apic_base;
+
+ apic_base = rdmsr(MSR_APICBASE);
+ apic_base |= APICBASE_X2APIC | APICBASE_ENABLED;
+ wrmsr(MSR_APICBASE, apic_base);
+}
+
+static bool
+native_lapic_is_x2apic(void)
+{
+ uint64_t apic_base;
+
+ apic_base = rdmsr(MSR_APICBASE);
+ return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) ==
+ (APICBASE_X2APIC | APICBASE_ENABLED));
+}
+
static void lapic_enable(void);
static void lapic_resume(struct pic *pic, bool suspend_cancelled);
-static void lapic_timer_oneshot(struct lapic *,
- u_int count, int enable_int);
-static void lapic_timer_periodic(struct lapic *,
- u_int count, int enable_int);
+static void lapic_timer_oneshot(struct lapic *);
+static void lapic_timer_oneshot_nointr(struct lapic *, uint32_t);
+static void lapic_timer_periodic(struct lapic *);
+static void lapic_timer_deadline(struct lapic *);
static void lapic_timer_stop(struct lapic *);
static void lapic_timer_set_divisor(u_int divisor);
static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value);
static int lapic_et_start(struct eventtimer *et,
- sbintime_t first, sbintime_t period);
+ sbintime_t first, sbintime_t period);
static int lapic_et_stop(struct eventtimer *et);
+static u_int apic_idt_to_irq(u_int apic_id, u_int vector);
+static void lapic_set_tpr(u_int vector);
struct pic lapic_pic = { .pic_resume = lapic_resume };
+/* Forward declarations for apic_ops */
+static void native_lapic_create(u_int apic_id, int boot_cpu);
+static void native_lapic_init(vm_paddr_t addr);
+static void native_lapic_xapic_mode(void);
+static void native_lapic_setup(int boot);
+static void native_lapic_dump(const char *str);
+static void native_lapic_disable(void);
+static void native_lapic_eoi(void);
+static int native_lapic_id(void);
+static int native_lapic_intr_pending(u_int vector);
+static u_int native_apic_cpuid(u_int apic_id);
+static u_int native_apic_alloc_vector(u_int apic_id, u_int irq);
+static u_int native_apic_alloc_vectors(u_int apic_id, u_int *irqs,
+ u_int count, u_int align);
+static void native_apic_disable_vector(u_int apic_id, u_int vector);
+static void native_apic_enable_vector(u_int apic_id, u_int vector);
+static void native_apic_free_vector(u_int apic_id, u_int vector, u_int irq);
+static void native_lapic_set_logical_id(u_int apic_id, u_int cluster,
+ u_int cluster_id);
+static int native_lapic_enable_pmc(void);
+static void native_lapic_disable_pmc(void);
+static void native_lapic_reenable_pmc(void);
+static void native_lapic_enable_cmc(void);
+static int native_lapic_enable_mca_elvt(void);
+static int native_lapic_set_lvt_mask(u_int apic_id, u_int lvt,
+ u_char masked);
+static int native_lapic_set_lvt_mode(u_int apic_id, u_int lvt,
+ uint32_t mode);
+static int native_lapic_set_lvt_polarity(u_int apic_id, u_int lvt,
+ enum intr_polarity pol);
+static int native_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
+ enum intr_trigger trigger);
+#ifdef SMP
+static void native_lapic_ipi_raw(register_t icrlo, u_int dest);
+static void native_lapic_ipi_vectored(u_int vector, int dest);
+static int native_lapic_ipi_wait(int delay);
+#endif /* SMP */
+static int native_lapic_ipi_alloc(inthand_t *ipifunc);
+static void native_lapic_ipi_free(int vector);
+
+struct apic_ops apic_ops = {
+ .create = native_lapic_create,
+ .init = native_lapic_init,
+ .xapic_mode = native_lapic_xapic_mode,
+ .is_x2apic = native_lapic_is_x2apic,
+ .setup = native_lapic_setup,
+ .dump = native_lapic_dump,
+ .disable = native_lapic_disable,
+ .eoi = native_lapic_eoi,
+ .id = native_lapic_id,
+ .intr_pending = native_lapic_intr_pending,
+ .set_logical_id = native_lapic_set_logical_id,
+ .cpuid = native_apic_cpuid,
+ .alloc_vector = native_apic_alloc_vector,
+ .alloc_vectors = native_apic_alloc_vectors,
+ .enable_vector = native_apic_enable_vector,
+ .disable_vector = native_apic_disable_vector,
+ .free_vector = native_apic_free_vector,
+ .enable_pmc = native_lapic_enable_pmc,
+ .disable_pmc = native_lapic_disable_pmc,
+ .reenable_pmc = native_lapic_reenable_pmc,
+ .enable_cmc = native_lapic_enable_cmc,
+ .enable_mca_elvt = native_lapic_enable_mca_elvt,
+#ifdef SMP
+ .ipi_raw = native_lapic_ipi_raw,
+ .ipi_vectored = native_lapic_ipi_vectored,
+ .ipi_wait = native_lapic_ipi_wait,
+#endif
+ .ipi_alloc = native_lapic_ipi_alloc,
+ .ipi_free = native_lapic_ipi_free,
+ .set_lvt_mask = native_lapic_set_lvt_mask,
+ .set_lvt_mode = native_lapic_set_lvt_mode,
+ .set_lvt_polarity = native_lapic_set_lvt_polarity,
+ .set_lvt_triggermode = native_lapic_set_lvt_triggermode,
+};
+
static uint32_t
-lvt_mode(struct lapic *la, u_int pin, uint32_t value)
+lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value)
{
- struct lvt *lvt;
- KASSERT(pin <= APIC_LVT_MAX, ("%s: pin %u out of range", __func__, pin));
- if (la->la_lvts[pin].lvt_active)
- lvt = &la->la_lvts[pin];
- else
- lvt = &lvts[pin];
-
value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
APIC_LVT_VECTOR);
if (lvt->lvt_edgetrigger == 0)
@@ -204,7 +419,7 @@
case APIC_LVT_DM_SMI:
case APIC_LVT_DM_INIT:
case APIC_LVT_DM_EXTINT:
- if (!lvt->lvt_edgetrigger) {
+ if (!lvt->lvt_edgetrigger && bootverbose) {
printf("lapic%u: Forcing LINT%u to edge trigger\n",
la->la_id, pin);
value &= ~APIC_LVT_TM;
@@ -220,23 +435,70 @@
return (value);
}
+static uint32_t
+lvt_mode(struct lapic *la, u_int pin, uint32_t value)
+{
+ struct lvt *lvt;
+
+ KASSERT(pin <= APIC_LVT_MAX,
+ ("%s: pin %u out of range", __func__, pin));
+ if (la->la_lvts[pin].lvt_active)
+ lvt = &la->la_lvts[pin];
+ else
+ lvt = &lvts[pin];
+
+ return (lvt_mode_impl(la, lvt, pin, value));
+}
+
+static uint32_t
+elvt_mode(struct lapic *la, u_int idx, uint32_t value)
+{
+ struct lvt *elvt;
+
+ KASSERT(idx <= APIC_ELVT_MAX,
+ ("%s: idx %u out of range", __func__, idx));
+
+ elvt = &la->la_elvts[idx];
+ KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx));
+ KASSERT(elvt->lvt_edgetrigger,
+ ("%s: ELVT%u is not edge triggered", __func__, idx));
+ KASSERT(elvt->lvt_activehi,
+ ("%s: ELVT%u is not active high", __func__, idx));
+ return (lvt_mode_impl(la, elvt, idx, value));
+}
+
/*
* Map the local APIC and setup necessary interrupt vectors.
*/
-void
-lapic_init(vm_paddr_t addr)
+static void
+native_lapic_init(vm_paddr_t addr)
{
#ifdef SMP
uint64_t r, r1, r2, rx;
#endif
+ uint32_t ver;
u_int regs[4];
int i, arat;
- /* Map the local APIC and setup the spurious interrupt handler. */
+ /*
+ * Enable x2APIC mode if possible. Map the local APIC
+ * registers page.
+ *
+ * Keep the LAPIC registers page mapped uncached for x2APIC
+ * mode too, to have direct map page attribute set to
+ * uncached. This is needed to work around CPU errata present
+ * on all Intel processors.
+ */
KASSERT(trunc_page(addr) == addr,
("local APIC not aligned on a page boundary"));
lapic_paddr = addr;
- lapic = pmap_mapdev(addr, sizeof(lapic_t));
+ lapic_map = pmap_mapdev(addr, PAGE_SIZE);
+ if (x2apic_mode) {
+ native_lapic_enable_x2apic();
+ lapic_map = NULL;
+ }
+
+ /* Setup the spurious interrupt handler. */
setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL,
GSEL_APIC);
@@ -247,15 +509,18 @@
PCPU_SET(apic_id, lapic_id());
/* Local APIC timer interrupt. */
- setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC);
+ setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
+ SDT_APIC, SEL_KPL, GSEL_APIC);
/* Local APIC error interrupt. */
- setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
+ setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
+ SDT_APIC, SEL_KPL, GSEL_APIC);
/* XXX: Thermal interrupt */
/* Local APIC CMCI. */
- setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
+ setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
+ SDT_APICT, SEL_KPL, GSEL_APIC);
if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
arat = 0;
@@ -264,6 +529,9 @@
do_cpuid(0x06, regs);
if ((regs[0] & CPUTPM1_ARAT) != 0)
arat = 1;
+ } else if (cpu_vendor_id == CPU_VENDOR_AMD &&
+ CPUID_TO_FAMILY(cpu_id) >= 0x12) {
+ arat = 1;
}
bzero(&lapic_et, sizeof(lapic_et));
lapic_et.et_name = "LAPIC";
@@ -272,8 +540,16 @@
lapic_et.et_quality = 600;
if (!arat) {
lapic_et.et_flags |= ET_FLAGS_C3STOP;
- lapic_et.et_quality -= 200;
+ lapic_et.et_quality = 100;
}
+ if ((cpu_feature & CPUID_TSC) != 0 &&
+ (cpu_feature2 & CPUID2_TSCDLT) != 0 &&
+ tsc_is_invariant && tsc_freq != 0) {
+ lapic_timer_tsc_deadline = 1;
+ TUNABLE_INT_FETCH("hw.lapic_tsc_deadline",
+ &lapic_timer_tsc_deadline);
+ }
+
lapic_et.et_frequency = 0;
/* We don't know frequency yet, so trying to guess. */
lapic_et.et_min_period = 0x00001000LL;
@@ -284,6 +560,29 @@
et_register(&lapic_et);
}
+ /*
+ * Set lapic_eoi_suppression after lapic_enable(), to not
+ * enable suppression in the hardware prematurely. Note that
+ * we by default enable suppression even when system only has
+ * one IO-APIC, since EOI is broadcasted to all APIC agents,
+ * including CPUs, otherwise.
+ *
+ * It seems that at least some KVM versions report
+ * EOI_SUPPRESSION bit, but auto-EOI does not work.
+ */
+ ver = lapic_read32(LAPIC_VERSION);
+ if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) {
+ lapic_eoi_suppression = 1;
+ if (vm_guest == VM_GUEST_KVM) {
+ if (bootverbose)
+ printf(
+ "KVM -- disabling lapic eoi suppression\n");
+ lapic_eoi_suppression = 0;
+ }
+ TUNABLE_INT_FETCH("hw.lapic_eoi_suppression",
+ &lapic_eoi_suppression);
+ }
+
#ifdef SMP
#define LOOPS 100000
/*
@@ -299,20 +598,22 @@
*/
KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0,
("TSC not initialized"));
- r = rdtsc();
- for (rx = 0; rx < LOOPS; rx++) {
- (void)lapic->icr_lo;
- ia32_pause();
+ if (!x2apic_mode) {
+ r = rdtsc();
+ for (rx = 0; rx < LOOPS; rx++) {
+ (void)lapic_read_icr_lo();
+ ia32_pause();
+ }
+ r = rdtsc() - r;
+ r1 = tsc_freq * LOOPS;
+ r2 = r * 1000000;
+ lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
+ if (bootverbose) {
+ printf("LAPIC: ipi_wait() us multiplier %ju (r %ju "
+ "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult,
+ (uintmax_t)r, (uintmax_t)tsc_freq);
+ }
}
- r = rdtsc() - r;
- r1 = tsc_freq * LOOPS;
- r2 = r * 1000000;
- lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
- if (bootverbose) {
- printf("LAPIC: ipi_wait() us multiplier %ju (r %ju tsc %ju)\n",
- (uintmax_t)lapic_ipi_wait_mult, (uintmax_t)r,
- (uintmax_t)tsc_freq);
- }
#undef LOOPS
#endif /* SMP */
}
@@ -320,8 +621,8 @@
/*
* Create a local APIC instance.
*/
-void
-lapic_create(u_int apic_id, int boot_cpu)
+static void
+native_lapic_create(u_int apic_id, int boot_cpu)
{
int i;
@@ -344,8 +645,12 @@
lapics[apic_id].la_lvts[i] = lvts[i];
lapics[apic_id].la_lvts[i].lvt_active = 0;
}
+ for (i = 0; i <= APIC_ELVT_MAX; i++) {
+ lapics[apic_id].la_elvts[i] = elvts[i];
+ lapics[apic_id].la_elvts[i].lvt_active = 0;
+ }
for (i = 0; i <= APIC_NUM_IOINTS; i++)
- lapics[apic_id].la_ioint_irqs[i] = -1;
+ lapics[apic_id].la_ioint_irqs[i] = IRQ_FREE;
lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] =
IRQ_TIMER;
@@ -363,41 +668,100 @@
#endif
}
+static inline uint32_t
+amd_read_ext_features(void)
+{
+ uint32_t version;
+
+ if (cpu_vendor_id != CPU_VENDOR_AMD)
+ return (0);
+ version = lapic_read32(LAPIC_VERSION);
+ if ((version & APIC_VER_AMD_EXT_SPACE) != 0)
+ return (lapic_read32(LAPIC_EXT_FEATURES));
+ else
+ return (0);
+}
+
+static inline uint32_t
+amd_read_elvt_count(void)
+{
+ uint32_t extf;
+ uint32_t count;
+
+ extf = amd_read_ext_features();
+ count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT;
+ count = min(count, APIC_ELVT_MAX + 1);
+ return (count);
+}
+
/*
* Dump contents of local APIC registers
*/
-void
-lapic_dump(const char* str)
+static void
+native_lapic_dump(const char* str)
{
+ uint32_t version;
uint32_t maxlvt;
+ uint32_t extf;
+ int elvt_count;
+ int i;
- maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+ version = lapic_read32(LAPIC_VERSION);
+ maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
- printf(" ID: 0x%08x VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n",
- lapic->id, lapic->version, lapic->ldr, lapic->dfr);
- printf(" lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
- lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr);
+ printf(" ID: 0x%08x VER: 0x%08x LDR: 0x%08x DFR: 0x%08x",
+ lapic_read32(LAPIC_ID), version,
+ lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR));
+ if ((cpu_feature2 & CPUID2_X2APIC) != 0)
+ printf(" x2APIC: %d", x2apic_mode);
+ printf("\n lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
+ lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1),
+ lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR));
printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x",
- lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error);
+ lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL),
+ lapic_read32(LAPIC_LVT_ERROR));
if (maxlvt >= APIC_LVT_PMC)
- printf(" pmc: 0x%08x", lapic->lvt_pcint);
+ printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT));
printf("\n");
if (maxlvt >= APIC_LVT_CMCI)
- printf(" cmci: 0x%08x\n", lapic->lvt_cmci);
+ printf(" cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI));
+ extf = amd_read_ext_features();
+ if (extf != 0) {
+ printf(" AMD ext features: 0x%08x\n", extf);
+ elvt_count = amd_read_elvt_count();
+ for (i = 0; i < elvt_count; i++)
+ printf(" AMD elvt%d: 0x%08x\n", i,
+ lapic_read32(LAPIC_EXT_LVT0 + i));
+ }
}
-void
-lapic_setup(int boot)
+static void
+native_lapic_xapic_mode(void)
{
+ register_t saveintr;
+
+ saveintr = intr_disable();
+ if (x2apic_mode)
+ native_lapic_enable_x2apic();
+ intr_restore(saveintr);
+}
+
+static void
+native_lapic_setup(int boot)
+{
struct lapic *la;
- u_int32_t maxlvt;
+ uint32_t version;
+ uint32_t maxlvt;
register_t saveintr;
- char buf[MAXCOMLEN + 1];
+ int elvt_count;
+ int i;
+ saveintr = intr_disable();
+
la = &lapics[lapic_id()];
KASSERT(la->la_present, ("missing APIC structure"));
- saveintr = intr_disable();
- maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+ version = lapic_read32(LAPIC_VERSION);
+ maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
/* Initialize the TPR to allow all interrupts. */
lapic_set_tpr(0);
@@ -406,54 +770,103 @@
lapic_enable();
/* Program LINT[01] LVT entries. */
- lapic->lvt_lint0 = lvt_mode(la, APIC_LVT_LINT0, lapic->lvt_lint0);
- lapic->lvt_lint1 = lvt_mode(la, APIC_LVT_LINT1, lapic->lvt_lint1);
+ lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0,
+ lapic_read32(LAPIC_LVT_LINT0)));
+ lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1,
+ lapic_read32(LAPIC_LVT_LINT1)));
/* Program the PMC LVT entry if present. */
- if (maxlvt >= APIC_LVT_PMC)
- lapic->lvt_pcint = lvt_mode(la, APIC_LVT_PMC, lapic->lvt_pcint);
+ if (maxlvt >= APIC_LVT_PMC) {
+ lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
+ LAPIC_LVT_PCINT));
+ }
- /* Program timer LVT and setup handler. */
- la->lvt_timer_cache = lapic->lvt_timer =
- lvt_mode(la, APIC_LVT_TIMER, lapic->lvt_timer);
- if (boot) {
- snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid));
- intrcnt_add(buf, &la->la_timer_count);
+ /* Program timer LVT. */
+ la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER,
+ lapic_read32(LAPIC_LVT_TIMER));
+ la->lvt_timer_last = la->lvt_timer_base;
+ lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base);
+
+ /* Calibrate the timer parameters using BSP. */
+ if (boot && IS_BSP()) {
+ lapic_calibrate_initcount(la);
+ if (lapic_timer_tsc_deadline)
+ lapic_calibrate_deadline(la);
}
/* Setup the timer if configured. */
- if (la->la_timer_mode != 0) {
+ if (la->la_timer_mode != LAT_MODE_UNDEF) {
KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor",
lapic_id()));
- lapic_timer_set_divisor(lapic_timer_divisor);
- if (la->la_timer_mode == 1)
- lapic_timer_periodic(la, la->la_timer_period, 1);
- else
- lapic_timer_oneshot(la, la->la_timer_period, 1);
+ switch (la->la_timer_mode) {
+ case LAT_MODE_PERIODIC:
+ lapic_timer_set_divisor(lapic_timer_divisor);
+ lapic_timer_periodic(la);
+ break;
+ case LAT_MODE_ONESHOT:
+ lapic_timer_set_divisor(lapic_timer_divisor);
+ lapic_timer_oneshot(la);
+ break;
+ case LAT_MODE_DEADLINE:
+ lapic_timer_deadline(la);
+ break;
+ default:
+ panic("corrupted la_timer_mode %p %d", la,
+ la->la_timer_mode);
+ }
}
/* Program error LVT and clear any existing errors. */
- lapic->lvt_error = lvt_mode(la, APIC_LVT_ERROR, lapic->lvt_error);
- lapic->esr = 0;
+ lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR,
+ lapic_read32(LAPIC_LVT_ERROR)));
+ lapic_write32(LAPIC_ESR, 0);
/* XXX: Thermal LVT */
/* Program the CMCI LVT entry if present. */
- if (maxlvt >= APIC_LVT_CMCI)
- lapic->lvt_cmci = lvt_mode(la, APIC_LVT_CMCI, lapic->lvt_cmci);
+ if (maxlvt >= APIC_LVT_CMCI) {
+ lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI,
+ lapic_read32(LAPIC_LVT_CMCI)));
+ }
+ elvt_count = amd_read_elvt_count();
+ for (i = 0; i < elvt_count; i++) {
+ if (la->la_elvts[i].lvt_active)
+ lapic_write32(LAPIC_EXT_LVT0 + i,
+ elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i)));
+ }
+
intr_restore(saveintr);
}
-void
-lapic_reenable_pmc(void)
+static void
+native_lapic_intrcnt(void *dummy __unused)
{
+ struct pcpu *pc;
+ struct lapic *la;
+ char buf[MAXCOMLEN + 1];
+
+ STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+ la = &lapics[pc->pc_apic_id];
+ if (!la->la_present)
+ continue;
+
+ snprintf(buf, sizeof(buf), "cpu%d:timer", pc->pc_cpuid);
+ intrcnt_add(buf, &la->la_timer_count);
+ }
+}
+SYSINIT(native_lapic_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, native_lapic_intrcnt,
+ NULL);
+
+static void
+native_lapic_reenable_pmc(void)
+{
#ifdef HWPMC_HOOKS
uint32_t value;
- value = lapic->lvt_pcint;
+ value = lapic_read32(LAPIC_LVT_PCINT);
value &= ~APIC_LVT_M;
- lapic->lvt_pcint = value;
+ lapic_write32(LAPIC_LVT_PCINT, value);
#endif
}
@@ -464,27 +877,32 @@
struct lapic *la;
la = &lapics[lapic_id()];
- lapic->lvt_pcint = lvt_mode(la, APIC_LVT_PMC, lapic->lvt_pcint);
+ lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
+ lapic_read32(LAPIC_LVT_PCINT)));
}
#endif
-int
-lapic_enable_pmc(void)
+static int
+native_lapic_enable_pmc(void)
{
#ifdef HWPMC_HOOKS
u_int32_t maxlvt;
/* Fail if the local APIC is not present. */
- if (lapic == NULL)
+ if (!x2apic_mode && lapic_map == NULL)
return (0);
/* Fail if the PMC LVT is not present. */
- maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+ maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
if (maxlvt < APIC_LVT_PMC)
return (0);
lvts[APIC_LVT_PMC].lvt_masked = 0;
+#ifdef EARLY_AP_STARTUP
+ MPASS(mp_ncpus == 1 || smp_started);
+ smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
+#else
#ifdef SMP
/*
* If hwpmc was loaded at boot time then the APs may not be
@@ -496,6 +914,7 @@
else
#endif
lapic_update_pmc(NULL);
+#endif
return (1);
#else
return (0);
@@ -502,18 +921,18 @@
#endif
}
-void
-lapic_disable_pmc(void)
+static void
+native_lapic_disable_pmc(void)
{
#ifdef HWPMC_HOOKS
u_int32_t maxlvt;
/* Fail if the local APIC is not present. */
- if (lapic == NULL)
+ if (!x2apic_mode && lapic_map == NULL)
return;
/* Fail if the PMC LVT is not present. */
- maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+ maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
if (maxlvt < APIC_LVT_PMC)
return;
@@ -527,45 +946,89 @@
#endif
}
+static void
+lapic_calibrate_initcount(struct lapic *la)
+{
+ u_long value;
+
+ /* Start off with a divisor of 2 (power on reset default). */
+ lapic_timer_divisor = 2;
+ /* Try to calibrate the local APIC timer. */
+ do {
+ lapic_timer_set_divisor(lapic_timer_divisor);
+ lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
+ DELAY(1000000);
+ value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER);
+ if (value != APIC_TIMER_MAX_COUNT)
+ break;
+ lapic_timer_divisor <<= 1;
+ } while (lapic_timer_divisor <= 128);
+ if (lapic_timer_divisor > 128)
+ panic("lapic: Divisor too big");
+ if (bootverbose) {
+ printf("lapic: Divisor %lu, Frequency %lu Hz\n",
+ lapic_timer_divisor, value);
+ }
+ count_freq = value;
+}
+
+static void
+lapic_calibrate_deadline(struct lapic *la __unused)
+{
+
+ if (bootverbose) {
+ printf("lapic: deadline tsc mode, Frequency %ju Hz\n",
+ (uintmax_t)tsc_freq);
+ }
+}
+
+static void
+lapic_change_mode(struct eventtimer *et, struct lapic *la,
+ enum lat_timer_mode newmode)
+{
+
+ if (la->la_timer_mode == newmode)
+ return;
+ switch (newmode) {
+ case LAT_MODE_PERIODIC:
+ lapic_timer_set_divisor(lapic_timer_divisor);
+ et->et_frequency = count_freq;
+ break;
+ case LAT_MODE_DEADLINE:
+ et->et_frequency = tsc_freq;
+ break;
+ case LAT_MODE_ONESHOT:
+ lapic_timer_set_divisor(lapic_timer_divisor);
+ et->et_frequency = count_freq;
+ break;
+ default:
+ panic("lapic_change_mode %d", newmode);
+ }
+ la->la_timer_mode = newmode;
+ et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
+ et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
+}
+
static int
lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
{
struct lapic *la;
- u_long value;
la = &lapics[PCPU_GET(apic_id)];
- if (et->et_frequency == 0) {
- /* Start off with a divisor of 2 (power on reset default). */
- lapic_timer_divisor = 2;
- /* Try to calibrate the local APIC timer. */
- do {
- lapic_timer_set_divisor(lapic_timer_divisor);
- lapic_timer_oneshot(la, APIC_TIMER_MAX_COUNT, 0);
- DELAY(1000000);
- value = APIC_TIMER_MAX_COUNT - lapic->ccr_timer;
- if (value != APIC_TIMER_MAX_COUNT)
- break;
- lapic_timer_divisor <<= 1;
- } while (lapic_timer_divisor <= 128);
- if (lapic_timer_divisor > 128)
- panic("lapic: Divisor too big");
- if (bootverbose)
- printf("lapic: Divisor %lu, Frequency %lu Hz\n",
- lapic_timer_divisor, value);
- et->et_frequency = value;
- et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
- et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
- }
- if (la->la_timer_mode == 0)
- lapic_timer_set_divisor(lapic_timer_divisor);
if (period != 0) {
- la->la_timer_mode = 1;
- la->la_timer_period = ((uint32_t)et->et_frequency * period) >> 32;
- lapic_timer_periodic(la, la->la_timer_period, 1);
+ lapic_change_mode(et, la, LAT_MODE_PERIODIC);
+ la->la_timer_period = ((uint32_t)et->et_frequency * period) >>
+ 32;
+ lapic_timer_periodic(la);
+ } else if (lapic_timer_tsc_deadline) {
+ lapic_change_mode(et, la, LAT_MODE_DEADLINE);
+ la->la_timer_period = (et->et_frequency * first) >> 32;
+ lapic_timer_deadline(la);
} else {
- la->la_timer_mode = 2;
- la->la_timer_period = ((uint32_t)et->et_frequency * first) >> 32;
- lapic_timer_oneshot(la, la->la_timer_period, 1);
+ lapic_change_mode(et, la, LAT_MODE_ONESHOT);
+ la->la_timer_period = ((uint32_t)et->et_frequency * first) >>
+ 32;
+ lapic_timer_oneshot(la);
}
return (0);
}
@@ -573,34 +1036,37 @@
static int
lapic_et_stop(struct eventtimer *et)
{
- struct lapic *la = &lapics[PCPU_GET(apic_id)];
+ struct lapic *la;
- la->la_timer_mode = 0;
+ la = &lapics[PCPU_GET(apic_id)];
lapic_timer_stop(la);
+ la->la_timer_mode = LAT_MODE_UNDEF;
return (0);
}
-void
-lapic_disable(void)
+static void
+native_lapic_disable(void)
{
uint32_t value;
/* Software disable the local APIC. */
- value = lapic->svr;
+ value = lapic_read32(LAPIC_SVR);
value &= ~APIC_SVR_SWEN;
- lapic->svr = value;
+ lapic_write32(LAPIC_SVR, value);
}
static void
lapic_enable(void)
{
- u_int32_t value;
+ uint32_t value;
/* Program the spurious vector to enable the local APIC. */
- value = lapic->svr;
+ value = lapic_read32(LAPIC_SVR);
value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
- value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT);
- lapic->svr = value;
+ value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT;
+ if (lapic_eoi_suppression)
+ value |= APIC_SVR_EOI_SUPPRESSION;
+ lapic_write32(LAPIC_SVR, value);
}
/* Reset the local APIC on the BSP during resume. */
@@ -611,34 +1077,36 @@
lapic_setup(0);
}
-int
-lapic_id(void)
+static int
+native_lapic_id(void)
{
+ uint32_t v;
- KASSERT(lapic != NULL, ("local APIC is not mapped"));
- return (lapic->id >> APIC_ID_SHIFT);
+ KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped"));
+ v = lapic_read32(LAPIC_ID);
+ if (!x2apic_mode)
+ v >>= APIC_ID_SHIFT;
+ return (v);
}
-int
-lapic_intr_pending(u_int vector)
+static int
+native_lapic_intr_pending(u_int vector)
{
- volatile u_int32_t *irr;
+ uint32_t irr;
/*
- * The IRR registers are an array of 128-bit registers each of
- * which only describes 32 interrupts in the low 32 bits.. Thus,
- * we divide the vector by 32 to get the 128-bit index. We then
- * multiply that index by 4 to get the equivalent index from
- * treating the IRR as an array of 32-bit registers. Finally, we
- * modulus the vector by 32 to determine the individual bit to
- * test.
+ * The IRR registers are an array of registers each of which
+ * only describes 32 interrupts in the low 32 bits. Thus, we
+ * divide the vector by 32 to get the register index.
+ * Finally, we modulus the vector by 32 to determine the
+ * individual bit to test.
*/
- irr = &lapic->irr0;
- return (irr[(vector / 32) * 4] & 1 << (vector % 32));
+ irr = lapic_read32(LAPIC_IRR0 + vector / 32);
+ return (irr & 1 << (vector % 32));
}
-void
-lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
+static void
+native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
{
struct lapic *la;
@@ -653,8 +1121,8 @@
la->la_cluster_id = cluster_id;
}
-int
-lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
+static int
+native_lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
{
if (pin > APIC_LVT_MAX)
@@ -676,8 +1144,8 @@
return (0);
}
-int
-lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
+static int
+native_lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
{
struct lvt *lvt;
@@ -732,8 +1200,8 @@
return (0);
}
-int
-lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
+static int
+native_lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
{
if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM)
@@ -757,8 +1225,9 @@
return (0);
}
-int
-lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger)
+static int
+native_lapic_set_lvt_triggermode(u_int apic_id, u_int pin,
+ enum intr_trigger trigger)
{
if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
@@ -786,25 +1255,25 @@
* Adjust the TPR of the current CPU so that it blocks all interrupts below
* the passed in vector.
*/
-void
+static void
lapic_set_tpr(u_int vector)
{
#ifdef CHEAP_TPR
- lapic->tpr = vector;
+ lapic_write32(LAPIC_TPR, vector);
#else
- u_int32_t tpr;
+ uint32_t tpr;
- tpr = lapic->tpr & ~APIC_TPR_PRIO;
+ tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO;
tpr |= vector;
- lapic->tpr = tpr;
+ lapic_write32(LAPIC_TPR, tpr);
#endif
}
-void
-lapic_eoi(void)
+static void
+native_lapic_eoi(void)
{
- lapic->eoi = 0;
+ lapic_write32_nofence(LAPIC_EOI, 0);
}
void
@@ -864,48 +1333,82 @@
{
KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
- KASSERT(ffs(divisor) <= sizeof(lapic_timer_divisors) /
- sizeof(u_int32_t), ("lapic: invalid divisor %u", divisor));
- lapic->dcr_timer = lapic_timer_divisors[ffs(divisor) - 1];
+ KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors),
+ ("lapic: invalid divisor %u", divisor));
+ lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]);
}
static void
-lapic_timer_oneshot(struct lapic *la, u_int count, int enable_int)
+lapic_timer_oneshot(struct lapic *la)
{
- u_int32_t value;
+ uint32_t value;
- value = la->lvt_timer_cache;
- value &= ~APIC_LVTT_TM;
+ value = la->lvt_timer_base;
+ value &= ~(APIC_LVTT_TM | APIC_LVT_M);
value |= APIC_LVTT_TM_ONE_SHOT;
- if (enable_int)
- value &= ~APIC_LVT_M;
- lapic->lvt_timer = value;
- lapic->icr_timer = count;
+ la->lvt_timer_last = value;
+ lapic_write32(LAPIC_LVT_TIMER, value);
+ lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
}
static void
-lapic_timer_periodic(struct lapic *la, u_int count, int enable_int)
+lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count)
{
- u_int32_t value;
+ uint32_t value;
- value = la->lvt_timer_cache;
+ value = la->lvt_timer_base;
value &= ~APIC_LVTT_TM;
+ value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M;
+ la->lvt_timer_last = value;
+ lapic_write32(LAPIC_LVT_TIMER, value);
+ lapic_write32(LAPIC_ICR_TIMER, count);
+}
+
+static void
+lapic_timer_periodic(struct lapic *la)
+{
+ uint32_t value;
+
+ value = la->lvt_timer_base;
+ value &= ~(APIC_LVTT_TM | APIC_LVT_M);
value |= APIC_LVTT_TM_PERIODIC;
- if (enable_int)
- value &= ~APIC_LVT_M;
- lapic->lvt_timer = value;
- lapic->icr_timer = count;
+ la->lvt_timer_last = value;
+ lapic_write32(LAPIC_LVT_TIMER, value);
+ lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
}
static void
+lapic_timer_deadline(struct lapic *la)
+{
+ uint32_t value;
+
+ value = la->lvt_timer_base;
+ value &= ~(APIC_LVTT_TM | APIC_LVT_M);
+ value |= APIC_LVTT_TM_TSCDLT;
+ if (value != la->lvt_timer_last) {
+ la->lvt_timer_last = value;
+ lapic_write32_nofence(LAPIC_LVT_TIMER, value);
+ if (!x2apic_mode)
+ mfence();
+ }
+ wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc());
+}
+
+static void
lapic_timer_stop(struct lapic *la)
{
- u_int32_t value;
+ uint32_t value;
- value = la->lvt_timer_cache;
- value &= ~APIC_LVTT_TM;
- value |= APIC_LVT_M;
- lapic->lvt_timer = value;
+ if (la->la_timer_mode == LAT_MODE_DEADLINE) {
+ wrmsr(MSR_TSC_DEADLINE, 0);
+ mfence();
+ } else {
+ value = la->lvt_timer_base;
+ value &= ~APIC_LVTT_TM;
+ value |= APIC_LVT_M;
+ la->lvt_timer_last = value;
+ lapic_write32(LAPIC_LVT_TIMER, value);
+ }
}
void
@@ -922,13 +1425,13 @@
* is called prior to lapic_setup() during boot, this just needs to unmask
* this CPU's LVT_CMCI entry.
*/
-void
-lapic_enable_cmc(void)
+static void
+native_lapic_enable_cmc(void)
{
u_int apic_id;
#ifdef DEV_ATPIC
- if (lapic == NULL)
+ if (!x2apic_mode && lapic_map == NULL)
return;
#endif
apic_id = PCPU_GET(apic_id);
@@ -940,10 +1443,41 @@
printf("lapic%u: CMCI unmasked\n", apic_id);
}
+static int
+native_lapic_enable_mca_elvt(void)
+{
+ u_int apic_id;
+ uint32_t value;
+ int elvt_count;
+
+#ifdef DEV_ATPIC
+ if (lapic_map == NULL)
+ return (-1);
+#endif
+
+ apic_id = PCPU_GET(apic_id);
+ KASSERT(lapics[apic_id].la_present,
+ ("%s: missing APIC %u", __func__, apic_id));
+ elvt_count = amd_read_elvt_count();
+ if (elvt_count <= APIC_ELVT_MCA)
+ return (-1);
+
+ value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA);
+ if ((value & APIC_LVT_M) == 0) {
+ printf("AMD MCE Thresholding Extended LVT is already active\n");
+ return (-1);
+ }
+ lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0;
+ lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1;
+ if (bootverbose)
+ printf("lapic%u: MCE Thresholding ELVT unmasked\n", apic_id);
+ return (APIC_ELVT_MCA);
+}
+
void
lapic_handle_error(void)
{
- u_int32_t esr;
+ uint32_t esr;
/*
* Read the contents of the error status register. Write to
@@ -951,15 +1485,15 @@
* to update its value to indicate any errors that have
* occurred since the previous write to the register.
*/
- lapic->esr = 0;
- esr = lapic->esr;
+ lapic_write32(LAPIC_ESR, 0);
+ esr = lapic_read32(LAPIC_ESR);
printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
lapic_eoi();
}
-u_int
-apic_cpuid(u_int apic_id)
+static u_int
+native_apic_cpuid(u_int apic_id)
{
#ifdef SMP
return apic_cpuids[apic_id];
@@ -969,12 +1503,12 @@
}
/* Request a free IDT vector to be used by the specified IRQ. */
-u_int
-apic_alloc_vector(u_int apic_id, u_int irq)
+static u_int
+native_apic_alloc_vector(u_int apic_id, u_int irq)
{
u_int vector;
- KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
+ KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
/*
* Search for a free vector. Currently we just use a very simple
@@ -982,7 +1516,7 @@
*/
mtx_lock_spin(&icu_lock);
for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
- if (lapics[apic_id].la_ioint_irqs[vector] != -1)
+ if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE)
continue;
lapics[apic_id].la_ioint_irqs[vector] = irq;
mtx_unlock_spin(&icu_lock);
@@ -998,8 +1532,8 @@
* aligned on a boundary of 'align'. If the request cannot be
* satisfied, 0 is returned.
*/
-u_int
-apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
+static u_int
+native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
{
u_int first, run, vector;
@@ -1008,7 +1542,7 @@
KASSERT(align >= count, ("align < count"));
#ifdef INVARIANTS
for (run = 0; run < count; run++)
- KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u",
+ KASSERT(irqs[run] < num_io_irqs, ("Invalid IRQ %u at index %u",
irqs[run], run));
#endif
@@ -1022,7 +1556,7 @@
for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
/* Vector is in use, end run. */
- if (lapics[apic_id].la_ioint_irqs[vector] != -1) {
+ if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE) {
run = 0;
first = 0;
continue;
@@ -1058,8 +1592,8 @@
* which do not have the vector configured would report spurious interrupts
* should it fire.
*/
-void
-apic_enable_vector(u_int apic_id, u_int vector)
+static void
+native_apic_enable_vector(u_int apic_id, u_int vector)
{
KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
@@ -1069,12 +1603,12 @@
KASSERT(vector != IDT_DTRACE_RET,
("Attempt to overwrite DTrace entry"));
#endif
- setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL,
- GSEL_APIC);
+ setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
+ SDT_APIC, SEL_KPL, GSEL_APIC);
}
-void
-apic_disable_vector(u_int apic_id, u_int vector)
+static void
+native_apic_disable_vector(u_int apic_id, u_int vector)
{
KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
@@ -1089,13 +1623,14 @@
* We can not currently clear the idt entry because other cpus
* may have a valid vector at this offset.
*/
- setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
+ setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+ SEL_KPL, GSEL_APIC);
#endif
}
/* Release an APIC vector when it's no longer in use. */
-void
-apic_free_vector(u_int apic_id, u_int vector, u_int irq)
+static void
+native_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
{
struct thread *td;
@@ -1102,7 +1637,7 @@
KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
("Vector %u does not map to an IRQ line", vector));
- KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
+ KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] ==
irq, ("IRQ mismatch"));
#ifdef KDTRACE_HOOKS
@@ -1123,7 +1658,7 @@
thread_unlock(td);
}
mtx_lock_spin(&icu_lock);
- lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1;
+ lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = IRQ_FREE;
mtx_unlock_spin(&icu_lock);
if (!rebooting) {
thread_lock(td);
@@ -1133,7 +1668,7 @@
}
/* Map an IDT vector (APIC) to an IRQ (interrupt source). */
-u_int
+static u_int
apic_idt_to_irq(u_int apic_id, u_int vector)
{
int irq;
@@ -1174,7 +1709,7 @@
db_printf("Interrupts bound to lapic %u\n", apic_id);
for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
irq = lapics[apic_id].la_ioint_irqs[i];
- if (irq == -1 || irq == IRQ_SYSCALL)
+ if (irq == IRQ_FREE || irq == IRQ_SYSCALL)
continue;
#ifdef KDTRACE_HOOKS
if (irq == IRQ_DTRACE_RET)
@@ -1187,7 +1722,7 @@
db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
if (irq == IRQ_TIMER)
db_printf("lapic timer\n");
- else if (irq < NUM_IO_INTS) {
+ else if (irq < num_io_irqs) {
isrc = intr_lookup_source(irq);
if (isrc == NULL || verbose == 0)
db_printf("IRQ %u\n", irq);
@@ -1224,48 +1759,49 @@
uint32_t v;
db_printf("lapic ID = %d\n", lapic_id());
- v = lapic->version;
+ v = lapic_read32(LAPIC_VERSION);
db_printf("version = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
v & 0xf);
db_printf("max LVT = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
- v = lapic->svr;
+ v = lapic_read32(LAPIC_SVR);
db_printf("SVR = %02x (%s)\n", v & APIC_SVR_VECTOR,
v & APIC_SVR_ENABLE ? "enabled" : "disabled");
- db_printf("TPR = %02x\n", lapic->tpr);
+ db_printf("TPR = %02x\n", lapic_read32(LAPIC_TPR));
-#define dump_field(prefix, index) \
- dump_mask(__XSTRING(prefix ## index), lapic->prefix ## index, \
+#define dump_field(prefix, regn, index) \
+ dump_mask(__XSTRING(prefix ## index), \
+ lapic_read32(LAPIC_ ## regn ## index), \
index * 32)
db_printf("In-service Interrupts:\n");
- dump_field(isr, 0);
- dump_field(isr, 1);
- dump_field(isr, 2);
- dump_field(isr, 3);
- dump_field(isr, 4);
- dump_field(isr, 5);
- dump_field(isr, 6);
- dump_field(isr, 7);
+ dump_field(isr, ISR, 0);
+ dump_field(isr, ISR, 1);
+ dump_field(isr, ISR, 2);
+ dump_field(isr, ISR, 3);
+ dump_field(isr, ISR, 4);
+ dump_field(isr, ISR, 5);
+ dump_field(isr, ISR, 6);
+ dump_field(isr, ISR, 7);
db_printf("TMR Interrupts:\n");
- dump_field(tmr, 0);
- dump_field(tmr, 1);
- dump_field(tmr, 2);
- dump_field(tmr, 3);
- dump_field(tmr, 4);
- dump_field(tmr, 5);
- dump_field(tmr, 6);
- dump_field(tmr, 7);
+ dump_field(tmr, TMR, 0);
+ dump_field(tmr, TMR, 1);
+ dump_field(tmr, TMR, 2);
+ dump_field(tmr, TMR, 3);
+ dump_field(tmr, TMR, 4);
+ dump_field(tmr, TMR, 5);
+ dump_field(tmr, TMR, 6);
+ dump_field(tmr, TMR, 7);
db_printf("IRR Interrupts:\n");
- dump_field(irr, 0);
- dump_field(irr, 1);
- dump_field(irr, 2);
- dump_field(irr, 3);
- dump_field(irr, 4);
- dump_field(irr, 5);
- dump_field(irr, 6);
- dump_field(irr, 7);
+ dump_field(irr, IRR, 0);
+ dump_field(irr, IRR, 1);
+ dump_field(irr, IRR, 2);
+ dump_field(irr, IRR, 3);
+ dump_field(irr, IRR, 4);
+ dump_field(irr, IRR, 5);
+ dump_field(irr, IRR, 6);
+ dump_field(irr, IRR, 7);
#undef dump_field
}
@@ -1391,20 +1927,18 @@
* Local APIC must be registered before other PICs and pseudo PICs
* for proper suspend/resume order.
*/
-#ifndef XEN
intr_register_pic(&lapic_pic);
-#endif
retval = best_enum->apic_setup_io();
if (retval != 0)
printf("%s: Failed to setup I/O APICs: returned %d\n",
best_enum->apic_name, retval);
-#ifdef XEN
- return;
-#endif
+
/*
- * Finish setting up the local APIC on the BSP once we know how to
- * properly program the LINT pins.
+ * Finish setting up the local APIC on the BSP once we know
+ * how to properly program the LINT pins. In particular, this
+ * enables the EOI suppression mode, if LAPIC support it and
+ * user did not disabled the mode.
*/
lapic_setup(1);
if (bootverbose)
@@ -1411,9 +1945,13 @@
lapic_dump("BSP");
/* Enable the MSI "pic". */
- msi_init();
+ init_ops.msi_init();
+
+#ifdef XENHVM
+ xen_intr_alloc_irqs();
+#endif
}
-SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_SECOND, apic_setup_io, NULL);
+SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL);
#ifdef SMP
/*
@@ -1426,13 +1964,18 @@
* Wait delay microseconds for IPI to be sent. If delay is -1, we
* wait forever.
*/
-int
-lapic_ipi_wait(int delay)
+static int
+native_lapic_ipi_wait(int delay)
{
uint64_t rx;
+ /* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */
+ if (x2apic_mode)
+ return (1);
+
for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) {
- if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE)
+ if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) ==
+ APIC_DELSTAT_IDLE)
return (1);
ia32_pause();
}
@@ -1439,33 +1982,51 @@
return (0);
}
-void
-lapic_ipi_raw(register_t icrlo, u_int dest)
+static void
+native_lapic_ipi_raw(register_t icrlo, u_int dest)
{
- register_t value, saveintr;
+ uint64_t icr;
+ uint32_t vhi, vlo;
+ register_t saveintr;
/* XXX: Need more sanity checking of icrlo? */
- KASSERT(lapic != NULL, ("%s called too early", __func__));
- KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
+ KASSERT(x2apic_mode || lapic_map != NULL,
+ ("%s called too early", __func__));
+ KASSERT(x2apic_mode ||
+ (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
("%s: invalid dest field", __func__));
KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
("%s: reserved bits set in ICR LO register", __func__));
/* Set destination in ICR HI register if it is being used. */
- saveintr = intr_disable();
+ if (!x2apic_mode) {
+ saveintr = intr_disable();
+ icr = lapic_read_icr();
+ }
+
if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
- value = lapic->icr_hi;
- value &= ~APIC_ID_MASK;
- value |= dest << APIC_ID_SHIFT;
- lapic->icr_hi = value;
+ if (x2apic_mode) {
+ vhi = dest;
+ } else {
+ vhi = icr >> 32;
+ vhi &= ~APIC_ID_MASK;
+ vhi |= dest << APIC_ID_SHIFT;
+ }
+ } else {
+ vhi = 0;
}
/* Program the contents of the IPI and dispatch it. */
- value = lapic->icr_lo;
- value &= APIC_ICRLO_RESV_MASK;
- value |= icrlo;
- lapic->icr_lo = value;
- intr_restore(saveintr);
+ if (x2apic_mode) {
+ vlo = icrlo;
+ } else {
+ vlo = icr;
+ vlo &= APIC_ICRLO_RESV_MASK;
+ vlo |= icrlo;
+ }
+ lapic_write_icr(vhi, vlo);
+ if (!x2apic_mode)
+ intr_restore(saveintr);
}
#define BEFORE_SPIN 50000
@@ -1473,8 +2034,8 @@
#define AFTER_SPIN 50
#endif
-void
-lapic_ipi_vectored(u_int vector, int dest)
+static void
+native_lapic_ipi_vectored(u_int vector, int dest)
{
register_t icrlo, destfield;
@@ -1484,11 +2045,10 @@
icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT;
/*
- * IPI_STOP_HARD is just a "fake" vector used to send a NMI.
- * Use special rules regard NMI if passed, otherwise specify
- * the vector.
+ * NMI IPIs are just fake vectors used to send a NMI. Use special rules
+ * regarding NMIs if passed, otherwise specify the vector.
*/
- if (vector == IPI_STOP_HARD)
+ if (vector >= IPI_NMI_FIRST)
icrlo |= APIC_DELMODE_NMI;
else
icrlo |= vector | APIC_DELMODE_FIXED;
@@ -1504,7 +2064,8 @@
icrlo |= APIC_DEST_ALLESELF;
break;
default:
- KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
+ KASSERT(x2apic_mode ||
+ (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
("%s: invalid destination 0x%x", __func__, dest));
destfield = dest;
}
@@ -1541,10 +2102,70 @@
printf("APIC: IPI might be stuck\n");
#else /* !needsattention */
/* Wait until mesage is sent without a timeout. */
- while (lapic->icr_lo & APIC_DELSTAT_PEND)
+ while (lapic_read_icr_lo() & APIC_DELSTAT_PEND)
ia32_pause();
#endif /* needsattention */
}
#endif /* DETECT_DEADLOCK */
}
+
#endif /* SMP */
+
+/*
+ * Since the IDT is shared by all CPUs the IPI slot update needs to be globally
+ * visible.
+ *
+ * Consider the case where an IPI is generated immediately after allocation:
+ * vector = lapic_ipi_alloc(ipifunc);
+ * ipi_selected(other_cpus, vector);
+ *
+ * In xAPIC mode a write to ICR_LO has serializing semantics because the
+ * APIC page is mapped as an uncached region. In x2APIC mode there is an
+ * explicit 'mfence' before the ICR MSR is written. Therefore in both cases
+ * the IDT slot update is globally visible before the IPI is delivered.
+ */
+static int
+native_lapic_ipi_alloc(inthand_t *ipifunc)
+{
+ struct gate_descriptor *ip;
+ long func;
+ int idx, vector;
+
+ KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
+ ("invalid ipifunc %p", ipifunc));
+
+ vector = -1;
+ mtx_lock_spin(&icu_lock);
+ for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
+ ip = &idt[idx];
+ func = (ip->gd_hioffset << 16) | ip->gd_looffset;
+ if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) ||
+ (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) {
+ vector = idx;
+ setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
+ break;
+ }
+ }
+ mtx_unlock_spin(&icu_lock);
+ return (vector);
+}
+
+static void
+native_lapic_ipi_free(int vector)
+{
+ struct gate_descriptor *ip;
+ long func;
+
+ KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST,
+ ("%s: invalid vector %d", __func__, vector));
+
+ mtx_lock_spin(&icu_lock);
+ ip = &idt[vector];
+ func = (ip->gd_hioffset << 16) | ip->gd_looffset;
+ KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
+ func != (uintptr_t)&IDTVEC(rsvd_pti),
+ ("invalid idtfunc %#lx", func));
+ setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+ SEL_KPL, GSEL_APIC);
+ mtx_unlock_spin(&icu_lock);
+}
Modified: trunk/sys/x86/x86/mca.c
===================================================================
--- trunk/sys/x86/x86/mca.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/mca.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -31,7 +31,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mca.c 314667 2017-03-04 13:03:31Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mca.c 333159 2018-05-02 07:38:38Z kib $");
#ifdef __amd64__
#define DEV_APIC
@@ -53,7 +53,7 @@
#include <sys/systm.h>
#include <sys/taskqueue.h>
#include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <x86/mca.h>
@@ -76,6 +76,11 @@
int max_threshold;
time_t last_intr;
};
+
+struct amd_et_state {
+ int cur_threshold;
+ time_t last_intr;
+};
#endif
struct mca_internal {
@@ -93,22 +98,20 @@
"Machine Check Architecture");
static int mca_enabled = 1;
-TUNABLE_INT("hw.mca.enabled", &mca_enabled);
SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
"Administrative toggle for machine check support");
static int amd10h_L1TP = 1;
-TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP);
SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
"Administrative toggle for logging of level one TLB parity (L1TP) errors");
static int intel6h_HSD131;
-TUNABLE_INT("hw.mca.intel6h_hsd131", &intel6h_HSD131);
SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
"Administrative toggle for logging of spurious corrected errors");
int workaround_erratum383;
-SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
+SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
+ &workaround_erratum383, 0,
"Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
static STAILQ_HEAD(, mca_internal) mca_freelist;
@@ -121,8 +124,18 @@
static struct mtx mca_lock;
#ifdef DEV_APIC
-static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */
+static struct cmc_state **cmc_state; /* Indexed by cpuid, bank. */
+static struct amd_et_state *amd_et_state; /* Indexed by cpuid. */
static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */
+
+static int amd_elvt = -1;
+
+static inline bool
+amd_thresholding_supported(void)
+{
+ return (cpu_vendor_id == CPU_VENDOR_AMD &&
+ CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16);
+}
#endif
static int
@@ -511,8 +524,8 @@
STAILQ_INSERT_TAIL(&mca_records, rec, link);
mca_count++;
mtx_unlock_spin(&mca_lock);
- if (mode == CMCI)
- taskqueue_enqueue_fast(mca_tq, &mca_refill_task);
+ if (mode == CMCI && !cold)
+ taskqueue_enqueue(mca_tq, &mca_refill_task);
}
#ifdef DEV_APIC
@@ -524,19 +537,15 @@
* cmc_throttle seconds or the periodic scan. If a periodic scan
* finds that the threshold is too high, it is lowered.
*/
-static void
-cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+static int
+update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
+ int cur_threshold, int max_threshold)
{
- struct cmc_state *cc;
- uint64_t ctl;
u_int delta;
- int count, limit;
+ int limit;
- /* Fetch the current limit for this bank. */
- cc = &cmc_state[PCPU_GET(cpuid)][bank];
- ctl = rdmsr(MSR_MC_CTL2(bank));
- count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
- delta = (u_int)(time_uptime - cc->last_intr);
+ delta = (u_int)(time_uptime - last_intr);
+ limit = cur_threshold;
/*
* If an interrupt was received less than cmc_throttle seconds
@@ -545,16 +554,11 @@
* double the threshold up to the max.
*/
if (mode == CMCI && valid) {
- limit = ctl & MC_CTL2_THRESHOLD;
if (delta < cmc_throttle && count >= limit &&
- limit < cc->max_threshold) {
- limit = min(limit << 1, cc->max_threshold);
- ctl &= ~MC_CTL2_THRESHOLD;
- ctl |= limit;
- wrmsr(MSR_MC_CTL2(bank), ctl);
+ limit < max_threshold) {
+ limit = min(limit << 1, max_threshold);
}
- cc->last_intr = time_uptime;
- return;
+ return (limit);
}
/*
@@ -562,11 +566,11 @@
* should be lowered.
*/
if (mode != POLLED)
- return;
+ return (limit);
/* If a CMCI occured recently, do nothing for now. */
if (delta < cmc_throttle)
- return;
+ return (limit);
/*
* Compute a new limit based on the average rate of events per
@@ -573,20 +577,70 @@
* cmc_throttle seconds since the last interrupt.
*/
if (valid) {
- count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
limit = count * cmc_throttle / delta;
if (limit <= 0)
limit = 1;
- else if (limit > cc->max_threshold)
- limit = cc->max_threshold;
- } else
+ else if (limit > max_threshold)
+ limit = max_threshold;
+ } else {
limit = 1;
- if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+ }
+ return (limit);
+}
+
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+ struct cmc_state *cc;
+ uint64_t ctl;
+ int cur_threshold, new_threshold;
+ int count;
+
+ /* Fetch the current limit for this bank. */
+ cc = &cmc_state[PCPU_GET(cpuid)][bank];
+ ctl = rdmsr(MSR_MC_CTL2(bank));
+ count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+ cur_threshold = ctl & MC_CTL2_THRESHOLD;
+
+ new_threshold = update_threshold(mode, valid, cc->last_intr, count,
+ cur_threshold, cc->max_threshold);
+
+ if (mode == CMCI && valid)
+ cc->last_intr = time_uptime;
+ if (new_threshold != cur_threshold) {
ctl &= ~MC_CTL2_THRESHOLD;
- ctl |= limit;
+ ctl |= new_threshold;
wrmsr(MSR_MC_CTL2(bank), ctl);
}
}
+
+static void
+amd_thresholding_update(enum scan_mode mode, int bank, int valid)
+{
+ struct amd_et_state *cc;
+ uint64_t misc;
+ int new_threshold;
+ int count;
+
+ KASSERT(bank == MC_AMDNB_BANK,
+ ("%s: unexpected bank %d", __func__, bank));
+ cc = &amd_et_state[PCPU_GET(cpuid)];
+ misc = rdmsr(MSR_MC_MISC(bank));
+ count = (misc & MC_MISC_AMDNB_CNT_MASK) >> MC_MISC_AMDNB_CNT_SHIFT;
+ count = count - (MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold);
+
+ new_threshold = update_threshold(mode, valid, cc->last_intr, count,
+ cc->cur_threshold, MC_MISC_AMDNB_CNT_MAX);
+
+ cc->cur_threshold = new_threshold;
+ misc &= ~MC_MISC_AMDNB_CNT_MASK;
+ misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
+ << MC_MISC_AMDNB_CNT_SHIFT;
+ misc &= ~MC_MISC_AMDNB_OVERFLOW;
+ wrmsr(MSR_MC_MISC(bank), misc);
+ if (mode == CMCI && valid)
+ cc->last_intr = time_uptime;
+}
#endif
/*
@@ -600,7 +654,7 @@
* count of the number of valid MC records found.
*/
static int
-mca_scan(enum scan_mode mode)
+mca_scan(enum scan_mode mode, int *recoverablep)
{
struct mca_record rec;
uint64_t mcg_cap, ucmask;
@@ -641,13 +695,19 @@
* If this is a bank this CPU monitors via CMCI,
* update the threshold.
*/
- if (PCPU_GET(cmci_mask) & 1 << i)
- cmci_update(mode, i, valid, &rec);
+ if (PCPU_GET(cmci_mask) & 1 << i) {
+ if (cmc_state != NULL)
+ cmci_update(mode, i, valid, &rec);
+ else
+ amd_thresholding_update(mode, i, valid);
+ }
#endif
}
if (mode == POLLED)
mca_fill_freelist();
- return (mode == MCE ? recoverable : count);
+ if (recoverablep != NULL)
+ *recoverablep = recoverable;
+ return (count);
}
/*
@@ -669,7 +729,7 @@
CPU_FOREACH(cpu) {
sched_bind(td, cpu);
thread_unlock(td);
- count += mca_scan(POLLED);
+ count += mca_scan(POLLED, NULL);
thread_lock(td);
sched_unbind(td);
}
@@ -690,7 +750,7 @@
mca_periodic_scan(void *arg)
{
- taskqueue_enqueue_fast(mca_tq, &mca_scan_task);
+ taskqueue_enqueue(mca_tq, &mca_scan_task);
callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
}
@@ -704,7 +764,7 @@
if (error)
return (error);
if (i)
- taskqueue_enqueue_fast(mca_tq, &mca_scan_task);
+ taskqueue_enqueue(mca_tq, &mca_scan_task);
return (0);
}
@@ -717,6 +777,9 @@
mca_tq = taskqueue_create_fast("mca", M_WAITOK,
taskqueue_thread_enqueue, &mca_tq);
taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
+
+ /* CMCIs during boot may have claimed items from the freelist. */
+ mca_fill_freelist();
}
SYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL);
@@ -729,7 +792,11 @@
callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
}
+#ifdef EARLY_AP_STARTUP
+SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
+#else
SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
+#endif
#ifdef DEV_APIC
static void
@@ -747,6 +814,18 @@
&cmc_throttle, 0, sysctl_positive_int, "I",
"Interval in seconds to throttle corrected MC interrupts");
}
+
+static void
+amd_thresholding_setup(void)
+{
+
+ amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state),
+ M_MCA, M_WAITOK | M_ZERO);
+ SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+ "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &cmc_throttle, 0, sysctl_positive_int, "I",
+ "Interval in seconds to throttle corrected MC interrupts");
+}
#endif
static void
@@ -785,6 +864,8 @@
#ifdef DEV_APIC
if (mcg_cap & MCG_CAP_CMCI_P)
cmci_setup();
+ else if (amd_thresholding_supported())
+ amd_thresholding_setup();
#endif
}
@@ -859,6 +940,82 @@
ctl |= MC_CTL2_CMCI_EN | 1;
wrmsr(MSR_MC_CTL2(i), ctl);
}
+
+static void
+amd_thresholding_start(struct amd_et_state *cc)
+{
+ uint64_t misc;
+
+ KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
+ misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
+ misc &= ~MC_MISC_AMDNB_INT_MASK;
+ misc |= MC_MISC_AMDNB_INT_LVT;
+ misc &= ~MC_MISC_AMDNB_LVT_MASK;
+ misc |= (uint64_t)amd_elvt << MC_MISC_AMDNB_LVT_SHIFT;
+ misc &= ~MC_MISC_AMDNB_CNT_MASK;
+ misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
+ << MC_MISC_AMDNB_CNT_SHIFT;
+ misc &= ~MC_MISC_AMDNB_OVERFLOW;
+ misc |= MC_MISC_AMDNB_CNTEN;
+
+ wrmsr(MSR_MC_MISC(MC_AMDNB_BANK), misc);
+}
+
+static void
+amd_thresholding_init(void)
+{
+ struct amd_et_state *cc;
+ uint64_t misc;
+
+ /* The counter must be valid and present. */
+ misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
+ if ((misc & (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP)) !=
+ (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP))
+ return;
+
+ /* The register should not be locked. */
+ if ((misc & MC_MISC_AMDNB_LOCK) != 0)
+ return;
+
+ /*
+ * If counter is enabled then either the firmware or another CPU
+ * has already claimed it.
+ */
+ if ((misc & MC_MISC_AMDNB_CNTEN) != 0)
+ return;
+
+ /*
+ * Configure an Extended Interrupt LVT register for reporting
+ * counter overflows if that feature is supported and the first
+ * extended register is available.
+ */
+ amd_elvt = lapic_enable_mca_elvt();
+ if (amd_elvt < 0)
+ return;
+
+ /* Re-use Intel CMC support infrastructure. */
+ cc = &amd_et_state[PCPU_GET(cpuid)];
+ cc->cur_threshold = 1;
+ amd_thresholding_start(cc);
+
+ /* Mark the NB bank as monitored. */
+ PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << MC_AMDNB_BANK);
+}
+
+static void
+amd_thresholding_resume(void)
+{
+ struct amd_et_state *cc;
+
+ /* Nothing to do if this CPU doesn't monitor the NB bank. */
+ if ((PCPU_GET(cmci_mask) & 1 << MC_AMDNB_BANK) == 0)
+ return;
+
+ cc = &amd_et_state[PCPU_GET(cpuid)];
+ cc->last_intr = 0;
+ cc->cur_threshold = 1;
+ amd_thresholding_start(cc);
+}
#endif
/*
@@ -884,7 +1041,7 @@
if (mcg_cap & MCG_CAP_CTL_P)
/* Enable MCA features. */
wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
- if (PCPU_GET(cpuid) == 0 && boot)
+ if (IS_BSP() && boot)
mca_setup(mcg_cap);
/*
@@ -900,6 +1057,14 @@
if ((mask & (1UL << 5)) == 0)
wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
}
+
+ /*
+ * The cmci_monitor() must not be executed
+ * simultaneously by several CPUs.
+ */
+ if (boot)
+ mtx_lock_spin(&mca_lock);
+
for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
/* By default enable logging of all errors. */
ctl = 0xffffffffffffffffUL;
@@ -934,10 +1099,30 @@
/* Clear all errors. */
wrmsr(MSR_MC_STATUS(i), 0);
}
+ if (boot)
+ mtx_unlock_spin(&mca_lock);
#ifdef DEV_APIC
- if (PCPU_GET(cmci_mask) != 0 && boot)
+ /*
+ * AMD Processors from families 10h - 16h provide support
+ * for Machine Check Error Thresholding.
+ * The processors support counters of MC errors and they
+ * can be configured to generate an interrupt when a counter
+ * overflows.
+ * The counters are all associated with Bank 4 and each
+ * of them covers a group of errors reported via that bank.
+ * At the moment only the DRAM Error Threshold Group is
+ * supported.
+ */
+ if (amd_thresholding_supported() &&
+ (mcg_cap & MCG_CAP_COUNT) >= 4) {
+ if (boot)
+ amd_thresholding_init();
+ else
+ amd_thresholding_resume();
+ } else if (PCPU_GET(cmci_mask) != 0 && boot) {
lapic_enable_cmc();
+ }
#endif
}
@@ -978,7 +1163,7 @@
mca_intr(void)
{
uint64_t mcg_status;
- int old_count, recoverable;
+ int recoverable, count;
if (!(cpu_feature & CPUID_MCA)) {
/*
@@ -992,8 +1177,7 @@
}
/* Scan the banks and check for any non-recoverable errors. */
- old_count = mca_count;
- recoverable = mca_scan(MCE);
+ count = mca_scan(MCE, &recoverable);
mcg_status = rdmsr(MSR_MCG_STATUS);
if (!(mcg_status & MCG_STATUS_RIPV))
recoverable = 0;
@@ -1000,12 +1184,11 @@
if (!recoverable) {
/*
- * Wait for at least one error to be logged before
- * panic'ing. Some errors will assert a machine check
- * on all CPUs, but only certain CPUs will find a valid
- * bank to log.
+ * Only panic if the error was detected local to this CPU.
+ * Some errors will assert a machine check on all CPUs, but
+ * only certain CPUs will find a valid bank to log.
*/
- while (mca_count == old_count)
+ while (count == 0)
cpu_spinwait();
panic("Unrecoverable machine check exception");
@@ -1027,7 +1210,7 @@
* Serialize MCA bank scanning to prevent collisions from
* sibling threads.
*/
- count = mca_scan(CMCI);
+ count = mca_scan(CMCI, NULL);
/* If we found anything, log them to the console. */
if (count != 0) {
Added: trunk/sys/x86/x86/mp_watchdog.c
===================================================================
--- trunk/sys/x86/x86/mp_watchdog.c (rev 0)
+++ trunk/sys/x86/x86/mp_watchdog.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,211 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2004 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/x86/mp_watchdog.c 303912 2016-08-10 13:38:44Z kib $
+ */
+
+#include "opt_mp_watchdog.h"
+#include "opt_sched.h"
+
+#ifdef SCHED_ULE
+#error MP_WATCHDOG cannot currently be used with SCHED_ULE
+#endif
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
+#include <machine/mp_watchdog.h>
+
+/*
+ * mp_watchdog hijacks the idle thread on a specified CPU, prevents new work
+ * from being scheduled there, and uses it as a "watchdog" to detect kernel
+ * failure on other CPUs. This is made reasonable by inclusion of logical
+ * processors in Xeon hardware. The watchdog is configured by setting the
+ * debug.watchdog sysctl/tunable to the CPU of interest. A callout will then
+ * begin executing reseting a timer that is gradually lowered by the watching
+ * thread. If the timer reaches 0, the watchdog fires by ether dropping
+ * directly to the debugger, or by sending an NMI IPI to the boot processor.
+ * This is a somewhat less efficient substitute for dedicated watchdog
+ * hardware, but can be quite an effective tool for debugging hangs.
+ *
+ * XXXRW: This should really use the watchdog(9)/watchdog(4) framework, but
+ * doesn't yet.
+ */
+static int watchdog_cpu = -1;
+static int watchdog_dontfire = 1;
+static int watchdog_timer = -1;
+static int watchdog_nmi = 1;
+
+SYSCTL_INT(_debug, OID_AUTO, watchdog_nmi, CTLFLAG_RWTUN, &watchdog_nmi, 0,
+ "IPI the boot processor with an NMI to enter the debugger");
+
+static struct callout watchdog_callout;
+
+static void watchdog_change(int wdcpu);
+
+/*
+ * Number of seconds before the watchdog will fire if the callout fails to
+ * reset the timer.
+ */
+#define WATCHDOG_THRESHOLD 10
+
+static void
+watchdog_init(void *arg)
+{
+
+ callout_init(&watchdog_callout, 1);
+ if (watchdog_cpu != -1)
+ watchdog_change(watchdog_cpu);
+}
+
+/*
+ * This callout resets a timer until the watchdog kicks in. It acquires some
+ * critical locks to make sure things haven't gotten wedged with those locks
+ * held.
+ */
+static void
+watchdog_function(void *arg)
+{
+
+ /*
+ * Since the timer ran, we must not be wedged. Acquire some critical
+ * locks to make sure. Then reset the timer.
+ */
+ mtx_lock(&Giant);
+ watchdog_timer = WATCHDOG_THRESHOLD;
+ mtx_unlock(&Giant);
+ callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL);
+}
+SYSINIT(watchdog_init, SI_SUB_DRIVERS, SI_ORDER_ANY, watchdog_init, NULL);
+
+static void
+watchdog_change(int wdcpu)
+{
+
+ if (wdcpu == -1 || wdcpu == 0xffffffff) {
+ /*
+ * Disable the watchdog.
+ */
+ watchdog_cpu = -1;
+ watchdog_dontfire = 1;
+ callout_stop(&watchdog_callout);
+ printf("watchdog stopped\n");
+ } else {
+ watchdog_timer = WATCHDOG_THRESHOLD;
+ watchdog_dontfire = 0;
+ watchdog_cpu = wdcpu;
+ callout_reset(&watchdog_callout, 1 * hz, watchdog_function,
+ NULL);
+ }
+}
+
+/*
+ * This sysctl sets which CPU is the watchdog CPU. Set to -1 or 0xffffffff
+ * to disable the watchdog.
+ */
+static int
+sysctl_watchdog(SYSCTL_HANDLER_ARGS)
+{
+ int error, temp;
+
+ temp = watchdog_cpu;
+ error = sysctl_handle_int(oidp, &temp, 0, req);
+ if (error)
+ return (error);
+
+ if (req->newptr != NULL)
+ watchdog_change(temp);
+ return (0);
+}
+SYSCTL_PROC(_debug, OID_AUTO, watchdog, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+ sysctl_watchdog, "I", "");
+
+/*
+ * Drop into the debugger by sending an IPI NMI to the boot processor.
+ */
+static void
+watchdog_ipi_nmi(void)
+{
+
+ /*
+ * Deliver NMI to the boot processor. Why not?
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+ APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_NMI,
+ boot_cpu_id);
+ lapic_ipi_wait(-1);
+}
+
+/*
+ * ap_watchdog() is called by the SMP idle loop code. It works on the same
+ * premise that the disabling of logical processors does: that if the cpu is
+ * idle, then it can ignore the world from then on, as nothing will be
+ * scheduled on it. Leaving aside multi-runqueue schedulers (SCHED_ULE) and
+ * explicit process migration (sched_bind()), this is not an unreasonable
+ * assumption.
+ */
+void
+ap_watchdog(u_int cpuid)
+{
+ char old_pcomm[MAXCOMLEN + 1];
+ struct proc *p;
+
+ if (watchdog_cpu != cpuid)
+ return;
+
+ printf("watchdog started on cpu %d\n", cpuid);
+ p = curproc;
+ bcopy(p->p_comm, old_pcomm, MAXCOMLEN + 1);
+ snprintf(p->p_comm, MAXCOMLEN + 1, "mp_watchdog cpu %d", cpuid);
+ while (1) {
+ DELAY(1000000); /* One second. */
+ if (watchdog_cpu != cpuid)
+ break;
+ atomic_subtract_int(&watchdog_timer, 1);
+ if (watchdog_timer < 4)
+ printf("Watchdog timer: %d\n", watchdog_timer);
+ if (watchdog_timer == 0 && watchdog_dontfire == 0) {
+ printf("Watchdog firing!\n");
+ watchdog_dontfire = 1;
+ if (watchdog_nmi)
+ watchdog_ipi_nmi();
+ else
+ kdb_enter(KDB_WHY_WATCHDOG, "mp_watchdog");
+ }
+ }
+ bcopy(old_pcomm, p->p_comm, MAXCOMLEN + 1);
+ printf("watchdog stopped on cpu %d\n", cpuid);
+}
Property changes on: trunk/sys/x86/x86/mp_watchdog.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/mp_x86.c
===================================================================
--- trunk/sys/x86/x86/mp_x86.c (rev 0)
+++ trunk/sys/x86/x86/mp_x86.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,1640 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1996, by Steve Passe
+ * Copyright (c) 2003, by Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mp_x86.c 349958 2019-07-12 22:31:12Z jhb $");
+
+#ifdef __i386__
+#include "opt_apic.h"
+#endif
+#include "opt_cpu.h"
+#include "opt_kstack_pages.h"
+#include "opt_pmap.h"
+#include "opt_sched.h"
+#include "opt_smp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/cons.h> /* cngetc() */
+#include <sys/cpuset.h>
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+
+#include <x86/apicreg.h>
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/cputypes.h>
+#include <x86/mca.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/psl.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+#include <x86/ucode.h>
+
+/* lock region used by kernel profiling */
+int mcount_lock;
+
+int mp_naps; /* # of Applications processors */
+int boot_cpu_id = -1; /* designated BSP */
+
+extern struct pcpu __pcpu[];
+
+/* AP uses this during bootstrap. Do not staticize. */
+char *bootSTK;
+int bootAP;
+
+/* Free these after use */
+void *bootstacks[MAXCPU];
+void *dpcpu;
+
+struct pcb stoppcbs[MAXCPU];
+struct susppcb **susppcbs;
+
+#ifdef COUNT_IPIS
+/* Interrupt counts. */
+static u_long *ipi_preempt_counts[MAXCPU];
+static u_long *ipi_ast_counts[MAXCPU];
+u_long *ipi_invltlb_counts[MAXCPU];
+u_long *ipi_invlrng_counts[MAXCPU];
+u_long *ipi_invlpg_counts[MAXCPU];
+u_long *ipi_invlcache_counts[MAXCPU];
+u_long *ipi_rendezvous_counts[MAXCPU];
+static u_long *ipi_hardclock_counts[MAXCPU];
+#endif
+
+/* Default cpu_ops implementation. */
+struct cpu_ops cpu_ops;
+
+/*
+ * Local data and functions.
+ */
+
+static volatile cpuset_t ipi_stop_nmi_pending;
+
+volatile cpuset_t resuming_cpus;
+volatile cpuset_t toresume_cpus;
+
+/* used to hold the AP's until we are ready to release them */
+struct mtx ap_boot_mtx;
+
+/* Set to 1 once we're ready to let the APs out of the pen. */
+volatile int aps_ready = 0;
+
+/*
+ * Store data from cpu_add() until later in the boot when we actually setup
+ * the APs.
+ */
+struct cpu_info cpu_info[MAX_APIC_ID + 1];
+int apic_cpuids[MAX_APIC_ID + 1];
+int cpu_apic_ids[MAXCPU];
+
+/* Holds pending bitmap based IPIs per CPU */
+volatile u_int cpu_ipi_pending[MAXCPU];
+
+static void release_aps(void *dummy);
+static void cpustop_handler_post(u_int cpu);
+
+static int hyperthreading_allowed = 1;
+SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
+ &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
+
+static struct topo_node topo_root;
+
+static int pkg_id_shift;
+static int core_id_shift;
+static int disabled_cpus;
+
+struct cache_info {
+ int id_shift;
+ int present;
+} static caches[MAX_CACHE_LEVELS];
+
+void
+mem_range_AP_init(void)
+{
+
+ if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
+ mem_range_softc.mr_op->initAP(&mem_range_softc);
+}
+
+/*
+ * Round up to the next power of two, if necessary, and then
+ * take log2.
+ * Returns -1 if argument is zero.
+ */
+static __inline int
+mask_width(u_int x)
+{
+
+ return (fls(x << (1 - powerof2(x))) - 1);
+}
+
+/*
+ * Add a cache level to the cache topology description.
+ */
+static int
+add_deterministic_cache(int type, int level, int share_count)
+{
+
+ if (type == 0)
+ return (0);
+ if (type > 3) {
+ printf("unexpected cache type %d\n", type);
+ return (1);
+ }
+ if (type == 2) /* ignore instruction cache */
+ return (1);
+ if (level == 0 || level > MAX_CACHE_LEVELS) {
+ printf("unexpected cache level %d\n", type);
+ return (1);
+ }
+
+ if (caches[level - 1].present) {
+ printf("WARNING: multiple entries for L%u data cache\n", level);
+ printf("%u => %u\n", caches[level - 1].id_shift,
+ mask_width(share_count));
+ }
+ caches[level - 1].id_shift = mask_width(share_count);
+ caches[level - 1].present = 1;
+
+ if (caches[level - 1].id_shift > pkg_id_shift) {
+ printf("WARNING: L%u data cache covers more "
+ "APIC IDs than a package\n", level);
+ printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
+ caches[level - 1].id_shift = pkg_id_shift;
+ }
+ if (caches[level - 1].id_shift < core_id_shift) {
+ printf("WARNING: L%u data cache covers less "
+ "APIC IDs than a core\n", level);
+ printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
+ caches[level - 1].id_shift = core_id_shift;
+ }
+
+ return (1);
+}
+
+/*
+ * Determine topology of processing units and caches for AMD CPUs.
+ * See:
+ * - AMD CPUID Specification (Publication # 25481)
+ * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
+ * - BKDG For AMD Family 10h Processors (Publication # 31116)
+ * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
+ * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
+ * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
+ */
+static void
+topo_probe_amd(void)
+{
+ u_int p[4];
+ uint64_t v;
+ int level;
+ int nodes_per_socket;
+ int share_count;
+ int type;
+ int i;
+
+ /* No multi-core capability. */
+ if ((amd_feature2 & AMDID2_CMP) == 0)
+ return;
+
+ /* For families 10h and newer. */
+ pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
+ AMDID_COREID_SIZE_SHIFT;
+
+ /* For 0Fh family. */
+ if (pkg_id_shift == 0)
+ pkg_id_shift =
+ mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
+
+ /*
+ * Families prior to 16h define the following value as
+ * cores per compute unit and we don't really care about the AMD
+ * compute units at the moment. Perhaps we should treat them as
+ * cores and cores within the compute units as hardware threads,
+ * but that's up for debate.
+ * Later families define the value as threads per compute unit,
+ * so we are following AMD's nomenclature here.
+ */
+ if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
+ CPUID_TO_FAMILY(cpu_id) >= 0x16) {
+ cpuid_count(0x8000001e, 0, p);
+ share_count = ((p[1] >> 8) & 0xff) + 1;
+ core_id_shift = mask_width(share_count);
+ }
+
+ if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
+ for (i = 0; ; i++) {
+ cpuid_count(0x8000001d, i, p);
+ type = p[0] & 0x1f;
+ level = (p[0] >> 5) & 0x7;
+ share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+ if (!add_deterministic_cache(type, level, share_count))
+ break;
+ }
+ } else {
+ if (cpu_exthigh >= 0x80000005) {
+ cpuid_count(0x80000005, 0, p);
+ if (((p[2] >> 24) & 0xff) != 0) {
+ caches[0].id_shift = 0;
+ caches[0].present = 1;
+ }
+ }
+ if (cpu_exthigh >= 0x80000006) {
+ cpuid_count(0x80000006, 0, p);
+ if (((p[2] >> 16) & 0xffff) != 0) {
+ caches[1].id_shift = 0;
+ caches[1].present = 1;
+ }
+ if (((p[3] >> 18) & 0x3fff) != 0) {
+ nodes_per_socket = 1;
+ if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
+ /*
+ * Handle multi-node processors that
+ * have multiple chips, each with its
+ * own L3 cache, on the same die.
+ */
+ v = rdmsr(0xc001100c);
+ nodes_per_socket = 1 + ((v >> 3) & 0x7);
+ }
+ caches[2].id_shift =
+ pkg_id_shift - mask_width(nodes_per_socket);
+ caches[2].present = 1;
+ }
+ }
+ }
+}
+
+/*
+ * Determine topology of processing units for Intel CPUs
+ * using CPUID Leaf 1 and Leaf 4, if supported.
+ * See:
+ * - Intel 64 Architecture Processor Topology Enumeration
+ * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
+ * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
+ * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
+ */
+static void
+topo_probe_intel_0x4(void)
+{
+ u_int p[4];
+ int max_cores;
+ int max_logical;
+
+ /* Both zero and one here mean one logical processor per package. */
+ max_logical = (cpu_feature & CPUID_HTT) != 0 ?
+ (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
+ if (max_logical <= 1)
+ return;
+
+ if (cpu_high >= 0x4) {
+ cpuid_count(0x04, 0, p);
+ max_cores = ((p[0] >> 26) & 0x3f) + 1;
+ } else
+ max_cores = 1;
+
+ core_id_shift = mask_width(max_logical/max_cores);
+ KASSERT(core_id_shift >= 0,
+ ("intel topo: max_cores > max_logical\n"));
+ pkg_id_shift = core_id_shift + mask_width(max_cores);
+}
+
+/*
+ * Determine topology of processing units for Intel CPUs
+ * using CPUID Leaf 11, if supported.
+ * See:
+ * - Intel 64 Architecture Processor Topology Enumeration
+ * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
+ * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
+ * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
+ */
+static void
+topo_probe_intel_0xb(void)
+{
+ u_int p[4];
+ int bits;
+ int type;
+ int i;
+
+ /* Fall back if CPU leaf 11 doesn't really exist. */
+ cpuid_count(0x0b, 0, p);
+ if (p[1] == 0) {
+ topo_probe_intel_0x4();
+ return;
+ }
+
+ /* We only support three levels for now. */
+ for (i = 0; ; i++) {
+ cpuid_count(0x0b, i, p);
+
+ bits = p[0] & 0x1f;
+ type = (p[2] >> 8) & 0xff;
+
+ if (type == 0)
+ break;
+
+ /* TODO: check for duplicate (re-)assignment */
+ if (type == CPUID_TYPE_SMT)
+ core_id_shift = bits;
+ else if (type == CPUID_TYPE_CORE)
+ pkg_id_shift = bits;
+ else
+ printf("unknown CPU level type %d\n", type);
+ }
+
+ if (pkg_id_shift < core_id_shift) {
+ printf("WARNING: core covers more APIC IDs than a package\n");
+ core_id_shift = pkg_id_shift;
+ }
+}
+
+/*
+ * Determine topology of caches for Intel CPUs.
+ * See:
+ * - Intel 64 Architecture Processor Topology Enumeration
+ * - Intel 64 and IA-32 Architectures Software Developer’s Manual
+ * Volume 2A: Instruction Set Reference, A-M,
+ * CPUID instruction
+ */
+static void
+topo_probe_intel_caches(void)
+{
+ u_int p[4];
+ int level;
+ int share_count;
+ int type;
+ int i;
+
+ if (cpu_high < 0x4) {
+ /*
+ * Available cache level and sizes can be determined
+ * via CPUID leaf 2, but that requires a huge table of hardcoded
+ * values, so for now just assume L1 and L2 caches potentially
+ * shared only by HTT processing units, if HTT is present.
+ */
+ caches[0].id_shift = pkg_id_shift;
+ caches[0].present = 1;
+ caches[1].id_shift = pkg_id_shift;
+ caches[1].present = 1;
+ return;
+ }
+
+ for (i = 0; ; i++) {
+ cpuid_count(0x4, i, p);
+ type = p[0] & 0x1f;
+ level = (p[0] >> 5) & 0x7;
+ share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+ if (!add_deterministic_cache(type, level, share_count))
+ break;
+ }
+}
+
+/*
+ * Determine topology of processing units and caches for Intel CPUs.
+ * See:
+ * - Intel 64 Architecture Processor Topology Enumeration
+ */
+static void
+topo_probe_intel(void)
+{
+
+ /*
+ * Note that 0x1 <= cpu_high < 4 case should be
+ * compatible with topo_probe_intel_0x4() logic when
+ * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
+ * or it should trigger the fallback otherwise.
+ */
+ if (cpu_high >= 0xb)
+ topo_probe_intel_0xb();
+ else if (cpu_high >= 0x1)
+ topo_probe_intel_0x4();
+
+ topo_probe_intel_caches();
+}
+
+/*
+ * Topology information is queried only on BSP, on which this
+ * code runs and for which it can query CPUID information.
+ * Then topology is extrapolated on all packages using an
+ * assumption that APIC ID to hardware component ID mapping is
+ * homogenious.
+ * That doesn't necesserily imply that the topology is uniform.
+ */
+void
+topo_probe(void)
+{
+ static int cpu_topo_probed = 0;
+ struct x86_topo_layer {
+ int type;
+ int subtype;
+ int id_shift;
+ } topo_layers[MAX_CACHE_LEVELS + 3];
+ struct topo_node *parent;
+ struct topo_node *node;
+ int layer;
+ int nlayers;
+ int node_id;
+ int i;
+
+ if (cpu_topo_probed)
+ return;
+
+ CPU_ZERO(&logical_cpus_mask);
+
+ if (mp_ncpus <= 1)
+ ; /* nothing */
+ else if (cpu_vendor_id == CPU_VENDOR_AMD)
+ topo_probe_amd();
+ else if (cpu_vendor_id == CPU_VENDOR_INTEL)
+ topo_probe_intel();
+
+ KASSERT(pkg_id_shift >= core_id_shift,
+ ("bug in APIC topology discovery"));
+
+ nlayers = 0;
+ bzero(topo_layers, sizeof(topo_layers));
+
+ topo_layers[nlayers].type = TOPO_TYPE_PKG;
+ topo_layers[nlayers].id_shift = pkg_id_shift;
+ if (bootverbose)
+ printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
+ nlayers++;
+
+ /*
+ * Consider all caches to be within a package/chip
+ * and "in front" of all sub-components like
+ * cores and hardware threads.
+ */
+ for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
+ if (caches[i].present) {
+ KASSERT(caches[i].id_shift <= pkg_id_shift,
+ ("bug in APIC topology discovery"));
+ KASSERT(caches[i].id_shift >= core_id_shift,
+ ("bug in APIC topology discovery"));
+
+ topo_layers[nlayers].type = TOPO_TYPE_CACHE;
+ topo_layers[nlayers].subtype = i + 1;
+ topo_layers[nlayers].id_shift = caches[i].id_shift;
+ if (bootverbose)
+ printf("L%u cache ID shift: %u\n",
+ topo_layers[nlayers].subtype,
+ topo_layers[nlayers].id_shift);
+ nlayers++;
+ }
+ }
+
+ if (pkg_id_shift > core_id_shift) {
+ topo_layers[nlayers].type = TOPO_TYPE_CORE;
+ topo_layers[nlayers].id_shift = core_id_shift;
+ if (bootverbose)
+ printf("Core ID shift: %u\n",
+ topo_layers[nlayers].id_shift);
+ nlayers++;
+ }
+
+ topo_layers[nlayers].type = TOPO_TYPE_PU;
+ topo_layers[nlayers].id_shift = 0;
+ nlayers++;
+
+ topo_init_root(&topo_root);
+ for (i = 0; i <= MAX_APIC_ID; ++i) {
+ if (!cpu_info[i].cpu_present)
+ continue;
+
+ parent = &topo_root;
+ for (layer = 0; layer < nlayers; ++layer) {
+ node_id = i >> topo_layers[layer].id_shift;
+ parent = topo_add_node_by_hwid(parent, node_id,
+ topo_layers[layer].type,
+ topo_layers[layer].subtype);
+ }
+ }
+
+ parent = &topo_root;
+ for (layer = 0; layer < nlayers; ++layer) {
+ node_id = boot_cpu_id >> topo_layers[layer].id_shift;
+ node = topo_find_node_by_hwid(parent, node_id,
+ topo_layers[layer].type,
+ topo_layers[layer].subtype);
+ topo_promote_child(node);
+ parent = node;
+ }
+
+ cpu_topo_probed = 1;
+}
+
+/*
+ * Assign logical CPU IDs to local APICs.
+ */
+void
+assign_cpu_ids(void)
+{
+ struct topo_node *node;
+ u_int smt_mask;
+
+ smt_mask = (1u << core_id_shift) - 1;
+
+ /*
+ * Assign CPU IDs to local APIC IDs and disable any CPUs
+ * beyond MAXCPU. CPU 0 is always assigned to the BSP.
+ */
+ mp_ncpus = 0;
+ TOPO_FOREACH(node, &topo_root) {
+ if (node->type != TOPO_TYPE_PU)
+ continue;
+
+ if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
+ cpu_info[node->hwid].cpu_hyperthread = 1;
+
+ if (resource_disabled("lapic", node->hwid)) {
+ if (node->hwid != boot_cpu_id)
+ cpu_info[node->hwid].cpu_disabled = 1;
+ else
+ printf("Cannot disable BSP, APIC ID = %d\n",
+ node->hwid);
+ }
+
+ if (!hyperthreading_allowed &&
+ cpu_info[node->hwid].cpu_hyperthread)
+ cpu_info[node->hwid].cpu_disabled = 1;
+
+ if (mp_ncpus >= MAXCPU)
+ cpu_info[node->hwid].cpu_disabled = 1;
+
+ if (cpu_info[node->hwid].cpu_disabled) {
+ disabled_cpus++;
+ continue;
+ }
+
+ cpu_apic_ids[mp_ncpus] = node->hwid;
+ apic_cpuids[node->hwid] = mp_ncpus;
+ topo_set_pu_id(node, mp_ncpus);
+ mp_ncpus++;
+ }
+
+ KASSERT(mp_maxid >= mp_ncpus - 1,
+ ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
+ mp_ncpus));
+}
+
+/*
+ * Print various information about the SMP system hardware and setup.
+ */
+void
+cpu_mp_announce(void)
+{
+ struct topo_node *node;
+ const char *hyperthread;
+ int pkg_count;
+ int cores_per_pkg;
+ int thrs_per_core;
+
+ printf("FreeBSD/SMP: ");
+ if (topo_analyze(&topo_root, 1, &pkg_count,
+ &cores_per_pkg, &thrs_per_core)) {
+ printf("%d package(s)", pkg_count);
+ if (cores_per_pkg > 0)
+ printf(" x %d core(s)", cores_per_pkg);
+ if (thrs_per_core > 1)
+ printf(" x %d hardware threads", thrs_per_core);
+ } else {
+ printf("Non-uniform topology");
+ }
+ printf("\n");
+
+ if (disabled_cpus) {
+ printf("FreeBSD/SMP Online: ");
+ if (topo_analyze(&topo_root, 0, &pkg_count,
+ &cores_per_pkg, &thrs_per_core)) {
+ printf("%d package(s)", pkg_count);
+ if (cores_per_pkg > 0)
+ printf(" x %d core(s)", cores_per_pkg);
+ if (thrs_per_core > 1)
+ printf(" x %d hardware threads", thrs_per_core);
+ } else {
+ printf("Non-uniform topology");
+ }
+ printf("\n");
+ }
+
+ if (!bootverbose)
+ return;
+
+ TOPO_FOREACH(node, &topo_root) {
+ switch (node->type) {
+ case TOPO_TYPE_PKG:
+ printf("Package HW ID = %u (%#x)\n",
+ node->hwid, node->hwid);
+ break;
+ case TOPO_TYPE_CORE:
+ printf("\tCore HW ID = %u (%#x)\n",
+ node->hwid, node->hwid);
+ break;
+ case TOPO_TYPE_PU:
+ if (cpu_info[node->hwid].cpu_hyperthread)
+ hyperthread = "/HT";
+ else
+ hyperthread = "";
+
+ if (node->subtype == 0)
+ printf("\t\tCPU (AP%s): APIC ID: %u (%#x)"
+ "(disabled)\n", hyperthread, node->hwid,
+ node->hwid);
+ else if (node->id == 0)
+ printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n",
+ node->hwid, node->hwid);
+ else
+ printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n",
+ node->id, hyperthread, node->hwid,
+ node->hwid);
+ break;
+ default:
+ /* ignored */
+ break;
+ }
+ }
+}
+
+/*
+ * Add a scheduling group, a group of logical processors sharing
+ * a particular cache (and, thus having an affinity), to the scheduling
+ * topology.
+ * This function recursively works on lower level caches.
+ */
+static void
+x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
+{
+ struct topo_node *node;
+ int nchildren;
+ int ncores;
+ int i;
+
+ KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE,
+ ("x86topo_add_sched_group: bad type: %u", root->type));
+ CPU_COPY(&root->cpuset, &cg_root->cg_mask);
+ cg_root->cg_count = root->cpu_count;
+ if (root->type == TOPO_TYPE_SYSTEM)
+ cg_root->cg_level = CG_SHARE_NONE;
+ else
+ cg_root->cg_level = root->subtype;
+
+ /*
+ * Check how many core nodes we have under the given root node.
+ * If we have multiple logical processors, but not multiple
+ * cores, then those processors must be hardware threads.
+ */
+ ncores = 0;
+ node = root;
+ while (node != NULL) {
+ if (node->type != TOPO_TYPE_CORE) {
+ node = topo_next_node(root, node);
+ continue;
+ }
+
+ ncores++;
+ node = topo_next_nonchild_node(root, node);
+ }
+
+ if (cg_root->cg_level != CG_SHARE_NONE &&
+ root->cpu_count > 1 && ncores < 2)
+ cg_root->cg_flags = CG_FLAG_SMT;
+
+ /*
+ * Find out how many cache nodes we have under the given root node.
+ * We ignore cache nodes that cover all the same processors as the
+ * root node. Also, we do not descend below found cache nodes.
+ * That is, we count top-level "non-redundant" caches under the root
+ * node.
+ */
+ nchildren = 0;
+ node = root;
+ while (node != NULL) {
+ if (node->type != TOPO_TYPE_CACHE ||
+ (root->type != TOPO_TYPE_SYSTEM &&
+ CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
+ node = topo_next_node(root, node);
+ continue;
+ }
+ nchildren++;
+ node = topo_next_nonchild_node(root, node);
+ }
+
+ cg_root->cg_child = smp_topo_alloc(nchildren);
+ cg_root->cg_children = nchildren;
+
+ /*
+ * Now find again the same cache nodes as above and recursively
+ * build scheduling topologies for them.
+ */
+ node = root;
+ i = 0;
+ while (node != NULL) {
+ if (node->type != TOPO_TYPE_CACHE ||
+ (root->type != TOPO_TYPE_SYSTEM &&
+ CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
+ node = topo_next_node(root, node);
+ continue;
+ }
+ cg_root->cg_child[i].cg_parent = cg_root;
+ x86topo_add_sched_group(node, &cg_root->cg_child[i]);
+ i++;
+ node = topo_next_nonchild_node(root, node);
+ }
+}
+
+/*
+ * Build the MI scheduling topology from the discovered hardware topology.
+ */
+struct cpu_group *
+cpu_topo(void)
+{
+ struct cpu_group *cg_root;
+
+ if (mp_ncpus <= 1)
+ return (smp_topo_none());
+
+ cg_root = smp_topo_alloc(1);
+ x86topo_add_sched_group(&topo_root, cg_root);
+ return (cg_root);
+}
+
+
+/*
+ * Add a logical CPU to the topology.
+ */
+void
+cpu_add(u_int apic_id, char boot_cpu)
+{
+
+ if (apic_id > MAX_APIC_ID) {
+ panic("SMP: APIC ID %d too high", apic_id);
+ return;
+ }
+ KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
+ apic_id));
+ cpu_info[apic_id].cpu_present = 1;
+ if (boot_cpu) {
+ KASSERT(boot_cpu_id == -1,
+ ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
+ boot_cpu_id));
+ boot_cpu_id = apic_id;
+ cpu_info[apic_id].cpu_bsp = 1;
+ }
+ if (mp_ncpus < MAXCPU) {
+ mp_ncpus++;
+ mp_maxid = mp_ncpus - 1;
+ }
+ if (bootverbose)
+ printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
+ "AP");
+}
+
+void
+cpu_mp_setmaxid(void)
+{
+
+ /*
+ * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
+ * If there were no calls to cpu_add() assume this is a UP system.
+ */
+ if (mp_ncpus == 0)
+ mp_ncpus = 1;
+}
+
+int
+cpu_mp_probe(void)
+{
+
+ /*
+ * Always record BSP in CPU map so that the mbuf init code works
+ * correctly.
+ */
+ CPU_SETOF(0, &all_cpus);
+ return (mp_ncpus > 1);
+}
+
+/*
+ * AP CPU's call this to initialize themselves.
+ */
+void
+init_secondary_tail(void)
+{
+ u_int cpuid;
+
+ pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
+
+ /*
+ * On real hardware, switch to x2apic mode if possible. Do it
+ * after aps_ready was signalled, to avoid manipulating the
+ * mode while BSP might still want to send some IPI to us
+ * (second startup IPI is ignored on modern hardware etc).
+ */
+ lapic_xapic_mode();
+
+ /* Initialize the PAT MSR. */
+ pmap_init_pat();
+
+ /* set up CPU registers and state */
+ cpu_setregs();
+
+ /* set up SSE/NX */
+ initializecpu();
+
+ /* set up FPU state on the AP */
+#ifdef __amd64__
+ fpuinit();
+#else
+ npxinit(false);
+#endif
+
+ if (cpu_ops.cpu_init)
+ cpu_ops.cpu_init();
+
+ /* A quick check from sanity claus */
+ cpuid = PCPU_GET(cpuid);
+ if (PCPU_GET(apic_id) != lapic_id()) {
+ printf("SMP: cpuid = %d\n", cpuid);
+ printf("SMP: actual apic_id = %d\n", lapic_id());
+ printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
+ panic("cpuid mismatch! boom!!");
+ }
+
+ /* Initialize curthread. */
+ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
+ PCPU_SET(curthread, PCPU_GET(idlethread));
+
+ mtx_lock_spin(&ap_boot_mtx);
+
+ mca_init();
+
+ /* Init local apic for irq's */
+ lapic_setup(1);
+
+ /* Set memory range attributes for this CPU to match the BSP */
+ mem_range_AP_init();
+
+ smp_cpus++;
+
+ CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
+ printf("SMP: AP CPU #%d Launched!\n", cpuid);
+
+ /* Determine if we are a logical CPU. */
+ if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
+ CPU_SET(cpuid, &logical_cpus_mask);
+
+ if (bootverbose)
+ lapic_dump("AP");
+
+ if (smp_cpus == mp_ncpus) {
+ /* enable IPI's, tlb shootdown, freezes etc */
+ atomic_store_rel_int(&smp_started, 1);
+ }
+
+#ifdef __amd64__
+ /*
+ * Enable global pages TLB extension
+ * This also implicitly flushes the TLB
+ */
+ load_cr4(rcr4() | CR4_PGE);
+ if (pmap_pcid_enabled)
+ load_cr4(rcr4() | CR4_PCIDE);
+ load_ds(_udatasel);
+ load_es(_udatasel);
+ load_fs(_ufssel);
+#endif
+
+ mtx_unlock_spin(&ap_boot_mtx);
+
+ /* Wait until all the AP's are up. */
+ while (atomic_load_acq_int(&smp_started) == 0)
+ ia32_pause();
+
+#ifndef EARLY_AP_STARTUP
+ /* Start per-CPU event timers. */
+ cpu_initclocks_ap();
+#endif
+
+ sched_throw(NULL);
+
+ panic("scheduler returned us to %s", __func__);
+ /* NOTREACHED */
+}
+
+/*******************************************************************
+ * local functions and data
+ */
+
+/*
+ * We tell the I/O APIC code about all the CPUs we want to receive
+ * interrupts. If we don't want certain CPUs to receive IRQs we
+ * can simply not tell the I/O APIC code about them in this function.
+ * We also do not tell it about the BSP since it tells itself about
+ * the BSP internally to work with UP kernels and on UP machines.
+ */
+void
+set_interrupt_apic_ids(void)
+{
+ u_int i, apic_id;
+
+ for (i = 0; i < MAXCPU; i++) {
+ apic_id = cpu_apic_ids[i];
+ if (apic_id == -1)
+ continue;
+ if (cpu_info[apic_id].cpu_bsp)
+ continue;
+ if (cpu_info[apic_id].cpu_disabled)
+ continue;
+
+ /* Don't let hyperthreads service interrupts. */
+ if (cpu_info[apic_id].cpu_hyperthread)
+ continue;
+
+ intr_add_cpu(i);
+ }
+}
+
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+ sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+ sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+ sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+ 0, "");
+#endif /* COUNT_XINVLTLB_HITS */
+
+/*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
+ /*
+ * This attempts to follow the algorithm described in the
+ * Intel Multiprocessor Specification v1.4 in section B.4.
+ * For each IPI, we allow the local APIC ~20us to deliver the
+ * IPI. If that times out, we panic.
+ */
+
+ /*
+ * first we do an INIT IPI: this INIT IPI might be run, resetting
+ * and running the target CPU. OR this INIT IPI might be latched (P5
+ * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
+ * ignored.
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+ APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
+ lapic_ipi_wait(100);
+
+ /* Explicitly deassert the INIT IPI. */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+ APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
+ apic_id);
+
+ DELAY(10000); /* wait ~10mS */
+
+ /*
+ * next we do a STARTUP IPI: the previous INIT IPI might still be
+ * latched, (P5 bug) this 1st STARTUP would then terminate
+ * immediately, and the previously started INIT IPI would continue. OR
+ * the previous INIT IPI has already run. and this STARTUP IPI will
+ * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
+ * will run.
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+ APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+ vector, apic_id);
+ if (!lapic_ipi_wait(100))
+ panic("Failed to deliver first STARTUP IPI to APIC %d",
+ apic_id);
+ DELAY(200); /* wait ~200uS */
+
+ /*
+ * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
+ * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
+ * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
+ * recognized after hardware RESET or INIT IPI.
+ */
+ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+ APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+ vector, apic_id);
+ if (!lapic_ipi_wait(100))
+ panic("Failed to deliver second STARTUP IPI to APIC %d",
+ apic_id);
+
+ DELAY(200); /* wait ~200uS */
+}
+
+/*
+ * Send an IPI to specified CPU handling the bitmap logic.
+ */
+void
+ipi_send_cpu(int cpu, u_int ipi)
+{
+ u_int bitmap, old_pending, new_pending;
+
+ KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
+
+ if (IPI_IS_BITMAPED(ipi)) {
+ bitmap = 1 << ipi;
+ ipi = IPI_BITMAP_VECTOR;
+ do {
+ old_pending = cpu_ipi_pending[cpu];
+ new_pending = old_pending | bitmap;
+ } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
+ old_pending, new_pending));
+ if (old_pending)
+ return;
+ }
+ lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
+}
+
+void
+ipi_bitmap_handler(struct trapframe frame)
+{
+ struct trapframe *oldframe;
+ struct thread *td;
+ int cpu = PCPU_GET(cpuid);
+ u_int ipi_bitmap;
+
+ critical_enter();
+ td = curthread;
+ td->td_intr_nesting_level++;
+ oldframe = td->td_intr_frame;
+ td->td_intr_frame = &frame;
+ ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
+ if (ipi_bitmap & (1 << IPI_PREEMPT)) {
+#ifdef COUNT_IPIS
+ (*ipi_preempt_counts[cpu])++;
+#endif
+ sched_preempt(td);
+ }
+ if (ipi_bitmap & (1 << IPI_AST)) {
+#ifdef COUNT_IPIS
+ (*ipi_ast_counts[cpu])++;
+#endif
+ /* Nothing to do for AST */
+ }
+ if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
+#ifdef COUNT_IPIS
+ (*ipi_hardclock_counts[cpu])++;
+#endif
+ hardclockintr();
+ }
+ td->td_intr_frame = oldframe;
+ td->td_intr_nesting_level--;
+ critical_exit();
+}
+
+/*
+ * send an IPI to a set of cpus.
+ */
+void
+ipi_selected(cpuset_t cpus, u_int ipi)
+{
+ int cpu;
+
+ /*
+ * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+ * of help in order to understand what is the source.
+ * Set the mask of receiving CPUs for this purpose.
+ */
+ if (ipi == IPI_STOP_HARD)
+ CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
+
+ while ((cpu = CPU_FFS(&cpus)) != 0) {
+ cpu--;
+ CPU_CLR(cpu, &cpus);
+ CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
+ ipi_send_cpu(cpu, ipi);
+ }
+}
+
+/*
+ * send an IPI to a specific CPU.
+ */
+void
+ipi_cpu(int cpu, u_int ipi)
+{
+
+ /*
+ * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+ * of help in order to understand what is the source.
+ * Set the mask of receiving CPUs for this purpose.
+ */
+ if (ipi == IPI_STOP_HARD)
+ CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
+
+ CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
+ ipi_send_cpu(cpu, ipi);
+}
+
+/*
+ * send an IPI to all CPUs EXCEPT myself
+ */
+void
+ipi_all_but_self(u_int ipi)
+{
+ cpuset_t other_cpus;
+
+ other_cpus = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ if (IPI_IS_BITMAPED(ipi)) {
+ ipi_selected(other_cpus, ipi);
+ return;
+ }
+
+ /*
+ * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+ * of help in order to understand what is the source.
+ * Set the mask of receiving CPUs for this purpose.
+ */
+ if (ipi == IPI_STOP_HARD)
+ CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
+
+ CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
+ lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
+}
+
+int
+ipi_nmi_handler(void)
+{
+ u_int cpuid;
+
+ /*
+ * As long as there is not a simple way to know about a NMI's
+ * source, if the bitmask for the current CPU is present in
+ * the global pending bitword an IPI_STOP_HARD has been issued
+ * and should be handled.
+ */
+ cpuid = PCPU_GET(cpuid);
+ if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
+ return (1);
+
+ CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
+ cpustop_handler();
+ return (0);
+}
+
+int nmi_kdb_lock;
+
+void
+nmi_call_kdb_smp(u_int type, struct trapframe *frame)
+{
+ int cpu;
+ bool call_post;
+
+ cpu = PCPU_GET(cpuid);
+ if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
+ nmi_call_kdb(cpu, type, frame);
+ call_post = false;
+ } else {
+ savectx(&stoppcbs[cpu]);
+ CPU_SET_ATOMIC(cpu, &stopped_cpus);
+ while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
+ ia32_pause();
+ call_post = true;
+ }
+ atomic_store_rel_int(&nmi_kdb_lock, 0);
+ if (call_post)
+ cpustop_handler_post(cpu);
+}
+
+/*
+ * Handle an IPI_STOP by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpustop_handler(void)
+{
+ u_int cpu;
+
+ cpu = PCPU_GET(cpuid);
+
+ savectx(&stoppcbs[cpu]);
+
+ /* Indicate that we are stopped */
+ CPU_SET_ATOMIC(cpu, &stopped_cpus);
+
+ /* Wait for restart */
+ while (!CPU_ISSET(cpu, &started_cpus))
+ ia32_pause();
+
+ cpustop_handler_post(cpu);
+}
+
+static void
+cpustop_handler_post(u_int cpu)
+{
+
+ CPU_CLR_ATOMIC(cpu, &started_cpus);
+ CPU_CLR_ATOMIC(cpu, &stopped_cpus);
+
+#if defined(__amd64__) && defined(DDB)
+ amd64_db_resume_dbreg();
+#endif
+
+ if (cpu == 0 && cpustop_restartfunc != NULL) {
+ cpustop_restartfunc();
+ cpustop_restartfunc = NULL;
+ }
+}
+
+/*
+ * Handle an IPI_SUSPEND by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpususpend_handler(void)
+{
+ u_int cpu;
+
+ mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
+
+ cpu = PCPU_GET(cpuid);
+ if (savectx(&susppcbs[cpu]->sp_pcb)) {
+#ifdef __amd64__
+ fpususpend(susppcbs[cpu]->sp_fpususpend);
+#else
+ npxsuspend(susppcbs[cpu]->sp_fpususpend);
+#endif
+ /*
+ * suspended_cpus is cleared shortly after each AP is restarted
+ * by a Startup IPI, so that the BSP can proceed to restarting
+ * the next AP.
+ *
+ * resuming_cpus gets cleared when the AP completes
+ * initialization after having been released by the BSP.
+ * resuming_cpus is probably not the best name for the
+ * variable, because it is actually a set of processors that
+ * haven't resumed yet and haven't necessarily started resuming.
+ *
+ * Note that suspended_cpus is meaningful only for ACPI suspend
+ * as it's not really used for Xen suspend since the APs are
+ * automatically restored to the running state and the correct
+ * context. For the same reason resumectx is never called in
+ * that case.
+ */
+ CPU_SET_ATOMIC(cpu, &suspended_cpus);
+ CPU_SET_ATOMIC(cpu, &resuming_cpus);
+
+ /*
+ * Invalidate the cache after setting the global status bits.
+ * The last AP to set its bit may end up being an Owner of the
+ * corresponding cache line in MOESI protocol. The AP may be
+ * stopped before the cache line is written to the main memory.
+ */
+ wbinvd();
+ } else {
+#ifdef __amd64__
+ fpuresume(susppcbs[cpu]->sp_fpususpend);
+#else
+ npxresume(susppcbs[cpu]->sp_fpususpend);
+#endif
+ pmap_init_pat();
+ initializecpu();
+ PCPU_SET(switchtime, 0);
+ PCPU_SET(switchticks, ticks);
+
+ /* Indicate that we have restarted and restored the context. */
+ CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+ }
+
+ /* Wait for resume directive */
+ while (!CPU_ISSET(cpu, &toresume_cpus))
+ ia32_pause();
+
+ /* Re-apply microcode updates. */
+ ucode_reload();
+
+ if (cpu_ops.cpu_resume)
+ cpu_ops.cpu_resume();
+#ifdef __amd64__
+ if (vmm_resume_p)
+ vmm_resume_p();
+#endif
+
+ /* Resume MCA and local APIC */
+ lapic_xapic_mode();
+ mca_resume();
+ lapic_setup(0);
+
+ /* Indicate that we are resumed */
+ CPU_CLR_ATOMIC(cpu, &resuming_cpus);
+ CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+ CPU_CLR_ATOMIC(cpu, &toresume_cpus);
+}
+
+
+void
+invlcache_handler(void)
+{
+ uint32_t generation;
+
+#ifdef COUNT_IPIS
+ (*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ /*
+ * Reading the generation here allows greater parallelism
+ * since wbinvd is a serializing instruction. Without the
+ * temporary, we'd wait for wbinvd to complete, then the read
+ * would execute, then the dependent write, which must then
+ * complete before return from interrupt.
+ */
+ generation = smp_tlb_generation;
+ wbinvd();
+ PCPU_SET(smp_tlb_done, generation);
+}
+
+/*
+ * This is called once the rest of the system is up and running and we're
+ * ready to let the AP's out of the pen.
+ */
+static void
+release_aps(void *dummy __unused)
+{
+
+ if (mp_ncpus == 1)
+ return;
+ atomic_store_rel_int(&aps_ready, 1);
+ while (smp_started == 0)
+ ia32_pause();
+}
+SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
+
+#ifdef COUNT_IPIS
+/*
+ * Setup interrupt counters for IPI handlers.
+ */
+static void
+mp_ipi_intrcnt(void *dummy)
+{
+ char buf[64];
+ int i;
+
+ CPU_FOREACH(i) {
+ snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
+ intrcnt_add(buf, &ipi_invltlb_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
+ intrcnt_add(buf, &ipi_invlrng_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
+ intrcnt_add(buf, &ipi_invlpg_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
+ intrcnt_add(buf, &ipi_invlcache_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
+ intrcnt_add(buf, &ipi_preempt_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:ast", i);
+ intrcnt_add(buf, &ipi_ast_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
+ intrcnt_add(buf, &ipi_rendezvous_counts[i]);
+ snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
+ intrcnt_add(buf, &ipi_hardclock_counts[i]);
+ }
+}
+SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
+#endif
+
+/*
+ * Flush the TLB on other CPU's
+ */
+
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+pmap_t smp_tlb_pmap;
+volatile uint32_t smp_tlb_generation;
+
+#ifdef __amd64__
+#define read_eflags() read_rflags()
+#endif
+
+static void
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
+ vm_offset_t addr1, vm_offset_t addr2)
+{
+ cpuset_t other_cpus;
+ volatile uint32_t *p_cpudone;
+ uint32_t generation;
+ int cpu;
+
+ /*
+ * Check for other cpus. Return if none.
+ */
+ if (CPU_ISFULLSET(&mask)) {
+ if (mp_ncpus <= 1)
+ return;
+ } else {
+ CPU_CLR(PCPU_GET(cpuid), &mask);
+ if (CPU_EMPTY(&mask))
+ return;
+ }
+
+ if (!(read_eflags() & PSL_I))
+ panic("%s: interrupts disabled", __func__);
+ mtx_lock_spin(&smp_ipi_mtx);
+ smp_tlb_addr1 = addr1;
+ smp_tlb_addr2 = addr2;
+ smp_tlb_pmap = pmap;
+ generation = ++smp_tlb_generation;
+ if (CPU_ISFULLSET(&mask)) {
+ ipi_all_but_self(vector);
+ other_cpus = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ } else {
+ other_cpus = mask;
+ while ((cpu = CPU_FFS(&mask)) != 0) {
+ cpu--;
+ CPU_CLR(cpu, &mask);
+ CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
+ cpu, vector);
+ ipi_send_cpu(cpu, vector);
+ }
+ }
+ while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+ cpu--;
+ CPU_CLR(cpu, &other_cpus);
+ p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
+ while (*p_cpudone != generation)
+ ia32_pause();
+ }
+ mtx_unlock_spin(&smp_ipi_mtx);
+}
+
+void
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
+{
+
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_global++;
+#endif
+ }
+}
+
+void
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
+{
+
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_page++;
+#endif
+ }
+}
+
+void
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
+ pmap_t pmap)
+{
+
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
+ addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+ ipi_range++;
+ ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+ }
+}
+
+void
+smp_cache_flush(void)
+{
+
+ if (smp_started) {
+ smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL,
+ 0, 0);
+ }
+}
+
+/*
+ * Handlers for TLB related IPIs
+ */
+void
+invltlb_handler(void)
+{
+ uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+ xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+ (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ /*
+ * Reading the generation here allows greater parallelism
+ * since invalidating the TLB is a serializing operation.
+ */
+ generation = smp_tlb_generation;
+ if (smp_tlb_pmap == kernel_pmap)
+ invltlb_glob();
+ else
+ invltlb();
+ PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlpg_handler(void)
+{
+ uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+ xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+ (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ generation = smp_tlb_generation; /* Overlap with serialization */
+ invlpg(smp_tlb_addr1);
+ PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlrng_handler(void)
+{
+ vm_offset_t addr, addr2;
+ uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+ xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+ (*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ addr = smp_tlb_addr1;
+ addr2 = smp_tlb_addr2;
+ generation = smp_tlb_generation; /* Overlap with serialization */
+ do {
+ invlpg(addr);
+ addr += PAGE_SIZE;
+ } while (addr < addr2);
+
+ PCPU_SET(smp_tlb_done, generation);
+}
Property changes on: trunk/sys/x86/x86/mp_x86.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/mptable.c
===================================================================
--- trunk/sys/x86/x86/mptable.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/mptable.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mptable.c 262141 2014-02-18 01:15:32Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mptable.c 261087 2014-01-23 20:10:22Z jhb $");
#include "opt_mptable_force_htt.h"
#include <sys/param.h>
@@ -51,7 +51,7 @@
#include <x86/mptable.h>
#include <machine/frame.h>
#include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
#include <machine/md_var.h>
#ifdef NEW_PCIB
#include <machine/resource.h>
@@ -79,6 +79,13 @@
typedef void mptable_entry_handler(u_char *entry, void *arg);
typedef void mptable_extended_entry_handler(ext_entry_ptr entry, void *arg);
+/* descriptions of MP table entries */
+typedef struct BASETABLE_ENTRY {
+ uint8_t type;
+ uint8_t length;
+ uint8_t name[16];
+} basetable_entry;
+
static basetable_entry basetable_entry_types[] =
{
{0, 20, "Processor"},
Modified: trunk/sys/x86/x86/mptable_pci.c
===================================================================
--- trunk/sys/x86/x86/mptable_pci.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/mptable_pci.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -31,7 +31,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mptable_pci.c 280970 2015-04-01 21:48:54Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mptable_pci.c 294883 2016-01-27 02:23:54Z jhibbits $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -70,13 +70,13 @@
#ifdef NEW_PCIB
mptable_pci_host_res_init(dev);
#endif
- device_add_child(dev, "pci", pcib_get_bus(dev));
+ device_add_child(dev, "pci", -1);
return (bus_generic_attach(dev));
}
#ifdef NEW_PCIB
static int
-mptable_is_isa_range(u_long start, u_long end)
+mptable_is_isa_range(rman_res_t start, rman_res_t end)
{
if (end >= 0x10000)
@@ -89,7 +89,7 @@
}
static int
-mptable_is_vga_range(u_long start, u_long end)
+mptable_is_vga_range(rman_res_t start, rman_res_t end)
{
if (end >= 0x10000)
return (0);
@@ -102,7 +102,7 @@
static struct resource *
mptable_hostb_alloc_resource(device_t dev, device_t child, int type, int *rid,
- u_long start, u_long end, u_long count, u_int flags)
+ rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
{
struct mptable_hostb_softc *sc;
@@ -143,7 +143,7 @@
static int
mptable_hostb_adjust_resource(device_t dev, device_t child, int type,
- struct resource *r, u_long start, u_long end)
+ struct resource *r, rman_res_t start, rman_res_t end)
{
struct mptable_hostb_softc *sc;
Modified: trunk/sys/x86/x86/msi.c
===================================================================
--- trunk/sys/x86/x86/msi.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/msi.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -36,11 +36,14 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/msi.c 333126 2018-04-30 20:29:28Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/msi.c 344912 2019-03-08 01:04:19Z jhb $");
+#include "opt_acpi.h"
+
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/kernel.h>
+#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
@@ -52,7 +55,8 @@
#include <machine/md_var.h>
#include <machine/frame.h>
#include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
+#include <x86/iommu/iommu_intrmap.h>
#include <machine/specialreg.h>
#include <dev/pci/pcivar.h>
@@ -113,10 +117,11 @@
u_int msi_irq; /* IRQ cookie. */
u_int msi_msix; /* MSI-X message. */
u_int msi_vector:8; /* IDT vector. */
- u_int msi_cpu:8; /* Local APIC ID. (g) */
+ u_int msi_cpu; /* Local APIC ID. (g) */
u_int msi_count:8; /* Messages in this group. (g) */
u_int msi_maxcount:8; /* Alignment for this group. (g) */
- int *msi_irqs; /* Group's IRQ list. (g) */
+ u_int *msi_irqs; /* Group's IRQ list. (g) */
+ u_int msi_remap_cookie;
};
static void msi_create_source(void);
@@ -131,11 +136,27 @@
enum intr_polarity pol);
static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id);
-struct pic msi_pic = { msi_enable_source, msi_disable_source, msi_eoi_source,
- msi_enable_intr, msi_disable_intr, msi_vector,
- msi_source_pending, NULL, NULL, msi_config_intr,
- msi_assign_cpu };
+struct pic msi_pic = {
+ .pic_enable_source = msi_enable_source,
+ .pic_disable_source = msi_disable_source,
+ .pic_eoi_source = msi_eoi_source,
+ .pic_enable_intr = msi_enable_intr,
+ .pic_disable_intr = msi_disable_intr,
+ .pic_vector = msi_vector,
+ .pic_source_pending = msi_source_pending,
+ .pic_suspend = NULL,
+ .pic_resume = NULL,
+ .pic_config_intr = msi_config_intr,
+ .pic_assign_cpu = msi_assign_cpu,
+ .pic_reprogram_pin = NULL,
+};
+u_int first_msi_irq;
+
+u_int num_msi_irqs = 512;
+SYSCTL_UINT(_machdep, OID_AUTO, num_msi_irqs, CTLFLAG_RDTUN, &num_msi_irqs, 0,
+ "Number of IRQs reserved for MSI and MSI-X interrupts");
+
#ifdef SMP
/**
* Xen hypervisors prior to 4.6.0 do not properly handle updates to
@@ -153,7 +174,7 @@
#endif
static int msi_enabled;
-static int msi_last_irq;
+static u_int msi_last_irq;
static struct mtx msi_lock;
static void
@@ -314,6 +335,14 @@
}
#endif
+ if (num_msi_irqs == 0)
+ return;
+
+ first_msi_irq = max(MINIMUM_MSI_INT, num_io_irqs);
+ if (num_msi_irqs > UINT_MAX - first_msi_irq)
+ panic("num_msi_irqs too high");
+ num_io_irqs = first_msi_irq + num_msi_irqs;
+
msi_enabled = 1;
intr_register_pic(&msi_pic);
mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
@@ -326,11 +355,11 @@
u_int irq;
mtx_lock(&msi_lock);
- if (msi_last_irq >= NUM_MSI_INTS) {
+ if (msi_last_irq >= num_msi_irqs) {
mtx_unlock(&msi_lock);
return;
}
- irq = msi_last_irq + FIRST_MSI_INT;
+ irq = msi_last_irq + first_msi_irq;
msi_last_irq++;
mtx_unlock(&msi_lock);
@@ -348,8 +377,12 @@
msi_alloc(device_t dev, int count, int maxcount, int *irqs)
{
struct msi_intsrc *msi, *fsrc;
- u_int cpu;
- int cnt, i, *mirqs, vector;
+ u_int cpu, *mirqs;
+ int cnt, i, vector;
+#ifdef ACPI_DMAR
+ u_int cookies[count];
+ int error;
+#endif
if (!msi_enabled)
return (ENXIO);
@@ -363,7 +396,7 @@
/* Try to find 'count' free IRQs. */
cnt = 0;
- for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
+ for (i = first_msi_irq; i < first_msi_irq + num_msi_irqs; i++) {
msi = (struct msi_intsrc *)intr_lookup_source(i);
/* End of allocated sources, so break. */
@@ -382,7 +415,7 @@
/* Do we need to create some new sources? */
if (cnt < count) {
/* If we would exceed the max, give up. */
- if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) {
+ if (i + (count - cnt) > first_msi_irq + num_msi_irqs) {
mtx_unlock(&msi_lock);
free(mirqs, M_MSI);
return (ENXIO);
@@ -409,6 +442,24 @@
return (ENOSPC);
}
+#ifdef ACPI_DMAR
+ mtx_unlock(&msi_lock);
+ error = iommu_alloc_msi_intr(dev, cookies, count);
+ mtx_lock(&msi_lock);
+ if (error == EOPNOTSUPP)
+ error = 0;
+ if (error != 0) {
+ for (i = 0; i < count; i++)
+ apic_free_vector(cpu, vector + i, irqs[i]);
+ free(mirqs, M_MSI);
+ return (error);
+ }
+ for (i = 0; i < count; i++) {
+ msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
+ msi->msi_remap_cookie = cookies[i];
+ }
+#endif
+
/* Assign IDT vectors and make these messages owned by 'dev'. */
fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
for (i = 0; i < count; i++) {
@@ -430,7 +481,6 @@
bcopy(irqs, mirqs, count * sizeof(*mirqs));
fsrc->msi_irqs = mirqs;
mtx_unlock(&msi_lock);
-
return (0);
}
@@ -474,6 +524,9 @@
msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
KASSERT(msi->msi_first == first, ("message not in group"));
KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch"));
+#ifdef ACPI_DMAR
+ iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie);
+#endif
msi->msi_first = NULL;
msi->msi_dev = NULL;
apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
@@ -481,6 +534,11 @@
}
/* Clear out the first message. */
+#ifdef ACPI_DMAR
+ mtx_unlock(&msi_lock);
+ iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie);
+ mtx_lock(&msi_lock);
+#endif
first->msi_first = NULL;
first->msi_dev = NULL;
apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq);
@@ -498,6 +556,11 @@
msi_map(int irq, uint64_t *addr, uint32_t *data)
{
struct msi_intsrc *msi;
+ int error;
+#ifdef ACPI_DMAR
+ struct msi_intsrc *msi1;
+ int i, k;
+#endif
mtx_lock(&msi_lock);
msi = (struct msi_intsrc *)intr_lookup_source(irq);
@@ -525,10 +588,36 @@
msi = msi->msi_first;
}
- *addr = INTEL_ADDR(msi);
- *data = INTEL_DATA(msi);
+#ifdef ACPI_DMAR
+ if (!msi->msi_msix) {
+ for (k = msi->msi_count - 1, i = first_msi_irq; k > 0 &&
+ i < first_msi_irq + num_msi_irqs; i++) {
+ if (i == msi->msi_irq)
+ continue;
+ msi1 = (struct msi_intsrc *)intr_lookup_source(i);
+ if (!msi1->msi_msix && msi1->msi_first == msi) {
+ mtx_unlock(&msi_lock);
+ iommu_map_msi_intr(msi1->msi_dev,
+ msi1->msi_cpu, msi1->msi_vector,
+ msi1->msi_remap_cookie, NULL, NULL);
+ k--;
+ mtx_lock(&msi_lock);
+ }
+ }
+ }
mtx_unlock(&msi_lock);
- return (0);
+ error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu,
+ msi->msi_vector, msi->msi_remap_cookie, addr, data);
+#else
+ mtx_unlock(&msi_lock);
+ error = EOPNOTSUPP;
+#endif
+ if (error == EOPNOTSUPP) {
+ *addr = INTEL_ADDR(msi);
+ *data = INTEL_DATA(msi);
+ error = 0;
+ }
+ return (error);
}
int
@@ -537,6 +626,10 @@
struct msi_intsrc *msi;
u_int cpu;
int i, vector;
+#ifdef ACPI_DMAR
+ u_int cookie;
+ int error;
+#endif
if (!msi_enabled)
return (ENXIO);
@@ -545,7 +638,7 @@
mtx_lock(&msi_lock);
/* Find a free IRQ. */
- for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
+ for (i = first_msi_irq; i < first_msi_irq + num_msi_irqs; i++) {
msi = (struct msi_intsrc *)intr_lookup_source(i);
/* End of allocated sources, so break. */
@@ -558,7 +651,7 @@
}
/* Are all IRQs in use? */
- if (i == FIRST_MSI_INT + NUM_MSI_INTS) {
+ if (i == first_msi_irq + num_msi_irqs) {
mtx_unlock(&msi_lock);
return (ENXIO);
}
@@ -579,6 +672,22 @@
mtx_unlock(&msi_lock);
return (ENOSPC);
}
+
+ msi->msi_dev = dev;
+#ifdef ACPI_DMAR
+ mtx_unlock(&msi_lock);
+ error = iommu_alloc_msi_intr(dev, &cookie, 1);
+ mtx_lock(&msi_lock);
+ if (error == EOPNOTSUPP)
+ error = 0;
+ if (error != 0) {
+ msi->msi_dev = NULL;
+ apic_free_vector(cpu, vector, i);
+ return (error);
+ }
+ msi->msi_remap_cookie = cookie;
+#endif
+
if (bootverbose)
printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n",
msi->msi_irq, cpu, vector);
@@ -585,7 +694,6 @@
/* Setup source. */
msi->msi_cpu = cpu;
- msi->msi_dev = dev;
msi->msi_first = msi;
msi->msi_vector = vector;
msi->msi_msix = 1;
@@ -621,6 +729,11 @@
KASSERT(msi->msi_dev != NULL, ("unowned message"));
/* Clear out the message. */
+#ifdef ACPI_DMAR
+ mtx_unlock(&msi_lock);
+ iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie);
+ mtx_lock(&msi_lock);
+#endif
msi->msi_first = NULL;
msi->msi_dev = NULL;
apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
Modified: trunk/sys/x86/x86/nexus.c
===================================================================
--- trunk/sys/x86/x86/nexus.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/nexus.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/nexus.c 221324 2011-05-02 14:13:12Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/nexus.c 340016 2018-11-01 18:34:26Z jhb $");
/*
* This code implements a `root nexus' for Intel Architecture
@@ -64,7 +64,6 @@
#include <machine/vmparam.h>
#include <vm/vm.h>
#include <vm/pmap.h>
-#include <machine/pmap.h>
#include <machine/metadata.h>
#include <machine/nexusvar.h>
@@ -80,7 +79,7 @@
#ifdef PC98
#include <pc98/cbus/cbus.h>
#else
-#include <x86/isa/isa.h>
+#include <isa/isareg.h>
#endif
#endif
#include <sys/rtprio.h>
@@ -100,9 +99,10 @@
static device_t nexus_add_child(device_t bus, u_int order, const char *name,
int unit);
static struct resource *nexus_alloc_resource(device_t, device_t, int, int *,
- u_long, u_long, u_long, u_int);
+ rman_res_t, rman_res_t, rman_res_t,
+ u_int);
static int nexus_adjust_resource(device_t, device_t, int, struct resource *,
- u_long, u_long);
+ rman_res_t, rman_res_t);
#ifdef SMP
static int nexus_bind_intr(device_t, device_t, struct resource *, int);
#endif
@@ -115,6 +115,12 @@
struct resource *);
static int nexus_deactivate_resource(device_t, device_t, int, int,
struct resource *);
+static int nexus_map_resource(device_t bus, device_t child, int type,
+ struct resource *r,
+ struct resource_map_request *argsp,
+ struct resource_map *map);
+static int nexus_unmap_resource(device_t bus, device_t child, int type,
+ struct resource *r, struct resource_map *map);
static int nexus_release_resource(device_t, device_t, int, int,
struct resource *);
static int nexus_setup_intr(device_t, device_t, struct resource *, int flags,
@@ -123,9 +129,13 @@
static int nexus_teardown_intr(device_t, device_t, struct resource *,
void *);
static struct resource_list *nexus_get_reslist(device_t dev, device_t child);
-static int nexus_set_resource(device_t, device_t, int, int, u_long, u_long);
-static int nexus_get_resource(device_t, device_t, int, int, u_long *, u_long *);
+static int nexus_set_resource(device_t, device_t, int, int,
+ rman_res_t, rman_res_t);
+static int nexus_get_resource(device_t, device_t, int, int,
+ rman_res_t *, rman_res_t *);
static void nexus_delete_resource(device_t, device_t, int, int);
+static int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t,
+ cpuset_t *);
#ifdef DEV_APIC
static int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs);
static int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs);
@@ -151,6 +161,8 @@
DEVMETHOD(bus_release_resource, nexus_release_resource),
DEVMETHOD(bus_activate_resource, nexus_activate_resource),
DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource),
+ DEVMETHOD(bus_map_resource, nexus_map_resource),
+ DEVMETHOD(bus_unmap_resource, nexus_unmap_resource),
DEVMETHOD(bus_setup_intr, nexus_setup_intr),
DEVMETHOD(bus_teardown_intr, nexus_teardown_intr),
#ifdef SMP
@@ -162,6 +174,7 @@
DEVMETHOD(bus_set_resource, nexus_set_resource),
DEVMETHOD(bus_get_resource, nexus_get_resource),
DEVMETHOD(bus_delete_resource, nexus_delete_resource),
+ DEVMETHOD(bus_get_cpus, nexus_get_cpus),
/* pcib interface */
#ifdef DEV_APIC
@@ -214,7 +227,7 @@
irq_rman.rm_start = 0;
irq_rman.rm_type = RMAN_ARRAY;
irq_rman.rm_descr = "Interrupt request lines";
- irq_rman.rm_end = NUM_IO_INTS - 1;
+ irq_rman.rm_end = num_io_irqs - 1;
if (rman_init(&irq_rman))
panic("nexus_init_resources irq_rman");
@@ -222,7 +235,7 @@
* We search for regions of existing IRQs and add those to the IRQ
* resource manager.
*/
- for (irq = 0; irq < NUM_IO_INTS; irq++)
+ for (irq = 0; irq < num_io_irqs; irq++)
if (intr_lookup_source(irq) != NULL)
if (rman_manage_region(&irq_rman, irq, irq) != 0)
panic("nexus_init_resources irq_rman add");
@@ -260,11 +273,15 @@
panic("nexus_init_resources port_rman");
mem_rman.rm_start = 0;
- mem_rman.rm_end = ~0ul;
+#ifndef PAE
+ mem_rman.rm_end = BUS_SPACE_MAXADDR;
+#else
+ mem_rman.rm_end = ((1ULL << cpu_maxphyaddr) - 1);
+#endif
mem_rman.rm_type = RMAN_ARRAY;
mem_rman.rm_descr = "I/O memory addresses";
if (rman_init(&mem_rman)
- || rman_manage_region(&mem_rman, 0, ~0))
+ || rman_manage_region(&mem_rman, 0, mem_rman.rm_end))
panic("nexus_init_resources mem_rman");
}
@@ -296,9 +313,9 @@
if (STAILQ_FIRST(rl))
retval += printf(" at");
- retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#lx");
- retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#lx");
- retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%ld");
+ retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx");
+ retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx");
+ retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
return retval;
}
@@ -360,7 +377,8 @@
*/
static struct resource *
nexus_alloc_resource(device_t bus, device_t child, int type, int *rid,
- u_long start, u_long end, u_long count, u_int flags)
+ rman_res_t start, rman_res_t end, rman_res_t count,
+ u_int flags)
{
struct nexus_device *ndev = DEVTONX(child);
struct resource *rv;
@@ -369,12 +387,13 @@
int needactivate = flags & RF_ACTIVE;
/*
- * If this is an allocation of the "default" range for a given RID, and
- * we know what the resources for this device are (ie. they aren't maintained
- * by a child bus), then work out the start/end values.
+ * If this is an allocation of the "default" range for a given
+ * RID, and we know what the resources for this device are
+ * (ie. they aren't maintained by a child bus), then work out
+ * the start/end values.
*/
- if ((start == 0UL) && (end == ~0UL) && (count == 1)) {
- if (ndev == NULL)
+ if (RMAN_IS_DEFAULT_RANGE(start, end) && (count == 1)) {
+ if (device_get_parent(child) != bus || ndev == NULL)
return(NULL);
rle = resource_list_find(&ndev->nx_resources, type, *rid);
if (rle == NULL)
@@ -390,7 +409,7 @@
return (NULL);
rv = rman_reserve_resource(rm, start, end, count, flags, child);
- if (rv == 0)
+ if (rv == NULL)
return 0;
rman_set_rid(rv, *rid);
@@ -406,7 +425,7 @@
static int
nexus_adjust_resource(device_t bus, device_t child, int type,
- struct resource *r, u_long start, u_long end)
+ struct resource *r, rman_res_t start, rman_res_t end)
{
struct rman *rm;
@@ -422,12 +441,82 @@
nexus_activate_resource(device_t bus, device_t child, int type, int rid,
struct resource *r)
{
+ struct resource_map map;
+ int error;
+
+ error = rman_activate_resource(r);
+ if (error != 0)
+ return (error);
+
+ if (!(rman_get_flags(r) & RF_UNMAPPED) &&
+ (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
+ error = nexus_map_resource(bus, child, type, r, NULL, &map);
+ if (error) {
+ rman_deactivate_resource(r);
+ return (error);
+ }
+
+ rman_set_mapping(r,&map);
+ }
+ return (0);
+}
+
+static int
+nexus_deactivate_resource(device_t bus, device_t child, int type, int rid,
+ struct resource *r)
+{
+ struct resource_map map;
+ int error;
+
+ error = rman_deactivate_resource(r);
+ if (error)
+ return (error);
+
+ if (!(rman_get_flags(r) & RF_UNMAPPED) &&
+ (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
+ rman_get_mapping(r, &map);
+ nexus_unmap_resource(bus, child, type, r, &map);
+ }
+ return (0);
+}
+
+static int
+nexus_map_resource(device_t bus, device_t child, int type, struct resource *r,
+ struct resource_map_request *argsp, struct resource_map *map)
+{
+ struct resource_map_request args;
+ rman_res_t end, length, start;
#ifdef PC98
- bus_space_handle_t bh;
int error;
#endif
- void *vaddr;
+ /* Resources must be active to be mapped. */
+ if (!(rman_get_flags(r) & RF_ACTIVE))
+ return (ENXIO);
+
+ /* Mappings are only supported on I/O and memory resources. */
+ switch (type) {
+ case SYS_RES_IOPORT:
+ case SYS_RES_MEMORY:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ resource_init_map_request(&args);
+ if (argsp != NULL)
+ bcopy(argsp, &args, imin(argsp->size, args.size));
+ start = rman_get_start(r) + args.offset;
+ if (args.length == 0)
+ length = rman_get_size(r);
+ else
+ length = args.length;
+ end = start + length - 1;
+ if (start > rman_get_end(r) || start < rman_get_start(r))
+ return (EINVAL);
+ if (end > rman_get_end(r) || end < start)
+ return (EINVAL);
+
/*
* If this is a memory resource, map it into the kernel.
*/
@@ -435,58 +524,64 @@
case SYS_RES_IOPORT:
#ifdef PC98
error = i386_bus_space_handle_alloc(X86_BUS_SPACE_IO,
- rman_get_start(r), rman_get_size(r), &bh);
+ start, length, &map->r_bushandle);
if (error)
return (error);
- rman_set_bushandle(r, bh);
#else
- rman_set_bushandle(r, rman_get_start(r));
+ map->r_bushandle = start;
#endif
- rman_set_bustag(r, X86_BUS_SPACE_IO);
+ map->r_bustag = X86_BUS_SPACE_IO;
+ map->r_size = length;
+ map->r_vaddr = NULL;
break;
case SYS_RES_MEMORY:
#ifdef PC98
error = i386_bus_space_handle_alloc(X86_BUS_SPACE_MEM,
- rman_get_start(r), rman_get_size(r), &bh);
+ start, length, &map->r_bushandle);
if (error)
return (error);
#endif
- vaddr = pmap_mapdev(rman_get_start(r), rman_get_size(r));
- rman_set_virtual(r, vaddr);
- rman_set_bustag(r, X86_BUS_SPACE_MEM);
+ map->r_vaddr = pmap_mapdev_attr(start, length, args.memattr);
+ map->r_bustag = X86_BUS_SPACE_MEM;
+ map->r_size = length;
+
+ /*
+ * PC-98 stores the virtual address as a member of the
+ * structure in the handle. On plain x86, the handle is
+ * the virtual address.
+ */
#ifdef PC98
- /* PC-98: the type of bus_space_handle_t is the structure. */
- bh->bsh_base = (bus_addr_t) vaddr;
- rman_set_bushandle(r, bh);
+ map->r_bushandle->bsh_base = (bus_addr_t)map->r_vaddr;
#else
- /* IBM-PC: the type of bus_space_handle_t is u_int */
- rman_set_bushandle(r, (bus_space_handle_t) vaddr);
+ map->r_bushandle = (bus_space_handle_t)map->r_vaddr;
#endif
+ break;
}
- return (rman_activate_resource(r));
+ return (0);
}
static int
-nexus_deactivate_resource(device_t bus, device_t child, int type, int rid,
- struct resource *r)
+nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r,
+ struct resource_map *map)
{
-
+
/*
* If this is a memory resource, unmap it.
*/
- if (type == SYS_RES_MEMORY) {
- pmap_unmapdev((vm_offset_t)rman_get_virtual(r),
- rman_get_size(r));
- }
+ switch (type) {
+ case SYS_RES_MEMORY:
+ pmap_unmapdev((vm_offset_t)map->r_vaddr, map->r_size);
+ /* FALLTHROUGH */
+ case SYS_RES_IOPORT:
#ifdef PC98
- if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) {
- bus_space_handle_t bh;
-
- bh = rman_get_bushandle(r);
- i386_bus_space_handle_free(rman_get_bustag(r), bh, bh->bsh_sz);
+ i386_bus_space_handle_free(map->r_bustag, map->r_bushandle,
+ map->r_bushandle->bsh_sz);
+#endif
+ break;
+ default:
+ return (EINVAL);
}
-#endif
- return (rman_deactivate_resource(r));
+ return (0);
}
static int
@@ -493,6 +588,7 @@
nexus_release_resource(device_t bus, device_t child, int type, int rid,
struct resource *r)
{
+
if (rman_get_flags(r) & RF_ACTIVE) {
int error = bus_deactivate_resource(child, type, rid, r);
if (error)
@@ -518,7 +614,7 @@
if (irq == NULL)
panic("nexus_setup_intr: NULL irq resource!");
- *cookiep = 0;
+ *cookiep = NULL;
if ((rman_get_flags(irq) & RF_SHAREABLE) == 0)
flags |= INTR_EXCL;
@@ -573,7 +669,8 @@
}
static int
-nexus_set_resource(device_t dev, device_t child, int type, int rid, u_long start, u_long count)
+nexus_set_resource(device_t dev, device_t child, int type, int rid,
+ rman_res_t start, rman_res_t count)
{
struct nexus_device *ndev = DEVTONX(child);
struct resource_list *rl = &ndev->nx_resources;
@@ -584,7 +681,8 @@
}
static int
-nexus_get_resource(device_t dev, device_t child, int type, int rid, u_long *startp, u_long *countp)
+nexus_get_resource(device_t dev, device_t child, int type, int rid,
+ rman_res_t *startp, rman_res_t *countp)
{
struct nexus_device *ndev = DEVTONX(child);
struct resource_list *rl = &ndev->nx_resources;
@@ -609,6 +707,24 @@
resource_list_delete(rl, type, rid);
}
+static int
+nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
+ cpuset_t *cpuset)
+{
+
+ switch (op) {
+#ifdef SMP
+ case INTR_CPUS:
+ if (setsize != sizeof(cpuset_t))
+ return (EINVAL);
+ *cpuset = intr_cpus;
+ return (0);
+#endif
+ default:
+ return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
+ }
+}
+
/* Called from the MSI code to add new IRQs to the IRQ rman. */
void
nexus_add_irq(u_long irq)
@@ -689,11 +805,8 @@
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type(ELF_KERN_STR);
- if (kmdp != NULL)
- smapbase = (struct bios_smap *)preload_search_info(kmdp,
- MODINFO_METADATA | MODINFOMD_SMAP);
- else
- smapbase = NULL;
+ smapbase = (struct bios_smap *)preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase != NULL) {
smapsize = *((u_int32_t *)smapbase - 1);
smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
Added: trunk/sys/x86/x86/pvclock.c
===================================================================
--- trunk/sys/x86/x86/pvclock.c (rev 0)
+++ trunk/sys/x86/x86/pvclock.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,204 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2009 Adrian Chadd
+ * Copyright (c) 2012 Spectra Logic Corporation
+ * Copyright (c) 2014 Bryan Venteicher
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/pvclock.c 278184 2015-02-04 08:33:04Z bryanv $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+
+#include <machine/cpufunc.h>
+#include <machine/cpu.h>
+#include <machine/atomic.h>
+#include <machine/pvclock.h>
+
+/*
+ * Last time; this guarantees a monotonically increasing clock for when
+ * a stable TSC is not provided.
+ */
+static volatile uint64_t pvclock_last_cycles;
+
+void
+pvclock_resume(void)
+{
+
+ atomic_store_rel_64(&pvclock_last_cycles, 0);
+}
+
+uint64_t
+pvclock_get_last_cycles(void)
+{
+
+ return (atomic_load_acq_64(&pvclock_last_cycles));
+}
+
+uint64_t
+pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti)
+{
+ uint64_t freq;
+
+ freq = (1000000000ULL << 32) / ti->tsc_to_system_mul;
+
+ if (ti->tsc_shift < 0)
+ freq <<= -ti->tsc_shift;
+ else
+ freq >>= ti->tsc_shift;
+
+ return (freq);
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline uint64_t
+pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
+{
+ uint64_t product;
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#if defined(__i386__)
+ {
+ uint32_t tmp1, tmp2;
+
+ /**
+ * For i386, the formula looks like:
+ *
+ * lower = (mul_frac * (delta & UINT_MAX)) >> 32
+ * upper = mul_frac * (delta >> 32)
+ * product = lower + upper
+ */
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
+ "2" (mul_frac) );
+ }
+#elif defined(__amd64__)
+ {
+ unsigned long tmp;
+
+ __asm__ (
+ "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+ : [lo]"=a" (product), [hi]"=d" (tmp)
+ : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
+ }
+#else
+#error "pvclock: unsupported x86 architecture?"
+#endif
+
+ return (product);
+}
+
+static uint64_t
+pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti)
+{
+ uint64_t delta;
+
+ delta = rdtsc() - ti->tsc_timestamp;
+
+ return (pvclock_scale_delta(delta, ti->tsc_to_system_mul,
+ ti->tsc_shift));
+}
+
+static void
+pvclock_read_time_info(struct pvclock_vcpu_time_info *ti,
+ uint64_t *cycles, uint8_t *flags)
+{
+ uint32_t version;
+
+ do {
+ version = ti->version;
+ rmb();
+ *cycles = ti->system_time + pvclock_get_nsec_offset(ti);
+ *flags = ti->flags;
+ rmb();
+ } while ((ti->version & 1) != 0 || ti->version != version);
+}
+
+static void
+pvclock_read_wall_clock(struct pvclock_wall_clock *wc, uint32_t *sec,
+ uint32_t *nsec)
+{
+ uint32_t version;
+
+ do {
+ version = wc->version;
+ rmb();
+ *sec = wc->sec;
+ *nsec = wc->nsec;
+ rmb();
+ } while ((wc->version & 1) != 0 || wc->version != version);
+}
+
+uint64_t
+pvclock_get_timecount(struct pvclock_vcpu_time_info *ti)
+{
+ uint64_t now, last;
+ uint8_t flags;
+
+ pvclock_read_time_info(ti, &now, &flags);
+
+ if (flags & PVCLOCK_FLAG_TSC_STABLE)
+ return (now);
+
+ /*
+ * Enforce a monotonically increasing clock time across all VCPUs.
+ * If our time is too old, use the last time and return. Otherwise,
+ * try to update the last time.
+ */
+ do {
+ last = atomic_load_acq_64(&pvclock_last_cycles);
+ if (last > now)
+ return (last);
+ } while (!atomic_cmpset_64(&pvclock_last_cycles, last, now));
+
+ return (now);
+}
+
+void
+pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts)
+{
+ uint32_t sec, nsec;
+
+ pvclock_read_wall_clock(wc, &sec, &nsec);
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
Property changes on: trunk/sys/x86/x86/pvclock.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/stack_machdep.c
===================================================================
--- trunk/sys/x86/x86/stack_machdep.c (rev 0)
+++ trunk/sys/x86/x86/stack_machdep.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,182 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 EMC Corporation
+ * Copyright (c) 2005 Antoine Brodin
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/stack_machdep.c 337976 2018-08-17 16:04:59Z markj $");
+
+#include "opt_stack.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/stack.h>
+
+#include <machine/pcb.h>
+#include <machine/smp.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <x86/stack.h>
+
+#ifdef __i386__
+#define PCB_FP(pcb) ((pcb)->pcb_ebp)
+#define TF_FLAGS(tf) ((tf)->tf_eflags)
+#define TF_FP(tf) ((tf)->tf_ebp)
+#define TF_PC(tf) ((tf)->tf_eip)
+
+typedef struct i386_frame *x86_frame_t;
+#else
+#define PCB_FP(pcb) ((pcb)->pcb_rbp)
+#define TF_FLAGS(tf) ((tf)->tf_rflags)
+#define TF_FP(tf) ((tf)->tf_rbp)
+#define TF_PC(tf) ((tf)->tf_rip)
+
+typedef struct amd64_frame *x86_frame_t;
+#endif
+
+#ifdef STACK
+static struct stack *nmi_stack;
+static volatile struct thread *nmi_pending;
+
+#ifdef SMP
+static struct mtx nmi_lock;
+MTX_SYSINIT(nmi_lock, &nmi_lock, "stack_nmi", MTX_SPIN);
+#endif
+#endif
+
+static void
+stack_capture(struct thread *td, struct stack *st, register_t fp)
+{
+ x86_frame_t frame;
+ vm_offset_t callpc;
+
+ stack_zero(st);
+ frame = (x86_frame_t)fp;
+ while (1) {
+ if ((vm_offset_t)frame < td->td_kstack ||
+ (vm_offset_t)frame >= td->td_kstack +
+ td->td_kstack_pages * PAGE_SIZE)
+ break;
+ callpc = frame->f_retaddr;
+ if (!INKERNEL(callpc))
+ break;
+ if (stack_put(st, callpc) == -1)
+ break;
+ if (frame->f_frame <= frame)
+ break;
+ frame = frame->f_frame;
+ }
+}
+
+int
+stack_nmi_handler(struct trapframe *tf)
+{
+
+#ifdef STACK
+ /* Don't consume an NMI that wasn't meant for us. */
+ if (nmi_stack == NULL || curthread != nmi_pending)
+ return (0);
+
+ if (!TRAPF_USERMODE(tf) && (TF_FLAGS(tf) & PSL_I) != 0)
+ stack_capture(curthread, nmi_stack, TF_FP(tf));
+ else
+ /* We were running in usermode or had interrupts disabled. */
+ nmi_stack->depth = 0;
+
+ atomic_store_rel_ptr((long *)&nmi_pending, (long)NULL);
+ return (1);
+#else
+ return (0);
+#endif
+}
+
+void
+stack_save_td(struct stack *st, struct thread *td)
+{
+
+ if (TD_IS_SWAPPED(td))
+ panic("stack_save_td: swapped");
+ if (TD_IS_RUNNING(td))
+ panic("stack_save_td: running");
+
+ stack_capture(td, st, PCB_FP(td->td_pcb));
+}
+
+int
+stack_save_td_running(struct stack *st, struct thread *td)
+{
+
+#ifdef STACK
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ MPASS(TD_IS_RUNNING(td));
+
+ if (td == curthread) {
+ stack_save(st);
+ return (0);
+ }
+
+#ifdef SMP
+ mtx_lock_spin(&nmi_lock);
+
+ nmi_stack = st;
+ nmi_pending = td;
+ ipi_cpu(td->td_oncpu, IPI_TRACE);
+ while ((void *)atomic_load_acq_ptr((long *)&nmi_pending) != NULL)
+ cpu_spinwait();
+ nmi_stack = NULL;
+
+ mtx_unlock_spin(&nmi_lock);
+
+ if (st->depth == 0)
+ return (EAGAIN);
+#else /* !SMP */
+ KASSERT(0, ("curthread isn't running"));
+#endif /* SMP */
+ return (0);
+#else /* !STACK */
+ return (EOPNOTSUPP);
+#endif /* STACK */
+}
+
+void
+stack_save(struct stack *st)
+{
+ register_t fp;
+
+#ifdef __i386__
+ __asm __volatile("movl %%ebp,%0" : "=g" (fp));
+#else
+ __asm __volatile("movq %%rbp,%0" : "=g" (fp));
+#endif
+ stack_capture(curthread, st, fp);
+}
Property changes on: trunk/sys/x86/x86/stack_machdep.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/tsc.c
===================================================================
--- trunk/sys/x86/x86/tsc.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/tsc.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/tsc.c 280973 2015-04-02 01:02:42Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/tsc.c 353007 2019-10-02 13:46:40Z kib $");
#include "opt_compat.h"
#include "opt_clock.h"
@@ -49,6 +49,7 @@
#include <machine/md_var.h>
#include <machine/specialreg.h>
#include <x86/vmware.h>
+#include <dev/acpica/acpi_hpet.h>
#include "cpufreq_if.h"
@@ -60,34 +61,28 @@
SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN,
&tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant");
-TUNABLE_INT("kern.timecounter.invariant_tsc", &tsc_is_invariant);
#ifdef SMP
int smp_tsc;
SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0,
"Indicates whether the TSC is safe to use in SMP mode");
-TUNABLE_INT("kern.timecounter.smp_tsc", &smp_tsc);
int smp_tsc_adjust = 0;
SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN,
&smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP");
-TUNABLE_INT("kern.timecounter.smp_tsc_adjust", &smp_tsc_adjust);
#endif
static int tsc_shift = 1;
SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN,
&tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency");
-TUNABLE_INT("kern.timecounter.tsc_shift", &tsc_shift);
static int tsc_disabled;
SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0,
"Disable x86 Time Stamp Counter");
-TUNABLE_INT("machdep.disable_tsc", &tsc_disabled);
static int tsc_skip_calibration;
SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN,
&tsc_skip_calibration, 0, "Disable TSC frequency calibration");
-TUNABLE_INT("machdep.disable_tsc_calibration", &tsc_skip_calibration);
static void tsc_freq_changed(void *arg, const struct cf_level *level,
int status);
@@ -100,14 +95,22 @@
static unsigned tsc_get_timecount_mfence(struct timecounter *tc);
static unsigned tsc_get_timecount_low_mfence(struct timecounter *tc);
static void tsc_levels_changed(void *arg, int unit);
+static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
+ struct timecounter *tc);
+#ifdef COMPAT_FREEBSD32
+static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
+ struct timecounter *tc);
+#endif
static struct timecounter tsc_timecounter = {
- tsc_get_timecount, /* get_timecount */
- 0, /* no poll_pps */
- ~0u, /* counter_mask */
- 0, /* frequency */
- "TSC", /* name */
- 800, /* quality (adjusted in code) */
+ .tc_get_timecount = tsc_get_timecount,
+ .tc_counter_mask = ~0u,
+ .tc_name = "TSC",
+ .tc_quality = 800, /* adjusted in code */
+ .tc_fill_vdso_timehands = x86_tsc_vdso_timehands,
+#ifdef COMPAT_FREEBSD32
+ .tc_fill_vdso_timehands32 = x86_tsc_vdso_timehands32,
+#endif
};
static void
@@ -126,6 +129,40 @@
tsc_is_invariant = 1;
}
+/*
+ * Calculate TSC frequency using information from the CPUID leaf 0x15
+ * 'Time Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15
+ * is not functional, as it is on Skylake/Kabylake, try 0x16 'Processor
+ * Frequency Information'. Leaf 0x16 is described in the SDM as
+ * informational only, but if 0x15 did not work, and TSC calibration
+ * is disabled, it is the best we can get at all. It should still be
+ * an improvement over the parsing of the CPU model name in
+ * tsc_freq_intel(), when available.
+ */
+static bool
+tsc_freq_cpuid(void)
+{
+ u_int regs[4];
+
+ if (cpu_high < 0x15)
+ return (false);
+ do_cpuid(0x15, regs);
+ if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) {
+ tsc_freq = (uint64_t)regs[2] * regs[1] / regs[0];
+ return (true);
+ }
+
+ if (cpu_high < 0x16)
+ return (false);
+ do_cpuid(0x16, regs);
+ if (regs[0] != 0) {
+ tsc_freq = (uint64_t)regs[0] * 1000000;
+ return (true);
+ }
+
+ return (false);
+}
+
static void
tsc_freq_intel(void)
{
@@ -250,18 +287,19 @@
}
if (tsc_skip_calibration) {
- if (cpu_vendor_id == CPU_VENDOR_INTEL)
+ if (tsc_freq_cpuid())
+ ;
+ else if (cpu_vendor_id == CPU_VENDOR_INTEL)
tsc_freq_intel();
- return;
+ } else {
+ if (bootverbose)
+ printf("Calibrating TSC clock ... ");
+ tsc1 = rdtsc();
+ DELAY(1000000);
+ tsc2 = rdtsc();
+ tsc_freq = tsc2 - tsc1;
}
-
if (bootverbose)
- printf("Calibrating TSC clock ... ");
- tsc1 = rdtsc();
- DELAY(1000000);
- tsc2 = rdtsc();
- tsc_freq = tsc2 - tsc1;
- if (bootverbose)
printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq);
}
@@ -427,7 +465,7 @@
}
static int
-test_tsc(void)
+test_tsc(int adj_max_count)
{
uint64_t *data, *tsc;
u_int i, size, adj;
@@ -441,12 +479,12 @@
for (i = 0, tsc = data; i < N; i++, tsc += size)
smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc);
smp_tsc = 1; /* XXX */
- smp_rendezvous(smp_no_rendevous_barrier, comp_smp_tsc,
- smp_no_rendevous_barrier, data);
- if (!smp_tsc && adj < smp_tsc_adjust) {
+ smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc,
+ smp_no_rendezvous_barrier, data);
+ if (!smp_tsc && adj < adj_max_count) {
adj++;
- smp_rendezvous(smp_no_rendevous_barrier, adj_smp_tsc,
- smp_no_rendevous_barrier, data);
+ smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc,
+ smp_no_rendezvous_barrier, data);
goto retry;
}
free(data, M_TEMP);
@@ -481,19 +519,6 @@
#undef N
-#else
-
-/*
- * The function is not called, it is provided to avoid linking failure
- * on uniprocessor kernel.
- */
-static int
-test_tsc(void)
-{
-
- return (0);
-}
-
#endif /* SMP */
static void
@@ -529,17 +554,22 @@
}
/*
- * We cannot use the TSC if it stops incrementing while idle.
* Intel CPUs without a C-state invariant TSC can stop the TSC
- * in either C2 or C3.
+ * in either C2 or C3. Disable use of C2 and C3 while using
+ * the TSC as the timecounter. The timecounter can be changed
+ * to enable C2 and C3.
+ *
+ * Note that the TSC is used as the cputicker for computing
+ * thread runtime regardless of the timecounter setting, so
+ * using an alternate timecounter and enabling C2 or C3 can
+ * result incorrect runtimes for kernel idle threads (but not
+ * for any non-idle threads).
*/
- if (cpu_deepest_sleep >= 2 && cpu_vendor_id == CPU_VENDOR_INTEL &&
+ if (cpu_vendor_id == CPU_VENDOR_INTEL &&
(amd_pminfo & AMDPM_TSC_INVARIANT) == 0) {
- tsc_timecounter.tc_quality = -1000;
tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP;
if (bootverbose)
- printf("TSC timecounter disabled: C2/C3 may halt it.\n");
- goto init;
+ printf("TSC timecounter disables C2 and C3.\n");
}
/*
@@ -549,9 +579,12 @@
* non-zero value. The TSC seems unreliable in virtualized SMP
* environments, so it is set to a negative quality in those cases.
*/
+#ifdef SMP
if (mp_ncpus > 1)
- tsc_timecounter.tc_quality = test_tsc();
- else if (tsc_is_invariant)
+ tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust);
+ else
+#endif /* SMP */
+ if (tsc_is_invariant)
tsc_timecounter.tc_quality = 1000;
max_freq >>= tsc_shift;
@@ -586,6 +619,32 @@
}
SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
+void
+resume_TSC(void)
+{
+#ifdef SMP
+ int quality;
+
+ /* If TSC was not good on boot, it is unlikely to become good now. */
+ if (tsc_timecounter.tc_quality < 0)
+ return;
+ /* Nothing to do with UP. */
+ if (mp_ncpus < 2)
+ return;
+
+ /*
+ * If TSC was good, a single synchronization should be enough,
+ * but honour smp_tsc_adjust if it's set.
+ */
+ quality = test_tsc(MAX(smp_tsc_adjust, 1));
+ if (quality != tsc_timecounter.tc_quality) {
+ printf("TSC timecounter quality changed: %d -> %d\n",
+ tsc_timecounter.tc_quality, quality);
+ tsc_timecounter.tc_quality = quality;
+ }
+#endif /* SMP */
+}
+
/*
* When cpufreq levels change, find out about the (new) max frequency. We
* use this to update CPU accounting in case it got a lower estimate at boot.
@@ -726,22 +785,27 @@
return (tsc_get_timecount_low(tc));
}
-uint32_t
-cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+static uint32_t
+x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
{
- vdso_th->th_x86_shift = (int)(intptr_t)timecounter->tc_priv;
+ vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC;
+ vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv;
+ vdso_th->th_x86_hpet_idx = 0xffffffff;
bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
- return (timecounter == &tsc_timecounter);
+ return (1);
}
#ifdef COMPAT_FREEBSD32
-uint32_t
-cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+static uint32_t
+x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
+ struct timecounter *tc)
{
- vdso_th32->th_x86_shift = (int)(intptr_t)timecounter->tc_priv;
+ vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC;
+ vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv;
+ vdso_th32->th_x86_hpet_idx = 0xffffffff;
bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res));
- return (timecounter == &tsc_timecounter);
+ return (1);
}
#endif
Added: trunk/sys/x86/x86/ucode.c
===================================================================
--- trunk/sys/x86/x86/ucode.c (rev 0)
+++ trunk/sys/x86/x86/ucode.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,402 @@
+/* $MidnightBSD$ */
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/ucode.c 347700 2019-05-16 14:42:16Z markj $");
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <x86/specialreg.h>
+#include <machine/stdarg.h>
+#include <x86/ucode.h>
+#include <x86/x86_smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+
+static void *ucode_intel_match(uint8_t *data, size_t *len);
+static int ucode_intel_verify(struct ucode_intel_header *hdr,
+ size_t resid);
+
+static struct ucode_ops {
+ const char *vendor;
+ int (*load)(void *, bool, uint64_t *, uint64_t *);
+ void *(*match)(uint8_t *, size_t *);
+} loaders[] = {
+ {
+ .vendor = INTEL_VENDOR_ID,
+ .load = ucode_intel_load,
+ .match = ucode_intel_match,
+ },
+};
+
+/* Selected microcode update data. */
+static void *early_ucode_data;
+static void *ucode_data;
+static struct ucode_ops *ucode_loader;
+
+/* Variables used for reporting success or failure. */
+enum {
+ NO_ERROR,
+ NO_MATCH,
+ VERIFICATION_FAILED,
+} ucode_error = NO_ERROR;
+static uint64_t ucode_nrev, ucode_orev;
+
+static void
+log_msg(void *arg __unused)
+{
+
+ if (ucode_nrev != 0) {
+ printf("CPU microcode: updated from %#jx to %#jx\n",
+ (uintmax_t)ucode_orev, (uintmax_t)ucode_nrev);
+ return;
+ }
+
+ switch (ucode_error) {
+ case NO_MATCH:
+ printf("CPU microcode: no matching update found\n");
+ break;
+ case VERIFICATION_FAILED:
+ printf("CPU microcode: microcode verification failed\n");
+ break;
+ default:
+ break;
+ }
+}
+SYSINIT(ucode_log, SI_SUB_CPU, SI_ORDER_FIRST, log_msg, NULL);
+
+int
+ucode_intel_load(void *data, bool unsafe, uint64_t *nrevp, uint64_t *orevp)
+{
+ uint64_t nrev, orev;
+ uint32_t cpuid[4];
+
+ orev = rdmsr(MSR_BIOS_SIGN) >> 32;
+
+ /*
+ * Perform update. Flush caches first to work around seemingly
+ * undocumented errata applying to some Broadwell CPUs.
+ */
+ wbinvd();
+ if (unsafe)
+ wrmsr_safe(MSR_BIOS_UPDT_TRIG, (uint64_t)(uintptr_t)data);
+ else
+ wrmsr(MSR_BIOS_UPDT_TRIG, (uint64_t)(uintptr_t)data);
+ wrmsr(MSR_BIOS_SIGN, 0);
+
+ /*
+ * Serialize instruction flow.
+ */
+ do_cpuid(0, cpuid);
+
+ /*
+ * Verify that the microcode revision changed.
+ */
+ nrev = rdmsr(MSR_BIOS_SIGN) >> 32;
+ if (nrevp != NULL)
+ *nrevp = nrev;
+ if (orevp != NULL)
+ *orevp = orev;
+ if (nrev <= orev)
+ return (EEXIST);
+ return (0);
+}
+
+static int
+ucode_intel_verify(struct ucode_intel_header *hdr, size_t resid)
+{
+ uint32_t cksum, *data, size;
+ int i;
+
+ if (resid < sizeof(struct ucode_intel_header))
+ return (1);
+ size = hdr->total_size;
+ if (size == 0)
+ size = UCODE_INTEL_DEFAULT_DATA_SIZE +
+ sizeof(struct ucode_intel_header);
+
+ if (hdr->header_version != 1)
+ return (1);
+ if (size % 16 != 0)
+ return (1);
+ if (resid < size)
+ return (1);
+
+ cksum = 0;
+ data = (uint32_t *)hdr;
+ for (i = 0; i < size / sizeof(uint32_t); i++)
+ cksum += data[i];
+ if (cksum != 0)
+ return (1);
+ return (0);
+}
+
+static void *
+ucode_intel_match(uint8_t *data, size_t *len)
+{
+ struct ucode_intel_header *hdr;
+ struct ucode_intel_extsig_table *table;
+ struct ucode_intel_extsig *entry;
+ uint64_t platformid;
+ size_t resid;
+ uint32_t data_size, flags, regs[4], sig, total_size;
+ int i;
+
+ do_cpuid(1, regs);
+ sig = regs[0];
+
+ platformid = rdmsr(MSR_IA32_PLATFORM_ID);
+ flags = 1 << ((platformid >> 50) & 0x7);
+
+ for (resid = *len; resid > 0; data += total_size, resid -= total_size) {
+ hdr = (struct ucode_intel_header *)data;
+ if (ucode_intel_verify(hdr, resid) != 0) {
+ ucode_error = VERIFICATION_FAILED;
+ break;
+ }
+
+ data_size = hdr->data_size;
+ total_size = hdr->total_size;
+ if (data_size == 0)
+ data_size = UCODE_INTEL_DEFAULT_DATA_SIZE;
+ if (total_size == 0)
+ total_size = UCODE_INTEL_DEFAULT_DATA_SIZE +
+ sizeof(struct ucode_intel_header);
+ if (data_size > total_size + sizeof(struct ucode_intel_header))
+ table = (struct ucode_intel_extsig_table *)
+ ((uint8_t *)(hdr + 1) + data_size);
+ else
+ table = NULL;
+
+ if (hdr->processor_signature == sig) {
+ if ((hdr->processor_flags & flags) != 0) {
+ *len = data_size;
+ return (hdr + 1);
+ }
+ } else if (table != NULL) {
+ for (i = 0; i < table->signature_count; i++) {
+ entry = &table->entries[i];
+ if (entry->processor_signature == sig &&
+ (entry->processor_flags & flags) != 0) {
+ *len = data_size;
+ return (hdr + 1);
+ }
+ }
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Release any memory backing unused microcode blobs back to the system.
+ * We copy the selected update and free the entire microcode file.
+ */
+static void
+ucode_release(void *arg __unused)
+{
+ char *name, *type;
+ caddr_t file;
+ int release;
+
+ if (early_ucode_data == NULL)
+ return;
+ release = 1;
+ TUNABLE_INT_FETCH("debug.ucode.release", &release);
+ if (!release)
+ return;
+
+restart:
+ file = 0;
+ for (;;) {
+ file = preload_search_next_name(file);
+ if (file == 0)
+ break;
+ type = (char *)preload_search_info(file, MODINFO_TYPE);
+ if (type == NULL || strcmp(type, "cpu_microcode") != 0)
+ continue;
+
+ name = preload_search_info(file, MODINFO_NAME);
+ preload_delete_name(name);
+ goto restart;
+ }
+}
+SYSINIT(ucode_release, SI_SUB_KMEM + 1, SI_ORDER_ANY, ucode_release, NULL);
+
+void
+ucode_load_ap(int cpu)
+{
+#ifdef SMP
+ KASSERT(cpu_info[cpu_apic_ids[cpu]].cpu_present,
+ ("cpu %d not present", cpu));
+
+ if (cpu_info[cpu_apic_ids[cpu]].cpu_hyperthread)
+ return;
+#endif
+
+ if (ucode_data != NULL)
+ (void)ucode_loader->load(ucode_data, false, NULL, NULL);
+}
+
+static void *
+map_ucode(uintptr_t free, size_t len)
+{
+#ifdef __i386__
+ uintptr_t va;
+
+ for (va = free; va < free + len; va += PAGE_SIZE)
+ pmap_kenter(va, (vm_paddr_t)va);
+#else
+ (void)len;
+#endif
+ return ((void *)free);
+}
+
+static void
+unmap_ucode(uintptr_t free, size_t len)
+{
+#ifdef __i386__
+ uintptr_t va;
+
+ for (va = free; va < free + len; va += PAGE_SIZE)
+ pmap_kremove(va);
+#else
+ (void)free;
+ (void)len;
+#endif
+}
+
+/*
+ * Search for an applicable microcode update, and load it. APs will load the
+ * selected update once they come online.
+ *
+ * "free" is the address of the next free physical page. If a microcode update
+ * is selected, it will be copied to this region prior to loading in order to
+ * satisfy alignment requirements.
+ */
+size_t
+ucode_load_bsp(uintptr_t free)
+{
+ union {
+ uint32_t regs[4];
+ char vendor[13];
+ } cpuid;
+ uint8_t *addr, *fileaddr, *match;
+ char *type;
+ uint64_t nrev, orev;
+ caddr_t file;
+ size_t i, len;
+ int error;
+
+ KASSERT(free % PAGE_SIZE == 0, ("unaligned boundary %p", (void *)free));
+
+ do_cpuid(0, cpuid.regs);
+ cpuid.regs[0] = cpuid.regs[1];
+ cpuid.regs[1] = cpuid.regs[3];
+ cpuid.vendor[12] = '\0';
+ for (i = 0; i < nitems(loaders); i++)
+ if (strcmp(cpuid.vendor, loaders[i].vendor) == 0) {
+ ucode_loader = &loaders[i];
+ break;
+ }
+ if (ucode_loader == NULL)
+ return (0);
+
+ file = 0;
+ fileaddr = match = NULL;
+ for (;;) {
+ file = preload_search_next_name(file);
+ if (file == 0)
+ break;
+ type = (char *)preload_search_info(file, MODINFO_TYPE);
+ if (type == NULL || strcmp(type, "cpu_microcode") != 0)
+ continue;
+
+ fileaddr = preload_fetch_addr(file);
+ len = preload_fetch_size(file);
+ match = ucode_loader->match(fileaddr, &len);
+ if (match != NULL) {
+ addr = map_ucode(free, len);
+ /* We can't use memcpy() before ifunc resolution. */
+ for (i = 0; i < len; i++)
+ addr[i] = ((volatile uint8_t *)match)[i];
+ match = addr;
+
+ error = ucode_loader->load(match, false, &nrev, &orev);
+ if (error == 0) {
+ ucode_data = early_ucode_data = match;
+ ucode_nrev = nrev;
+ ucode_orev = orev;
+ return (len);
+ }
+ unmap_ucode(free, len);
+ }
+ }
+ if (fileaddr != NULL && ucode_error == NO_ERROR)
+ ucode_error = NO_MATCH;
+ return (0);
+}
+
+/*
+ * Reload microcode following an ACPI resume.
+ */
+void
+ucode_reload(void)
+{
+
+ ucode_load_ap(PCPU_GET(cpuid));
+}
+
+/*
+ * Replace an existing microcode update.
+ */
+void *
+ucode_update(void *newdata)
+{
+
+ newdata = (void *)atomic_swap_ptr((void *)&ucode_data,
+ (uintptr_t)newdata);
+ if (newdata == early_ucode_data)
+ newdata = NULL;
+ return (newdata);
+}
Property changes on: trunk/sys/x86/x86/ucode.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/x86_mem.c
===================================================================
--- trunk/sys/x86/x86/x86_mem.c (rev 0)
+++ trunk/sys/x86/x86/x86_mem.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,729 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1999 Michael Smith <msmith at freebsd.org>
+ * Copyright (c) 2017 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/x86_mem.c 314591 2017-03-03 10:30:30Z kib $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+/*
+ * Pentium Pro+ memory range operations
+ *
+ * This code will probably be impenetrable without reference to the
+ * Intel Pentium Pro documentation or x86-64 programmers manual vol 2.
+ */
+
+static char *mem_owner_bios = "BIOS";
+
+#define MR686_FIXMTRR (1<<0)
+
+#define mrwithin(mr, a) \
+ (((a) >= (mr)->mr_base) && ((a) < ((mr)->mr_base + (mr)->mr_len)))
+#define mroverlap(mra, mrb) \
+ (mrwithin(mra, mrb->mr_base) || mrwithin(mrb, mra->mr_base))
+
+#define mrvalid(base, len) \
+ ((!(base & ((1 << 12) - 1))) && /* base is multiple of 4k */ \
+ ((len) >= (1 << 12)) && /* length is >= 4k */ \
+ powerof2((len)) && /* ... and power of two */ \
+ !((base) & ((len) - 1))) /* range is not discontiuous */
+
+#define mrcopyflags(curr, new) \
+ (((curr) & ~MDF_ATTRMASK) | ((new) & MDF_ATTRMASK))
+
+static int mtrrs_disabled;
+SYSCTL_INT(_machdep, OID_AUTO, disable_mtrrs, CTLFLAG_RDTUN,
+ &mtrrs_disabled, 0,
+ "Disable MTRRs.");
+
+static void x86_mrinit(struct mem_range_softc *sc);
+static int x86_mrset(struct mem_range_softc *sc,
+ struct mem_range_desc *mrd, int *arg);
+static void x86_mrAPinit(struct mem_range_softc *sc);
+static void x86_mrreinit(struct mem_range_softc *sc);
+
+static struct mem_range_ops x86_mrops = {
+ x86_mrinit,
+ x86_mrset,
+ x86_mrAPinit,
+ x86_mrreinit
+};
+
+/* XXX for AP startup hook */
+static u_int64_t mtrrcap, mtrrdef;
+
+/* The bitmask for the PhysBase and PhysMask fields of the variable MTRRs. */
+static u_int64_t mtrr_physmask;
+
+static struct mem_range_desc *mem_range_match(struct mem_range_softc *sc,
+ struct mem_range_desc *mrd);
+static void x86_mrfetch(struct mem_range_softc *sc);
+static int x86_mtrrtype(int flags);
+static int x86_mrt2mtrr(int flags, int oldval);
+static int x86_mtrrconflict(int flag1, int flag2);
+static void x86_mrstore(struct mem_range_softc *sc);
+static void x86_mrstoreone(void *arg);
+static struct mem_range_desc *x86_mtrrfixsearch(struct mem_range_softc *sc,
+ u_int64_t addr);
+static int x86_mrsetlow(struct mem_range_softc *sc,
+ struct mem_range_desc *mrd, int *arg);
+static int x86_mrsetvariable(struct mem_range_softc *sc,
+ struct mem_range_desc *mrd, int *arg);
+
+/* ia32 MTRR type to memory range type conversion */
+static int x86_mtrrtomrt[] = {
+ MDF_UNCACHEABLE,
+ MDF_WRITECOMBINE,
+ MDF_UNKNOWN,
+ MDF_UNKNOWN,
+ MDF_WRITETHROUGH,
+ MDF_WRITEPROTECT,
+ MDF_WRITEBACK
+};
+
+#define MTRRTOMRTLEN nitems(x86_mtrrtomrt)
+
+static int
+x86_mtrr2mrt(int val)
+{
+
+ if (val < 0 || val >= MTRRTOMRTLEN)
+ return (MDF_UNKNOWN);
+ return (x86_mtrrtomrt[val]);
+}
+
+/*
+ * x86 MTRR conflicts. Writeback and uncachable may overlap.
+ */
+static int
+x86_mtrrconflict(int flag1, int flag2)
+{
+
+ flag1 &= MDF_ATTRMASK;
+ flag2 &= MDF_ATTRMASK;
+ if ((flag1 & MDF_UNKNOWN) || (flag2 & MDF_UNKNOWN))
+ return (1);
+ if (flag1 == flag2 ||
+ (flag1 == MDF_WRITEBACK && flag2 == MDF_UNCACHEABLE) ||
+ (flag2 == MDF_WRITEBACK && flag1 == MDF_UNCACHEABLE))
+ return (0);
+ return (1);
+}
+
+/*
+ * Look for an exactly-matching range.
+ */
+static struct mem_range_desc *
+mem_range_match(struct mem_range_softc *sc, struct mem_range_desc *mrd)
+{
+ struct mem_range_desc *cand;
+ int i;
+
+ for (i = 0, cand = sc->mr_desc; i < sc->mr_ndesc; i++, cand++)
+ if ((cand->mr_base == mrd->mr_base) &&
+ (cand->mr_len == mrd->mr_len))
+ return (cand);
+ return (NULL);
+}
+
+/*
+ * Ensure that the direct map region does not contain any mappings
+ * that span MTRRs of different types. However, the fixed MTRRs can
+ * be ignored, because a large page mapping the first 1 MB of physical
+ * memory is a special case that the processor handles. Invalidate
+ * any old TLB entries that might hold inconsistent memory type
+ * information.
+ */
+static void
+x86_mr_split_dmap(struct mem_range_softc *sc __unused)
+{
+#ifdef __amd64__
+ struct mem_range_desc *mrd;
+ int i;
+
+ i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0;
+ mrd = sc->mr_desc + i;
+ for (; i < sc->mr_ndesc; i++, mrd++) {
+ if ((mrd->mr_flags & (MDF_ACTIVE | MDF_BOGUS)) == MDF_ACTIVE)
+ pmap_demote_DMAP(mrd->mr_base, mrd->mr_len, TRUE);
+ }
+#endif
+}
+
+/*
+ * Fetch the current mtrr settings from the current CPU (assumed to
+ * all be in sync in the SMP case). Note that if we are here, we
+ * assume that MTRRs are enabled, and we may or may not have fixed
+ * MTRRs.
+ */
+static void
+x86_mrfetch(struct mem_range_softc *sc)
+{
+ struct mem_range_desc *mrd;
+ u_int64_t msrv;
+ int i, j, msr;
+
+ mrd = sc->mr_desc;
+
+ /* Get fixed-range MTRRs. */
+ if (sc->mr_cap & MR686_FIXMTRR) {
+ msr = MSR_MTRR64kBase;
+ for (i = 0; i < (MTRR_N64K / 8); i++, msr++) {
+ msrv = rdmsr(msr);
+ for (j = 0; j < 8; j++, mrd++) {
+ mrd->mr_flags =
+ (mrd->mr_flags & ~MDF_ATTRMASK) |
+ x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+ if (mrd->mr_owner[0] == 0)
+ strcpy(mrd->mr_owner, mem_owner_bios);
+ msrv = msrv >> 8;
+ }
+ }
+ msr = MSR_MTRR16kBase;
+ for (i = 0; i < MTRR_N16K / 8; i++, msr++) {
+ msrv = rdmsr(msr);
+ for (j = 0; j < 8; j++, mrd++) {
+ mrd->mr_flags =
+ (mrd->mr_flags & ~MDF_ATTRMASK) |
+ x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+ if (mrd->mr_owner[0] == 0)
+ strcpy(mrd->mr_owner, mem_owner_bios);
+ msrv = msrv >> 8;
+ }
+ }
+ msr = MSR_MTRR4kBase;
+ for (i = 0; i < MTRR_N4K / 8; i++, msr++) {
+ msrv = rdmsr(msr);
+ for (j = 0; j < 8; j++, mrd++) {
+ mrd->mr_flags =
+ (mrd->mr_flags & ~MDF_ATTRMASK) |
+ x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+ if (mrd->mr_owner[0] == 0)
+ strcpy(mrd->mr_owner, mem_owner_bios);
+ msrv = msrv >> 8;
+ }
+ }
+ }
+
+ /* Get remainder which must be variable MTRRs. */
+ msr = MSR_MTRRVarBase;
+ for (; mrd - sc->mr_desc < sc->mr_ndesc; msr += 2, mrd++) {
+ msrv = rdmsr(msr);
+ mrd->mr_flags = (mrd->mr_flags & ~MDF_ATTRMASK) |
+ x86_mtrr2mrt(msrv & MTRR_PHYSBASE_TYPE);
+ mrd->mr_base = msrv & mtrr_physmask;
+ msrv = rdmsr(msr + 1);
+ mrd->mr_flags = (msrv & MTRR_PHYSMASK_VALID) ?
+ (mrd->mr_flags | MDF_ACTIVE) :
+ (mrd->mr_flags & ~MDF_ACTIVE);
+
+ /* Compute the range from the mask. Ick. */
+ mrd->mr_len = (~(msrv & mtrr_physmask) &
+ (mtrr_physmask | 0xfff)) + 1;
+ if (!mrvalid(mrd->mr_base, mrd->mr_len))
+ mrd->mr_flags |= MDF_BOGUS;
+
+ /* If unclaimed and active, must be the BIOS. */
+ if ((mrd->mr_flags & MDF_ACTIVE) && (mrd->mr_owner[0] == 0))
+ strcpy(mrd->mr_owner, mem_owner_bios);
+ }
+}
+
+/*
+ * Return the MTRR memory type matching a region's flags
+ */
+static int
+x86_mtrrtype(int flags)
+{
+ int i;
+
+ flags &= MDF_ATTRMASK;
+
+ for (i = 0; i < MTRRTOMRTLEN; i++) {
+ if (x86_mtrrtomrt[i] == MDF_UNKNOWN)
+ continue;
+ if (flags == x86_mtrrtomrt[i])
+ return (i);
+ }
+ return (-1);
+}
+
+static int
+x86_mrt2mtrr(int flags, int oldval)
+{
+ int val;
+
+ if ((val = x86_mtrrtype(flags)) == -1)
+ return (oldval & 0xff);
+ return (val & 0xff);
+}
+
+/*
+ * Update running CPU(s) MTRRs to match the ranges in the descriptor
+ * list.
+ *
+ * Must be called with interrupts enabled.
+ */
+static void
+x86_mrstore(struct mem_range_softc *sc)
+{
+
+ smp_rendezvous(NULL, x86_mrstoreone, NULL, sc);
+}
+
+/*
+ * Update the current CPU's MTRRs with those represented in the
+ * descriptor list. Note that we do this wholesale rather than just
+ * stuffing one entry; this is simpler (but slower, of course).
+ */
+static void
+x86_mrstoreone(void *arg)
+{
+ struct mem_range_softc *sc = arg;
+ struct mem_range_desc *mrd;
+ u_int64_t omsrv, msrv;
+ int i, j, msr;
+ u_long cr0, cr4;
+
+ mrd = sc->mr_desc;
+
+ critical_enter();
+
+ /* Disable PGE. */
+ cr4 = rcr4();
+ load_cr4(cr4 & ~CR4_PGE);
+
+ /* Disable caches (CD = 1, NW = 0). */
+ cr0 = rcr0();
+ load_cr0((cr0 & ~CR0_NW) | CR0_CD);
+
+ /* Flushes caches and TLBs. */
+ wbinvd();
+ invltlb();
+
+ /* Disable MTRRs (E = 0). */
+ wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) & ~MTRR_DEF_ENABLE);
+
+ /* Set fixed-range MTRRs. */
+ if (sc->mr_cap & MR686_FIXMTRR) {
+ msr = MSR_MTRR64kBase;
+ for (i = 0; i < MTRR_N64K / 8; i++, msr++) {
+ msrv = 0;
+ omsrv = rdmsr(msr);
+ for (j = 7; j >= 0; j--) {
+ msrv = msrv << 8;
+ msrv |= x86_mrt2mtrr((mrd + j)->mr_flags,
+ omsrv >> (j * 8));
+ }
+ wrmsr(msr, msrv);
+ mrd += 8;
+ }
+ msr = MSR_MTRR16kBase;
+ for (i = 0; i < MTRR_N16K / 8; i++, msr++) {
+ msrv = 0;
+ omsrv = rdmsr(msr);
+ for (j = 7; j >= 0; j--) {
+ msrv = msrv << 8;
+ msrv |= x86_mrt2mtrr((mrd + j)->mr_flags,
+ omsrv >> (j * 8));
+ }
+ wrmsr(msr, msrv);
+ mrd += 8;
+ }
+ msr = MSR_MTRR4kBase;
+ for (i = 0; i < MTRR_N4K / 8; i++, msr++) {
+ msrv = 0;
+ omsrv = rdmsr(msr);
+ for (j = 7; j >= 0; j--) {
+ msrv = msrv << 8;
+ msrv |= x86_mrt2mtrr((mrd + j)->mr_flags,
+ omsrv >> (j * 8));
+ }
+ wrmsr(msr, msrv);
+ mrd += 8;
+ }
+ }
+
+ /* Set remainder which must be variable MTRRs. */
+ msr = MSR_MTRRVarBase;
+ for (; mrd - sc->mr_desc < sc->mr_ndesc; msr += 2, mrd++) {
+ /* base/type register */
+ omsrv = rdmsr(msr);
+ if (mrd->mr_flags & MDF_ACTIVE) {
+ msrv = mrd->mr_base & mtrr_physmask;
+ msrv |= x86_mrt2mtrr(mrd->mr_flags, omsrv);
+ } else {
+ msrv = 0;
+ }
+ wrmsr(msr, msrv);
+
+ /* mask/active register */
+ if (mrd->mr_flags & MDF_ACTIVE) {
+ msrv = MTRR_PHYSMASK_VALID |
+ rounddown2(mtrr_physmask, mrd->mr_len);
+ } else {
+ msrv = 0;
+ }
+ wrmsr(msr + 1, msrv);
+ }
+
+ /* Flush caches and TLBs. */
+ wbinvd();
+ invltlb();
+
+ /* Enable MTRRs. */
+ wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) | MTRR_DEF_ENABLE);
+
+ /* Restore caches and PGE. */
+ load_cr0(cr0);
+ load_cr4(cr4);
+
+ critical_exit();
+}
+
+/*
+ * Hunt for the fixed MTRR referencing (addr)
+ */
+static struct mem_range_desc *
+x86_mtrrfixsearch(struct mem_range_softc *sc, u_int64_t addr)
+{
+ struct mem_range_desc *mrd;
+ int i;
+
+ for (i = 0, mrd = sc->mr_desc; i < MTRR_N64K + MTRR_N16K + MTRR_N4K;
+ i++, mrd++)
+ if (addr >= mrd->mr_base &&
+ addr < mrd->mr_base + mrd->mr_len)
+ return (mrd);
+ return (NULL);
+}
+
+/*
+ * Try to satisfy the given range request by manipulating the fixed
+ * MTRRs that cover low memory.
+ *
+ * Note that we try to be generous here; we'll bloat the range out to
+ * the next higher/lower boundary to avoid the consumer having to know
+ * too much about the mechanisms here.
+ *
+ * XXX note that this will have to be updated when we start supporting
+ * "busy" ranges.
+ */
+static int
+x86_mrsetlow(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg)
+{
+ struct mem_range_desc *first_md, *last_md, *curr_md;
+
+ /* Range check. */
+ if ((first_md = x86_mtrrfixsearch(sc, mrd->mr_base)) == NULL ||
+ (last_md = x86_mtrrfixsearch(sc, mrd->mr_base + mrd->mr_len - 1))
+ == NULL)
+ return (EINVAL);
+
+ /* Check that we aren't doing something risky. */
+ if ((mrd->mr_flags & MDF_FORCE) == 0) {
+ for (curr_md = first_md; curr_md <= last_md; curr_md++) {
+ if ((curr_md->mr_flags & MDF_ATTRMASK) == MDF_UNKNOWN)
+ return (EACCES);
+ }
+ }
+
+ /* Set flags, clear set-by-firmware flag. */
+ for (curr_md = first_md; curr_md <= last_md; curr_md++) {
+ curr_md->mr_flags = mrcopyflags(curr_md->mr_flags &
+ ~MDF_FIRMWARE, mrd->mr_flags);
+ bcopy(mrd->mr_owner, curr_md->mr_owner, sizeof(mrd->mr_owner));
+ }
+
+ return (0);
+}
+
+/*
+ * Modify/add a variable MTRR to satisfy the request.
+ *
+ * XXX needs to be updated to properly support "busy" ranges.
+ */
+static int
+x86_mrsetvariable(struct mem_range_softc *sc, struct mem_range_desc *mrd,
+ int *arg)
+{
+ struct mem_range_desc *curr_md, *free_md;
+ int i;
+
+ /*
+ * Scan the currently active variable descriptors, look for
+ * one we exactly match (straight takeover) and for possible
+ * accidental overlaps.
+ *
+ * Keep track of the first empty variable descriptor in case
+ * we can't perform a takeover.
+ */
+ i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0;
+ curr_md = sc->mr_desc + i;
+ free_md = NULL;
+ for (; i < sc->mr_ndesc; i++, curr_md++) {
+ if (curr_md->mr_flags & MDF_ACTIVE) {
+ /* Exact match? */
+ if (curr_md->mr_base == mrd->mr_base &&
+ curr_md->mr_len == mrd->mr_len) {
+
+ /* Whoops, owned by someone. */
+ if (curr_md->mr_flags & MDF_BUSY)
+ return (EBUSY);
+
+ /* Check that we aren't doing something risky */
+ if (!(mrd->mr_flags & MDF_FORCE) &&
+ (curr_md->mr_flags & MDF_ATTRMASK) ==
+ MDF_UNKNOWN)
+ return (EACCES);
+
+ /* Ok, just hijack this entry. */
+ free_md = curr_md;
+ break;
+ }
+
+ /* Non-exact overlap? */
+ if (mroverlap(curr_md, mrd)) {
+ /* Between conflicting region types? */
+ if (x86_mtrrconflict(curr_md->mr_flags,
+ mrd->mr_flags))
+ return (EINVAL);
+ }
+ } else if (free_md == NULL) {
+ free_md = curr_md;
+ }
+ }
+
+ /* Got somewhere to put it? */
+ if (free_md == NULL)
+ return (ENOSPC);
+
+ /* Set up new descriptor. */
+ free_md->mr_base = mrd->mr_base;
+ free_md->mr_len = mrd->mr_len;
+ free_md->mr_flags = mrcopyflags(MDF_ACTIVE, mrd->mr_flags);
+ bcopy(mrd->mr_owner, free_md->mr_owner, sizeof(mrd->mr_owner));
+ return (0);
+}
+
+/*
+ * Handle requests to set memory range attributes by manipulating MTRRs.
+ */
+static int
+x86_mrset(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg)
+{
+ struct mem_range_desc *targ;
+ int error;
+
+ switch (*arg) {
+ case MEMRANGE_SET_UPDATE:
+ /*
+ * Make sure that what's being asked for is even
+ * possible at all.
+ */
+ if (!mrvalid(mrd->mr_base, mrd->mr_len) ||
+ x86_mtrrtype(mrd->mr_flags) == -1)
+ return (EINVAL);
+
+#define FIXTOP \
+ ((MTRR_N64K * 0x10000) + (MTRR_N16K * 0x4000) + (MTRR_N4K * 0x1000))
+
+ /* Are the "low memory" conditions applicable? */
+ if ((sc->mr_cap & MR686_FIXMTRR) != 0 &&
+ mrd->mr_base + mrd->mr_len <= FIXTOP) {
+ if ((error = x86_mrsetlow(sc, mrd, arg)) != 0)
+ return (error);
+ } else {
+ /* It's time to play with variable MTRRs. */
+ if ((error = x86_mrsetvariable(sc, mrd, arg)) != 0)
+ return (error);
+ }
+ break;
+
+ case MEMRANGE_SET_REMOVE:
+ if ((targ = mem_range_match(sc, mrd)) == NULL)
+ return (ENOENT);
+ if (targ->mr_flags & MDF_FIXACTIVE)
+ return (EPERM);
+ if (targ->mr_flags & MDF_BUSY)
+ return (EBUSY);
+ targ->mr_flags &= ~MDF_ACTIVE;
+ targ->mr_owner[0] = 0;
+ break;
+
+ default:
+ return (EOPNOTSUPP);
+ }
+
+ x86_mr_split_dmap(sc);
+
+ /* Update the hardware. */
+ x86_mrstore(sc);
+
+ /* Refetch to see where we're at. */
+ x86_mrfetch(sc);
+ return (0);
+}
+
+/*
+ * Work out how many ranges we support, initialise storage for them,
+ * and fetch the initial settings.
+ */
+static void
+x86_mrinit(struct mem_range_softc *sc)
+{
+ struct mem_range_desc *mrd;
+ int i, nmdesc;
+
+ if (sc->mr_desc != NULL)
+ /* Already initialized. */
+ return;
+
+ nmdesc = 0;
+ mtrrcap = rdmsr(MSR_MTRRcap);
+ mtrrdef = rdmsr(MSR_MTRRdefType);
+
+ /* For now, bail out if MTRRs are not enabled. */
+ if (!(mtrrdef & MTRR_DEF_ENABLE)) {
+ if (bootverbose)
+ printf("CPU supports MTRRs but not enabled\n");
+ return;
+ }
+ nmdesc = mtrrcap & MTRR_CAP_VCNT;
+ if (bootverbose)
+ printf("Pentium Pro MTRR support enabled\n");
+
+ /*
+ * Determine the size of the PhysMask and PhysBase fields in
+ * the variable range MTRRs.
+ */
+ mtrr_physmask = (((uint64_t)1 << cpu_maxphyaddr) - 1) &
+ ~(uint64_t)0xfff;
+
+ /* If fixed MTRRs supported and enabled. */
+ if ((mtrrcap & MTRR_CAP_FIXED) && (mtrrdef & MTRR_DEF_FIXED_ENABLE)) {
+ sc->mr_cap = MR686_FIXMTRR;
+ nmdesc += MTRR_N64K + MTRR_N16K + MTRR_N4K;
+ }
+
+ sc->mr_desc = malloc(nmdesc * sizeof(struct mem_range_desc), M_MEMDESC,
+ M_WAITOK | M_ZERO);
+ sc->mr_ndesc = nmdesc;
+
+ mrd = sc->mr_desc;
+
+ /* Populate the fixed MTRR entries' base/length. */
+ if (sc->mr_cap & MR686_FIXMTRR) {
+ for (i = 0; i < MTRR_N64K; i++, mrd++) {
+ mrd->mr_base = i * 0x10000;
+ mrd->mr_len = 0x10000;
+ mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+ MDF_FIXACTIVE;
+ }
+ for (i = 0; i < MTRR_N16K; i++, mrd++) {
+ mrd->mr_base = i * 0x4000 + 0x80000;
+ mrd->mr_len = 0x4000;
+ mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+ MDF_FIXACTIVE;
+ }
+ for (i = 0; i < MTRR_N4K; i++, mrd++) {
+ mrd->mr_base = i * 0x1000 + 0xc0000;
+ mrd->mr_len = 0x1000;
+ mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+ MDF_FIXACTIVE;
+ }
+ }
+
+ /*
+ * Get current settings, anything set now is considered to
+ * have been set by the firmware. (XXX has something already
+ * played here?)
+ */
+ x86_mrfetch(sc);
+ mrd = sc->mr_desc;
+ for (i = 0; i < sc->mr_ndesc; i++, mrd++) {
+ if (mrd->mr_flags & MDF_ACTIVE)
+ mrd->mr_flags |= MDF_FIRMWARE;
+ }
+
+ x86_mr_split_dmap(sc);
+}
+
+/*
+ * Initialise MTRRs on an AP after the BSP has run the init code.
+ */
+static void
+x86_mrAPinit(struct mem_range_softc *sc)
+{
+
+ x86_mrstoreone(sc);
+ wrmsr(MSR_MTRRdefType, mtrrdef);
+}
+
+/*
+ * Re-initialise running CPU(s) MTRRs to match the ranges in the descriptor
+ * list.
+ *
+ * Must be called with interrupts enabled.
+ */
+static void
+x86_mrreinit(struct mem_range_softc *sc)
+{
+
+ smp_rendezvous(NULL, (void (*)(void *))x86_mrAPinit, NULL, sc);
+}
+
+static void
+x86_mem_drvinit(void *unused)
+{
+
+ if (mtrrs_disabled)
+ return;
+ if (!(cpu_feature & CPUID_MTRR))
+ return;
+ mem_range_softc.mr_op = &x86_mrops;
+ x86_mrinit(&mem_range_softc);
+}
+SYSINIT(x86memdev, SI_SUB_CPU, SI_ORDER_ANY, x86_mem_drvinit, NULL);
Property changes on: trunk/sys/x86/x86/x86_mem.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/xen/hvm.c
===================================================================
--- trunk/sys/x86/xen/hvm.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/xen/hvm.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/xen/hvm.c 305672 2016-09-09 19:57:32Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/hvm.c 305672 2016-09-09 19:57:32Z jhb $");
#include <sys/param.h>
#include <sys/bus.h>
@@ -59,34 +59,8 @@
#include <xen/interface/vcpu.h>
/*--------------------------- Forward Declarations ---------------------------*/
-#ifdef SMP
-static driver_filter_t xen_smp_rendezvous_action;
-static driver_filter_t xen_invltlb;
-static driver_filter_t xen_invlpg;
-static driver_filter_t xen_invlrng;
-static driver_filter_t xen_invlcache;
-#ifdef __i386__
-static driver_filter_t xen_lazypmap;
-#endif
-static driver_filter_t xen_ipi_bitmap_handler;
-static driver_filter_t xen_cpustop_handler;
-static driver_filter_t xen_cpususpend_handler;
-static driver_filter_t xen_cpustophard_handler;
-static void xen_ipi_vectored(u_int vector, int dest);
-#endif
static void xen_hvm_cpu_init(void);
-/*---------------------------- Extern Declarations ---------------------------*/
-#ifdef __i386__
-extern void pmap_lazyfix_action(void);
-#endif
-#ifdef __amd64__
-extern int pmap_pcid_enabled;
-#endif
-
-/*---------------------------------- Macros ----------------------------------*/
-#define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS)
-
/*-------------------------------- Local Types -------------------------------*/
enum xen_hvm_init_type {
XEN_HVM_INIT_COLD,
@@ -94,18 +68,11 @@
XEN_HVM_INIT_RESUME
};
-struct xen_ipi_handler
-{
- driver_filter_t *filter;
- const char *description;
-};
-
/*-------------------------------- Global Data -------------------------------*/
enum xen_domain_type xen_domain_type = XEN_NATIVE;
#ifdef SMP
struct cpu_ops xen_hvm_cpu_ops = {
- .ipi_vectored = lapic_ipi_vectored,
.cpu_init = xen_hvm_cpu_init,
.cpu_resume = xen_hvm_cpu_init
};
@@ -113,24 +80,6 @@
static MALLOC_DEFINE(M_XENHVM, "xen_hvm", "Xen HVM PV Support");
-#ifdef SMP
-static struct xen_ipi_handler xen_ipis[] =
-{
- [IPI_TO_IDX(IPI_RENDEZVOUS)] = { xen_smp_rendezvous_action, "r" },
- [IPI_TO_IDX(IPI_INVLTLB)] = { xen_invltlb, "itlb"},
- [IPI_TO_IDX(IPI_INVLPG)] = { xen_invlpg, "ipg" },
- [IPI_TO_IDX(IPI_INVLRNG)] = { xen_invlrng, "irg" },
- [IPI_TO_IDX(IPI_INVLCACHE)] = { xen_invlcache, "ic" },
-#ifdef __i386__
- [IPI_TO_IDX(IPI_LAZYPMAP)] = { xen_lazypmap, "lp" },
-#endif
- [IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler, "b" },
- [IPI_TO_IDX(IPI_STOP)] = { xen_cpustop_handler, "st" },
- [IPI_TO_IDX(IPI_SUSPEND)] = { xen_cpususpend_handler, "sp" },
- [IPI_TO_IDX(IPI_STOP_HARD)] = { xen_cpustophard_handler, "sth" },
-};
-#endif
-
/**
* If non-zero, the hypervisor has been configured to use a direct
* IDT event callback for interrupt injection.
@@ -140,14 +89,10 @@
/*------------------------------- Per-CPU Data -------------------------------*/
DPCPU_DEFINE(struct vcpu_info, vcpu_local_info);
DPCPU_DEFINE(struct vcpu_info *, vcpu_info);
-#ifdef SMP
-DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]);
-#endif
/*------------------ Hypervisor Access Shared Memory Regions -----------------*/
-/** Hypercall table accessed via HYPERVISOR_*_op() methods. */
-char *hypercall_stubs;
shared_info_t *HYPERVISOR_shared_info;
+start_info_t *HYPERVISOR_start_info;
/*------------------------------ Sysctl tunables -----------------------------*/
@@ -156,207 +101,6 @@
TUNABLE_INT("hw.xen.disable_pv_disks", &xen_disable_pv_disks);
TUNABLE_INT("hw.xen.disable_pv_nics", &xen_disable_pv_nics);
-#ifdef SMP
-/*---------------------------- XEN PV IPI Handlers ---------------------------*/
-/*
- * This are C clones of the ASM functions found in apic_vector.s
- */
-static int
-xen_ipi_bitmap_handler(void *arg)
-{
- struct trapframe *frame;
-
- frame = arg;
- ipi_bitmap_handler(*frame);
- return (FILTER_HANDLED);
-}
-
-static int
-xen_smp_rendezvous_action(void *arg)
-{
-#ifdef COUNT_IPIS
- (*ipi_rendezvous_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
- smp_rendezvous_action();
- return (FILTER_HANDLED);
-}
-
-static int
-xen_invltlb(void *arg)
-{
-
- invltlb_handler();
- return (FILTER_HANDLED);
-}
-
-#ifdef __amd64__
-static int
-xen_invltlb_pcid(void *arg)
-{
-
- invltlb_pcid_handler();
- return (FILTER_HANDLED);
-}
-#endif
-
-static int
-xen_invlpg(void *arg)
-{
-
- invlpg_handler();
- return (FILTER_HANDLED);
-}
-
-#ifdef __amd64__
-static int
-xen_invlpg_pcid(void *arg)
-{
-
- invlpg_pcid_handler();
- return (FILTER_HANDLED);
-}
-#endif
-
-static int
-xen_invlrng(void *arg)
-{
-
- invlrng_handler();
- return (FILTER_HANDLED);
-}
-
-static int
-xen_invlcache(void *arg)
-{
-
- invlcache_handler();
- return (FILTER_HANDLED);
-}
-
-#ifdef __i386__
-static int
-xen_lazypmap(void *arg)
-{
-
- pmap_lazyfix_action();
- return (FILTER_HANDLED);
-}
-#endif
-
-static int
-xen_cpustop_handler(void *arg)
-{
-
- cpustop_handler();
- return (FILTER_HANDLED);
-}
-
-static int
-xen_cpususpend_handler(void *arg)
-{
-
- cpususpend_handler();
- return (FILTER_HANDLED);
-}
-
-static int
-xen_cpustophard_handler(void *arg)
-{
-
- ipi_nmi_handler();
- return (FILTER_HANDLED);
-}
-
-/* Xen PV IPI sender */
-static void
-xen_ipi_vectored(u_int vector, int dest)
-{
- xen_intr_handle_t *ipi_handle;
- int ipi_idx, to_cpu, self;
-
- ipi_idx = IPI_TO_IDX(vector);
- if (ipi_idx > nitems(xen_ipis))
- panic("IPI out of range");
-
- switch(dest) {
- case APIC_IPI_DEST_SELF:
- ipi_handle = DPCPU_GET(ipi_handle);
- xen_intr_signal(ipi_handle[ipi_idx]);
- break;
- case APIC_IPI_DEST_ALL:
- CPU_FOREACH(to_cpu) {
- ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
- xen_intr_signal(ipi_handle[ipi_idx]);
- }
- break;
- case APIC_IPI_DEST_OTHERS:
- self = PCPU_GET(cpuid);
- CPU_FOREACH(to_cpu) {
- if (to_cpu != self) {
- ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
- xen_intr_signal(ipi_handle[ipi_idx]);
- }
- }
- break;
- default:
- to_cpu = apic_cpuid(dest);
- ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
- xen_intr_signal(ipi_handle[ipi_idx]);
- break;
- }
-}
-
-/*---------------------- XEN diverged cpu operations -------------------------*/
-static void
-xen_cpu_ipi_init(int cpu)
-{
- xen_intr_handle_t *ipi_handle;
- const struct xen_ipi_handler *ipi;
- device_t dev;
- int idx, rc;
-
- ipi_handle = DPCPU_ID_GET(cpu, ipi_handle);
- dev = pcpu_find(cpu)->pc_device;
- KASSERT((dev != NULL), ("NULL pcpu device_t"));
-
- for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) {
-
- if (ipi->filter == NULL) {
- ipi_handle[idx] = NULL;
- continue;
- }
-
- rc = xen_intr_alloc_and_bind_ipi(dev, cpu, ipi->filter,
- INTR_TYPE_TTY, &ipi_handle[idx]);
- if (rc != 0)
- panic("Unable to allocate a XEN IPI port");
- xen_intr_describe(ipi_handle[idx], "%s", ipi->description);
- }
-}
-
-static void
-xen_setup_cpus(void)
-{
- int i;
-
- if (!xen_hvm_domain() || !xen_vector_callback_enabled)
- return;
-
-#ifdef __amd64__
- if (pmap_pcid_enabled) {
- xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = xen_invltlb_pcid;
- xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = xen_invlpg_pcid;
- }
-#endif
- CPU_FOREACH(i)
- xen_cpu_ipi_init(i);
-
- /* Set the xen pv ipi ops to replace the native ones */
- cpu_ops.ipi_vectored = xen_ipi_vectored;
-}
-#endif
-
/*---------------------- XEN Hypervisor Probe and Setup ----------------------*/
static uint32_t
xen_hvm_cpuid_base(void)
@@ -376,16 +120,21 @@
* Allocate and fill in the hypcall page.
*/
static int
-xen_hvm_init_hypercall_stubs(void)
+xen_hvm_init_hypercall_stubs(enum xen_hvm_init_type init_type)
{
uint32_t base, regs[4];
int i;
+ if (xen_pv_domain()) {
+ /* hypercall page is already set in the PV case */
+ return (0);
+ }
+
base = xen_hvm_cpuid_base();
if (base == 0)
return (ENXIO);
- if (hypercall_stubs == NULL) {
+ if (init_type == XEN_HVM_INIT_COLD) {
int major, minor;
do_cpuid(base + 1, regs);
@@ -417,18 +166,9 @@
* Find the hypercall pages.
*/
do_cpuid(base + 2, regs);
-
- if (hypercall_stubs == NULL) {
- size_t call_region_size;
- call_region_size = regs[0] * PAGE_SIZE;
- hypercall_stubs = malloc(call_region_size, M_XENHVM, M_NOWAIT);
- if (hypercall_stubs == NULL)
- panic("Unable to allocate Xen hypercall region");
- }
-
for (i = 0; i < regs[0]; i++)
- wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i);
+ wrmsr(regs[1], vtophys(&hypercall_page + i * PAGE_SIZE) + i);
return (0);
}
@@ -438,6 +178,14 @@
{
struct xen_add_to_physmap xatp;
+ if (xen_pv_domain()) {
+ /*
+ * Already setup in the PV case, shared_info is passed inside
+ * of the start_info struct at start of day.
+ */
+ return;
+ }
+
if (HYPERVISOR_shared_info == NULL) {
HYPERVISOR_shared_info = malloc(PAGE_SIZE, M_XENHVM, M_NOWAIT);
if (HYPERVISOR_shared_info == NULL)
@@ -516,6 +264,16 @@
{
u_short disable_devs = 0;
+ if (xen_pv_domain()) {
+ /*
+ * No emulated devices in the PV case, so no need to unplug
+ * anything.
+ */
+ if (xen_disable_pv_disks != 0 || xen_disable_pv_nics != 0)
+ printf("PV devices cannot be disabled in PV guests\n");
+ return;
+ }
+
if (inw(XEN_MAGIC_IOPORT) != XMI_MAGIC)
return;
@@ -543,7 +301,7 @@
if (init_type == XEN_HVM_INIT_CANCELLED_SUSPEND)
return;
- error = xen_hvm_init_hypercall_stubs();
+ error = xen_hvm_init_hypercall_stubs(init_type);
switch (init_type) {
case XEN_HVM_INIT_COLD:
@@ -550,11 +308,21 @@
if (error != 0)
return;
+ /*
+ * If xen_domain_type is not set at this point
+ * it means we are inside a (PV)HVM guest, because
+ * for PVH the guest type is set much earlier
+ * (see hammer_time_xen).
+ */
+ if (!xen_domain()) {
+ xen_domain_type = XEN_HVM_DOMAIN;
+ vm_guest = VM_GUEST_XEN;
+ }
+
setup_xen_features();
#ifdef SMP
cpu_ops = xen_hvm_cpu_ops;
#endif
- vm_guest = VM_GUEST_XEN;
break;
case XEN_HVM_INIT_RESUME:
if (error != 0)
@@ -569,9 +337,15 @@
}
xen_vector_callback_enabled = 0;
- xen_domain_type = XEN_HVM_DOMAIN;
+ xen_hvm_set_callback(NULL);
+
+ /*
+ * On (PV)HVM domains we need to request the hypervisor to
+ * fill the shared info page, for PVH guest the shared_info page
+ * is passed inside the start_info struct and is already set, so this
+ * functions are no-ops.
+ */
xen_hvm_init_shared_info_page();
- xen_hvm_set_callback(NULL);
xen_hvm_disable_emulated_devices();
}
@@ -603,6 +377,9 @@
struct pcpu *pc;
int i;
+ if (!xen_hvm_domain())
+ return;
+
/* Set vcpu_id to acpi_id */
CPU_FOREACH(i) {
pc = pcpu_find(i);
@@ -645,8 +422,5 @@
}
SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_sysinit, NULL);
-#ifdef SMP
-SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_FIRST, xen_setup_cpus, NULL);
-#endif
SYSINIT(xen_hvm_cpu_init, SI_SUB_INTR, SI_ORDER_FIRST, xen_hvm_cpu_init, NULL);
SYSINIT(xen_set_vcpu_id, SI_SUB_CPU, SI_ORDER_ANY, xen_set_vcpu_id, NULL);
Added: trunk/sys/x86/xen/pv.c
===================================================================
--- trunk/sys/x86/xen/pv.c (rev 0)
+++ trunk/sys/x86/xen/pv.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,428 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2004 Christian Limpach.
+ * Copyright (c) 2004-2006,2008 Kip Macy
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2013 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/pv.c 344378 2019-02-20 19:19:24Z kevans $");
+
+#include "opt_ddb.h"
+#include "opt_kstack_pages.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/reboot.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/boot.h>
+#include <sys/ctype.h>
+#include <sys/mutex.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_param.h>
+
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+#include <x86/init.h>
+#include <machine/pc/bios.h>
+#include <machine/smp.h>
+#include <machine/intr_machdep.h>
+#include <machine/metadata.h>
+
+#include <xen/xen-os.h>
+#include <xen/hypervisor.h>
+#include <xen/xenstore/xenstorevar.h>
+#include <xen/xen_pv.h>
+#include <xen/xen_msi.h>
+
+#include <xen/interface/vcpu.h>
+
+#include <dev/xen/timer/timer.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/* Native initial function */
+extern u_int64_t hammer_time(u_int64_t, u_int64_t);
+/* Xen initial function */
+uint64_t hammer_time_xen(start_info_t *, uint64_t);
+
+#define MAX_E820_ENTRIES 128
+
+/*--------------------------- Forward Declarations ---------------------------*/
+static caddr_t xen_pv_parse_preload_data(u_int64_t);
+static void xen_pv_parse_memmap(caddr_t, vm_paddr_t *, int *);
+
+#ifdef SMP
+static int xen_pv_start_all_aps(void);
+#endif
+
+/*---------------------------- Extern Declarations ---------------------------*/
+#ifdef SMP
+/* Variables used by amd64 mp_machdep to start APs */
+extern char *doublefault_stack;
+extern char *mce_stack;
+extern char *nmi_stack;
+#endif
+
+/*
+ * Placed by the linker at the end of the bss section, which is the last
+ * section loaded by Xen before loading the symtab and strtab.
+ */
+extern uint32_t end;
+
+/*-------------------------------- Global Data -------------------------------*/
+/* Xen init_ops implementation. */
+struct init_ops xen_init_ops = {
+ .parse_preload_data = xen_pv_parse_preload_data,
+ .early_clock_source_init = xen_clock_init,
+ .early_delay = xen_delay,
+ .parse_memmap = xen_pv_parse_memmap,
+#ifdef SMP
+ .start_all_aps = xen_pv_start_all_aps,
+#endif
+ .msi_init = xen_msi_init,
+};
+
+static struct bios_smap xen_smap[MAX_E820_ENTRIES];
+
+/*-------------------------------- Xen PV init -------------------------------*/
+/*
+ * First function called by the Xen PVH boot sequence.
+ *
+ * Set some Xen global variables and prepare the environment so it is
+ * as similar as possible to what native FreeBSD init function expects.
+ */
+uint64_t
+hammer_time_xen(start_info_t *si, uint64_t xenstack)
+{
+ uint64_t physfree;
+ uint64_t *PT4 = (u_int64_t *)xenstack;
+ uint64_t *PT3 = (u_int64_t *)(xenstack + PAGE_SIZE);
+ uint64_t *PT2 = (u_int64_t *)(xenstack + 2 * PAGE_SIZE);
+ int i;
+
+ xen_domain_type = XEN_PV_DOMAIN;
+ vm_guest = VM_GUEST_XEN;
+
+ if ((si == NULL) || (xenstack == 0)) {
+ xc_printf("ERROR: invalid start_info or xen stack, halting\n");
+ HYPERVISOR_shutdown(SHUTDOWN_crash);
+ }
+
+ xc_printf("FreeBSD PVH running on %s\n", si->magic);
+
+ /* We use 3 pages of xen stack for the boot pagetables */
+ physfree = xenstack + 3 * PAGE_SIZE - KERNBASE;
+
+ /* Setup Xen global variables */
+ HYPERVISOR_start_info = si;
+ HYPERVISOR_shared_info =
+ (shared_info_t *)(si->shared_info + KERNBASE);
+
+ /*
+ * Setup some misc global variables for Xen devices
+ *
+ * XXX: Devices that need these specific variables should
+ * be rewritten to fetch this info by themselves from the
+ * start_info page.
+ */
+ xen_store = (struct xenstore_domain_interface *)
+ (ptoa(si->store_mfn) + KERNBASE);
+ console_page = (char *)(ptoa(si->console.domU.mfn) + KERNBASE);
+
+ /*
+ * Use the stack Xen gives us to build the page tables
+ * as native FreeBSD expects to find them (created
+ * by the boot trampoline).
+ */
+ for (i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); i++) {
+ /*
+ * Each slot of the level 4 pages points
+ * to the same level 3 page
+ */
+ PT4[i] = ((uint64_t)&PT3[0]) - KERNBASE;
+ PT4[i] |= PG_V | PG_RW | PG_U;
+
+ /*
+ * Each slot of the level 3 pages points
+ * to the same level 2 page
+ */
+ PT3[i] = ((uint64_t)&PT2[0]) - KERNBASE;
+ PT3[i] |= PG_V | PG_RW | PG_U;
+
+ /*
+ * The level 2 page slots are mapped with
+ * 2MB pages for 1GB.
+ */
+ PT2[i] = i * (2 * 1024 * 1024);
+ PT2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+ }
+ load_cr3(((uint64_t)&PT4[0]) - KERNBASE);
+
+ /* Set the hooks for early functions that diverge from bare metal */
+ init_ops = xen_init_ops;
+ apic_ops = xen_apic_ops;
+
+ /* Now we can jump into the native init function */
+ return (hammer_time(0, physfree));
+}
+
+/*-------------------------------- PV specific -------------------------------*/
+#ifdef SMP
+static bool
+start_xen_ap(int cpu)
+{
+ struct vcpu_guest_context *ctxt;
+ int ms, cpus = mp_naps;
+ const size_t stacksize = kstack_pages * PAGE_SIZE;
+
+ /* allocate and set up an idle stack data page */
+ bootstacks[cpu] =
+ (void *)kmem_malloc(kernel_arena, stacksize, M_WAITOK | M_ZERO);
+ doublefault_stack =
+ (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+ mce_stack =
+ (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+ nmi_stack =
+ (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+ dpcpu =
+ (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, M_WAITOK | M_ZERO);
+
+ bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8;
+ bootAP = cpu;
+
+ ctxt = malloc(sizeof(*ctxt), M_TEMP, M_WAITOK | M_ZERO);
+
+ ctxt->flags = VGCF_IN_KERNEL;
+ ctxt->user_regs.rip = (unsigned long) init_secondary;
+ ctxt->user_regs.rsp = (unsigned long) bootSTK;
+
+ /* Set the AP to use the same page tables */
+ ctxt->ctrlreg[3] = KPML4phys;
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+ panic("unable to initialize AP#%d", cpu);
+
+ free(ctxt, M_TEMP);
+
+ /* Launch the vCPU */
+ if (HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
+ panic("unable to start AP#%d", cpu);
+
+ /* Wait up to 5 seconds for it to start. */
+ for (ms = 0; ms < 5000; ms++) {
+ if (mp_naps > cpus)
+ return (true);
+ DELAY(1000);
+ }
+
+ return (false);
+}
+
+static int
+xen_pv_start_all_aps(void)
+{
+ int cpu;
+
+ mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
+
+ for (cpu = 1; cpu < mp_ncpus; cpu++) {
+
+ /* attempt to start the Application Processor */
+ if (!start_xen_ap(cpu))
+ panic("AP #%d failed to start!", cpu);
+
+ CPU_SET(cpu, &all_cpus); /* record AP in CPU map */
+ }
+
+ return (mp_naps);
+}
+#endif /* SMP */
+
+/*
+ * Functions to convert the "extra" parameters passed by Xen
+ * into FreeBSD boot options.
+ */
+static void
+xen_pv_set_env(void)
+{
+ char *cmd_line_next, *cmd_line;
+ size_t env_size;
+
+ cmd_line = HYPERVISOR_start_info->cmd_line;
+ env_size = sizeof(HYPERVISOR_start_info->cmd_line);
+
+ /* Skip leading spaces */
+ for (; isspace(*cmd_line) && (env_size != 0); cmd_line++)
+ env_size--;
+
+ /* Replace ',' with '\0' */
+ for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;)
+ ;
+
+ init_static_kenv(cmd_line, 0);
+}
+
+#ifdef DDB
+/*
+ * The way Xen loads the symtab is different from the native boot loader,
+ * because it's tailored for NetBSD. So we have to adapt and use the same
+ * method as NetBSD. Portions of the code below have been picked from NetBSD:
+ * sys/kern/kern_ksyms.c CVS Revision 1.71.
+ */
+static void
+xen_pv_parse_symtab(void)
+{
+ Elf_Ehdr *ehdr;
+ Elf_Shdr *shdr;
+ vm_offset_t sym_end;
+ uint32_t size;
+ int i, j;
+
+ size = end;
+ sym_end = HYPERVISOR_start_info->mod_start != 0 ?
+ HYPERVISOR_start_info->mod_start :
+ HYPERVISOR_start_info->mfn_list;
+
+ /*
+ * Make sure the size is right headed, sym_end is just a
+ * high boundary, but at least allows us to fail earlier.
+ */
+ if ((vm_offset_t)&end + size > sym_end) {
+ xc_printf("Unable to load ELF symtab: size mismatch\n");
+ return;
+ }
+
+ ehdr = (Elf_Ehdr *)(&end + 1);
+ if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) ||
+ ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+ ehdr->e_version > 1) {
+ xc_printf("Unable to load ELF symtab: invalid symbol table\n");
+ return;
+ }
+
+ shdr = (Elf_Shdr *)((uint8_t *)ehdr + ehdr->e_shoff);
+ /* Find the symbol table and the corresponding string table. */
+ for (i = 1; i < ehdr->e_shnum; i++) {
+ if (shdr[i].sh_type != SHT_SYMTAB)
+ continue;
+ if (shdr[i].sh_offset == 0)
+ continue;
+ ksymtab = (uintptr_t)((uint8_t *)ehdr + shdr[i].sh_offset);
+ ksymtab_size = shdr[i].sh_size;
+ j = shdr[i].sh_link;
+ if (shdr[j].sh_offset == 0)
+ continue; /* Can this happen? */
+ kstrtab = (uintptr_t)((uint8_t *)ehdr + shdr[j].sh_offset);
+ break;
+ }
+
+ if (ksymtab == 0 || kstrtab == 0) {
+ xc_printf(
+ "Unable to load ELF symtab: could not find symtab or strtab\n");
+ return;
+ }
+}
+#endif
+
+static caddr_t
+xen_pv_parse_preload_data(u_int64_t modulep)
+{
+ caddr_t kmdp;
+ vm_ooffset_t off;
+ vm_paddr_t metadata;
+ char *envp;
+
+ if (HYPERVISOR_start_info->mod_start != 0) {
+ preload_metadata = (caddr_t)(HYPERVISOR_start_info->mod_start);
+
+ kmdp = preload_search_by_type("elf kernel");
+ if (kmdp == NULL)
+ kmdp = preload_search_by_type("elf64 kernel");
+ KASSERT(kmdp != NULL, ("unable to find kernel"));
+
+ /*
+ * Xen has relocated the metadata and the modules,
+ * so we need to recalculate it's position. This is
+ * done by saving the original modulep address and
+ * then calculating the offset with mod_start,
+ * which contains the relocated modulep address.
+ */
+ metadata = MD_FETCH(kmdp, MODINFOMD_MODULEP, vm_paddr_t);
+ off = HYPERVISOR_start_info->mod_start - metadata;
+
+ preload_bootstrap_relocate(off);
+
+ boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
+ envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
+ if (envp != NULL)
+ envp += off;
+ init_static_kenv(envp, 0);
+ } else {
+ /* Parse the extra boot information given by Xen */
+ xen_pv_set_env();
+ boothowto |= boot_env_to_howto();
+ kmdp = NULL;
+ }
+
+#ifdef DDB
+ xen_pv_parse_symtab();
+#endif
+ return (kmdp);
+}
+
+static void
+xen_pv_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
+{
+ struct xen_memory_map memmap;
+ u_int32_t size;
+ int rc;
+
+ /* Fetch the E820 map from Xen */
+ memmap.nr_entries = MAX_E820_ENTRIES;
+ set_xen_guest_handle(memmap.buffer, xen_smap);
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc)
+ panic("unable to fetch Xen E820 memory map");
+ size = memmap.nr_entries * sizeof(xen_smap[0]);
+
+ bios_add_smap_entries(xen_smap, size, physmap, physmap_idx);
+}
Property changes on: trunk/sys/x86/xen/pv.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/pvcpu_enum.c
===================================================================
--- trunk/sys/x86/xen/pvcpu_enum.c (rev 0)
+++ trunk/sys/x86/xen/pvcpu_enum.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,267 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2003 John Baldwin <jhb at FreeBSD.org>
+ * Copyright (c) 2013 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/pvcpu_enum.c 340016 2018-11-01 18:34:26Z jhb $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/smp.h>
+#include <sys/pcpu.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#include <xen/xen-os.h>
+#include <xen/xen_intr.h>
+#include <xen/hypervisor.h>
+
+#include <xen/interface/vcpu.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/aclocal.h>
+#include <contrib/dev/acpica/include/actables.h>
+
+#include <dev/acpica/acpivar.h>
+
+static int xenpv_probe(void);
+static int xenpv_probe_cpus(void);
+static int xenpv_setup_local(void);
+static int xenpv_setup_io(void);
+
+static ACPI_TABLE_MADT *madt;
+static vm_paddr_t madt_physaddr;
+static vm_offset_t madt_length;
+
+static struct apic_enumerator xenpv_enumerator = {
+ "Xen PV",
+ xenpv_probe,
+ xenpv_probe_cpus,
+ xenpv_setup_local,
+ xenpv_setup_io
+};
+
+/*--------------------- Helper functions to parse MADT -----------------------*/
+
+/*
+ * Parse an interrupt source override for an ISA interrupt.
+ */
+static void
+madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr)
+{
+ enum intr_trigger trig;
+ enum intr_polarity pol;
+ int ret;
+
+ if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 &&
+ intr->GlobalIrq == 2) {
+ if (bootverbose)
+ printf("MADT: Skipping timer override\n");
+ return;
+ }
+
+ madt_parse_interrupt_values(intr, &trig, &pol);
+
+ /* Remap the IRQ if it is mapped to a different interrupt vector. */
+ if (intr->SourceIrq != intr->GlobalIrq && intr->GlobalIrq > 15 &&
+ intr->SourceIrq == AcpiGbl_FADT.SciInterrupt)
+ /*
+ * If the SCI is remapped to a non-ISA global interrupt,
+ * then override the vector we use to setup.
+ */
+ acpi_OverrideInterruptLevel(intr->GlobalIrq);
+
+ /* Register the IRQ with the polarity and trigger mode found. */
+ ret = xen_register_pirq(intr->GlobalIrq, trig, pol);
+ if (ret != 0)
+ panic("Unable to register interrupt override");
+}
+
+/*
+ * Call the handler routine for each entry in the MADT table.
+ */
+static void
+madt_walk_table(acpi_subtable_handler *handler, void *arg)
+{
+
+ acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length,
+ handler, arg);
+}
+
+/*
+ * Parse interrupt entries.
+ */
+static void
+madt_parse_ints(ACPI_SUBTABLE_HEADER *entry, void *arg __unused)
+{
+
+ if (entry->Type == ACPI_MADT_TYPE_INTERRUPT_OVERRIDE)
+ madt_parse_interrupt_override(
+ (ACPI_MADT_INTERRUPT_OVERRIDE *)entry);
+}
+
+/*---------------------------- Xen PV enumerator -----------------------------*/
+
+/*
+ * This enumerator will only be registered on PVH
+ */
+static int
+xenpv_probe(void)
+{
+ return (0);
+}
+
+/*
+ * Test each possible vCPU in order to find the number of vCPUs
+ */
+static int
+xenpv_probe_cpus(void)
+{
+#ifdef SMP
+ int i, ret;
+
+ for (i = 0; i < MAXCPU; i++) {
+ ret = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+ if (ret >= 0)
+ lapic_create((i * 2), (i == 0));
+ }
+#endif
+ return (0);
+}
+
+/*
+ * Initialize the vCPU id of the BSP
+ */
+static int
+xenpv_setup_local(void)
+{
+ PCPU_SET(vcpu_id, 0);
+ lapic_init(0);
+ return (0);
+}
+
+/*
+ * On PVH guests there's no IO APIC
+ */
+static int
+xenpv_setup_io(void)
+{
+
+ if (xen_initial_domain()) {
+ /*
+ * NB: we could iterate over the MADT IOAPIC entries in order
+ * to figure out the exact number of IOAPIC interrupts, but
+ * this is legacy code so just keep using the previous
+ * behaviour and assume a maximum of 256 interrupts.
+ */
+ num_io_irqs = max(MINIMUM_MSI_INT - 1, num_io_irqs);
+
+ acpi_SetDefaultIntrModel(ACPI_INTR_APIC);
+ }
+ return (0);
+}
+
+void
+xenpv_register_pirqs(struct pic *pic __unused)
+{
+ unsigned int i;
+ int ret;
+
+ /* Map MADT */
+ madt_physaddr = acpi_find_table(ACPI_SIG_MADT);
+ madt = acpi_map_table(madt_physaddr, ACPI_SIG_MADT);
+ madt_length = madt->Header.Length;
+
+ /* Try to initialize ACPI so that we can access the FADT. */
+ ret = acpi_Startup();
+ if (ACPI_FAILURE(ret)) {
+ printf("MADT: ACPI Startup failed with %s\n",
+ AcpiFormatException(ret));
+ printf("Try disabling either ACPI or apic support.\n");
+ panic("Using MADT but ACPI doesn't work");
+ }
+
+ /* Run through the table to see if there are any overrides. */
+ madt_walk_table(madt_parse_ints, NULL);
+
+ /*
+ * If there was not an explicit override entry for the SCI,
+ * force it to use level trigger and active-low polarity.
+ */
+ if (!madt_found_sci_override) {
+ printf(
+"MADT: Forcing active-low polarity and level trigger for SCI\n");
+ ret = xen_register_pirq(AcpiGbl_FADT.SciInterrupt,
+ INTR_TRIGGER_LEVEL, INTR_POLARITY_LOW);
+ if (ret != 0)
+ panic("Unable to register SCI IRQ");
+ }
+
+ /* Register legacy ISA IRQs */
+ for (i = 1; i < 16; i++) {
+ if (intr_lookup_source(i) != NULL)
+ continue;
+ ret = xen_register_pirq(i, INTR_TRIGGER_EDGE,
+ INTR_POLARITY_LOW);
+ if (ret != 0 && bootverbose)
+ printf("Unable to register legacy IRQ#%u: %d\n", i,
+ ret);
+ }
+}
+
+static void
+xenpv_register(void *dummy __unused)
+{
+ if (xen_pv_domain()) {
+ apic_register_enumerator(&xenpv_enumerator);
+ }
+}
+SYSINIT(xenpv_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, xenpv_register, NULL);
+
+/*
+ * Setup per-CPU vCPU IDs
+ */
+static void
+xenpv_set_ids(void *dummy)
+{
+ struct pcpu *pc;
+ int i;
+
+ CPU_FOREACH(i) {
+ pc = pcpu_find(i);
+ pc->pc_vcpu_id = i;
+ }
+}
+SYSINIT(xenpv_set_ids, SI_SUB_CPU, SI_ORDER_MIDDLE, xenpv_set_ids, NULL);
Property changes on: trunk/sys/x86/xen/pvcpu_enum.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xen_apic.c
===================================================================
--- trunk/sys/x86/xen/xen_apic.c (rev 0)
+++ trunk/sys/x86/xen/xen_apic.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,598 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_apic.c 334047 2018-05-22 14:36:46Z kib $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/cpufunc.h>
+#include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <machine/md_var.h>
+#include <machine/smp.h>
+
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
+
+#include <xen/xen-os.h>
+#include <xen/features.h>
+#include <xen/gnttab.h>
+#include <xen/hypervisor.h>
+#include <xen/hvm.h>
+#include <xen/xen_intr.h>
+
+#include <xen/interface/vcpu.h>
+
+/*--------------------------------- Macros -----------------------------------*/
+
+#define XEN_APIC_UNSUPPORTED \
+ panic("%s: not available in Xen PV port.", __func__)
+
+
+/*--------------------------- Forward Declarations ---------------------------*/
+#ifdef SMP
+static driver_filter_t xen_smp_rendezvous_action;
+static driver_filter_t xen_invltlb;
+static driver_filter_t xen_invlpg;
+static driver_filter_t xen_invlrng;
+static driver_filter_t xen_invlcache;
+static driver_filter_t xen_ipi_bitmap_handler;
+static driver_filter_t xen_cpustop_handler;
+static driver_filter_t xen_cpususpend_handler;
+static driver_filter_t xen_cpustophard_handler;
+#endif
+
+/*---------------------------------- Macros ----------------------------------*/
+#define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS)
+
+/*--------------------------------- Xen IPIs ---------------------------------*/
+#ifdef SMP
+struct xen_ipi_handler
+{
+ driver_filter_t *filter;
+ const char *description;
+};
+
+static struct xen_ipi_handler xen_ipis[] =
+{
+ [IPI_TO_IDX(IPI_RENDEZVOUS)] = { xen_smp_rendezvous_action, "r" },
+ [IPI_TO_IDX(IPI_INVLTLB)] = { xen_invltlb, "itlb"},
+ [IPI_TO_IDX(IPI_INVLPG)] = { xen_invlpg, "ipg" },
+ [IPI_TO_IDX(IPI_INVLRNG)] = { xen_invlrng, "irg" },
+ [IPI_TO_IDX(IPI_INVLCACHE)] = { xen_invlcache, "ic" },
+ [IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler, "b" },
+ [IPI_TO_IDX(IPI_STOP)] = { xen_cpustop_handler, "st" },
+ [IPI_TO_IDX(IPI_SUSPEND)] = { xen_cpususpend_handler, "sp" },
+ [IPI_TO_IDX(IPI_STOP_HARD)] = { xen_cpustophard_handler, "sth" },
+};
+#endif
+
+/*------------------------------- Per-CPU Data -------------------------------*/
+#ifdef SMP
+DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]);
+#endif
+
+/*------------------------------- Xen PV APIC --------------------------------*/
+
+static void
+xen_pv_lapic_create(u_int apic_id, int boot_cpu)
+{
+#ifdef SMP
+ cpu_add(apic_id, boot_cpu);
+#endif
+}
+
+static void
+xen_pv_lapic_init(vm_paddr_t addr)
+{
+
+}
+
+static void
+xen_pv_lapic_setup(int boot)
+{
+
+}
+
+static void
+xen_pv_lapic_dump(const char *str)
+{
+
+ printf("cpu%d %s XEN PV LAPIC\n", PCPU_GET(cpuid), str);
+}
+
+static void
+xen_pv_lapic_disable(void)
+{
+
+}
+
+static bool
+xen_pv_lapic_is_x2apic(void)
+{
+
+ return (false);
+}
+
+static void
+xen_pv_lapic_eoi(void)
+{
+
+ XEN_APIC_UNSUPPORTED;
+}
+
+static int
+xen_pv_lapic_id(void)
+{
+
+ return (PCPU_GET(apic_id));
+}
+
+static int
+xen_pv_lapic_intr_pending(u_int vector)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (0);
+}
+
+static u_int
+xen_pv_apic_cpuid(u_int apic_id)
+{
+#ifdef SMP
+ return (apic_cpuids[apic_id]);
+#else
+ return (0);
+#endif
+}
+
+static u_int
+xen_pv_apic_alloc_vector(u_int apic_id, u_int irq)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (0);
+}
+
+static u_int
+xen_pv_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (0);
+}
+
+static void
+xen_pv_apic_disable_vector(u_int apic_id, u_int vector)
+{
+
+ XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_apic_enable_vector(u_int apic_id, u_int vector)
+{
+
+ XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
+{
+
+ XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
+{
+
+ XEN_APIC_UNSUPPORTED;
+}
+
+static int
+xen_pv_lapic_enable_pmc(void)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (0);
+}
+
+static void
+xen_pv_lapic_disable_pmc(void)
+{
+
+ XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_reenable_pmc(void)
+{
+
+ XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_enable_cmc(void)
+{
+
+}
+
+#ifdef SMP
+static void
+xen_pv_lapic_ipi_raw(register_t icrlo, u_int dest)
+{
+
+ XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_ipi_vectored(u_int vector, int dest)
+{
+ xen_intr_handle_t *ipi_handle;
+ int ipi_idx, to_cpu, self;
+
+ ipi_idx = IPI_TO_IDX(vector);
+ if (ipi_idx >= nitems(xen_ipis))
+ panic("IPI out of range");
+
+ switch(dest) {
+ case APIC_IPI_DEST_SELF:
+ ipi_handle = DPCPU_GET(ipi_handle);
+ xen_intr_signal(ipi_handle[ipi_idx]);
+ break;
+ case APIC_IPI_DEST_ALL:
+ CPU_FOREACH(to_cpu) {
+ ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
+ xen_intr_signal(ipi_handle[ipi_idx]);
+ }
+ break;
+ case APIC_IPI_DEST_OTHERS:
+ self = PCPU_GET(cpuid);
+ CPU_FOREACH(to_cpu) {
+ if (to_cpu != self) {
+ ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
+ xen_intr_signal(ipi_handle[ipi_idx]);
+ }
+ }
+ break;
+ default:
+ to_cpu = apic_cpuid(dest);
+ ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
+ xen_intr_signal(ipi_handle[ipi_idx]);
+ break;
+ }
+}
+
+static int
+xen_pv_lapic_ipi_wait(int delay)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (0);
+}
+#endif /* SMP */
+
+static int
+xen_pv_lapic_ipi_alloc(inthand_t *ipifunc)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (-1);
+}
+
+static void
+xen_pv_lapic_ipi_free(int vector)
+{
+
+ XEN_APIC_UNSUPPORTED;
+}
+
+static int
+xen_pv_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (0);
+}
+
+static int
+xen_pv_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (0);
+}
+
+static int
+xen_pv_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (0);
+}
+
+static int
+xen_pv_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
+ enum intr_trigger trigger)
+{
+
+ XEN_APIC_UNSUPPORTED;
+ return (0);
+}
+
+/* Xen apic_ops implementation */
+struct apic_ops xen_apic_ops = {
+ .create = xen_pv_lapic_create,
+ .init = xen_pv_lapic_init,
+ .xapic_mode = xen_pv_lapic_disable,
+ .is_x2apic = xen_pv_lapic_is_x2apic,
+ .setup = xen_pv_lapic_setup,
+ .dump = xen_pv_lapic_dump,
+ .disable = xen_pv_lapic_disable,
+ .eoi = xen_pv_lapic_eoi,
+ .id = xen_pv_lapic_id,
+ .intr_pending = xen_pv_lapic_intr_pending,
+ .set_logical_id = xen_pv_lapic_set_logical_id,
+ .cpuid = xen_pv_apic_cpuid,
+ .alloc_vector = xen_pv_apic_alloc_vector,
+ .alloc_vectors = xen_pv_apic_alloc_vectors,
+ .enable_vector = xen_pv_apic_enable_vector,
+ .disable_vector = xen_pv_apic_disable_vector,
+ .free_vector = xen_pv_apic_free_vector,
+ .enable_pmc = xen_pv_lapic_enable_pmc,
+ .disable_pmc = xen_pv_lapic_disable_pmc,
+ .reenable_pmc = xen_pv_lapic_reenable_pmc,
+ .enable_cmc = xen_pv_lapic_enable_cmc,
+#ifdef SMP
+ .ipi_raw = xen_pv_lapic_ipi_raw,
+ .ipi_vectored = xen_pv_lapic_ipi_vectored,
+ .ipi_wait = xen_pv_lapic_ipi_wait,
+#endif
+ .ipi_alloc = xen_pv_lapic_ipi_alloc,
+ .ipi_free = xen_pv_lapic_ipi_free,
+ .set_lvt_mask = xen_pv_lapic_set_lvt_mask,
+ .set_lvt_mode = xen_pv_lapic_set_lvt_mode,
+ .set_lvt_polarity = xen_pv_lapic_set_lvt_polarity,
+ .set_lvt_triggermode = xen_pv_lapic_set_lvt_triggermode,
+};
+
+#ifdef SMP
+/*---------------------------- XEN PV IPI Handlers ---------------------------*/
+/*
+ * These are C clones of the ASM functions found in apic_vector.
+ */
+static int
+xen_ipi_bitmap_handler(void *arg)
+{
+ struct trapframe *frame;
+
+ frame = arg;
+ ipi_bitmap_handler(*frame);
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_smp_rendezvous_action(void *arg)
+{
+#ifdef COUNT_IPIS
+ (*ipi_rendezvous_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+ smp_rendezvous_action();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_invltlb(void *arg)
+{
+
+ invltlb_handler();
+ return (FILTER_HANDLED);
+}
+
+#ifdef __amd64__
+static int
+xen_invltlb_invpcid(void *arg)
+{
+
+ invltlb_invpcid_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_invltlb_pcid(void *arg)
+{
+
+ invltlb_pcid_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_invltlb_invpcid_pti(void *arg)
+{
+
+ invltlb_invpcid_pti_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_invlpg_invpcid_handler(void *arg)
+{
+
+ invlpg_invpcid_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_invlpg_pcid_handler(void *arg)
+{
+
+ invlpg_pcid_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_invlrng_invpcid_handler(void *arg)
+{
+
+ invlrng_invpcid_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_invlrng_pcid_handler(void *arg)
+{
+
+ invlrng_pcid_handler();
+ return (FILTER_HANDLED);
+}
+#endif
+
+static int
+xen_invlpg(void *arg)
+{
+
+ invlpg_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_invlrng(void *arg)
+{
+
+ invlrng_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_invlcache(void *arg)
+{
+
+ invlcache_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_cpustop_handler(void *arg)
+{
+
+ cpustop_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_cpususpend_handler(void *arg)
+{
+
+ cpususpend_handler();
+ return (FILTER_HANDLED);
+}
+
+static int
+xen_cpustophard_handler(void *arg)
+{
+
+ ipi_nmi_handler();
+ return (FILTER_HANDLED);
+}
+
+/*----------------------------- XEN PV IPI setup -----------------------------*/
+/*
+ * Those functions are provided outside of the Xen PV APIC implementation
+ * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC,
+ * because on PVHVM there's an emulated LAPIC provided by Xen.
+ */
+static void
+xen_cpu_ipi_init(int cpu)
+{
+ xen_intr_handle_t *ipi_handle;
+ const struct xen_ipi_handler *ipi;
+ int idx, rc;
+
+ ipi_handle = DPCPU_ID_GET(cpu, ipi_handle);
+
+ for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) {
+
+ if (ipi->filter == NULL) {
+ ipi_handle[idx] = NULL;
+ continue;
+ }
+
+ rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter,
+ INTR_TYPE_TTY, &ipi_handle[idx]);
+ if (rc != 0)
+ panic("Unable to allocate a XEN IPI port");
+ xen_intr_describe(ipi_handle[idx], "%s", ipi->description);
+ }
+}
+
+static void
+xen_setup_cpus(void)
+{
+ int i;
+
+ if (!xen_vector_callback_enabled)
+ return;
+
+#ifdef __amd64__
+ if (pmap_pcid_enabled) {
+ if (pti)
+ xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
+ invpcid_works ? xen_invltlb_invpcid_pti :
+ xen_invltlb_pcid;
+ else
+ xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
+ invpcid_works ? xen_invltlb_invpcid :
+ xen_invltlb_pcid;
+ xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = invpcid_works ?
+ xen_invlpg_invpcid_handler : xen_invlpg_pcid_handler;
+ xen_ipis[IPI_TO_IDX(IPI_INVLRNG)].filter = invpcid_works ?
+ xen_invlrng_invpcid_handler : xen_invlrng_pcid_handler;
+ }
+#endif
+ CPU_FOREACH(i)
+ xen_cpu_ipi_init(i);
+
+ /* Set the xen pv ipi ops to replace the native ones */
+ if (xen_hvm_domain())
+ apic_ops.ipi_vectored = xen_pv_lapic_ipi_vectored;
+}
+
+/* We need to setup IPIs before APs are started */
+SYSINIT(xen_setup_cpus, SI_SUB_SMP-1, SI_ORDER_FIRST, xen_setup_cpus, NULL);
+#endif /* SMP */
Property changes on: trunk/sys/x86/xen/xen_apic.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/xen/xen_intr.c
===================================================================
--- trunk/sys/x86/xen/xen_intr.c 2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/xen/xen_intr.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -2,7 +2,7 @@
/******************************************************************************
* xen_intr.c
*
- * Xen event and interrupt services for x86 PV and HVM guests.
+ * Xen event and interrupt services for x86 HVM guests.
*
* Copyright (c) 2002-2005, K A Fraser
* Copyright (c) 2005, Intel Corporation <xiaofeng.ling at intel.com>
@@ -31,8 +31,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/xen/xen_intr.c 291647 2015-12-02 12:58:20Z royger $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_intr.c 342656 2018-12-31 22:09:08Z jhb $");
+#include "opt_ddb.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
@@ -49,22 +51,30 @@
#include <vm/pmap.h>
#include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
+#include <x86/apicreg.h>
#include <machine/smp.h>
#include <machine/stdarg.h>
#include <machine/xen/synch_bitops.h>
#include <machine/xen/xen-os.h>
-#include <machine/xen/xenvar.h>
+#include <xen/xen-os.h>
#include <xen/hypervisor.h>
#include <xen/xen_intr.h>
#include <xen/evtchn/evtchnvar.h>
#include <dev/xen/xenpci/xenpcivar.h>
+#include <dev/pci/pcivar.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
static MALLOC_DEFINE(M_XENINTR, "xen_intr", "Xen Interrupt Services");
+static u_int first_evtchn_irq;
+
/**
* Per-cpu event channel processing state.
*/
@@ -96,7 +106,7 @@
* Start the scan at port 0 by initializing the last scanned
* location as the highest numbered event channel port.
*/
-DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = {
+static DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = {
.last_processed_l1i = LONG_BIT - 1,
.last_processed_l2i = LONG_BIT - 1
};
@@ -103,8 +113,12 @@
DPCPU_DECLARE(struct vcpu_info *, vcpu_info);
-#define is_valid_evtchn(x) ((x) != 0)
+#define XEN_EEXIST 17 /* Xen "already exists" error */
+#define XEN_ALLOCATE_VECTOR 0 /* Allocate a vector for this event channel */
+#define XEN_INVALID_EVTCHN 0 /* Invalid event channel */
+#define is_valid_evtchn(x) ((x) != XEN_INVALID_EVTCHN)
+
struct xenisrc {
struct intsrc xi_intsrc;
enum evtchn_type xi_type;
@@ -113,13 +127,13 @@
evtchn_port_t xi_port;
int xi_pirq;
int xi_virq;
+ void *xi_cookie;
u_int xi_close:1; /* close on unbind? */
- u_int xi_needs_eoi:1;
- u_int xi_shared:1; /* Shared with other domains. */
+ u_int xi_activehi:1;
+ u_int xi_edgetrigger:1;
+ u_int xi_masked:1;
};
-#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
-
static void xen_intr_suspend(struct pic *);
static void xen_intr_resume(struct pic *, bool suspend_cancelled);
static void xen_intr_enable_source(struct intsrc *isrc);
@@ -137,6 +151,9 @@
static void xen_intr_pirq_disable_source(struct intsrc *isrc, int eoi);
static void xen_intr_pirq_eoi_source(struct intsrc *isrc);
static void xen_intr_pirq_enable_intr(struct intsrc *isrc);
+static void xen_intr_pirq_disable_intr(struct intsrc *isrc);
+static int xen_intr_pirq_config_intr(struct intsrc *isrc,
+ enum intr_trigger trig, enum intr_polarity pol);
/**
* PIC interface for all event channel port types except physical IRQs.
@@ -160,22 +177,25 @@
* physical interrupt sources.
*/
struct pic xen_intr_pirq_pic = {
+#ifdef __amd64__
+ .pic_register_sources = xenpv_register_pirqs,
+#endif
.pic_enable_source = xen_intr_pirq_enable_source,
.pic_disable_source = xen_intr_pirq_disable_source,
.pic_eoi_source = xen_intr_pirq_eoi_source,
.pic_enable_intr = xen_intr_pirq_enable_intr,
- .pic_disable_intr = xen_intr_disable_intr,
+ .pic_disable_intr = xen_intr_pirq_disable_intr,
.pic_vector = xen_intr_vector,
.pic_source_pending = xen_intr_source_pending,
- .pic_suspend = xen_intr_suspend,
- .pic_resume = xen_intr_resume,
- .pic_config_intr = xen_intr_config_intr,
+ .pic_config_intr = xen_intr_pirq_config_intr,
.pic_assign_cpu = xen_intr_assign_cpu
};
-static struct mtx xen_intr_isrc_lock;
-static int xen_intr_isrc_count;
-static struct xenisrc *xen_intr_port_to_isrc[NR_EVENT_CHANNELS];
+static struct mtx xen_intr_isrc_lock;
+static u_int xen_intr_auto_vector_count;
+static struct xenisrc *xen_intr_port_to_isrc[NR_EVENT_CHANNELS];
+static u_long *xen_intr_pirq_eoi_map;
+static boolean_t xen_intr_pirq_eoi_map_enabled;
/*------------------------- Private Functions --------------------------------*/
/**
@@ -197,7 +217,7 @@
struct xen_intr_pcpu_data *pcpu;
pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
- clear_bit(port, pcpu->evtchn_enabled);
+ xen_clear_bit(port, pcpu->evtchn_enabled);
}
/**
@@ -219,7 +239,7 @@
struct xen_intr_pcpu_data *pcpu;
pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
- set_bit(port, pcpu->evtchn_enabled);
+ xen_set_bit(port, pcpu->evtchn_enabled);
}
/**
@@ -257,11 +277,11 @@
KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn isrc lock not held"));
- for (isrc_idx = 0; isrc_idx < xen_intr_isrc_count; isrc_idx ++) {
+ for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx ++) {
struct xenisrc *isrc;
u_int vector;
- vector = FIRST_EVTCHN_INT + isrc_idx;
+ vector = first_evtchn_irq + isrc_idx;
isrc = (struct xenisrc *)intr_lookup_source(vector);
if (isrc != NULL
&& isrc->xi_type == EVTCHN_TYPE_UNBOUND) {
@@ -283,15 +303,14 @@
* object or NULL.
*/
static struct xenisrc *
-xen_intr_alloc_isrc(enum evtchn_type type)
+xen_intr_alloc_isrc(enum evtchn_type type, int vector)
{
static int warned;
struct xenisrc *isrc;
- int vector;
KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn alloc lock not held"));
- if (xen_intr_isrc_count > NR_EVENT_CHANNELS) {
+ if (xen_intr_auto_vector_count > NR_EVENT_CHANNELS) {
if (!warned) {
warned = 1;
printf("xen_intr_alloc: Event channels exhausted.\n");
@@ -298,12 +317,19 @@
}
return (NULL);
}
- vector = FIRST_EVTCHN_INT + xen_intr_isrc_count;
- xen_intr_isrc_count++;
+ if (type != EVTCHN_TYPE_PIRQ) {
+ vector = first_evtchn_irq + xen_intr_auto_vector_count;
+ xen_intr_auto_vector_count++;
+ }
+
+ KASSERT((intr_lookup_source(vector) == NULL),
+ ("Trying to use an already allocated vector"));
+
mtx_unlock(&xen_intr_isrc_lock);
isrc = malloc(sizeof(*isrc), M_XENINTR, M_WAITOK | M_ZERO);
- isrc->xi_intsrc.is_pic = &xen_intr_pic;
+ isrc->xi_intsrc.is_pic =
+ (type == EVTCHN_TYPE_PIRQ) ? &xen_intr_pirq_pic : &xen_intr_pic;
isrc->xi_vector = vector;
isrc->xi_type = type;
intr_register_source(&isrc->xi_intsrc);
@@ -345,6 +371,7 @@
isrc->xi_cpu = 0;
isrc->xi_type = EVTCHN_TYPE_UNBOUND;
isrc->xi_port = 0;
+ isrc->xi_cookie = NULL;
mtx_unlock(&xen_intr_isrc_lock);
return (0);
}
@@ -372,7 +399,7 @@
*/
static int
xen_intr_bind_isrc(struct xenisrc **isrcp, evtchn_port_t local_port,
- enum evtchn_type type, device_t intr_owner, driver_filter_t filter,
+ enum evtchn_type type, const char *intr_owner, driver_filter_t filter,
driver_intr_t handler, void *arg, enum intr_type flags,
xen_intr_handle_t *port_handlep)
{
@@ -381,8 +408,8 @@
*isrcp = NULL;
if (port_handlep == NULL) {
- device_printf(intr_owner,
- "xen_intr_bind_isrc: Bad event handle\n");
+ printf("%s: xen_intr_bind_isrc: Bad event handle\n",
+ intr_owner);
return (EINVAL);
}
@@ -389,7 +416,7 @@
mtx_lock(&xen_intr_isrc_lock);
isrc = xen_intr_find_unused_isrc(type);
if (isrc == NULL) {
- isrc = xen_intr_alloc_isrc(type);
+ isrc = xen_intr_alloc_isrc(type, XEN_ALLOCATE_VECTOR);
if (isrc == NULL) {
mtx_unlock(&xen_intr_isrc_lock);
return (ENOSPC);
@@ -399,17 +426,37 @@
xen_intr_port_to_isrc[local_port] = isrc;
mtx_unlock(&xen_intr_isrc_lock);
- error = intr_add_handler(device_get_nameunit(intr_owner),
- isrc->xi_vector, filter, handler, arg,
- flags|INTR_EXCL, port_handlep);
+ /* Assign the opaque handler (the event channel port) */
+ *port_handlep = &isrc->xi_vector;
+
+#ifdef SMP
+ if (type == EVTCHN_TYPE_PORT) {
+ /*
+ * By default all interrupts are assigned to vCPU#0
+ * unless specified otherwise, so shuffle them to balance
+ * the interrupt load.
+ */
+ xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu());
+ }
+#endif
+
+ if (filter == NULL && handler == NULL) {
+ /*
+ * No filter/handler provided, leave the event channel
+ * masked and without a valid handler, the caller is
+ * in charge of setting that up.
+ */
+ *isrcp = isrc;
+ return (0);
+ }
+
+ error = xen_intr_add_handler(intr_owner, filter, handler, arg, flags,
+ *port_handlep);
if (error != 0) {
- device_printf(intr_owner,
- "xen_intr_bind_irq: intr_add_handler failed\n");
xen_intr_release_isrc(isrc);
return (error);
}
*isrcp = isrc;
- evtchn_unmask_port(local_port);
return (0);
}
@@ -426,13 +473,17 @@
static struct xenisrc *
xen_intr_isrc(xen_intr_handle_t handle)
{
- struct intr_handler *ih;
+ int vector;
- ih = handle;
- if (ih == NULL || ih->ih_event == NULL)
+ if (handle == NULL)
return (NULL);
- return (ih->ih_event->ie_source);
+ vector = *(int *)handle;
+ KASSERT(vector >= first_evtchn_irq &&
+ vector < (first_evtchn_irq + xen_intr_auto_vector_count),
+ ("Xen interrupt vector is out of range"));
+
+ return ((struct xenisrc *)intr_lookup_source(vector));
}
/**
@@ -451,6 +502,11 @@
xen_intr_active_ports(struct xen_intr_pcpu_data *pcpu, shared_info_t *sh,
u_int idx)
{
+
+ CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(sh->evtchn_pending[0]));
+ CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(pcpu->evtchn_enabled[0]));
+ CTASSERT(sizeof(sh->evtchn_mask) == sizeof(sh->evtchn_pending));
+ CTASSERT(sizeof(sh->evtchn_mask) == sizeof(pcpu->evtchn_enabled));
return (sh->evtchn_pending[idx]
& ~sh->evtchn_mask[idx]
& pcpu->evtchn_enabled[idx]);
@@ -570,8 +626,10 @@
static int
xen_intr_init(void *dummy __unused)
{
+ shared_info_t *s = HYPERVISOR_shared_info;
struct xen_intr_pcpu_data *pcpu;
- int i;
+ struct physdev_pirq_eoi_gmfn eoi_gmfn;
+ int i, rc;
if (!xen_domain())
return (0);
@@ -579,25 +637,65 @@
mtx_init(&xen_intr_isrc_lock, "xen-irq-lock", NULL, MTX_DEF);
/*
- * Register interrupt count manually as we aren't
- * guaranteed to see a call to xen_intr_assign_cpu()
- * before our first interrupt. Also set the per-cpu
- * mask of CPU#0 to enable all, since by default
- * all event channels are bound to CPU#0.
+ * Set the per-cpu mask of CPU#0 to enable all, since by default all
+ * event channels are bound to CPU#0.
*/
CPU_FOREACH(i) {
pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
- sizeof(pcpu->evtchn_enabled));
- xen_intr_intrcnt_add(i);
+ sizeof(pcpu->evtchn_enabled));
}
+ for (i = 0; i < nitems(s->evtchn_mask); i++)
+ atomic_store_rel_long(&s->evtchn_mask[i], ~0);
+
+ /* Try to register PIRQ EOI map */
+ xen_intr_pirq_eoi_map = malloc(PAGE_SIZE, M_XENINTR, M_WAITOK | M_ZERO);
+ eoi_gmfn.gmfn = atop(vtophys(xen_intr_pirq_eoi_map));
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
+ if (rc != 0 && bootverbose)
+ printf("Xen interrupts: unable to register PIRQ EOI map\n");
+ else
+ xen_intr_pirq_eoi_map_enabled = true;
+
intr_register_pic(&xen_intr_pic);
+ if (xen_pv_domain() && xen_initial_domain())
+ intr_register_pic(&xen_intr_pirq_pic);
+ if (bootverbose)
+ printf("Xen interrupt system initialized\n");
+
return (0);
}
-SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_MIDDLE, xen_intr_init, NULL);
+SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL);
+static void
+xen_intrcnt_init(void *dummy __unused)
+{
+ unsigned int i;
+
+ if (!xen_domain())
+ return;
+
+ /*
+ * Register interrupt count manually as we aren't guaranteed to see a
+ * call to xen_intr_assign_cpu() before our first interrupt.
+ */
+ CPU_FOREACH(i)
+ xen_intr_intrcnt_add(i);
+}
+SYSINIT(xen_intrcnt_init, SI_SUB_INTR, SI_ORDER_MIDDLE, xen_intrcnt_init, NULL);
+
+void
+xen_intr_alloc_irqs(void)
+{
+
+ if (num_io_irqs > UINT_MAX - NR_EVENT_CHANNELS)
+ panic("IRQ allocation overflow (num_msi_irqs too high?)");
+ first_evtchn_irq = num_io_irqs;
+ num_io_irqs += NR_EVENT_CHANNELS;
+}
+
/*--------------------------- Common PIC Functions ---------------------------*/
/**
* Prepare this PIC for system suspension.
@@ -685,8 +783,8 @@
struct xen_intr_pcpu_data *pcpu;
pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
- memset(pcpu->evtchn_enabled,
- i == 0 ? ~0 : 0, sizeof(pcpu->evtchn_enabled));
+ memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
+ sizeof(pcpu->evtchn_enabled));
}
/* Mask all event channels. */
@@ -697,10 +795,10 @@
memset(xen_intr_port_to_isrc, 0, sizeof(xen_intr_port_to_isrc));
/* Free unused isrcs and rebind VIRQs and IPIs */
- for (isrc_idx = 0; isrc_idx < xen_intr_isrc_count; isrc_idx++) {
+ for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx++) {
u_int vector;
- vector = FIRST_EVTCHN_INT + isrc_idx;
+ vector = first_evtchn_irq + isrc_idx;
isrc = (struct xenisrc *)intr_lookup_source(vector);
if (isrc != NULL) {
isrc->xi_port = 0;
@@ -712,7 +810,6 @@
xen_rebind_virq(isrc);
break;
default:
- isrc->xi_cpu = 0;
break;
}
}
@@ -798,16 +895,13 @@
struct evtchn_bind_vcpu bind_vcpu;
struct xenisrc *isrc;
u_int to_cpu, vcpu_id;
- int error;
+ int error, masked;
-#ifdef XENHVM
if (xen_vector_callback_enabled == 0)
return (EOPNOTSUPP);
-#endif
to_cpu = apic_cpuid(apic_id);
vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id;
- xen_intr_intrcnt_add(to_cpu);
mtx_lock(&xen_intr_isrc_lock);
isrc = (struct xenisrc *)base_isrc;
@@ -816,6 +910,11 @@
return (EINVAL);
}
+ /*
+ * Mask the event channel while binding it to prevent interrupt
+ * delivery with an inconsistent state in isrc->xi_cpu.
+ */
+ masked = evtchn_test_and_set_mask(isrc->xi_port);
if ((isrc->xi_type == EVTCHN_TYPE_VIRQ) ||
(isrc->xi_type == EVTCHN_TYPE_IPI)) {
/*
@@ -826,18 +925,12 @@
evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
isrc->xi_cpu = to_cpu;
evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
- mtx_unlock(&xen_intr_isrc_lock);
- return (0);
+ goto out;
}
bind_vcpu.port = isrc->xi_port;
bind_vcpu.vcpu = vcpu_id;
- /*
- * Allow interrupts to be fielded on the new VCPU before
- * we ask the hypervisor to deliver them there.
- */
- evtchn_cpu_unmask_port(to_cpu, isrc->xi_port);
error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu);
if (isrc->xi_cpu != to_cpu) {
if (error == 0) {
@@ -844,11 +937,13 @@
/* Commit to new binding by removing the old one. */
evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
isrc->xi_cpu = to_cpu;
- } else {
- /* Roll-back to previous binding. */
- evtchn_cpu_mask_port(to_cpu, isrc->xi_port);
+ evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
}
}
+
+out:
+ if (masked == 0)
+ evtchn_unmask_port(isrc->xi_port);
mtx_unlock(&xen_intr_isrc_lock);
return (0);
#else
@@ -865,8 +960,21 @@
* acknowledgements.
*/
static void
-xen_intr_disable_source(struct intsrc *isrc, int eoi)
+xen_intr_disable_source(struct intsrc *base_isrc, int eoi)
{
+ struct xenisrc *isrc;
+
+ isrc = (struct xenisrc *)base_isrc;
+
+ /*
+ * NB: checking if the event channel is already masked is
+ * needed because the event channel user-space device
+ * masks event channels on it's filter as part of it's
+ * normal operation, and those shouldn't be automatically
+ * unmasked by the generic interrupt code. The event channel
+ * device will unmask them when needed.
+ */
+ isrc->xi_masked = !!evtchn_test_and_set_mask(isrc->xi_port);
}
/*
@@ -875,8 +983,14 @@
* \param isrc The interrupt source to unmask (if necessary).
*/
static void
-xen_intr_enable_source(struct intsrc *isrc)
+xen_intr_enable_source(struct intsrc *base_isrc)
{
+ struct xenisrc *isrc;
+
+ isrc = (struct xenisrc *)base_isrc;
+
+ if (isrc->xi_masked == 0)
+ evtchn_unmask_port(isrc->xi_port);
}
/*
@@ -885,7 +999,7 @@
* \param isrc The interrupt source to EOI.
*/
static void
-xen_intr_eoi_source(struct intsrc *isrc)
+xen_intr_eoi_source(struct intsrc *base_isrc)
{
}
@@ -916,7 +1030,11 @@
struct xenisrc *isrc;
isrc = (struct xenisrc *)base_isrc;
- evtchn_mask_port(isrc->xi_port);
+
+ if (isrc->xi_edgetrigger == 0)
+ evtchn_mask_port(isrc->xi_port);
+ if (eoi == PIC_EOI)
+ xen_intr_pirq_eoi_source(base_isrc);
}
/*
@@ -930,7 +1048,9 @@
struct xenisrc *isrc;
isrc = (struct xenisrc *)base_isrc;
- evtchn_unmask_port(isrc->xi_port);
+
+ if (isrc->xi_edgetrigger == 0)
+ evtchn_unmask_port(isrc->xi_port);
}
/*
@@ -942,13 +1062,17 @@
xen_intr_pirq_eoi_source(struct intsrc *base_isrc)
{
struct xenisrc *isrc;
+ int error;
- /* XXX Use shared page of flags for this. */
isrc = (struct xenisrc *)base_isrc;
- if (isrc->xi_needs_eoi != 0) {
+
+ if (xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)) {
struct physdev_eoi eoi = { .irq = isrc->xi_pirq };
- (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+ error = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+ if (error != 0)
+ panic("Unable to EOI PIRQ#%d: %d\n",
+ isrc->xi_pirq, error);
}
}
@@ -958,10 +1082,118 @@
* \param isrc The interrupt source to enable.
*/
static void
-xen_intr_pirq_enable_intr(struct intsrc *isrc)
+xen_intr_pirq_enable_intr(struct intsrc *base_isrc)
{
+ struct xenisrc *isrc;
+ struct evtchn_bind_pirq bind_pirq;
+ struct physdev_irq_status_query irq_status;
+ int error;
+
+ isrc = (struct xenisrc *)base_isrc;
+
+ if (!xen_intr_pirq_eoi_map_enabled) {
+ irq_status.irq = isrc->xi_pirq;
+ error = HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query,
+ &irq_status);
+ if (error)
+ panic("unable to get status of IRQ#%d", isrc->xi_pirq);
+
+ if (irq_status.flags & XENIRQSTAT_needs_eoi) {
+ /*
+ * Since the dynamic PIRQ EOI map is not available
+ * mark the PIRQ as needing EOI unconditionally.
+ */
+ xen_set_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map);
+ }
+ }
+
+ bind_pirq.pirq = isrc->xi_pirq;
+ bind_pirq.flags = isrc->xi_edgetrigger ? 0 : BIND_PIRQ__WILL_SHARE;
+ error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
+ if (error)
+ panic("unable to bind IRQ#%d", isrc->xi_pirq);
+
+ isrc->xi_port = bind_pirq.port;
+
+ mtx_lock(&xen_intr_isrc_lock);
+ KASSERT((xen_intr_port_to_isrc[bind_pirq.port] == NULL),
+ ("trying to override an already setup event channel port"));
+ xen_intr_port_to_isrc[bind_pirq.port] = isrc;
+ mtx_unlock(&xen_intr_isrc_lock);
+
+ evtchn_unmask_port(isrc->xi_port);
}
+/*
+ * Disable an interrupt source.
+ *
+ * \param isrc The interrupt source to disable.
+ */
+static void
+xen_intr_pirq_disable_intr(struct intsrc *base_isrc)
+{
+ struct xenisrc *isrc;
+ struct evtchn_close close;
+ int error;
+
+ isrc = (struct xenisrc *)base_isrc;
+
+ evtchn_mask_port(isrc->xi_port);
+
+ close.port = isrc->xi_port;
+ error = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+ if (error)
+ panic("unable to close event channel %d IRQ#%d",
+ isrc->xi_port, isrc->xi_pirq);
+
+ mtx_lock(&xen_intr_isrc_lock);
+ xen_intr_port_to_isrc[isrc->xi_port] = NULL;
+ mtx_unlock(&xen_intr_isrc_lock);
+
+ isrc->xi_port = 0;
+}
+
+/**
+ * Perform configuration of an interrupt source.
+ *
+ * \param isrc The interrupt source to configure.
+ * \param trig Edge or level.
+ * \param pol Active high or low.
+ *
+ * \returns 0 if no events are pending, otherwise non-zero.
+ */
+static int
+xen_intr_pirq_config_intr(struct intsrc *base_isrc, enum intr_trigger trig,
+ enum intr_polarity pol)
+{
+ struct xenisrc *isrc = (struct xenisrc *)base_isrc;
+ struct physdev_setup_gsi setup_gsi;
+ int error;
+
+ KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM),
+ ("%s: Conforming trigger or polarity\n", __func__));
+
+ setup_gsi.gsi = isrc->xi_pirq;
+ setup_gsi.triggering = trig == INTR_TRIGGER_EDGE ? 0 : 1;
+ setup_gsi.polarity = pol == INTR_POLARITY_HIGH ? 0 : 1;
+
+ error = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+ if (error == -XEN_EEXIST) {
+ if ((isrc->xi_edgetrigger && (trig != INTR_TRIGGER_EDGE)) ||
+ (isrc->xi_activehi && (pol != INTR_POLARITY_HIGH)))
+ panic("unable to reconfigure interrupt IRQ#%d",
+ isrc->xi_pirq);
+ error = 0;
+ }
+ if (error)
+ panic("unable to configure IRQ#%d\n", isrc->xi_pirq);
+
+ isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
+ isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
+
+ return (0);
+}
+
/*--------------------------- Public Functions -------------------------------*/
/*------- API comments for these methods can be found in xen/xenintr.h -------*/
int
@@ -972,8 +1204,9 @@
struct xenisrc *isrc;
int error;
- error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT, dev,
- filter, handler, arg, flags, port_handlep);
+ error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT,
+ device_get_nameunit(dev), filter, handler, arg, flags,
+ port_handlep);
if (error != 0)
return (error);
@@ -1007,8 +1240,8 @@
}
error = xen_intr_bind_isrc(&isrc, alloc_unbound.port, EVTCHN_TYPE_PORT,
- dev, filter, handler, arg, flags,
- port_handlep);
+ device_get_nameunit(dev), filter, handler, arg, flags,
+ port_handlep);
if (error != 0) {
evtchn_close_t close = { .port = alloc_unbound.port };
if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
@@ -1042,8 +1275,8 @@
}
error = xen_intr_bind_isrc(&isrc, bind_interdomain.local_port,
- EVTCHN_TYPE_PORT, dev, filter, handler,
- arg, flags, port_handlep);
+ EVTCHN_TYPE_PORT, device_get_nameunit(dev), filter, handler, arg,
+ flags, port_handlep);
if (error) {
evtchn_close_t close = { .port = bind_interdomain.local_port };
if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
@@ -1069,9 +1302,6 @@
struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id };
int error;
- /* Ensure the target CPU is ready to handle evtchn interrupts. */
- xen_intr_intrcnt_add(cpu);
-
isrc = NULL;
error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq);
if (error != 0) {
@@ -1082,8 +1312,9 @@
return (-error);
}
- error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ, dev,
- filter, handler, arg, flags, port_handlep);
+ error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ,
+ device_get_nameunit(dev), filter, handler, arg, flags,
+ port_handlep);
#ifdef SMP
if (error == 0)
@@ -1122,19 +1353,17 @@
}
int
-xen_intr_alloc_and_bind_ipi(device_t dev, u_int cpu,
- driver_filter_t filter, enum intr_type flags,
- xen_intr_handle_t *port_handlep)
+xen_intr_alloc_and_bind_ipi(u_int cpu, driver_filter_t filter,
+ enum intr_type flags, xen_intr_handle_t *port_handlep)
{
#ifdef SMP
int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
struct xenisrc *isrc;
struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
+ /* Same size as the one used by intr_handler->ih_name. */
+ char name[MAXCOMLEN + 1];
int error;
- /* Ensure the target CPU is ready to handle evtchn interrupts. */
- xen_intr_intrcnt_add(cpu);
-
isrc = NULL;
error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
if (error != 0) {
@@ -1145,12 +1374,10 @@
return (-error);
}
+ snprintf(name, sizeof(name), "cpu%u", cpu);
+
error = xen_intr_bind_isrc(&isrc, bind_ipi.port, EVTCHN_TYPE_IPI,
- dev, filter, NULL, NULL, flags,
- port_handlep);
- if (error == 0)
- error = intr_event_bind(isrc->xi_intsrc.is_event, cpu);
-
+ name, filter, NULL, NULL, flags, port_handlep);
if (error != 0) {
evtchn_close_t close = { .port = bind_ipi.port };
@@ -1182,6 +1409,101 @@
}
int
+xen_register_pirq(int vector, enum intr_trigger trig, enum intr_polarity pol)
+{
+ struct physdev_map_pirq map_pirq;
+ struct xenisrc *isrc;
+ int error;
+
+ if (vector == 0)
+ return (EINVAL);
+
+ if (bootverbose)
+ printf("xen: register IRQ#%d\n", vector);
+
+ map_pirq.domid = DOMID_SELF;
+ map_pirq.type = MAP_PIRQ_TYPE_GSI;
+ map_pirq.index = vector;
+ map_pirq.pirq = vector;
+
+ error = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_pirq);
+ if (error) {
+ printf("xen: unable to map IRQ#%d\n", vector);
+ return (error);
+ }
+
+ mtx_lock(&xen_intr_isrc_lock);
+ isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector);
+ mtx_unlock(&xen_intr_isrc_lock);
+ KASSERT((isrc != NULL), ("xen: unable to allocate isrc for interrupt"));
+ isrc->xi_pirq = vector;
+ isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
+ isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
+
+ return (0);
+}
+
+int
+xen_register_msi(device_t dev, int vector, int count)
+{
+ struct physdev_map_pirq msi_irq;
+ struct xenisrc *isrc;
+ int ret;
+
+ memset(&msi_irq, 0, sizeof(msi_irq));
+ msi_irq.domid = DOMID_SELF;
+ msi_irq.type = count == 1 ?
+ MAP_PIRQ_TYPE_MSI_SEG : MAP_PIRQ_TYPE_MULTI_MSI;
+ msi_irq.index = -1;
+ msi_irq.pirq = -1;
+ msi_irq.bus = pci_get_bus(dev) | (pci_get_domain(dev) << 16);
+ msi_irq.devfn = (pci_get_slot(dev) << 3) | pci_get_function(dev);
+ msi_irq.entry_nr = count;
+
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &msi_irq);
+ if (ret != 0)
+ return (ret);
+ if (count != msi_irq.entry_nr) {
+ panic("unable to setup all requested MSI vectors "
+ "(expected %d got %d)", count, msi_irq.entry_nr);
+ }
+
+ mtx_lock(&xen_intr_isrc_lock);
+ for (int i = 0; i < count; i++) {
+ isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector + i);
+ KASSERT(isrc != NULL,
+ ("xen: unable to allocate isrc for interrupt"));
+ isrc->xi_pirq = msi_irq.pirq + i;
+ /* MSI interrupts are always edge triggered */
+ isrc->xi_edgetrigger = 1;
+ }
+ mtx_unlock(&xen_intr_isrc_lock);
+
+ return (0);
+}
+
+int
+xen_release_msi(int vector)
+{
+ struct physdev_unmap_pirq unmap;
+ struct xenisrc *isrc;
+ int ret;
+
+ isrc = (struct xenisrc *)intr_lookup_source(vector);
+ if (isrc == NULL)
+ return (ENXIO);
+
+ unmap.pirq = isrc->xi_pirq;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap);
+ if (ret != 0)
+ return (ret);
+
+ xen_intr_release_isrc(isrc);
+
+ return (0);
+}
+
+int
xen_intr_describe(xen_intr_handle_t port_handle, const char *fmt, ...)
{
char descr[MAXCOMLEN + 1];
@@ -1195,22 +1517,24 @@
va_start(ap, fmt);
vsnprintf(descr, sizeof(descr), fmt, ap);
va_end(ap);
- return (intr_describe(isrc->xi_vector, port_handle, descr));
+ return (intr_describe(isrc->xi_vector, isrc->xi_cookie, descr));
}
void
xen_intr_unbind(xen_intr_handle_t *port_handlep)
{
- struct intr_handler *handler;
struct xenisrc *isrc;
- handler = *port_handlep;
+ KASSERT(port_handlep != NULL,
+ ("NULL xen_intr_handle_t passed to xen_intr_unbind"));
+
+ isrc = xen_intr_isrc(*port_handlep);
*port_handlep = NULL;
- isrc = xen_intr_isrc(handler);
if (isrc == NULL)
return;
- intr_remove_handler(handler);
+ if (isrc->xi_cookie != NULL)
+ intr_remove_handler(isrc->xi_cookie);
xen_intr_release_isrc(isrc);
}
@@ -1240,3 +1564,96 @@
return (isrc->xi_port);
}
+
+int
+xen_intr_add_handler(const char *name, driver_filter_t filter,
+ driver_intr_t handler, void *arg, enum intr_type flags,
+ xen_intr_handle_t handle)
+{
+ struct xenisrc *isrc;
+ int error;
+
+ isrc = xen_intr_isrc(handle);
+ if (isrc == NULL || isrc->xi_cookie != NULL)
+ return (EINVAL);
+
+ error = intr_add_handler(name, isrc->xi_vector,filter, handler, arg,
+ flags|INTR_EXCL, &isrc->xi_cookie);
+ if (error != 0) {
+ printf(
+ "%s: xen_intr_add_handler: intr_add_handler failed: %d\n",
+ name, error);
+ }
+
+ return (error);
+}
+
+#ifdef DDB
+static const char *
+xen_intr_print_type(enum evtchn_type type)
+{
+ static const char *evtchn_type_to_string[EVTCHN_TYPE_COUNT] = {
+ [EVTCHN_TYPE_UNBOUND] = "UNBOUND",
+ [EVTCHN_TYPE_PIRQ] = "PIRQ",
+ [EVTCHN_TYPE_VIRQ] = "VIRQ",
+ [EVTCHN_TYPE_IPI] = "IPI",
+ [EVTCHN_TYPE_PORT] = "PORT",
+ };
+
+ if (type >= EVTCHN_TYPE_COUNT)
+ return ("UNKNOWN");
+
+ return (evtchn_type_to_string[type]);
+}
+
+static void
+xen_intr_dump_port(struct xenisrc *isrc)
+{
+ struct xen_intr_pcpu_data *pcpu;
+ shared_info_t *s = HYPERVISOR_shared_info;
+ int i;
+
+ db_printf("Port %d Type: %s\n",
+ isrc->xi_port, xen_intr_print_type(isrc->xi_type));
+ if (isrc->xi_type == EVTCHN_TYPE_PIRQ) {
+ db_printf("\tPirq: %d ActiveHi: %d EdgeTrigger: %d "
+ "NeedsEOI: %d\n",
+ isrc->xi_pirq, isrc->xi_activehi, isrc->xi_edgetrigger,
+ !!xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map));
+ }
+ if (isrc->xi_type == EVTCHN_TYPE_VIRQ)
+ db_printf("\tVirq: %d\n", isrc->xi_virq);
+
+ db_printf("\tMasked: %d Pending: %d\n",
+ !!xen_test_bit(isrc->xi_port, &s->evtchn_mask[0]),
+ !!xen_test_bit(isrc->xi_port, &s->evtchn_pending[0]));
+
+ db_printf("\tPer-CPU Masks: ");
+ CPU_FOREACH(i) {
+ pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
+ db_printf("cpu#%d: %d ", i,
+ !!xen_test_bit(isrc->xi_port, pcpu->evtchn_enabled));
+ }
+ db_printf("\n");
+}
+
+DB_SHOW_COMMAND(xen_evtchn, db_show_xen_evtchn)
+{
+ int i;
+
+ if (!xen_domain()) {
+ db_printf("Only available on Xen guests\n");
+ return;
+ }
+
+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+ struct xenisrc *isrc;
+
+ isrc = xen_intr_port_to_isrc[i];
+ if (isrc == NULL)
+ continue;
+
+ xen_intr_dump_port(isrc);
+ }
+}
+#endif /* DDB */
Added: trunk/sys/x86/xen/xen_msi.c
===================================================================
--- trunk/sys/x86/xen/xen_msi.c (rev 0)
+++ trunk/sys/x86/xen/xen_msi.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,134 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_msi.c 344912 2019-03-08 01:04:19Z jhb $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/systm.h>
+#include <x86/apicreg.h>
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/frame.h>
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+#include <machine/specialreg.h>
+#include <dev/pci/pcivar.h>
+
+#include <xen/xen-os.h>
+#include <xen/xen_intr.h>
+#include <xen/xen_msi.h>
+
+static struct mtx msi_lock;
+static u_int msi_last_irq;
+
+void
+xen_msi_init(void)
+{
+
+ MPASS(num_io_irqs > 0);
+ first_msi_irq = min(MINIMUM_MSI_INT, num_io_irqs);
+ if (num_msi_irqs > UINT_MAX - first_msi_irq)
+ panic("num_msi_irqs too high");
+ num_io_irqs = first_msi_irq + num_msi_irqs;
+
+ mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
+}
+
+/*
+ * Try to allocate 'count' interrupt sources with contiguous IDT values.
+ */
+int
+xen_msi_alloc(device_t dev, int count, int maxcount, int *irqs)
+{
+ int i, ret = 0;
+
+ mtx_lock(&msi_lock);
+
+ /* If we would exceed the max, give up. */
+ if (msi_last_irq + count > num_msi_irqs) {
+ mtx_unlock(&msi_lock);
+ return (ENXIO);
+ }
+
+ /* Allocate MSI vectors */
+ for (i = 0; i < count; i++)
+ irqs[i] = first_msi_irq + msi_last_irq++;
+
+ mtx_unlock(&msi_lock);
+
+ ret = xen_register_msi(dev, irqs[0], count);
+ if (ret != 0)
+ return (ret);
+
+ for (i = 0; i < count; i++)
+ nexus_add_irq(irqs[i]);
+
+ return (0);
+}
+
+int
+xen_msi_release(int *irqs, int count)
+{
+ int i, ret;
+
+ for (i = 0; i < count; i++) {
+ ret = xen_release_msi(irqs[i]);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+int
+xen_msi_map(int irq, uint64_t *addr, uint32_t *data)
+{
+
+ return (0);
+}
+
+int
+xen_msix_alloc(device_t dev, int *irq)
+{
+
+ return (ENXIO);
+}
+
+int
+xen_msix_release(int irq)
+{
+
+ return (ENOENT);
+}
Property changes on: trunk/sys/x86/xen/xen_msi.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xen_nexus.c
===================================================================
--- trunk/sys/x86/xen/xen_nexus.c (rev 0)
+++ trunk/sys/x86/xen/xen_nexus.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,168 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2013 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_nexus.c 340016 2018-11-01 18:34:26Z jhb $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+
+#include <dev/acpica/acpivar.h>
+
+#include <x86/init.h>
+#include <machine/nexusvar.h>
+#include <machine/intr_machdep.h>
+
+#include <xen/xen-os.h>
+#include <xen/xen_intr.h>
+#include <xen/xen_msi.h>
+
+#include "pcib_if.h"
+
+/*
+ * Xen nexus(4) driver.
+ */
+static int
+nexus_xen_probe(device_t dev)
+{
+
+ if (!xen_pv_domain())
+ return (ENXIO);
+
+ return (BUS_PROBE_SPECIFIC);
+}
+
+static int
+nexus_xen_attach(device_t dev)
+{
+ int error;
+ device_t acpi_dev = NULL;
+
+ nexus_init_resources();
+ bus_generic_probe(dev);
+
+ if (xen_initial_domain()) {
+ /* Disable some ACPI devices that are not usable by Dom0 */
+ acpi_cpu_disabled = true;
+ acpi_hpet_disabled = true;
+ acpi_timer_disabled = true;
+
+ acpi_dev = BUS_ADD_CHILD(dev, 10, "acpi", 0);
+ if (acpi_dev == NULL)
+ panic("Unable to add ACPI bus to Xen Dom0");
+ }
+
+ error = bus_generic_attach(dev);
+ if (xen_initial_domain() && (error == 0))
+ acpi_install_wakeup_handler(device_get_softc(acpi_dev));
+
+ return (error);
+}
+
+static int
+nexus_xen_config_intr(device_t dev, int irq, enum intr_trigger trig,
+ enum intr_polarity pol)
+{
+ int ret;
+
+ /*
+ * ISA and PCI intline IRQs are not preregistered on Xen, so
+ * intercept calls to configure those and register them on the fly.
+ */
+ if ((irq < first_msi_irq) && (intr_lookup_source(irq) == NULL)) {
+ ret = xen_register_pirq(irq, trig, pol);
+ if (ret != 0)
+ return (ret);
+ nexus_add_irq(irq);
+ }
+ return (intr_config_intr(irq, trig, pol));
+}
+
+static int
+nexus_xen_alloc_msix(device_t pcib, device_t dev, int *irq)
+{
+
+ return (xen_msix_alloc(dev, irq));
+}
+
+static int
+nexus_xen_release_msix(device_t pcib, device_t dev, int irq)
+{
+
+ return (xen_msix_release(irq));
+}
+
+static int
+nexus_xen_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs)
+{
+
+ return (xen_msi_alloc(dev, count, maxcount, irqs));
+}
+
+static int
+nexus_xen_release_msi(device_t pcib, device_t dev, int count, int *irqs)
+{
+
+ return (xen_msi_release(irqs, count));
+}
+
+static int
+nexus_xen_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data)
+{
+
+ return (xen_msi_map(irq, addr, data));
+}
+
+static device_method_t nexus_xen_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, nexus_xen_probe),
+ DEVMETHOD(device_attach, nexus_xen_attach),
+
+ /* INTR */
+ DEVMETHOD(bus_config_intr, nexus_xen_config_intr),
+
+ /* MSI */
+ DEVMETHOD(pcib_alloc_msi, nexus_xen_alloc_msi),
+ DEVMETHOD(pcib_release_msi, nexus_xen_release_msi),
+ DEVMETHOD(pcib_alloc_msix, nexus_xen_alloc_msix),
+ DEVMETHOD(pcib_release_msix, nexus_xen_release_msix),
+ DEVMETHOD(pcib_map_msi, nexus_xen_map_msi),
+
+ { 0, 0 }
+};
+
+DEFINE_CLASS_1(nexus, nexus_xen_driver, nexus_xen_methods, 1, nexus_driver);
+static devclass_t nexus_devclass;
+
+DRIVER_MODULE(nexus_xen, root, nexus_xen_driver, nexus_devclass, 0, 0);
Property changes on: trunk/sys/x86/xen/xen_nexus.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xen_pci_bus.c
===================================================================
--- trunk/sys/x86/xen/xen_pci_bus.c (rev 0)
+++ trunk/sys/x86/xen/xen_pci_bus.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,91 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_pci_bus.c 275649 2014-12-09 18:03:25Z royger $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+
+#include <sys/pciio.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pci_private.h>
+
+#include <xen/xen-os.h>
+#include <xen/hypervisor.h>
+#include <xen/xen_pci.h>
+
+#include "pcib_if.h"
+#include "pci_if.h"
+
+void
+xen_pci_enable_msi_method(device_t dev, device_t child, uint64_t address,
+ uint16_t data)
+{
+ struct pci_devinfo *dinfo = device_get_ivars(child);
+ struct pcicfg_msi *msi = &dinfo->cfg.msi;
+
+ /* Enable MSI in the control register. */
+ msi->msi_ctrl |= PCIM_MSICTRL_MSI_ENABLE;
+ pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
+ msi->msi_ctrl, 2);
+}
+
+void
+xen_pci_disable_msi_method(device_t dev, device_t child)
+{
+ struct pci_devinfo *dinfo = device_get_ivars(child);
+ struct pcicfg_msi *msi = &dinfo->cfg.msi;
+
+ msi->msi_ctrl &= ~PCIM_MSICTRL_MSI_ENABLE;
+ pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
+ msi->msi_ctrl, 2);
+}
+
+void
+xen_pci_child_added_method(device_t dev, device_t child)
+{
+ struct pci_devinfo *dinfo;
+ struct physdev_pci_device_add add_pci;
+ int error;
+
+ dinfo = device_get_ivars(child);
+ KASSERT((dinfo != NULL),
+ ("xen_pci_add_child_method called with NULL dinfo"));
+
+ bzero(&add_pci, sizeof(add_pci));
+ add_pci.seg = dinfo->cfg.domain;
+ add_pci.bus = dinfo->cfg.bus;
+ add_pci.devfn = (dinfo->cfg.slot << 3) | dinfo->cfg.func;
+ error = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add_pci);
+ if (error)
+ panic("unable to add device bus %u devfn %u error: %d\n",
+ add_pci.bus, add_pci.devfn, error);
+}
Property changes on: trunk/sys/x86/xen/xen_pci_bus.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xenpv.c
===================================================================
--- trunk/sys/x86/xen/xenpv.c (rev 0)
+++ trunk/sys/x86/xen/xenpv.c 2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,203 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monné <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xenpv.c 331017 2018-03-15 19:08:33Z kevans $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/pcpu.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+#include <sys/limits.h>
+#include <sys/vmmeter.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_param.h>
+#include <vm/vm_phys.h>
+
+#include <xen/xen-os.h>
+#include <xen/gnttab.h>
+
+#include "xenmem_if.h"
+
+/*
+ * Allocate unused physical memory above 4GB in order to map memory
+ * from foreign domains. We use memory starting at 4GB in order to
+ * prevent clashes with MMIO/ACPI regions.
+ *
+ * Since this is not possible on i386 just use any available memory
+ * chunk and hope we don't clash with anything else.
+ */
+#ifdef __amd64__
+#define LOW_MEM_LIMIT 0x100000000ul
+#else
+#define LOW_MEM_LIMIT 0
+#endif
+
+static devclass_t xenpv_devclass;
+
+static void
+xenpv_identify(driver_t *driver, device_t parent)
+{
+ if (!xen_domain())
+ return;
+
+ /* Make sure there's only one xenpv device. */
+ if (devclass_get_device(xenpv_devclass, 0))
+ return;
+
+ /*
+ * The xenpv bus should be the last to attach in order
+ * to properly detect if an ISA bus has already been added.
+ */
+ if (BUS_ADD_CHILD(parent, UINT_MAX, "xenpv", 0) == NULL)
+ panic("Unable to attach xenpv bus.");
+}
+
+static int
+xenpv_probe(device_t dev)
+{
+
+ device_set_desc(dev, "Xen PV bus");
+ return (BUS_PROBE_NOWILDCARD);
+}
+
+static int
+xenpv_attach(device_t dev)
+{
+ device_t child;
+
+ /*
+ * Let our child drivers identify any child devices that they
+ * can find. Once that is done attach any devices that we
+ * found.
+ */
+ bus_generic_probe(dev);
+ bus_generic_attach(dev);
+
+ if (!devclass_get_device(devclass_find("isa"), 0)) {
+ child = BUS_ADD_CHILD(dev, 0, "isa", 0);
+ if (child == NULL)
+ panic("Failed to attach ISA bus.");
+ device_probe_and_attach(child);
+ }
+
+ return (0);
+}
+
+static struct resource *
+xenpv_alloc_physmem(device_t dev, device_t child, int *res_id, size_t size)
+{
+ struct resource *res;
+ vm_paddr_t phys_addr;
+ int error;
+
+ res = bus_alloc_resource(child, SYS_RES_MEMORY, res_id, LOW_MEM_LIMIT,
+ ~0, size, RF_ACTIVE);
+ if (res == NULL)
+ return (NULL);
+
+ phys_addr = rman_get_start(res);
+ error = vm_phys_fictitious_reg_range(phys_addr, phys_addr + size,
+ VM_MEMATTR_DEFAULT);
+ if (error) {
+ bus_release_resource(child, SYS_RES_MEMORY, *res_id, res);
+ return (NULL);
+ }
+
+ return (res);
+}
+
+static int
+xenpv_free_physmem(device_t dev, device_t child, int res_id, struct resource *res)
+{
+ vm_paddr_t phys_addr;
+ size_t size;
+
+ phys_addr = rman_get_start(res);
+ size = rman_get_size(res);
+
+ vm_phys_fictitious_unreg_range(phys_addr, phys_addr + size);
+ return (bus_release_resource(child, SYS_RES_MEMORY, res_id, res));
+}
+
+static device_method_t xenpv_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_identify, xenpv_identify),
+ DEVMETHOD(device_probe, xenpv_probe),
+ DEVMETHOD(device_attach, xenpv_attach),
+ DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_resume, bus_generic_resume),
+
+ /* Bus interface */
+ DEVMETHOD(bus_add_child, bus_generic_add_child),
+ DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource),
+ DEVMETHOD(bus_release_resource, bus_generic_release_resource),
+ DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
+ DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+
+ /* Interface to allocate memory for foreign mappings */
+ DEVMETHOD(xenmem_alloc, xenpv_alloc_physmem),
+ DEVMETHOD(xenmem_free, xenpv_free_physmem),
+
+ DEVMETHOD_END
+};
+
+static driver_t xenpv_driver = {
+ "xenpv",
+ xenpv_methods,
+ 0,
+};
+
+DRIVER_MODULE(xenpv, nexus, xenpv_driver, xenpv_devclass, 0, 0);
+
+struct resource *
+xenmem_alloc(device_t dev, int *res_id, size_t size)
+{
+ device_t parent;
+
+ parent = device_get_parent(dev);
+ if (parent == NULL)
+ return (NULL);
+ return (XENMEM_ALLOC(parent, dev, res_id, size));
+}
+
+int
+xenmem_free(device_t dev, int res_id, struct resource *res)
+{
+ device_t parent;
+
+ parent = device_get_parent(dev);
+ if (parent == NULL)
+ return (ENXIO);
+ return (XENMEM_FREE(parent, dev, res_id, res));
+}
Property changes on: trunk/sys/x86/xen/xenpv.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
More information about the Midnightbsd-cvs
mailing list