From laffer1 at midnightbsd.org  Sat Feb  8 14:26:24 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:26:24 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12301] trunk/sys/xen/interface/io: sync with
 FreeBSD 11-stable
Message-ID: <202002081926.018JQO74060620@stargazer.midnightbsd.org>

Revision: 12301
          http://svnweb.midnightbsd.org/src/?rev=12301
Author:   laffer1
Date:     2020-02-08 14:26:24 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/xen/interface/io/blkif.h
    trunk/sys/xen/interface/io/console.h
    trunk/sys/xen/interface/io/fbif.h
    trunk/sys/xen/interface/io/kbdif.h
    trunk/sys/xen/interface/io/libxenvchan.h
    trunk/sys/xen/interface/io/netif.h
    trunk/sys/xen/interface/io/pciif.h
    trunk/sys/xen/interface/io/protocols.h
    trunk/sys/xen/interface/io/ring.h
    trunk/sys/xen/interface/io/tpmif.h
    trunk/sys/xen/interface/io/usbif.h
    trunk/sys/xen/interface/io/vscsiif.h
    trunk/sys/xen/interface/io/xenbus.h
    trunk/sys/xen/interface/io/xs_wire.h

Modified: trunk/sys/xen/interface/io/blkif.h
===================================================================
--- trunk/sys/xen/interface/io/blkif.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/blkif.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -60,7 +60,7 @@
  * All data in the XenStore is stored as strings.  Nodes specifying numeric
  * values are encoded in decimal.  Integer value ranges listed below are
  * expressed as fixed sized integer types capable of storing the conversion
- * of a properly formatted node string, without loss of information.
+ * of a properly formated node string, without loss of information.
  *
  * Any specified default value is in effect if the corresponding XenBus node
  * is not present in the XenStore.
@@ -89,10 +89,16 @@
  * params
  *      Values:         string
  *
- *      Data used by the backend driver to locate and configure the backing
- *      device.  The format and semantics of this data vary according to the
- *      backing device in use and are outside the scope of this specification.
+ *      A free formatted string providing sufficient information for the
+ *      backend driver to open the backing device.  (e.g. the path to the
+ *      file or block device representing the backing store.)
  *
+ * physical-device
+ *      Values:         "MAJOR:MINOR"
+ *
+ *      MAJOR and MINOR are the major number and minor number of the
+ *      backing device respectively.
+ *
  * type
  *      Values:         "file", "phy", "tap"
  *
@@ -319,7 +325,7 @@
  *      access (even when it should be read-only). If the frontend hits the
  *      maximum number of allowed persistently mapped grants, it can fallback
  *      to non persistent mode. This will cause a performance degradation,
- *      since the the backend driver will still try to map those grants
+ *      since the backend driver will still try to map those grants
  *      persistently. Since the persistent grants protocol is compatible with
  *      the previous protocol, a frontend driver can choose to work in
  *      persistent mode even when the backend doesn't support it.
@@ -494,7 +500,7 @@
  * discarded region on the device must be rendered unrecoverable before the
  * command returns.
  *
- * This operation is analogous to performing a trim (ATA) or unmap (SCSI),
+ * This operation is analogous to performing a trim (ATA) or unamp (SCSI),
  * command on a native device.
  *
  * More information about trim/unmap operations can be found at:
@@ -559,7 +565,6 @@
     /* @last_sect: last sector in frame to transfer (inclusive).     */
     uint8_t     first_sect, last_sect;
 };
-typedef struct blkif_request_segment blkif_request_segment_t;
 
 /*
  * Starting ring element for any I/O request.
@@ -570,7 +575,7 @@
     blkif_vdev_t   handle;       /* only for read/write requests         */
     uint64_t       id;           /* private guest value, echoed in resp  */
     blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-    blkif_request_segment_t seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 };
 typedef struct blkif_request blkif_request_t;
 

Modified: trunk/sys/xen/interface/io/console.h
===================================================================
--- trunk/sys/xen/interface/io/console.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/console.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -44,7 +44,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/io/fbif.h
===================================================================
--- trunk/sys/xen/interface/io/fbif.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/fbif.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -169,7 +169,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/io/kbdif.h
===================================================================
--- trunk/sys/xen/interface/io/kbdif.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/kbdif.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -125,7 +125,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/io/libxenvchan.h
===================================================================
--- trunk/sys/xen/interface/io/libxenvchan.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/libxenvchan.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -22,8 +22,7 @@
  *  Lesser General Public License for more details.
  *
  *  You should have received a copy of the GNU Lesser General Public
- *  License along with this library; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *  License along with this library; If not, see <http://www.gnu.org/licenses/>.
  *
  * @section DESCRIPTION
  *

Modified: trunk/sys/xen/interface/io/netif.h
===================================================================
--- trunk/sys/xen/interface/io/netif.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/netif.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -32,6 +32,24 @@
 #include "../grant_table.h"
 
 /*
+ * Older implementation of Xen network frontend / backend has an
+ * implicit dependency on the MAX_SKB_FRAGS as the maximum number of
+ * ring slots a skb can use. Netfront / netback may not work as
+ * expected when frontend and backend have different MAX_SKB_FRAGS.
+ *
+ * A better approach is to add mechanism for netfront / netback to
+ * negotiate this value. However we cannot fix all possible
+ * frontends, so we need to define a value which states the minimum
+ * slots backend must support.
+ *
+ * The minimum value derives from older Linux kernel's MAX_SKB_FRAGS
+ * (18), which is proved to work with most frontends. Any new backend
+ * which doesn't negotiate with frontend should expect frontend to
+ * send a valid packet using slots up to this value.
+ */
+#define XEN_NETIF_NR_SLOTS_MIN 18
+
+/*
  * Notifications after enqueuing any type of message should be conditional on
  * the appropriate req_event or rsp_event field in the shared ring.
  * If the client sends notification for rx requests then it should specify
@@ -40,16 +58,226 @@
  */
 
 /*
+ * "feature-split-event-channels" is introduced to separate guest TX
+ * and RX notification. Backend either doesn't support this feature or
+ * advertises it via xenstore as 0 (disabled) or 1 (enabled).
+ *
+ * To make use of this feature, frontend should allocate two event
+ * channels for TX and RX, advertise them to backend as
+ * "event-channel-tx" and "event-channel-rx" respectively. If frontend
+ * doesn't want to use this feature, it just writes "event-channel"
+ * node as before.
+ */
+
+/*
+ * Multiple transmit and receive queues:
+ * If supported, the backend will write the key "multi-queue-max-queues" to
+ * the directory for that vif, and set its value to the maximum supported
+ * number of queues.
+ * Frontends that are aware of this feature and wish to use it can write the
+ * key "multi-queue-num-queues", set to the number they wish to use, which
+ * must be greater than zero, and no more than the value reported by the backend
+ * in "multi-queue-max-queues".
+ *
+ * Queues replicate the shared rings and event channels.
+ * "feature-split-event-channels" may optionally be used when using
+ * multiple queues, but is not mandatory.
+ *
+ * Each queue consists of one shared ring pair, i.e. there must be the same
+ * number of tx and rx rings.
+ *
+ * For frontends requesting just one queue, the usual event-channel and
+ * ring-ref keys are written as before, simplifying the backend processing
+ * to avoid distinguishing between a frontend that doesn't understand the
+ * multi-queue feature, and one that does, but requested only one queue.
+ *
+ * Frontends requesting two or more queues must not write the toplevel
+ * event-channel (or event-channel-{tx,rx}) and {tx,rx}-ring-ref keys,
+ * instead writing those keys under sub-keys having the name "queue-N" where
+ * N is the integer ID of the queue for which those keys belong. Queues
+ * are indexed from zero. For example, a frontend with two queues and split
+ * event channels must write the following set of queue-related keys:
+ *
+ * /local/domain/1/device/vif/0/multi-queue-num-queues = "2"
+ * /local/domain/1/device/vif/0/queue-0 = ""
+ * /local/domain/1/device/vif/0/queue-0/tx-ring-ref = "<ring-ref-tx0>"
+ * /local/domain/1/device/vif/0/queue-0/rx-ring-ref = "<ring-ref-rx0>"
+ * /local/domain/1/device/vif/0/queue-0/event-channel-tx = "<evtchn-tx0>"
+ * /local/domain/1/device/vif/0/queue-0/event-channel-rx = "<evtchn-rx0>"
+ * /local/domain/1/device/vif/0/queue-1 = ""
+ * /local/domain/1/device/vif/0/queue-1/tx-ring-ref = "<ring-ref-tx1>"
+ * /local/domain/1/device/vif/0/queue-1/rx-ring-ref = "<ring-ref-rx1"
+ * /local/domain/1/device/vif/0/queue-1/event-channel-tx = "<evtchn-tx1>"
+ * /local/domain/1/device/vif/0/queue-1/event-channel-rx = "<evtchn-rx1>"
+ *
+ * If there is any inconsistency in the XenStore data, the backend may
+ * choose not to connect any queues, instead treating the request as an
+ * error. This includes scenarios where more (or fewer) queues were
+ * requested than the frontend provided details for.
+ *
+ * Mapping of packets to queues is considered to be a function of the
+ * transmitting system (backend or frontend) and is not negotiated
+ * between the two. Guests are free to transmit packets on any queue
+ * they choose, provided it has been set up correctly. Guests must be
+ * prepared to receive packets on any queue they have requested be set up.
+ */
+
+/*
+ * "feature-no-csum-offload" should be used to turn IPv4 TCP/UDP checksum
+ * offload off or on. If it is missing then the feature is assumed to be on.
+ * "feature-ipv6-csum-offload" should be used to turn IPv6 TCP/UDP checksum
+ * offload on or off. If it is missing then the feature is assumed to be off.
+ */
+
+/*
+ * "feature-gso-tcpv4" and "feature-gso-tcpv6" advertise the capability to
+ * handle large TCP packets (in IPv4 or IPv6 form respectively). Neither
+ * frontends nor backends are assumed to be capable unless the flags are
+ * present.
+ */
+
+/*
+ * "feature-multicast-control" advertises the capability to filter ethernet
+ * multicast packets in the backend. To enable use of this capability the
+ * frontend must set "request-multicast-control" before moving into the
+ * connected state.
+ *
+ * If "request-multicast-control" is set then the backend transmit side should
+ * no longer flood multicast packets to the frontend, it should instead drop any
+ * multicast packet that does not match in a filter list. The list is
+ * amended by the frontend by sending dummy transmit requests containing
+ * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL} extra-info fragments as specified below.
+ * Once enabled by the frontend, the feature cannot be disabled except by
+ * closing and re-connecting to the backend.
+ */
+
+/*
  * This is the 'wire' format for packets:
- *  Request 1: netif_tx_request -- NETTXF_* (any flags)
- * [Request 2: netif_tx_extra] (only if request 1 has NETTXF_extra_info)
- * [Request 3: netif_tx_extra] (only if request 2 has XEN_NETIF_EXTRA_FLAG_MORE)
- *  Request 4: netif_tx_request -- NETTXF_more_data
- *  Request 5: netif_tx_request -- NETTXF_more_data
+ *  Request 1: netif_tx_request_t -- NETTXF_* (any flags)
+ * [Request 2: netif_extra_info_t] (only if request 1 has NETTXF_extra_info)
+ * [Request 3: netif_extra_info_t] (only if request 2 has XEN_NETIF_EXTRA_MORE)
+ *  Request 4: netif_tx_request_t -- NETTXF_more_data
+ *  Request 5: netif_tx_request_t -- NETTXF_more_data
  *  ...
- *  Request N: netif_tx_request -- 0
+ *  Request N: netif_tx_request_t -- 0
  */
 
+/*
+ * Guest transmit
+ * ==============
+ *
+ * Ring slot size is 12 octets, however not all request/response
+ * structs use the full size.
+ *
+ * tx request data (netif_tx_request_t)
+ * ------------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | grant ref             | offset    | flags     |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | size      |
+ * +-----+-----+-----+-----+
+ *
+ * grant ref: Reference to buffer page.
+ * offset: Offset within buffer page.
+ * flags: NETTXF_*.
+ * id: request identifier, echoed in response.
+ * size: packet size in bytes.
+ *
+ * tx response (netif_tx_response_t)
+ * ---------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | status    | unused                |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | unused                |
+ * +-----+-----+-----+-----+
+ *
+ * id: reflects id in transmit request
+ * status: NETIF_RSP_*
+ *
+ * Guest receive
+ * =============
+ *
+ * Ring slot size is 8 octets.
+ *
+ * rx request (netif_rx_request_t)
+ * -------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | pad       | gref                  |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * id: request identifier, echoed in response.
+ * gref: reference to incoming granted frame.
+ *
+ * rx response (netif_rx_response_t)
+ * ---------------------------------
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | id        | offset    | flags     | status    |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * id: reflects id in receive request
+ * offset: offset in page of start of received packet
+ * flags: NETRXF_*
+ * status: -ve: NETIF_RSP_*; +ve: Rx'ed pkt size.
+ *
+ * Extra Info
+ * ==========
+ *
+ * Can be present if initial request has NET{T,R}XF_extra_info, or
+ * previous extra request has XEN_NETIF_EXTRA_MORE.
+ *
+ * The struct therefore needs to fit into either a tx or rx slot and
+ * is therefore limited to 8 octets.
+ *
+ * extra info (netif_extra_info_t)
+ * -------------------------------
+ *
+ * General format:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags| type specfic data                 |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * | padding for tx        |
+ * +-----+-----+-----+-----+
+ *
+ * type: XEN_NETIF_EXTRA_TYPE_*
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * padding for tx: present only in the tx case due to 8 octet limit
+ *     from rx case. Not shown in type specific entries below.
+ *
+ * XEN_NETIF_EXTRA_TYPE_GSO:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags| size      |type | pad | features  |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * type: Must be XEN_NETIF_EXTRA_TYPE_GSO
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * size: Maximum payload size of each segment.
+ * type: XEN_NETIF_GSO_TYPE_*
+ * features: EN_NETIF_GSO_FEAT_*
+ *
+ * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}:
+ *
+ *    0     1     2     3     4     5     6     7  octet
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * |type |flags| addr                              |
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ *
+ * type: Must be XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}
+ * flags: XEN_NETIF_EXTRA_FLAG_*
+ * addr: address to add/remove
+ */
+
 /* Protocol checksum field is blank in the packet (hardware offload)? */
 #define _NETTXF_csum_blank     (0)
 #define  NETTXF_csum_blank     (1U<<_NETTXF_csum_blank)
@@ -66,14 +294,13 @@
 #define _NETTXF_extra_info     (3)
 #define  NETTXF_extra_info     (1U<<_NETTXF_extra_info)
 
+#define XEN_NETIF_MAX_TX_SIZE 0xFFFF
 struct netif_tx_request {
     grant_ref_t gref;      /* Reference to buffer page */
     uint16_t offset;       /* Offset within buffer page */
     uint16_t flags;        /* NETTXF_* */
     uint16_t id;           /* Echoed in response message. */
-    uint16_t size;         /* For the first request in a packet, the packet 
-			      size in bytes.  For subsequent requests, the 
-			      size of that request's associated data in bytes*/
+    uint16_t size;         /* Packet size in bytes.       */
 };
 typedef struct netif_tx_request netif_tx_request_t;
 
@@ -84,16 +311,18 @@
 #define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3)  /* u.mcast */
 #define XEN_NETIF_EXTRA_TYPE_MAX       (4)
 
-/* netif_extra_info flags. */
+/* netif_extra_info_t flags. */
 #define _XEN_NETIF_EXTRA_FLAG_MORE (0)
 #define XEN_NETIF_EXTRA_FLAG_MORE  (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
 
-/* GSO types - only TCPv4 currently supported. */
+/* GSO types */
+#define XEN_NETIF_GSO_TYPE_NONE         (0)
 #define XEN_NETIF_GSO_TYPE_TCPV4        (1)
+#define XEN_NETIF_GSO_TYPE_TCPV6        (2)
 
 /*
- * This structure needs to fit within both netif_tx_request and
- * netif_rx_response for compatibility.
+ * This structure needs to fit within both netif_tx_request_t and
+ * netif_rx_response_t for compatibility.
  */
 struct netif_extra_info {
     uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
@@ -128,14 +357,6 @@
 
         /*
          * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}:
-         * Backend advertises availability via 'feature-multicast-control'
-         * xenbus node containing value '1'.
-         * Frontend requests this feature by advertising
-         * 'request-multicast-control' xenbus node containing value '1'.
-         * If multicast control is requested then multicast flooding is
-         * disabled and the frontend must explicitly register its interest
-         * in multicast groups using dummy transmit requests containing
-         * MCAST_{ADD,DEL} extra-info fragments.
          */
         struct {
             uint8_t addr[6]; /* Address to add/remove. */
@@ -154,6 +375,7 @@
 
 struct netif_rx_request {
     uint16_t    id;        /* Echoed in response message.        */
+    uint16_t    pad;
     grant_ref_t gref;      /* Reference to incoming granted frame */
 };
 typedef struct netif_rx_request netif_rx_request_t;
@@ -174,15 +396,11 @@
 #define _NETRXF_extra_info     (3)
 #define  NETRXF_extra_info     (1U<<_NETRXF_extra_info)
 
-/* GSO Prefix descriptor. */
-#define _NETRXF_gso_prefix     (4)
-#define  NETRXF_gso_prefix     (1U<<_NETRXF_gso_prefix)
-
 struct netif_rx_response {
     uint16_t id;
     uint16_t offset;       /* Offset in page of start of received packet  */
     uint16_t flags;        /* NETRXF_* */
-    int16_t  status;       /* -ve: NETIF_RSP_* ; +ve: Rx'ed response size. */
+    int16_t  status;       /* -ve: NETIF_RSP_* ; +ve: Rx'ed pkt size. */
 };
 typedef struct netif_rx_response netif_rx_response_t;
 
@@ -196,7 +414,7 @@
 #define NETIF_RSP_DROPPED         -2
 #define NETIF_RSP_ERROR           -1
 #define NETIF_RSP_OKAY             0
-/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
+/* No response: used for auxiliary requests (e.g., netif_extra_info_t). */
 #define NETIF_RSP_NULL             1
 
 #endif
@@ -204,7 +422,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/io/pciif.h
===================================================================
--- trunk/sys/xen/interface/io/pciif.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/pciif.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -47,6 +47,7 @@
 #define XEN_PCI_OP_aer_resume		(7)
 #define XEN_PCI_OP_aer_mmio		(8)
 #define XEN_PCI_OP_aer_slotreset	(9)
+#define XEN_PCI_OP_enable_multi_msi	(10)
 
 /* xen_pci_op error numbers */
 #define XEN_PCI_ERR_success          (0)
@@ -117,7 +118,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/io/protocols.h
===================================================================
--- trunk/sys/xen/interface/io/protocols.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/protocols.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -19,6 +19,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2008, Keir Fraser
  */
 
 #ifndef __XEN_PROTOCOLS_H__
@@ -26,7 +28,6 @@
 
 #define XEN_IO_PROTO_ABI_X86_32     "x86_32-abi"
 #define XEN_IO_PROTO_ABI_X86_64     "x86_64-abi"
-#define XEN_IO_PROTO_ABI_IA64       "ia64-abi"
 #define XEN_IO_PROTO_ABI_ARM        "arm-abi"
 
 #if defined(__i386__)
@@ -33,9 +34,7 @@
 # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32
 #elif defined(__x86_64__)
 # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_64
-#elif defined(__ia64__)
-# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_IA64
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__aarch64__)
 # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_ARM
 #else
 # error arch fixup needed here

Modified: trunk/sys/xen/interface/io/ring.h
===================================================================
--- trunk/sys/xen/interface/io/ring.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/ring.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -46,15 +46,9 @@
 #define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
 
 /*
- * The amount of space reserved in the shared ring for accounting information.
- */
-#define __RING_HEADER_SIZE(_s) \
-    ((intptr_t)(_s)->ring - (intptr_t)(_s))
-
-/*
  * Calculate size of a shared ring, given the total available space for the
  * ring and indexes (_sz), and the name tag of the request/response structure.
- * A ring contains as many entries as will fit, rounded down to the nearest
+ * A ring contains as many entries as will fit, rounded down to the nearest 
  * power of two (so we can mask with (size-1) to loop around).
  */
 #define __CONST_RING_SIZE(_s, _sz) \
@@ -64,19 +58,9 @@
  * The same for passing in an actual pointer instead of a name tag.
  */
 #define __RING_SIZE(_s, _sz) \
-    (__RD32(((_sz) - __RING_HEADER_SIZE(_s)) / sizeof((_s)->ring[0])))
+    (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
 
 /*
- * The number of pages needed to support a given number of request/reponse
- * entries.  The entry count is rounded down to the nearest power of two
- * as required by the ring macros.
- */
-#define __RING_PAGES(_s, _entries)              \
-    ((__RING_HEADER_SIZE(_s)                    \
-   + (__RD32(_entries) * sizeof((_s)->ring[0])) \
-   + PAGE_SIZE - 1) / PAGE_SIZE)
-
-/*
  * Macros to make the correct C datatypes for a new kind of ring.
  * 
  * To make a new ring datatype, you need to have two message structures,
@@ -128,7 +112,7 @@
             uint8_t msg;                                                \
         } tapif_user;                                                   \
         uint8_t pvt_pad[4];                                             \
-    } private;                                                          \
+    } pvt;                                                              \
     uint8_t __pad[44];                                                  \
     union __name##_sring_entry ring[1]; /* variable-length */           \
 };                                                                      \
@@ -173,7 +157,7 @@
 #define SHARED_RING_INIT(_s) do {                                       \
     (_s)->req_prod  = (_s)->rsp_prod  = 0;                              \
     (_s)->req_event = (_s)->rsp_event = 1;                              \
-    (void)memset((_s)->private.pvt_pad, 0, sizeof((_s)->private.pvt_pad)); \
+    (void)memset((_s)->pvt.pvt_pad, 0, sizeof((_s)->pvt.pvt_pad));      \
     (void)memset((_s)->__pad, 0, sizeof((_s)->__pad));                  \
 } while(0)
 
@@ -191,21 +175,6 @@
     (_r)->sring = (_s);                                                 \
 } while (0)
 
-/* Initialize to existing shared indexes -- for recovery */
-#define FRONT_RING_ATTACH(_r, _s, __size) do {                          \
-    (_r)->sring = (_s);                                                 \
-    (_r)->req_prod_pvt = (_s)->req_prod;                                \
-    (_r)->rsp_cons = (_s)->rsp_prod;                                    \
-    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
-} while (0)
-
-#define BACK_RING_ATTACH(_r, _s, __size) do {                           \
-    (_r)->sring = (_s);                                                 \
-    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                                \
-    (_r)->req_cons = (_s)->req_prod;                                    \
-    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
-} while (0)
-
 /* How big is this ring? */
 #define RING_SIZE(_r)                                                   \
     ((_r)->nr_ents)
@@ -251,6 +220,10 @@
 #define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                           \
     (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
 
+/* Ill-behaved frontend determination: Can there be this many requests? */
+#define RING_REQUEST_PROD_OVERFLOW(_r, _prod)                           \
+    (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r))
+
 #define RING_PUSH_REQUESTS(_r) do {                                     \
     xen_wmb(); /* back sees requests /before/ updated producer index */ \
     (_r)->sring->req_prod = (_r)->req_prod_pvt;                         \
@@ -332,7 +305,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/io/tpmif.h
===================================================================
--- trunk/sys/xen/interface/io/tpmif.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/tpmif.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -65,12 +65,78 @@
 };
 typedef struct tpmif_tx_interface tpmif_tx_interface_t;
 
+/******************************************************************************
+ * TPM I/O interface for Xen guest OSes, v2
+ *
+ * Author: Daniel De Graaf <dgdegra at tycho.nsa.gov>
+ *
+ * This protocol emulates the request/response behavior of a TPM using a Xen
+ * shared memory interface. All interaction with the TPM is at the direction
+ * of the frontend, since a TPM (hardware or virtual) is a passive device -
+ * the backend only processes commands as requested by the frontend.
+ *
+ * The frontend sends a request to the TPM by populating the shared page with
+ * the request packet, changing the state to TPMIF_STATE_SUBMIT, and sending
+ * and event channel notification. When the backend is finished, it will set
+ * the state to TPMIF_STATE_FINISH and send an event channel notification.
+ *
+ * In order to allow long-running commands to be canceled, the frontend can
+ * at any time change the state to TPMIF_STATE_CANCEL and send a notification.
+ * The TPM can either finish the command (changing state to TPMIF_STATE_FINISH)
+ * or can cancel the command and change the state to TPMIF_STATE_IDLE. The TPM
+ * can also change the state to TPMIF_STATE_IDLE instead of TPMIF_STATE_FINISH
+ * if another reason for cancellation is required - for example, a physical
+ * TPM may cancel a command if the interface is seized by another locality.
+ *
+ * The TPM command format is defined by the TCG, and is available at
+ * http://www.trustedcomputinggroup.org/resources/tpm_main_specification
+ */
+
+enum tpmif_state {
+    TPMIF_STATE_IDLE,        /* no contents / vTPM idle / cancel complete */
+    TPMIF_STATE_SUBMIT,      /* request ready / vTPM working */
+    TPMIF_STATE_FINISH,      /* response ready / vTPM idle */
+    TPMIF_STATE_CANCEL,      /* cancel requested / vTPM working */
+};
+/* Note: The backend should only change state to IDLE or FINISH, while the
+ * frontend should only change to SUBMIT or CANCEL. Status changes do not need
+ * to use atomic operations.
+ */
+
+
+/* The shared page for vTPM request/response packets looks like:
+ *
+ *  Offset               Contents
+ *  =================================================
+ *  0                    struct tpmif_shared_page
+ *  16                   [optional] List of grant IDs
+ *  16+4*nr_extra_pages  TPM packet data
+ *
+ * If the TPM packet data extends beyond the end of a single page, the grant IDs
+ * defined in extra_pages are used as if they were mapped immediately following
+ * the primary shared page. The grants are allocated by the frontend and mapped
+ * by the backend. Before sending a request spanning multiple pages, the
+ * frontend should verify that the TPM supports such large requests by querying
+ * the TPM_CAP_PROP_INPUT_BUFFER property from the TPM.
+ */
+struct tpmif_shared_page {
+    uint32_t length;         /* request/response length in bytes */
+
+    uint8_t state;           /* enum tpmif_state */
+    uint8_t locality;        /* for the current request */
+    uint8_t pad;             /* should be zero */
+
+    uint8_t nr_extra_pages;  /* extra pages for long packets; may be zero */
+    uint32_t extra_pages[0]; /* grant IDs; length is actually nr_extra_pages */
+};
+typedef struct tpmif_shared_page tpmif_shared_page_t;
+
 #endif
 
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/io/usbif.h
===================================================================
--- trunk/sys/xen/interface/io/usbif.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/usbif.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -32,6 +32,76 @@
 #include "ring.h"
 #include "../grant_table.h"
 
+/*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen pvUSB driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters. This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * XenStore nodes in sections marked "PRIVATE" are solely for use by the
+ * driver side whose XenBus tree contains them.
+ *
+ *****************************************************************************
+ *                            Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *------------------ Backend Device Identification (PRIVATE) ------------------
+ *
+ * num-ports
+ *      Values:         unsigned [1...31]
+ *
+ *      Number of ports for this (virtual) USB host connector.
+ *
+ * usb-ver
+ *      Values:         unsigned [1...2]
+ *
+ *      USB version of this host connector: 1 = USB 1.1, 2 = USB 2.0.
+ *
+ * port/[1...31]
+ *      Values:         string
+ *
+ *      Physical USB device connected to the given port, e.g. "3-1.5".
+ *
+ *****************************************************************************
+ *                            Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ *      Values:         unsigned
+ *
+ *      The identifier of the Xen event channel used to signal activity
+ *      in the ring buffer.
+ *
+ * urb-ring-ref
+ *      Values:         unsigned
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer. This is the ring
+ *      buffer for urb requests.
+ *
+ * conn-ring-ref
+ *      Values:         unsigned
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer. This is the ring
+ *      buffer for connection/disconnection requests.
+ *
+ * protocol
+ *      Values:         string (XEN_IO_PROTO_ABI_*)
+ *      Default Value:  XEN_IO_PROTO_ABI_NATIVE
+ *
+ *      The machine ABI rules governing the format of all ring request and
+ *      response structures.
+ *
+ */
+
 enum usb_spec_version {
 	USB_VER_UNKNOWN = 0,
 	USB_VER_USB11,
@@ -42,38 +112,65 @@
 /*
  *  USB pipe in usbif_request
  *
- *  bits 0-5 are specific bits for virtual USB driver.
- *  bits 7-31 are standard urb pipe.
+ *  - port number:	bits 0-4
+ *				(USB_MAXCHILDREN is 31)
  *
- *  - port number(NEW):	bits 0-4
- *  				(USB_MAXCHILDREN is 31)
+ *  - operation flag:	bit 5
+ *				(0 = submit urb,
+ *				 1 = unlink urb)
  *
- *  - operation flag(NEW):	bit 5
- *  				(0 = submit urb,
- *  				 1 = unlink urb)
- *
  *  - direction:		bit 7
- *  				(0 = Host-to-Device [Out]
- *                           1 = Device-to-Host [In])
+ *				(0 = Host-to-Device [Out]
+ *				 1 = Device-to-Host [In])
  *
  *  - device address:	bits 8-14
  *
  *  - endpoint:		bits 15-18
  *
- *  - pipe type:		bits 30-31
- *  				(00 = isochronous, 01 = interrupt,
- *                           10 = control, 11 = bulk)
+ *  - pipe type:	bits 30-31
+ *				(00 = isochronous, 01 = interrupt,
+ *				 10 = control, 11 = bulk)
  */
-#define usbif_pipeportnum(pipe) ((pipe) & 0x1f)
-#define usbif_setportnum_pipe(pipe, portnum) \
-	((pipe)|(portnum))
 
-#define usbif_pipeunlink(pipe) ((pipe) & 0x20)
-#define usbif_pipesubmit(pipe) (!usbif_pipeunlink(pipe))
-#define usbif_setunlink_pipe(pipe) ((pipe)|(0x20))
+#define USBIF_PIPE_PORT_MASK	0x0000001f
+#define USBIF_PIPE_UNLINK	0x00000020
+#define USBIF_PIPE_DIR		0x00000080
+#define USBIF_PIPE_DEV_MASK	0x0000007f
+#define USBIF_PIPE_DEV_SHIFT	8
+#define USBIF_PIPE_EP_MASK	0x0000000f
+#define USBIF_PIPE_EP_SHIFT	15
+#define USBIF_PIPE_TYPE_MASK	0x00000003
+#define USBIF_PIPE_TYPE_SHIFT	30
+#define USBIF_PIPE_TYPE_ISOC	0
+#define USBIF_PIPE_TYPE_INT	1
+#define USBIF_PIPE_TYPE_CTRL	2
+#define USBIF_PIPE_TYPE_BULK	3
 
-#define USBIF_BACK_MAX_PENDING_REQS (128)
+#define usbif_pipeportnum(pipe)			((pipe) & USBIF_PIPE_PORT_MASK)
+#define usbif_setportnum_pipe(pipe, portnum)	((pipe) | (portnum))
+
+#define usbif_pipeunlink(pipe)			((pipe) & USBIF_PIPE_UNLINK)
+#define usbif_pipesubmit(pipe)			(!usbif_pipeunlink(pipe))
+#define usbif_setunlink_pipe(pipe)		((pipe) | USBIF_PIPE_UNLINK)
+
+#define usbif_pipein(pipe)			((pipe) & USBIF_PIPE_DIR)
+#define usbif_pipeout(pipe)			(!usbif_pipein(pipe))
+
+#define usbif_pipedevice(pipe)			\
+		(((pipe) >> USBIF_PIPE_DEV_SHIFT) & USBIF_PIPE_DEV_MASK)
+
+#define usbif_pipeendpoint(pipe)		\
+		(((pipe) >> USBIF_PIPE_EP_SHIFT) & USBIF_PIPE_EP_MASK)
+
+#define usbif_pipetype(pipe)			\
+		(((pipe) >> USBIF_PIPE_TYPE_SHIFT) & USBIF_PIPE_TYPE_MASK)
+#define usbif_pipeisoc(pipe)	(usbif_pipetype(pipe) == USBIF_PIPE_TYPE_ISOC)
+#define usbif_pipeint(pipe)	(usbif_pipetype(pipe) == USBIF_PIPE_TYPE_INT)
+#define usbif_pipectrl(pipe)	(usbif_pipetype(pipe) == USBIF_PIPE_TYPE_CTRL)
+#define usbif_pipebulk(pipe)	(usbif_pipetype(pipe) == USBIF_PIPE_TYPE_BULK)
+
 #define USBIF_MAX_SEGMENTS_PER_REQUEST (16)
+#define USBIF_MAX_PORTNR	31
 
 /*
  * RING for transferring urbs.
@@ -143,6 +240,10 @@
 	uint16_t id; /* request id */
 	uint8_t portnum; /* port number */
 	uint8_t speed; /* usb_device_speed */
+#define USBIF_SPEED_NONE	0
+#define USBIF_SPEED_LOW		1
+#define USBIF_SPEED_FULL	2
+#define USBIF_SPEED_HIGH	3
 };
 typedef struct usbif_conn_response usbif_conn_response_t;
 

Modified: trunk/sys/xen/interface/io/vscsiif.h
===================================================================
--- trunk/sys/xen/interface/io/vscsiif.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/vscsiif.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -1,9 +1,9 @@
 /* $MidnightBSD$ */
 /******************************************************************************
  * vscsiif.h
- * 
+ *
  * Based on the blkif.h code.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
@@ -31,57 +31,212 @@
 #include "ring.h"
 #include "../grant_table.h"
 
-/* command between backend and frontend */
-#define VSCSIIF_ACT_SCSI_CDB         1    /* SCSI CDB command */
-#define VSCSIIF_ACT_SCSI_ABORT       2    /* SCSI Device(Lun) Abort*/
-#define VSCSIIF_ACT_SCSI_RESET       3    /* SCSI Device(Lun) Reset*/
+/*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen pvSCSI driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters.  This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * XenStore nodes in sections marked "PRIVATE" are solely for use by the
+ * driver side whose XenBus tree contains them.
+ *
+ *****************************************************************************
+ *                            Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *------------------ Backend Device Identification (PRIVATE) ------------------
+ *
+ * p-devname
+ *      Values:         string
+ *
+ *      A free string used to identify the physical device (e.g. a disk name).
+ *
+ * p-dev
+ *      Values:         string
+ *
+ *      A string specifying the backend device: either a 4-tuple "h:c:t:l"
+ *      (host, controller, target, lun, all integers), or a WWN (e.g.
+ *      "naa.60014054ac780582").
+ *
+ * v-dev
+ *      Values:         string
+ *
+ *      A string specifying the frontend device in form of a 4-tuple "h:c:t:l"
+ *      (host, controller, target, lun, all integers).
+ *
+ *--------------------------------- Features ---------------------------------
+ *
+ * feature-sg-grant
+ *      Values:         unsigned [VSCSIIF_SG_TABLESIZE...65535]
+ *      Default Value:  0
+ *
+ *      Specifies the maximum number of scatter/gather elements in grant pages
+ *      supported. If not set, the backend supports up to VSCSIIF_SG_TABLESIZE
+ *      SG elements specified directly in the request.
+ *
+ *****************************************************************************
+ *                            Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ *      Values:         unsigned
+ *
+ *      The identifier of the Xen event channel used to signal activity
+ *      in the ring buffer.
+ *
+ * ring-ref
+ *      Values:         unsigned
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer.
+ *
+ * protocol
+ *      Values:         string (XEN_IO_PROTO_ABI_*)
+ *      Default Value:  XEN_IO_PROTO_ABI_NATIVE
+ *
+ *      The machine ABI rules governing the format of all ring request and
+ *      response structures.
+ */
 
+/* Requests from the frontend to the backend */
 
-#define VSCSIIF_BACK_MAX_PENDING_REQS    128
+/*
+ * Request a SCSI operation specified via a CDB in vscsiif_request.cmnd.
+ * The target is specified via channel, id and lun.
+ *
+ * The operation to be performed is specified via a CDB in cmnd[], the length
+ * of the CDB is in cmd_len. sc_data_direction specifies the direction of data
+ * (to the device, from the device, or none at all).
+ *
+ * If data is to be transferred to or from the device the buffer(s) in the
+ * guest memory is/are specified via one or multiple scsiif_request_segment
+ * descriptors each specifying a memory page via a grant_ref_t, a offset into
+ * the page and the length of the area in that page. All scsiif_request_segment
+ * areas concatenated form the resulting data buffer used by the operation.
+ * If the number of scsiif_request_segment areas is not too large (less than
+ * or equal VSCSIIF_SG_TABLESIZE) the areas can be specified directly in the
+ * seg[] array and the number of valid scsiif_request_segment elements is to be
+ * set in nr_segments.
+ *
+ * If "feature-sg-grant" in the Xenstore is set it is possible to specify more
+ * than VSCSIIF_SG_TABLESIZE scsiif_request_segment elements via indirection.
+ * The maximum number of allowed scsiif_request_segment elements is the value
+ * of the "feature-sg-grant" entry from Xenstore. When using indirection the
+ * seg[] array doesn't contain specifications of the data buffers, but
+ * references to scsiif_request_segment arrays, which in turn reference the
+ * data buffers. While nr_segments holds the number of populated seg[] entries
+ * (plus the set VSCSIIF_SG_GRANT bit), the number of scsiif_request_segment
+ * elements referencing the target data buffers is calculated from the lengths
+ * of the seg[] elements (the sum of all valid seg[].length divided by the
+ * size of one scsiif_request_segment structure). The frontend may use a mix of
+ * direct and indirect requests.
+ */
+#define VSCSIIF_ACT_SCSI_CDB         1
 
 /*
+ * Request abort of a running operation for the specified target given by
+ * channel, id, lun and the operation's rqid in ref_rqid.
+ */
+#define VSCSIIF_ACT_SCSI_ABORT       2
+
+/*
+ * Request a device reset of the specified target (channel and id).
+ */
+#define VSCSIIF_ACT_SCSI_RESET       3
+
+/*
+ * Preset scatter/gather elements for a following request. Deprecated.
+ * Keeping the define only to avoid usage of the value "4" for other actions.
+ */
+#define VSCSIIF_ACT_SCSI_SG_PRESET   4
+
+/*
  * Maximum scatter/gather segments per request.
  *
- * Considering balance between allocating al least 16 "vscsiif_request"
- * structures on one page (4096bytes) and number of scatter gather 
- * needed, we decided to use 26 as a magic number.
+ * Considering balance between allocating at least 16 "vscsiif_request"
+ * structures on one page (4096 bytes) and the number of scatter/gather
+ * elements needed, we decided to use 26 as a magic number.
+ *
+ * If "feature-sg-grant" is set, more scatter/gather elements can be specified
+ * by placing them in one or more (up to VSCSIIF_SG_TABLESIZE) granted pages.
+ * In this case the vscsiif_request seg elements don't contain references to
+ * the user data, but to the SG elements referencing the user data.
  */
 #define VSCSIIF_SG_TABLESIZE             26
 
 /*
- * base on linux kernel 2.6.18
+ * based on Linux kernel 2.6.18, still valid
+ *
+ * Changing these values requires support of multiple protocols via the rings
+ * as "old clients" will blindly use these values and the resulting structure
+ * sizes.
  */
 #define VSCSIIF_MAX_COMMAND_SIZE         16
 #define VSCSIIF_SENSE_BUFFERSIZE         96
 
+struct scsiif_request_segment {
+    grant_ref_t gref;
+    uint16_t offset;
+    uint16_t length;
+};
+typedef struct scsiif_request_segment vscsiif_segment_t;
 
+#define VSCSIIF_SG_PER_PAGE (PAGE_SIZE / sizeof(struct scsiif_request_segment))
+
+/* Size of one request is 252 bytes */
 struct vscsiif_request {
     uint16_t rqid;          /* private guest value, echoed in resp  */
     uint8_t act;            /* command between backend and frontend */
-    uint8_t cmd_len;
+    uint8_t cmd_len;        /* valid CDB bytes */
 
-    uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
-    uint16_t timeout_per_command;     /* The command is issued by twice 
-                                         the value in Backend. */
-    uint16_t channel, id, lun;
-    uint16_t padding;
-    uint8_t sc_data_direction;        /* for DMA_TO_DEVICE(1)
-                                         DMA_FROM_DEVICE(2)
-                                         DMA_NONE(3) requests  */
-    uint8_t nr_segments;              /* Number of pieces of scatter-gather */
+    uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; /* the CDB */
+    uint16_t timeout_per_command;   /* deprecated: timeout in secs, 0=default */
+    uint16_t channel, id, lun;      /* (virtual) device specification */
+    uint16_t ref_rqid;              /* command abort reference */
+    uint8_t sc_data_direction;      /* for DMA_TO_DEVICE(1)
+                                       DMA_FROM_DEVICE(2)
+                                       DMA_NONE(3) requests  */
+    uint8_t nr_segments;            /* Number of pieces of scatter-gather */
+/*
+ * flag in nr_segments: SG elements via grant page
+ *
+ * If VSCSIIF_SG_GRANT is set, the low 7 bits of nr_segments specify the number
+ * of grant pages containing SG elements. Usable if "feature-sg-grant" set.
+ */
+#define VSCSIIF_SG_GRANT    0x80
 
-    struct scsiif_request_segment {
-        grant_ref_t gref;
-        uint16_t offset;
-        uint16_t length;
-    } seg[VSCSIIF_SG_TABLESIZE];
+    vscsiif_segment_t seg[VSCSIIF_SG_TABLESIZE];
     uint32_t reserved[3];
 };
 typedef struct vscsiif_request vscsiif_request_t;
 
+/*
+ * The following interface is deprecated!
+ */
+#define VSCSIIF_SG_LIST_SIZE ((sizeof(vscsiif_request_t) - 4) \
+                              / sizeof(vscsiif_segment_t))
+
+struct vscsiif_sg_list {
+    /* First two fields must match struct vscsiif_request! */
+    uint16_t rqid;          /* private guest value, must match main req */
+    uint8_t act;            /* VSCSIIF_ACT_SCSI_SG_PRESET */
+    uint8_t nr_segments;    /* Number of pieces of scatter-gather */
+    vscsiif_segment_t seg[VSCSIIF_SG_LIST_SIZE];
+};
+typedef struct vscsiif_sg_list vscsiif_sg_list_t;
+/* End of deprecated interface */
+
+/* Size of one response is 252 bytes */
 struct vscsiif_response {
-    uint16_t rqid;
-    uint8_t padding;
+    uint16_t rqid;          /* identifies request */
+    uint8_t act;            /* deprecated: valid only if SG_PRESET supported */
     uint8_t sense_len;
     uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
     int32_t rslt;
@@ -98,7 +253,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/io/xenbus.h
===================================================================
--- trunk/sys/xen/interface/io/xenbus.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/xenbus.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -37,9 +37,6 @@
 enum xenbus_state {
     XenbusStateUnknown       = 0,
 
-    /*
-     * Initializing: Back-end is initializing.
-     */
     XenbusStateInitialising  = 1,
 
     /*
@@ -53,9 +50,6 @@
      */
     XenbusStateInitialised   = 3,
 
-    /*
-     * Connected: The normal state for a front to backend connection.
-     */
     XenbusStateConnected     = 4,
 
     /*
@@ -63,18 +57,6 @@
      */
     XenbusStateClosing       = 5,
 
-    /*
-     * Closed: No connection exists between front and back end.
-     *
-     * For backend devices with the "online" attribute, the front can
-     * request a reconnect at any time.  To handle this transition
-     * gracefully, backend devices must reinitialize any XenStore data
-     * used to negotiate features with a peer before transitioning to
-     * the closed state.  When a reconnect request occurs, the
-     * XenBus backend support code will automatically transition the
-     * backend device from Closed to InitWait, kicking off the ring
-     * and feature negotiation process.
-     */
     XenbusStateClosed        = 6,
 
     /*
@@ -91,7 +73,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/io/xs_wire.h
===================================================================
--- trunk/sys/xen/interface/io/xs_wire.h	2020-02-02 21:31:28 UTC (rev 12300)
+++ trunk/sys/xen/interface/io/xs_wire.h	2020-02-08 19:26:24 UTC (rev 12301)
@@ -50,7 +50,9 @@
     XS_RESUME,
     XS_SET_TARGET,
     XS_RESTRICT,
-    XS_RESET_WATCHES
+    XS_RESET_WATCHES,
+
+    XS_INVALID = 0xffff /* Guaranteed to remain an invalid type */
 };
 
 #define XS_WRITE_NONE "NONE"
@@ -84,7 +86,8 @@
     XSD_ERROR(EROFS),
     XSD_ERROR(EBUSY),
     XSD_ERROR(EAGAIN),
-    XSD_ERROR(EISCONN)
+    XSD_ERROR(EISCONN),
+    XSD_ERROR(E2BIG)
 };
 #endif
 
@@ -104,7 +107,10 @@
     XS_WATCH_TOKEN
 };
 
-/* Inter-domain shared memory communications. */
+/*
+ * `incontents 150 xenstore_struct XenStore wire protocol.
+ *
+ * Inter-domain shared memory communications. */
 #define XENSTORE_RING_SIZE 1024
 typedef uint32_t XENSTORE_RING_IDX;
 #define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
@@ -113,6 +119,8 @@
     char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
     XENSTORE_RING_IDX req_cons, req_prod;
     XENSTORE_RING_IDX rsp_cons, rsp_prod;
+    uint32_t server_features; /* Bitmap of features supported by the server */
+    uint32_t connection;
 };
 
 /* Violating this is very bad.  See docs/misc/xenstore.txt. */
@@ -122,12 +130,19 @@
 #define XENSTORE_ABS_PATH_MAX 3072
 #define XENSTORE_REL_PATH_MAX 2048
 
+/* The ability to reconnect a ring */
+#define XENSTORE_SERVER_FEATURE_RECONNECTION 1
+
+/* Valid values for the connection field */
+#define XENSTORE_CONNECTED 0 /* the steady-state */
+#define XENSTORE_RECONNECT 1 /* guest has initiated a reconnect */
+
 #endif /* _XS_WIRE_H */
 
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil


From laffer1 at midnightbsd.org  Sat Feb  8 14:26:43 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:26:43 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12302]
 trunk/sys/xen/interface/xsm/flask_op.h: sync with FreeBSD 11-stable
Message-ID: <202002081926.018JQhQG060682@stargazer.midnightbsd.org>

Revision: 12302
          http://svnweb.midnightbsd.org/src/?rev=12302
Author:   laffer1
Date:     2020-02-08 14:26:42 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/xen/interface/xsm/flask_op.h

Modified: trunk/sys/xen/interface/xsm/flask_op.h
===================================================================
--- trunk/sys/xen/interface/xsm/flask_op.h	2020-02-08 19:26:24 UTC (rev 12301)
+++ trunk/sys/xen/interface/xsm/flask_op.h	2020-02-08 19:26:42 UTC (rev 12302)
@@ -26,6 +26,8 @@
 #ifndef __FLASK_OP_H__
 #define __FLASK_OP_H__
 
+#include "../event_channel.h"
+
 #define XEN_FLASK_INTERFACE_VERSION 1
 
 struct xen_flask_load {
@@ -143,6 +145,19 @@
     uint32_t sid;
 };
 
+struct xen_flask_relabel {
+    /* IN */
+    uint32_t domid;
+    uint32_t sid;
+};
+
+struct xen_flask_devicetree_label {
+    /* IN */
+    uint32_t sid;
+    uint32_t length;
+    XEN_GUEST_HANDLE(char) path;
+};
+
 struct xen_flask_op {
     uint32_t cmd;
 #define FLASK_LOAD              1
@@ -168,6 +183,8 @@
 #define FLASK_ADD_OCONTEXT      21
 #define FLASK_DEL_OCONTEXT      22
 #define FLASK_GET_PEER_SID      23
+#define FLASK_RELABEL_DOMAIN    24
+#define FLASK_DEVICETREE_LABEL  25
     uint32_t interface_version; /* XEN_FLASK_INTERFACE_VERSION */
     union {
         struct xen_flask_load load;
@@ -186,6 +203,8 @@
         /* FLASK_ADD_OCONTEXT, FLASK_DEL_OCONTEXT */
         struct xen_flask_ocontext ocontext;
         struct xen_flask_peersid peersid;
+        struct xen_flask_relabel relabel;
+        struct xen_flask_devicetree_label devicetree_label;
     } u;
 };
 typedef struct xen_flask_op xen_flask_op_t;


From laffer1 at midnightbsd.org  Sat Feb  8 14:27:19 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:27:19 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12303] trunk/sys/xen/interface/hvm: sync
 with FreeBSD 11-stable
Message-ID: <202002081927.018JRJKC060748@stargazer.midnightbsd.org>

Revision: 12303
          http://svnweb.midnightbsd.org/src/?rev=12303
Author:   laffer1
Date:     2020-02-08 14:27:19 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/xen/interface/hvm/e820.h
    trunk/sys/xen/interface/hvm/hvm_info_table.h
    trunk/sys/xen/interface/hvm/hvm_op.h
    trunk/sys/xen/interface/hvm/ioreq.h
    trunk/sys/xen/interface/hvm/params.h
    trunk/sys/xen/interface/hvm/save.h

Added Paths:
-----------
    trunk/sys/xen/interface/hvm/hvm_xs_strings.h
    trunk/sys/xen/interface/hvm/pvdrivers.h

Modified: trunk/sys/xen/interface/hvm/e820.h
===================================================================
--- trunk/sys/xen/interface/hvm/e820.h	2020-02-08 19:26:42 UTC (rev 12302)
+++ trunk/sys/xen/interface/hvm/e820.h	2020-02-08 19:27:19 UTC (rev 12303)
@@ -1,5 +1,4 @@
 /* $MidnightBSD$ */
-
 /*
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -18,6 +17,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_HVM_E820_H__

Modified: trunk/sys/xen/interface/hvm/hvm_info_table.h
===================================================================
--- trunk/sys/xen/interface/hvm/hvm_info_table.h	2020-02-08 19:26:42 UTC (rev 12302)
+++ trunk/sys/xen/interface/hvm/hvm_info_table.h	2020-02-08 19:27:19 UTC (rev 12303)
@@ -21,6 +21,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__

Modified: trunk/sys/xen/interface/hvm/hvm_op.h
===================================================================
--- trunk/sys/xen/interface/hvm/hvm_op.h	2020-02-08 19:26:42 UTC (rev 12302)
+++ trunk/sys/xen/interface/hvm/hvm_op.h	2020-02-08 19:27:19 UTC (rev 12303)
@@ -17,6 +17,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
@@ -24,6 +26,7 @@
 
 #include "../xen.h"
 #include "../trace.h"
+#include "../event_channel.h"
 
 /* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */
 #define HVMOP_set_param           0
@@ -81,6 +84,7 @@
     HVMMEM_ram_rw,             /* Normal read/write guest RAM */
     HVMMEM_ram_ro,             /* Read-only; writes are discarded */
     HVMMEM_mmio_dm,            /* Reads and write go to the device model */
+    HVMMEM_mmio_write_dm       /* Read-only; writes go to the device model */
 } hvmmem_type_t;
 
 /* Following tools-only interfaces may change in future. */
@@ -91,10 +95,10 @@
 struct xen_hvm_track_dirty_vram {
     /* Domain to be tracked. */
     domid_t  domid;
+    /* Number of pages to track. */
+    uint32_t nr;
     /* First pfn to track. */
     uint64_aligned_t first_pfn;
-    /* Number of pages to track. */
-    uint64_aligned_t nr;
     /* OUT variable. */
     /* Dirty bitmap buffer. */
     XEN_GUEST_HANDLE_64(uint8) dirty_bitmap;
@@ -107,10 +111,10 @@
 struct xen_hvm_modified_memory {
     /* Domain to be updated. */
     domid_t  domid;
+    /* Number of pages. */
+    uint32_t nr;
     /* First pfn. */
     uint64_aligned_t first_pfn;
-    /* Number of pages. */
-    uint64_aligned_t nr;
 };
 typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t;
 DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t);
@@ -163,49 +167,11 @@
 /* Following tools-only interfaces may change in future. */
 #if defined(__XEN__) || defined(__XEN_TOOLS__)
 
+/* Deprecated by XENMEM_access_op_set_access */
 #define HVMOP_set_mem_access        12
-typedef enum {
-    HVMMEM_access_n,
-    HVMMEM_access_r,
-    HVMMEM_access_w,
-    HVMMEM_access_rw,
-    HVMMEM_access_x,
-    HVMMEM_access_rx,
-    HVMMEM_access_wx,
-    HVMMEM_access_rwx,
-    HVMMEM_access_rx2rw,       /* Page starts off as r-x, but automatically
-                                * change to r-w on a write */
-    HVMMEM_access_n2rwx,       /* Log access: starts off as n, automatically 
-                                * goes to rwx, generating an event without
-                                * pausing the vcpu */
-    HVMMEM_access_default      /* Take the domain default */
-} hvmmem_access_t;
-/* Notify that a region of memory is to have specific access types */
-struct xen_hvm_set_mem_access {
-    /* Domain to be updated. */
-    domid_t domid;
-    /* Memory type */
-    uint16_t hvmmem_access; /* hvm_access_t */
-    /* Number of pages, ignored on setting default access */
-    uint32_t nr;
-    /* First pfn, or ~0ull to set the default access for new pages */
-    uint64_aligned_t first_pfn;
-};
-typedef struct xen_hvm_set_mem_access xen_hvm_set_mem_access_t;
-DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_access_t);
 
+/* Deprecated by XENMEM_access_op_get_access */
 #define HVMOP_get_mem_access        13
-/* Get the specific access type for that region of memory */
-struct xen_hvm_get_mem_access {
-    /* Domain to be queried. */
-    domid_t domid;
-    /* Memory type: OUT */
-    uint16_t hvmmem_access; /* hvm_access_t */
-    /* pfn, or ~0ull for default access for new pages.  IN */
-    uint64_aligned_t pfn;
-};
-typedef struct xen_hvm_get_mem_access xen_hvm_get_mem_access_t;
-DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_access_t);
 
 #define HVMOP_inject_trap            14
 /* Inject a trap into a VCPU, which will get taken up on the next
@@ -271,6 +237,267 @@
 typedef struct xen_hvm_inject_msi xen_hvm_inject_msi_t;
 DEFINE_XEN_GUEST_HANDLE(xen_hvm_inject_msi_t);
 
+/*
+ * IOREQ Servers
+ *
+ * The interface between an I/O emulator an Xen is called an IOREQ Server.
+ * A domain supports a single 'legacy' IOREQ Server which is instantiated if
+ * parameter...
+ *
+ * HVM_PARAM_IOREQ_PFN is read (to get the gmfn containing the synchronous
+ * ioreq structures), or...
+ * HVM_PARAM_BUFIOREQ_PFN is read (to get the gmfn containing the buffered
+ * ioreq ring), or...
+ * HVM_PARAM_BUFIOREQ_EVTCHN is read (to get the event channel that Xen uses
+ * to request buffered I/O emulation).
+ * 
+ * The following hypercalls facilitate the creation of IOREQ Servers for
+ * 'secondary' emulators which are invoked to implement port I/O, memory, or
+ * PCI config space ranges which they explicitly register.
+ */
+
+typedef uint16_t ioservid_t;
+
+/*
+ * HVMOP_create_ioreq_server: Instantiate a new IOREQ Server for a secondary
+ *                            emulator servicing domain <domid>.
+ *
+ * The <id> handed back is unique for <domid>. If <handle_bufioreq> is zero
+ * the buffered ioreq ring will not be allocated and hence all emulation
+ * requestes to this server will be synchronous.
+ */
+#define HVMOP_create_ioreq_server 17
+struct xen_hvm_create_ioreq_server {
+    domid_t domid;           /* IN - domain to be serviced */
+#define HVM_IOREQSRV_BUFIOREQ_OFF    0
+#define HVM_IOREQSRV_BUFIOREQ_LEGACY 1
+/*
+ * Use this when read_pointer gets updated atomically and
+ * the pointer pair gets read atomically:
+ */
+#define HVM_IOREQSRV_BUFIOREQ_ATOMIC 2
+    uint8_t handle_bufioreq; /* IN - should server handle buffered ioreqs */
+    ioservid_t id;           /* OUT - server id */
+};
+typedef struct xen_hvm_create_ioreq_server xen_hvm_create_ioreq_server_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_create_ioreq_server_t);
+
+/*
+ * HVMOP_get_ioreq_server_info: Get all the information necessary to access
+ *                              IOREQ Server <id>. 
+ *
+ * The emulator needs to map the synchronous ioreq structures and buffered
+ * ioreq ring (if it exists) that Xen uses to request emulation. These are
+ * hosted in domain <domid>'s gmfns <ioreq_pfn> and <bufioreq_pfn>
+ * respectively. In addition, if the IOREQ Server is handling buffered
+ * emulation requests, the emulator needs to bind to event channel
+ * <bufioreq_port> to listen for them. (The event channels used for
+ * synchronous emulation requests are specified in the per-CPU ioreq
+ * structures in <ioreq_pfn>).
+ * If the IOREQ Server is not handling buffered emulation requests then the
+ * values handed back in <bufioreq_pfn> and <bufioreq_port> will both be 0.
+ */
+#define HVMOP_get_ioreq_server_info 18
+struct xen_hvm_get_ioreq_server_info {
+    domid_t domid;                 /* IN - domain to be serviced */
+    ioservid_t id;                 /* IN - server id */
+    evtchn_port_t bufioreq_port;   /* OUT - buffered ioreq port */
+    uint64_aligned_t ioreq_pfn;    /* OUT - sync ioreq pfn */
+    uint64_aligned_t bufioreq_pfn; /* OUT - buffered ioreq pfn */
+};
+typedef struct xen_hvm_get_ioreq_server_info xen_hvm_get_ioreq_server_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_ioreq_server_info_t);
+
+/*
+ * HVM_map_io_range_to_ioreq_server: Register an I/O range of domain <domid>
+ *                                   for emulation by the client of IOREQ
+ *                                   Server <id>
+ * HVM_unmap_io_range_from_ioreq_server: Deregister an I/O range of <domid>
+ *                                       for emulation by the client of IOREQ
+ *                                       Server <id>
+ *
+ * There are three types of I/O that can be emulated: port I/O, memory accesses
+ * and PCI config space accesses. The <type> field denotes which type of range
+ * the <start> and <end> (inclusive) fields are specifying.
+ * PCI config space ranges are specified by segment/bus/device/function values
+ * which should be encoded using the HVMOP_PCI_SBDF helper macro below.
+ *
+ * NOTE: unless an emulation request falls entirely within a range mapped
+ * by a secondary emulator, it will not be passed to that emulator.
+ */
+#define HVMOP_map_io_range_to_ioreq_server 19
+#define HVMOP_unmap_io_range_from_ioreq_server 20
+struct xen_hvm_io_range {
+    domid_t domid;               /* IN - domain to be serviced */
+    ioservid_t id;               /* IN - server id */
+    uint32_t type;               /* IN - type of range */
+# define HVMOP_IO_RANGE_PORT   0 /* I/O port range */
+# define HVMOP_IO_RANGE_MEMORY 1 /* MMIO range */
+# define HVMOP_IO_RANGE_PCI    2 /* PCI segment/bus/dev/func range */
+    uint64_aligned_t start, end; /* IN - inclusive start and end of range */
+};
+typedef struct xen_hvm_io_range xen_hvm_io_range_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_io_range_t);
+
+#define HVMOP_PCI_SBDF(s,b,d,f)                 \
+	((((s) & 0xffff) << 16) |                   \
+	 (((b) & 0xff) << 8) |                      \
+	 (((d) & 0x1f) << 3) |                      \
+	 ((f) & 0x07))
+
+/*
+ * HVMOP_destroy_ioreq_server: Destroy the IOREQ Server <id> servicing domain
+ *                             <domid>.
+ *
+ * Any registered I/O ranges will be automatically deregistered.
+ */
+#define HVMOP_destroy_ioreq_server 21
+struct xen_hvm_destroy_ioreq_server {
+    domid_t domid; /* IN - domain to be serviced */
+    ioservid_t id; /* IN - server id */
+};
+typedef struct xen_hvm_destroy_ioreq_server xen_hvm_destroy_ioreq_server_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_destroy_ioreq_server_t);
+
+/*
+ * HVMOP_set_ioreq_server_state: Enable or disable the IOREQ Server <id> servicing
+ *                               domain <domid>.
+ *
+ * The IOREQ Server will not be passed any emulation requests until it is in the
+ * enabled state.
+ * Note that the contents of the ioreq_pfn and bufioreq_fn (see
+ * HVMOP_get_ioreq_server_info) are not meaningful until the IOREQ Server is in
+ * the enabled state.
+ */
+#define HVMOP_set_ioreq_server_state 22
+struct xen_hvm_set_ioreq_server_state {
+    domid_t domid;   /* IN - domain to be serviced */
+    ioservid_t id;   /* IN - server id */
+    uint8_t enabled; /* IN - enabled? */    
+};
+typedef struct xen_hvm_set_ioreq_server_state xen_hvm_set_ioreq_server_state_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_ioreq_server_state_t);
+
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 
+#if defined(__i386__) || defined(__x86_64__)
+
+/*
+ * HVMOP_set_evtchn_upcall_vector: Set a <vector> that should be used for event
+ *                                 channel upcalls on the specified <vcpu>. If set,
+ *                                 this vector will be used in preference to the
+ *                                 domain global callback via (see
+ *                                 HVM_PARAM_CALLBACK_IRQ).
+ */
+#define HVMOP_set_evtchn_upcall_vector 23
+struct xen_hvm_evtchn_upcall_vector {
+    uint32_t vcpu;
+    uint8_t vector;
+};
+typedef struct xen_hvm_evtchn_upcall_vector xen_hvm_evtchn_upcall_vector_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_evtchn_upcall_vector_t);
+
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+#define HVMOP_guest_request_vm_event 24
+
+/* HVMOP_altp2m: perform altp2m state operations */
+#define HVMOP_altp2m 25
+
+#define HVMOP_ALTP2M_INTERFACE_VERSION 0x00000001
+
+struct xen_hvm_altp2m_domain_state {
+    /* IN or OUT variable on/off */
+    uint8_t state;
+};
+typedef struct xen_hvm_altp2m_domain_state xen_hvm_altp2m_domain_state_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_domain_state_t);
+
+struct xen_hvm_altp2m_vcpu_enable_notify {
+    uint32_t vcpu_id;
+    uint32_t pad;
+    /* #VE info area gfn */
+    uint64_t gfn;
+};
+typedef struct xen_hvm_altp2m_vcpu_enable_notify xen_hvm_altp2m_vcpu_enable_notify_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_vcpu_enable_notify_t);
+
+struct xen_hvm_altp2m_view {
+    /* IN/OUT variable */
+    uint16_t view;
+    /* Create view only: default access type
+     * NOTE: currently ignored */
+    uint16_t hvmmem_default_access; /* xenmem_access_t */
+};
+typedef struct xen_hvm_altp2m_view xen_hvm_altp2m_view_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_view_t);
+
+struct xen_hvm_altp2m_set_mem_access {
+    /* view */
+    uint16_t view;
+    /* Memory type */
+    uint16_t hvmmem_access; /* xenmem_access_t */
+    uint32_t pad;
+    /* gfn */
+    uint64_t gfn;
+};
+typedef struct xen_hvm_altp2m_set_mem_access xen_hvm_altp2m_set_mem_access_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_set_mem_access_t);
+
+struct xen_hvm_altp2m_change_gfn {
+    /* view */
+    uint16_t view;
+    uint16_t pad1;
+    uint32_t pad2;
+    /* old gfn */
+    uint64_t old_gfn;
+    /* new gfn, INVALID_GFN (~0UL) means revert */
+    uint64_t new_gfn;
+};
+typedef struct xen_hvm_altp2m_change_gfn xen_hvm_altp2m_change_gfn_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_change_gfn_t);
+
+struct xen_hvm_altp2m_op {
+    uint32_t version;   /* HVMOP_ALTP2M_INTERFACE_VERSION */
+    uint32_t cmd;
+/* Get/set the altp2m state for a domain */
+#define HVMOP_altp2m_get_domain_state     1
+#define HVMOP_altp2m_set_domain_state     2
+/* Set the current VCPU to receive altp2m event notifications */
+#define HVMOP_altp2m_vcpu_enable_notify   3
+/* Create a new view */
+#define HVMOP_altp2m_create_p2m           4
+/* Destroy a view */
+#define HVMOP_altp2m_destroy_p2m          5
+/* Switch view for an entire domain */
+#define HVMOP_altp2m_switch_p2m           6
+/* Notify that a page of memory is to have specific access types */
+#define HVMOP_altp2m_set_mem_access       7
+/* Change a p2m entry to have a different gfn->mfn mapping */
+#define HVMOP_altp2m_change_gfn           8
+    domid_t domain;
+    uint16_t pad1;
+    uint32_t pad2;
+    union {
+        struct xen_hvm_altp2m_domain_state       domain_state;
+        struct xen_hvm_altp2m_vcpu_enable_notify enable_notify;
+        struct xen_hvm_altp2m_view               view;
+        struct xen_hvm_altp2m_set_mem_access     set_mem_access;
+        struct xen_hvm_altp2m_change_gfn         change_gfn;
+        uint8_t pad[64];
+    } u;
+};
+typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t);
+
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */

Added: trunk/sys/xen/interface/hvm/hvm_xs_strings.h
===================================================================
--- trunk/sys/xen/interface/hvm/hvm_xs_strings.h	                        (rev 0)
+++ trunk/sys/xen/interface/hvm/hvm_xs_strings.h	2020-02-08 19:27:19 UTC (rev 12303)
@@ -0,0 +1,83 @@
+/* $MidnightBSD$ */
+/******************************************************************************
+ * hvm/hvm_xs_strings.h
+ *
+ * HVM xenstore strings used in HVMLOADER.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2013, Citrix Systems
+ */
+
+#ifndef __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__
+#define __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__
+
+#define HVM_XS_HVMLOADER               "hvmloader"
+#define HVM_XS_BIOS                    "hvmloader/bios"
+#define HVM_XS_GENERATION_ID_ADDRESS   "hvmloader/generation-id-address"
+#define HVM_XS_ALLOW_MEMORY_RELOCATE   "hvmloader/allow-memory-relocate"
+
+/* The following values allow additional ACPI tables to be added to the
+ * virtual ACPI BIOS that hvmloader constructs. The values specify the guest
+ * physical address and length of a block of ACPI tables to add. The format of
+ * the block is simply concatenated raw tables (which specify their own length
+ * in the ACPI header).
+ */
+#define HVM_XS_ACPI_PT_ADDRESS         "hvmloader/acpi/address"
+#define HVM_XS_ACPI_PT_LENGTH          "hvmloader/acpi/length"
+
+/* Any number of SMBIOS types can be passed through to an HVM guest using
+ * the following xenstore values. The values specify the guest physical
+ * address and length of a block of SMBIOS structures for hvmloader to use.
+ * The block is formatted in the following way:
+ *
+ * <length><struct><length><struct>...
+ *
+ * Each length separator is a 32b integer indicating the length of the next
+ * SMBIOS structure. For DMTF defined types (0 - 121), the passed in struct
+ * will replace the default structure in hvmloader. In addition, any
+ * OEM/vendortypes (128 - 255) will all be added.
+ */
+#define HVM_XS_SMBIOS_PT_ADDRESS       "hvmloader/smbios/address"
+#define HVM_XS_SMBIOS_PT_LENGTH        "hvmloader/smbios/length"
+
+/* Set to 1 to enable SMBIOS default portable battery (type 22) values. */
+#define HVM_XS_SMBIOS_DEFAULT_BATTERY  "hvmloader/smbios/default_battery"
+
+/* The following xenstore values are used to override some of the default
+ * string values in the SMBIOS table constructed in hvmloader.
+ */
+#define HVM_XS_BIOS_STRINGS            "bios-strings"
+#define HVM_XS_BIOS_VENDOR             "bios-strings/bios-vendor"
+#define HVM_XS_BIOS_VERSION            "bios-strings/bios-version"
+#define HVM_XS_SYSTEM_MANUFACTURER     "bios-strings/system-manufacturer"
+#define HVM_XS_SYSTEM_PRODUCT_NAME     "bios-strings/system-product-name"
+#define HVM_XS_SYSTEM_VERSION          "bios-strings/system-version"
+#define HVM_XS_SYSTEM_SERIAL_NUMBER    "bios-strings/system-serial-number"
+#define HVM_XS_ENCLOSURE_MANUFACTURER  "bios-strings/enclosure-manufacturer"
+#define HVM_XS_ENCLOSURE_SERIAL_NUMBER "bios-strings/enclosure-serial-number"
+#define HVM_XS_BATTERY_MANUFACTURER    "bios-strings/battery-manufacturer"
+#define HVM_XS_BATTERY_DEVICE_NAME     "bios-strings/battery-device-name"
+
+/* 1 to 99 OEM strings can be set in xenstore using values of the form
+ * below. These strings will be loaded into the SMBIOS type 11 structure.
+ */
+#define HVM_XS_OEM_STRINGS             "bios-strings/oem-%d"
+
+#endif /* __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__ */


Property changes on: trunk/sys/xen/interface/hvm/hvm_xs_strings.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/interface/hvm/ioreq.h
===================================================================
--- trunk/sys/xen/interface/hvm/ioreq.h	2020-02-08 19:26:42 UTC (rev 12302)
+++ trunk/sys/xen/interface/hvm/ioreq.h	2020-02-08 19:27:19 UTC (rev 12303)
@@ -35,6 +35,7 @@
 
 #define IOREQ_TYPE_PIO          0 /* pio */
 #define IOREQ_TYPE_COPY         1 /* mmio ops */
+#define IOREQ_TYPE_PCI_CONFIG   2
 #define IOREQ_TYPE_TIMEOFFSET   7
 #define IOREQ_TYPE_INVALIDATE   8 /* mapcache */
 
@@ -41,7 +42,13 @@
 /*
  * VMExit dispatcher should cooperate with instruction decoder to
  * prepare this structure and notify service OS and DM by sending
- * virq
+ * virq.
+ *
+ * For I/O type IOREQ_TYPE_PCI_CONFIG, the physical address is formatted
+ * as follows:
+ * 
+ * 63....48|47..40|39..35|34..32|31........0
+ * SEGMENT |BUS   |DEV   |FN    |OFFSET
  */
 struct ioreq {
     uint64_t addr;          /* physical address */
@@ -77,30 +84,21 @@
 
 #define IOREQ_BUFFER_SLOT_NUM     511 /* 8 bytes each, plus 2 4-byte indexes */
 struct buffered_iopage {
-    unsigned int read_pointer;
-    unsigned int write_pointer;
+#ifdef __XEN__
+    union bufioreq_pointers {
+        struct {
+#endif
+            uint32_t read_pointer;
+            uint32_t write_pointer;
+#ifdef __XEN__
+        };
+        uint64_t full;
+    } ptrs;
+#endif
     buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM];
 }; /* NB. Size of this structure must be no greater than one page. */
 typedef struct buffered_iopage buffered_iopage_t;
 
-#if defined(__ia64__)
-struct pio_buffer {
-    uint32_t page_offset;
-    uint32_t pointer;
-    uint32_t data_end;
-    uint32_t buf_size;
-    void *opaque;
-};
-
-#define PIO_BUFFER_IDE_PRIMARY   0 /* I/O port = 0x1F0 */
-#define PIO_BUFFER_IDE_SECONDARY 1 /* I/O port = 0x170 */
-#define PIO_BUFFER_ENTRY_NUM     2
-struct buffered_piopage {
-    struct pio_buffer pio[PIO_BUFFER_ENTRY_NUM];
-    uint8_t buffer[1];
-};
-#endif /* defined(__ia64__) */
-
 /*
  * ACPI Control/Event register locations. Location is controlled by a 
  * version number in HVM_PARAM_ACPI_IOPORTS_LOCATION.
@@ -133,7 +131,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/hvm/params.h
===================================================================
--- trunk/sys/xen/interface/hvm/params.h	2020-02-08 19:26:42 UTC (rev 12302)
+++ trunk/sys/xen/interface/hvm/params.h	2020-02-08 19:27:19 UTC (rev 12303)
@@ -17,6 +17,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_HVM_PARAMS_H__
@@ -55,17 +57,54 @@
 #define HVM_PARAM_BUFIOREQ_PFN 6
 #define HVM_PARAM_BUFIOREQ_EVTCHN 26
 
-#ifdef __ia64__
+#if defined(__i386__) || defined(__x86_64__)
 
-#define HVM_PARAM_NVRAM_FD     7
-#define HVM_PARAM_VHPT_SIZE    8
-#define HVM_PARAM_BUFPIOREQ_PFN	9
+/*
+ * Viridian enlightenments
+ *
+ * (See http://download.microsoft.com/download/A/B/4/AB43A34E-BDD0-4FA6-BDEF-79EEF16E880B/Hypervisor%20Top%20Level%20Functional%20Specification%20v4.0.docx)
+ *
+ * To expose viridian enlightenments to the guest set this parameter
+ * to the desired feature mask. The base feature set must be present
+ * in any valid feature mask.
+ */
+#define HVM_PARAM_VIRIDIAN     9
 
-#elif defined(__i386__) || defined(__x86_64__)
+/* Base+Freq viridian feature sets:
+ *
+ * - Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL)
+ * - APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
+ * - Virtual Processor index MSR (HV_X64_MSR_VP_INDEX)
+ * - Timer frequency MSRs (HV_X64_MSR_TSC_FREQUENCY and
+ *   HV_X64_MSR_APIC_FREQUENCY)
+ */
+#define _HVMPV_base_freq 0
+#define HVMPV_base_freq  (1 << _HVMPV_base_freq)
 
-/* Expose Viridian interfaces to this HVM guest? */
-#define HVM_PARAM_VIRIDIAN     9
+/* Feature set modifications */
 
+/* Disable timer frequency MSRs (HV_X64_MSR_TSC_FREQUENCY and
+ * HV_X64_MSR_APIC_FREQUENCY).
+ * This modification restores the viridian feature set to the
+ * original 'base' set exposed in releases prior to Xen 4.4.
+ */
+#define _HVMPV_no_freq 1
+#define HVMPV_no_freq  (1 << _HVMPV_no_freq)
+
+/* Enable Partition Time Reference Counter (HV_X64_MSR_TIME_REF_COUNT) */
+#define _HVMPV_time_ref_count 2
+#define HVMPV_time_ref_count  (1 << _HVMPV_time_ref_count)
+
+/* Enable Reference TSC Page (HV_X64_MSR_REFERENCE_TSC) */
+#define _HVMPV_reference_tsc 3
+#define HVMPV_reference_tsc  (1 << _HVMPV_reference_tsc)
+
+#define HVMPV_feature_mask \
+	(HVMPV_base_freq | \
+	 HVMPV_no_freq | \
+	 HVMPV_time_ref_count | \
+	 HVMPV_reference_tsc)
+
 #endif
 
 /*
@@ -126,28 +165,34 @@
  */
 #define HVM_PARAM_ACPI_IOPORTS_LOCATION 19
 
-/* Enable blocking memory events, async or sync (pause vcpu until response) 
- * onchangeonly indicates messages only on a change of value */
+/* Deprecated */
 #define HVM_PARAM_MEMORY_EVENT_CR0          20
 #define HVM_PARAM_MEMORY_EVENT_CR3          21
 #define HVM_PARAM_MEMORY_EVENT_CR4          22
 #define HVM_PARAM_MEMORY_EVENT_INT3         23
 #define HVM_PARAM_MEMORY_EVENT_SINGLE_STEP  25
+#define HVM_PARAM_MEMORY_EVENT_MSR          30
 
-#define HVMPME_MODE_MASK       (3 << 0)
-#define HVMPME_mode_disabled   0
-#define HVMPME_mode_async      1
-#define HVMPME_mode_sync       2
-#define HVMPME_onchangeonly    (1 << 2)
-
 /* Boolean: Enable nestedhvm (hvm only) */
 #define HVM_PARAM_NESTEDHVM    24
 
 /* Params for the mem event rings */
 #define HVM_PARAM_PAGING_RING_PFN   27
-#define HVM_PARAM_ACCESS_RING_PFN   28
+#define HVM_PARAM_MONITOR_RING_PFN  28
 #define HVM_PARAM_SHARING_RING_PFN  29
 
-#define HVM_NR_PARAMS          30
+/* SHUTDOWN_* action in case of a triple fault */
+#define HVM_PARAM_TRIPLE_FAULT_REASON 31
 
+#define HVM_PARAM_IOREQ_SERVER_PFN 32
+#define HVM_PARAM_NR_IOREQ_SERVER_PAGES 33
+
+/* Location of the VM Generation ID in guest physical address space. */
+#define HVM_PARAM_VM_GENERATION_ID_ADDR 34
+
+/* Boolean: Enable altp2m */
+#define HVM_PARAM_ALTP2M       35
+
+#define HVM_NR_PARAMS          36
+
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */

Added: trunk/sys/xen/interface/hvm/pvdrivers.h
===================================================================
--- trunk/sys/xen/interface/hvm/pvdrivers.h	                        (rev 0)
+++ trunk/sys/xen/interface/hvm/pvdrivers.h	2020-02-08 19:27:19 UTC (rev 12303)
@@ -0,0 +1,50 @@
+/* $MidnightBSD$ */
+/*
+ * pvdrivers.h: Register of PV drivers product numbers.
+ * Copyright (c) 2012, Citrix Systems Inc.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _XEN_PUBLIC_PVDRIVERS_H_
+#define _XEN_PUBLIC_PVDRIVERS_H_
+
+/*
+ * This is the master registry of product numbers for
+ * PV drivers. 
+ * If you need a new product number allocating, please
+ * post to xen-devel at lists.xensource.com.  You should NOT use
+ * a product number without allocating one.
+ * If you maintain a separate versioning and distribution path
+ * for PV drivers you should have a separate product number so
+ * that your drivers can be separated from others.
+ *
+ * During development, you may use the product ID to
+ * indicate a driver which is yet to be released.
+ */
+
+#define PVDRIVERS_PRODUCT_LIST(EACH)                               \
+        EACH("xensource-windows",       0x0001) /* Citrix */       \
+        EACH("gplpv-windows",           0x0002) /* James Harper */ \
+        EACH("linux",                   0x0003)                    \
+        EACH("xenserver-windows-v7.0+", 0x0004) /* Citrix */       \
+        EACH("xenserver-windows-v7.2+", 0x0005) /* Citrix */       \
+        EACH("experimental",            0xffff)
+
+#endif /* _XEN_PUBLIC_PVDRIVERS_H_ */


Property changes on: trunk/sys/xen/interface/hvm/pvdrivers.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/interface/hvm/save.h
===================================================================
--- trunk/sys/xen/interface/hvm/save.h	2020-02-08 19:26:42 UTC (rev 12302)
+++ trunk/sys/xen/interface/hvm/save.h	2020-02-08 19:27:19 UTC (rev 12303)
@@ -103,9 +103,7 @@
 
 #if defined(__i386__) || defined(__x86_64__)
 #include "../arch-x86/hvm/save.h"
-#elif defined(__ia64__)
-#include "../arch-ia64/hvm/save.h"
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__aarch64__)
 #include "../arch-arm/hvm/save.h"
 #else
 #error "unsupported architecture"


From laffer1 at midnightbsd.org  Sat Feb  8 14:27:35 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:27:35 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12304] trunk/sys/xen/interface/arch-x86:
 sync with FreeBSD 11-stable
Message-ID: <202002081927.018JRZsB060802@stargazer.midnightbsd.org>

Revision: 12304
          http://svnweb.midnightbsd.org/src/?rev=12304
Author:   laffer1
Date:     2020-02-08 14:27:35 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/xen/interface/arch-x86/cpuid.h
    trunk/sys/xen/interface/arch-x86/hvm/save.h
    trunk/sys/xen/interface/arch-x86/xen-mca.h
    trunk/sys/xen/interface/arch-x86/xen-x86_32.h
    trunk/sys/xen/interface/arch-x86/xen-x86_64.h
    trunk/sys/xen/interface/arch-x86/xen.h

Added Paths:
-----------
    trunk/sys/xen/interface/arch-x86/pmu.h

Modified: trunk/sys/xen/interface/arch-x86/cpuid.h
===================================================================
--- trunk/sys/xen/interface/arch-x86/cpuid.h	2020-02-08 19:27:19 UTC (rev 12303)
+++ trunk/sys/xen/interface/arch-x86/cpuid.h	2020-02-08 19:27:35 UTC (rev 12304)
@@ -31,12 +31,20 @@
 #ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__
 #define __XEN_PUBLIC_ARCH_X86_CPUID_H__
 
-/* Xen identification leaves start at 0x40000000. */
+/*
+ * For compatibility with other hypervisor interfaces, the Xen cpuid leaves
+ * can be found at the first otherwise unused 0x100 aligned boundary starting
+ * from 0x40000000.
+ *
+ * e.g If viridian extensions are enabled for an HVM domain, the Xen cpuid
+ * leaves will start at 0x40000100
+ */
+
 #define XEN_CPUID_FIRST_LEAF 0x40000000
 #define XEN_CPUID_LEAF(i)    (XEN_CPUID_FIRST_LEAF + (i))
 
 /*
- * Leaf 1 (0x40000000)
+ * Leaf 1 (0x40000x00)
  * EAX: Largest Xen-information leaf. All leaves up to an including @EAX
  *      are supported by the Xen host.
  * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification
@@ -47,7 +55,7 @@
 #define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */
 
 /*
- * Leaf 2 (0x40000001)
+ * Leaf 2 (0x40000x01)
  * EAX[31:16]: Xen major version.
  * EAX[15: 0]: Xen minor version.
  * EBX-EDX: Reserved (currently all zeroes).
@@ -54,7 +62,7 @@
  */
 
 /*
- * Leaf 3 (0x40000002)
+ * Leaf 3 (0x40000x02)
  * EAX: Number of hypercall transfer pages. This register is always guaranteed
  *      to specify one hypercall page.
  * EBX: Base address of Xen-specific MSRs.
@@ -66,4 +74,18 @@
 #define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
 #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD  (1u<<0)
 
+/*
+ * Leaf 5 (0x40000x04)
+ * HVM-specific features
+ * EAX: Features
+ * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
+ */
+#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */
+#define XEN_HVM_CPUID_X2APIC_VIRT      (1u << 1) /* Virtualized x2APIC accesses */
+/* Memory mapped from other domains has valid IOMMU entries */
+#define XEN_HVM_CPUID_IOMMU_MAPPINGS   (1u << 2)
+#define XEN_HVM_CPUID_VCPU_ID_PRESENT  (1u << 3) /* vcpu id is present in EBX */
+
+#define XEN_CPUID_MAX_NUM_LEAVES 4
+
 #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */

Modified: trunk/sys/xen/interface/arch-x86/hvm/save.h
===================================================================
--- trunk/sys/xen/interface/arch-x86/hvm/save.h	2020-02-08 19:27:19 UTC (rev 12303)
+++ trunk/sys/xen/interface/arch-x86/hvm/save.h	2020-02-08 19:27:35 UTC (rev 12304)
@@ -270,15 +270,18 @@
 };
 
 static inline int _hvm_hw_fix_cpu(void *h) {
-    struct hvm_hw_cpu *new=h;
-    struct hvm_hw_cpu_compat *old=h;
 
+    union hvm_hw_cpu_union {
+        struct hvm_hw_cpu nat;
+        struct hvm_hw_cpu_compat cmp;
+    } *ucpu = (union hvm_hw_cpu_union *)h;
+
     /* If we copy from the end backwards, we should
      * be able to do the modification in-place */
-    new->error_code=old->error_code;
-    new->pending_event=old->pending_event;
-    new->tsc=old->tsc;
-    new->msr_tsc_aux=0;
+    ucpu->nat.error_code = ucpu->cmp.error_code;
+    ucpu->nat.pending_event = ucpu->cmp.pending_event;
+    ucpu->nat.tsc = ucpu->cmp.tsc;
+    ucpu->nat.msr_tsc_aux = 0;
 
     return 0;
 }
@@ -542,7 +545,7 @@
  */
 
 struct hvm_hw_cpu_xsave {
-    uint64_t xfeature_mask;
+    uint64_t xfeature_mask;        /* Ignored */
     uint64_t xcr0;                 /* Updated by XSETBV */
     uint64_t xcr0_accum;           /* Updated by XSETBV */
     struct {
@@ -566,6 +569,8 @@
 struct hvm_viridian_domain_context {
     uint64_t hypercall_gpa;
     uint64_t guest_os_id;
+    uint64_t time_ref_count;
+    uint64_t reference_tsc;
 };
 
 DECLARE_HVM_SAVE_TYPE(VIRIDIAN_DOMAIN, 15, struct hvm_viridian_domain_context);
@@ -578,13 +583,49 @@
 
 struct hvm_vmce_vcpu {
     uint64_t caps;
+    uint64_t mci_ctl2_bank0;
+    uint64_t mci_ctl2_bank1;
 };
 
 DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu);
 
+struct hvm_tsc_adjust {
+    uint64_t tsc_adjust;
+};
+
+DECLARE_HVM_SAVE_TYPE(TSC_ADJUST, 19, struct hvm_tsc_adjust);
+
+
+struct hvm_msr {
+    uint32_t count;
+    struct hvm_one_msr {
+        uint32_t index;
+        uint32_t _rsvd;
+        uint64_t val;
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    } msr[];
+#elif defined(__GNUC__)
+    } msr[0];
+#else
+    } msr[1 /* variable size */];
+#endif
+};
+
+#define CPU_MSR_CODE  20
+
 /* 
  * Largest type-code in use
  */
-#define HVM_SAVE_CODE_MAX 18
+#define HVM_SAVE_CODE_MAX 20
 
 #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */

Added: trunk/sys/xen/interface/arch-x86/pmu.h
===================================================================
--- trunk/sys/xen/interface/arch-x86/pmu.h	                        (rev 0)
+++ trunk/sys/xen/interface/arch-x86/pmu.h	2020-02-08 19:27:35 UTC (rev 12304)
@@ -0,0 +1,168 @@
+/* $MidnightBSD$ */
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2015 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_PMU_H__
+#define __XEN_PUBLIC_ARCH_X86_PMU_H__
+
+/* x86-specific PMU definitions */
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+    /*
+     * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+     * For PV(H) guests these fields are RO.
+     */
+    uint32_t counters;
+    uint32_t ctrls;
+
+    /* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    uint64_t regs[];
+#elif defined(__GNUC__)
+    uint64_t regs[0];
+#endif
+};
+typedef struct xen_pmu_amd_ctxt xen_pmu_amd_ctxt_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_amd_ctxt_t);
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+    uint64_t counter;
+    uint64_t control;
+};
+typedef struct xen_pmu_cntr_pair xen_pmu_cntr_pair_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_cntr_pair_t);
+
+struct xen_pmu_intel_ctxt {
+   /*
+    * Offsets to fixed and architectural counter MSRs (relative to
+    * xen_pmu_arch.c.intel).
+    * For PV(H) guests these fields are RO.
+    */
+    uint32_t fixed_counters;
+    uint32_t arch_counters;
+
+    /* PMU registers */
+    uint64_t global_ctrl;
+    uint64_t global_ovf_ctrl;
+    uint64_t global_status;
+    uint64_t fixed_ctrl;
+    uint64_t ds_area;
+    uint64_t pebs_enable;
+    uint64_t debugctl;
+
+    /* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    uint64_t regs[];
+#elif defined(__GNUC__)
+    uint64_t regs[0];
+#endif
+};
+typedef struct xen_pmu_intel_ctxt xen_pmu_intel_ctxt_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_intel_ctxt_t);
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+    uint64_t ip;
+    uint64_t sp;
+    uint64_t flags;
+    uint16_t cs;
+    uint16_t ss;
+    uint8_t cpl;
+    uint8_t pad[3];
+};
+typedef struct xen_pmu_regs xen_pmu_regs_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_regs_t);
+
+/* PMU flags */
+#define PMU_CACHED         (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER    (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL    (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV      (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+    union {
+        /*
+         * Processor's registers at the time of interrupt.
+         * WO for hypervisor, RO for guests.
+         */
+        struct xen_pmu_regs regs;
+        /* Padding for adding new registers to xen_pmu_regs in the future */
+#define XENPMU_REGS_PAD_SZ  64
+        uint8_t pad[XENPMU_REGS_PAD_SZ];
+    } r;
+
+    /* WO for hypervisor, RO for guest */
+    uint64_t pmu_flags;
+
+    /*
+     * APIC LVTPC register.
+     * RW for both hypervisor and guest.
+     * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+     * during XENPMU_flush or XENPMU_lvtpc_set.
+     */
+    union {
+        uint32_t lapic_lvtpc;
+        uint64_t pad;
+    } l;
+
+    /*
+     * Vendor-specific PMU registers.
+     * RW for both hypervisor and guest (see exceptions above).
+     * Guest's updates to this field are verified and then loaded by the
+     * hypervisor into hardware during XENPMU_flush
+     */
+    union {
+        struct xen_pmu_amd_ctxt amd;
+        struct xen_pmu_intel_ctxt intel;
+
+        /*
+         * Padding for contexts (fixed parts only, does not include MSR banks
+         * that are specified by offsets)
+         */
+#define XENPMU_CTXT_PAD_SZ  128
+        uint8_t pad[XENPMU_CTXT_PAD_SZ];
+    } c;
+};
+typedef struct xen_pmu_arch xen_pmu_arch_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_arch_t);
+
+#endif /* __XEN_PUBLIC_ARCH_X86_PMU_H__ */
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+


Property changes on: trunk/sys/xen/interface/arch-x86/pmu.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/interface/arch-x86/xen-mca.h
===================================================================
--- trunk/sys/xen/interface/arch-x86/xen-mca.h	2020-02-08 19:27:19 UTC (rev 12303)
+++ trunk/sys/xen/interface/arch-x86/xen-mca.h	2020-02-08 19:27:35 UTC (rev 12304)
@@ -415,7 +415,7 @@
 
 struct xen_mc_inject_v2 {
 	uint32_t flags;
-	struct xenctl_cpumap cpumap;
+	struct xenctl_bitmap cpumap;
 };
 #endif
 

Modified: trunk/sys/xen/interface/arch-x86/xen-x86_32.h
===================================================================
--- trunk/sys/xen/interface/arch-x86/xen-x86_32.h	2020-02-08 19:27:19 UTC (rev 12303)
+++ trunk/sys/xen/interface/arch-x86/xen-x86_32.h	2020-02-08 19:27:35 UTC (rev 12304)
@@ -105,6 +105,7 @@
     do { if ( sizeof(hnd) == 8 ) *(uint64_t *)&(hnd) = 0;   \
          (hnd).p = val;                                     \
     } while ( 0 )
+#define  int64_aligned_t  int64_t __attribute__((aligned(8)))
 #define uint64_aligned_t uint64_t __attribute__((aligned(8)))
 #define __XEN_GUEST_HANDLE_64(name) __guest_handle_64_ ## name
 #define XEN_GUEST_HANDLE_64(name) __XEN_GUEST_HANDLE_64(name)
@@ -164,7 +165,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/arch-x86/xen-x86_64.h
===================================================================
--- trunk/sys/xen/interface/arch-x86/xen-x86_64.h	2020-02-08 19:27:19 UTC (rev 12303)
+++ trunk/sys/xen/interface/arch-x86/xen-x86_64.h	2020-02-08 19:27:35 UTC (rev 12304)
@@ -195,7 +195,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/arch-x86/xen.h
===================================================================
--- trunk/sys/xen/interface/arch-x86/xen.h	2020-02-08 19:27:19 UTC (rev 12303)
+++ trunk/sys/xen/interface/arch-x86/xen.h	2020-02-08 19:27:35 UTC (rev 12304)
@@ -39,6 +39,14 @@
     typedef type * __guest_handle_ ## name
 #endif
 
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
 #define __DEFINE_XEN_GUEST_HANDLE(name, type) \
     ___DEFINE_XEN_GUEST_HANDLE(name, type);   \
     ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
@@ -45,6 +53,7 @@
 #define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
 #define __XEN_GUEST_HANDLE(name)        __guest_handle_ ## name
 #define XEN_GUEST_HANDLE(name)          __XEN_GUEST_HANDLE(name)
+#define XEN_GUEST_HANDLE_PARAM(name)    XEN_GUEST_HANDLE(name)
 #define set_xen_guest_handle_raw(hnd, val)  do { (hnd).p = val; } while (0)
 #ifdef __XEN_TOOLS__
 #define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
@@ -62,8 +71,12 @@
 #define PRI_xen_pfn "lx"
 #endif
 
+#define XEN_HAVE_PV_GUEST_ENTRY 1
+
+#define XEN_HAVE_PV_UPCALL_MASK 1
+
 /*
- * SEGMENT DESCRIPTOR TABLES
+ * `incontents 200 segdesc Segment Descriptor Tables
  */
 /*
  * ` enum neg_errnoval
@@ -75,11 +88,24 @@
  * start of the GDT because some stupid OSes export hard-coded selector values
  * in their ABI. These hard-coded values are always near the start of the GDT,
  * so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
  */
 #define FIRST_RESERVED_GDT_PAGE  14
 #define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
 #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
 
+
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_update_descriptor(u64 pa, u64 desc);
+ * `
+ * ` @pa   The machine physical address of the descriptor to
+ * `       update. Must be either a descriptor page or writable.
+ * ` @desc The descriptor value to update, in the same format as a
+ * `       native descriptor table entry.
+ */
+
 /* Maximum number of virtual CPUs in legacy multi-processor guests. */
 #define XEN_LEGACY_MAX_VCPUS 32
 
@@ -86,6 +112,7 @@
 #ifndef __ASSEMBLY__
 
 typedef unsigned long xen_ulong_t;
+#define PRI_xen_ulong "lx"
 
 /*
  * ` enum neg_errnoval
@@ -128,6 +155,15 @@
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled 
  * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
  */
 struct vcpu_guest_context {
     /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
@@ -185,14 +221,58 @@
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
 
 struct arch_shared_info {
-    unsigned long max_pfn;                  /* max pfn that appears in table */
-    /* Frame containing list of mfns containing list of mfns containing p2m. */
+    /*
+     * Number of valid entries in the p2m table(s) anchored at
+     * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+     */
+    unsigned long max_pfn;
+    /*
+     * Frame containing list of mfns containing list of mfns containing p2m.
+     * A value of 0 indicates it has not yet been set up, ~0 indicates it has
+     * been set to invalid e.g. due to the p2m being too large for the 3-level
+     * p2m tree. In this case the linear mapper p2m list anchored at p2m_vaddr
+     * is to be used.
+     */
     xen_pfn_t     pfn_to_mfn_frame_list_list;
     unsigned long nmi_reason;
-    uint64_t pad[32];
+    /*
+     * Following three fields are valid if p2m_cr3 contains a value different
+     * from 0.
+     * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+     * p2m_cr3 is in the same format as a cr3 value in the vcpu register state
+     * and holds the folded machine frame number (via xen_pfn_to_cr3) of a
+     * L3 or L4 page table.
+     * p2m_vaddr holds the virtual address of the linear p2m list. All entries
+     * in the range [0...max_pfn[ are accessible via this pointer.
+     * p2m_generation will be incremented by the guest before and after each
+     * change of the mappings of the p2m list. p2m_generation starts at 0 and
+     * a value with the least significant bit set indicates that a mapping
+     * update is in progress. This allows guest external software (e.g. in Dom0)
+     * to verify that read mappings are consistent and whether they have changed
+     * since the last check.
+     * Modifying a p2m element in the linear p2m list is allowed via an atomic
+     * write only.
+     */
+    unsigned long p2m_cr3;         /* cr3 value of the p2m address space */
+    unsigned long p2m_vaddr;       /* virtual address of the p2m list */
+    unsigned long p2m_generation;  /* generation count of p2m mapping */
+#ifdef __i386__
+    /* There's no room for this field in the generic structure. */
+    uint32_t wc_sec_hi;
+#endif
 };
 typedef struct arch_shared_info arch_shared_info_t;
 
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+/*
+ * struct xen_arch_domainconfig's ABI is covered by
+ * XEN_DOMCTL_INTERFACE_VERSION.
+ */
+struct xen_arch_domainconfig {
+    char dummy;
+};
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 /*
@@ -230,7 +310,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil


From laffer1 at midnightbsd.org  Sat Feb  8 14:27:58 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:27:58 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12305]
 trunk/sys/xen/interface/arch-arm/hvm/save.h: sync with FreeBSD 11-stable
Message-ID: <202002081927.018JRwMm060861@stargazer.midnightbsd.org>

Revision: 12305
          http://svnweb.midnightbsd.org/src/?rev=12305
Author:   laffer1
Date:     2020-02-08 14:27:58 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/xen/interface/arch-arm/hvm/save.h

Modified: trunk/sys/xen/interface/arch-arm/hvm/save.h
===================================================================
--- trunk/sys/xen/interface/arch-arm/hvm/save.h	2020-02-08 19:27:35 UTC (rev 12304)
+++ trunk/sys/xen/interface/arch-arm/hvm/save.h	2020-02-08 19:27:58 UTC (rev 12305)
@@ -32,7 +32,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil


From laffer1 at midnightbsd.org  Sat Feb  8 14:28:09 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:28:09 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12306] trunk/sys/xen/interface: sync with
 FreeBSD 11-stable
Message-ID: <202002081928.018JS90k060912@stargazer.midnightbsd.org>

Revision: 12306
          http://svnweb.midnightbsd.org/src/?rev=12306
Author:   laffer1
Date:     2020-02-08 14:28:08 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/xen/interface/arch-arm.h
    trunk/sys/xen/interface/callback.h
    trunk/sys/xen/interface/dom0_ops.h
    trunk/sys/xen/interface/domctl.h
    trunk/sys/xen/interface/elfnote.h
    trunk/sys/xen/interface/event_channel.h
    trunk/sys/xen/interface/features.h
    trunk/sys/xen/interface/grant_table.h
    trunk/sys/xen/interface/kexec.h
    trunk/sys/xen/interface/memory.h
    trunk/sys/xen/interface/nmi.h
    trunk/sys/xen/interface/physdev.h
    trunk/sys/xen/interface/platform.h
    trunk/sys/xen/interface/sched.h
    trunk/sys/xen/interface/sysctl.h
    trunk/sys/xen/interface/tmem.h
    trunk/sys/xen/interface/trace.h
    trunk/sys/xen/interface/vcpu.h
    trunk/sys/xen/interface/version.h
    trunk/sys/xen/interface/xen-compat.h
    trunk/sys/xen/interface/xen.h
    trunk/sys/xen/interface/xenoprof.h

Added Paths:
-----------
    trunk/sys/xen/interface/errno.h
    trunk/sys/xen/interface/gcov.h
    trunk/sys/xen/interface/pmu.h
    trunk/sys/xen/interface/vm_event.h

Modified: trunk/sys/xen/interface/arch-arm.h
===================================================================
--- trunk/sys/xen/interface/arch-arm.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/arch-arm.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -28,94 +28,254 @@
 #ifndef __XEN_PUBLIC_ARCH_ARM_H__
 #define __XEN_PUBLIC_ARCH_ARM_H__
 
-/* hypercall calling convention
- * ----------------------------
+/*
+ * `incontents 50 arm_abi Hypercall Calling Convention
  *
  * A hypercall is issued using the ARM HVC instruction.
  *
  * A hypercall can take up to 5 arguments. These are passed in
- * registers, the first argument in r0, the second argument in r1, the
- * third in r2, the forth in r3 and the fifth in r4.
+ * registers, the first argument in x0/r0 (for arm64/arm32 guests
+ * respectively irrespective of whether the underlying hypervisor is
+ * 32- or 64-bit), the second argument in x1/r1, the third in x2/r2,
+ * the forth in x3/r3 and the fifth in x4/r4.
  *
- * The hypercall number is passed in r12.
+ * The hypercall number is passed in r12 (arm) or x16 (arm64). In both
+ * cases the relevant ARM procedure calling convention specifies this
+ * is an inter-procedure-call scratch register (e.g. for use in linker
+ * stubs). This use does not conflict with use during a hypercall.
  *
  * The HVC ISS must contain a Xen specific TAG: XEN_HYPERCALL_TAG.
  *
- * The return value is in r0.
+ * The return value is in x0/r0.
  *
- * The hypercall will clobber r12 and the argument registers used by
- * that hypercall (except r0 which is the return value) i.e. a 2
- * argument hypercall will clobber r1 and a 4 argument hypercall will
- * clobber r1, r2 and r3.
+ * The hypercall will clobber x16/r12 and the argument registers used
+ * by that hypercall (except r0 which is the return value) i.e. in
+ * addition to x16/r12 a 2 argument hypercall will clobber x1/r1 and a
+ * 4 argument hypercall will clobber x1/r1, x2/r2 and x3/r3.
  *
+ * Parameter structs passed to hypercalls are laid out according to
+ * the Procedure Call Standard for the ARM Architecture (AAPCS, AKA
+ * EABI) and Procedure Call Standard for the ARM 64-bit Architecture
+ * (AAPCS64). Where there is a conflict the 64-bit standard should be
+ * used regardless of guest type. Structures which are passed as
+ * hypercall arguments are always little endian.
+ *
+ * All memory which is shared with other entities in the system
+ * (including the hypervisor and other guests) must reside in memory
+ * which is mapped as Normal Inner-cacheable. This applies to:
+ *  - hypercall arguments passed via a pointer to guest memory.
+ *  - memory shared via the grant table mechanism (including PV I/O
+ *    rings etc).
+ *  - memory shared with the hypervisor (struct shared_info, struct
+ *    vcpu_info, the grant table, etc).
+ *
+ * Any Inner cache allocation strategy (Write-Back, Write-Through etc)
+ * is acceptable. There is no restriction on the Outer-cacheability.
  */
 
+/*
+ * `incontents 55 arm_hcall Supported Hypercalls
+ *
+ * Xen on ARM makes extensive use of hardware facilities and therefore
+ * only a subset of the potential hypercalls are required.
+ *
+ * Since ARM uses second stage paging any machine/physical addresses
+ * passed to hypercalls are Guest Physical Addresses (Intermediate
+ * Physical Addresses) unless otherwise noted.
+ *
+ * The following hypercalls (and sub operations) are supported on the
+ * ARM platform. Other hypercalls should be considered
+ * unavailable/unsupported.
+ *
+ *  HYPERVISOR_memory_op
+ *   All generic sub-operations
+ *
+ *  HYPERVISOR_domctl
+ *   All generic sub-operations, with the exception of:
+ *    * XEN_DOMCTL_irq_permission (not yet implemented)
+ *
+ *  HYPERVISOR_sched_op
+ *   All generic sub-operations, with the exception of:
+ *    * SCHEDOP_block -- prefer wfi hardware instruction
+ *
+ *  HYPERVISOR_console_io
+ *   All generic sub-operations
+ *
+ *  HYPERVISOR_xen_version
+ *   All generic sub-operations
+ *
+ *  HYPERVISOR_event_channel_op
+ *   All generic sub-operations
+ *
+ *  HYPERVISOR_physdev_op
+ *   No sub-operations are currenty supported
+ *
+ *  HYPERVISOR_sysctl
+ *   All generic sub-operations, with the exception of:
+ *    * XEN_SYSCTL_page_offline_op
+ *    * XEN_SYSCTL_get_pmstat
+ *    * XEN_SYSCTL_pm_op
+ *
+ *  HYPERVISOR_hvm_op
+ *   Exactly these sub-operations are supported:
+ *    * HVMOP_set_param
+ *    * HVMOP_get_param
+ *
+ *  HYPERVISOR_grant_table_op
+ *   All generic sub-operations
+ *
+ *  HYPERVISOR_vcpu_op
+ *   Exactly these sub-operations are supported:
+ *    * VCPUOP_register_vcpu_info
+ *    * VCPUOP_register_runstate_memory_area
+ *
+ *
+ * Other notes on the ARM ABI:
+ *
+ * - struct start_info is not exported to ARM guests.
+ *
+ * - struct shared_info is mapped by ARM guests using the
+ *   HYPERVISOR_memory_op sub-op XENMEM_add_to_physmap, passing
+ *   XENMAPSPACE_shared_info as space parameter.
+ *
+ * - All the per-cpu struct vcpu_info are mapped by ARM guests using the
+ *   HYPERVISOR_vcpu_op sub-op VCPUOP_register_vcpu_info, including cpu0
+ *   struct vcpu_info.
+ *
+ * - The grant table is mapped using the HYPERVISOR_memory_op sub-op
+ *   XENMEM_add_to_physmap, passing XENMAPSPACE_grant_table as space
+ *   parameter. The memory range specified under the Xen compatible
+ *   hypervisor node on device tree can be used as target gpfn for the
+ *   mapping.
+ *
+ * - Xenstore is initialized by using the two hvm_params
+ *   HVM_PARAM_STORE_PFN and HVM_PARAM_STORE_EVTCHN. They can be read
+ *   with the HYPERVISOR_hvm_op sub-op HVMOP_get_param.
+ *
+ * - The paravirtualized console is initialized by using the two
+ *   hvm_params HVM_PARAM_CONSOLE_PFN and HVM_PARAM_CONSOLE_EVTCHN. They
+ *   can be read with the HYPERVISOR_hvm_op sub-op HVMOP_get_param.
+ *
+ * - Event channel notifications are delivered using the percpu GIC
+ *   interrupt specified under the Xen compatible hypervisor node on
+ *   device tree.
+ *
+ * - The device tree Xen compatible node is fully described under Linux
+ *   at Documentation/devicetree/bindings/arm/xen.txt.
+ */
+
 #define XEN_HYPERCALL_TAG   0XEA1
 
+#define  int64_aligned_t  int64_t __attribute__((aligned(8)))
+#define uint64_aligned_t uint64_t __attribute__((aligned(8)))
 
 #ifndef __ASSEMBLY__
-#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
-    typedef struct { type *p; } __guest_handle_ ## name
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type)                  \
+    typedef union { type *p; unsigned long q; }                 \
+        __guest_handle_ ## name;                                \
+    typedef union { type *p; uint64_aligned_t q; }              \
+        __guest_handle_64_ ## name;
 
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory. On ARM is always 8 bytes sizes and 8 bytes
+ * aligned.
+ * XEN_GUEST_HANDLE_PARAM represents a guest pointer, when passed as an
+ * hypercall argument. It is 4 bytes on aarch32 and 8 bytes on aarch64.
+ */
 #define __DEFINE_XEN_GUEST_HANDLE(name, type) \
     ___DEFINE_XEN_GUEST_HANDLE(name, type);   \
     ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
 #define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
-#define __XEN_GUEST_HANDLE(name)        __guest_handle_ ## name
+#define __XEN_GUEST_HANDLE(name)        __guest_handle_64_ ## name
 #define XEN_GUEST_HANDLE(name)          __XEN_GUEST_HANDLE(name)
-#define set_xen_guest_handle_raw(hnd, val)  do { (hnd).p = val; } while (0)
+#define XEN_GUEST_HANDLE_PARAM(name)    __guest_handle_ ## name
+#define set_xen_guest_handle_raw(hnd, val)                  \
+    do {                                                    \
+        typeof(&(hnd)) _sxghr_tmp = &(hnd);                 \
+        _sxghr_tmp->q = 0;                                  \
+        _sxghr_tmp->p = val;                                \
+    } while ( 0 )
 #ifdef __XEN_TOOLS__
 #define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
 #endif
 #define set_xen_guest_handle(hnd, val) set_xen_guest_handle_raw(hnd, val)
 
-struct cpu_user_regs
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., r0/x0). */
+# define __DECL_REG(n64, n32) union {          \
+        uint64_t n64;                          \
+        uint32_t n32;                          \
+    }
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., x0). */
+#define __DECL_REG(n64, n32) uint64_t n64
+#endif
+
+struct vcpu_guest_core_regs
 {
-    uint32_t r0;
-    uint32_t r1;
-    uint32_t r2;
-    uint32_t r3;
-    uint32_t r4;
-    uint32_t r5;
-    uint32_t r6;
-    uint32_t r7;
-    uint32_t r8;
-    uint32_t r9;
-    uint32_t r10;
-    union {
-        uint32_t r11;
-        uint32_t fp;
-    };
-    uint32_t r12;
+    /*         Aarch64       Aarch32 */
+    __DECL_REG(x0,           r0_usr);
+    __DECL_REG(x1,           r1_usr);
+    __DECL_REG(x2,           r2_usr);
+    __DECL_REG(x3,           r3_usr);
+    __DECL_REG(x4,           r4_usr);
+    __DECL_REG(x5,           r5_usr);
+    __DECL_REG(x6,           r6_usr);
+    __DECL_REG(x7,           r7_usr);
+    __DECL_REG(x8,           r8_usr);
+    __DECL_REG(x9,           r9_usr);
+    __DECL_REG(x10,          r10_usr);
+    __DECL_REG(x11,          r11_usr);
+    __DECL_REG(x12,          r12_usr);
 
-    uint32_t sp; /* r13 - SP: Valid for Hyp. frames only, o/w banked (see below) */
+    __DECL_REG(x13,          sp_usr);
+    __DECL_REG(x14,          lr_usr);
 
-    /* r14 - LR: is the same physical register as LR_usr */
-    union {
-        uint32_t lr; /* r14 - LR: Valid for Hyp. Same physical register as lr_usr. */
-        uint32_t lr_usr;
-    };
+    __DECL_REG(x15,          __unused_sp_hyp);
 
-    uint32_t pc; /* Return IP */
-    uint32_t cpsr; /* Return mode */
-    uint32_t pad0; /* Doubleword-align the kernel half of the frame */
+    __DECL_REG(x16,          lr_irq);
+    __DECL_REG(x17,          sp_irq);
 
-    /* Outer guest frame only from here on... */
+    __DECL_REG(x18,          lr_svc);
+    __DECL_REG(x19,          sp_svc);
 
-    uint32_t r8_fiq, r9_fiq, r10_fiq, r11_fiq, r12_fiq;
+    __DECL_REG(x20,          lr_abt);
+    __DECL_REG(x21,          sp_abt);
 
-    uint32_t sp_usr; /* LR_usr is the same register as LR, see above */
+    __DECL_REG(x22,          lr_und);
+    __DECL_REG(x23,          sp_und);
 
-    uint32_t sp_svc, sp_abt, sp_und, sp_irq, sp_fiq;
-    uint32_t lr_svc, lr_abt, lr_und, lr_irq, lr_fiq;
+    __DECL_REG(x24,          r8_fiq);
+    __DECL_REG(x25,          r9_fiq);
+    __DECL_REG(x26,          r10_fiq);
+    __DECL_REG(x27,          r11_fiq);
+    __DECL_REG(x28,          r12_fiq);
 
-    uint32_t spsr_svc, spsr_abt, spsr_und, spsr_irq, spsr_fiq;
+    __DECL_REG(x29,          sp_fiq);
+    __DECL_REG(x30,          lr_fiq);
 
-    uint32_t pad1; /* Doubleword-align the user half of the frame */
+    /* Return address and mode */
+    __DECL_REG(pc64,         pc32);             /* ELR_EL2 */
+    uint32_t cpsr;                              /* SPSR_EL2 */
+
+    union {
+        uint32_t spsr_el1;       /* AArch64 */
+        uint32_t spsr_svc;       /* AArch32 */
+    };
+
+    /* AArch32 guests only */
+    uint32_t spsr_fiq, spsr_irq, spsr_und, spsr_abt;
+
+    /* AArch64 guests only */
+    uint64_t sp_el0;
+    uint64_t sp_el1, elr_el1;
 };
-typedef struct cpu_user_regs cpu_user_regs_t;
-DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
+typedef struct vcpu_guest_core_regs vcpu_guest_core_regs_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_guest_core_regs_t);
 
+#undef __DECL_REG
+
 typedef uint64_t xen_pfn_t;
 #define PRI_xen_pfn PRIx64
 
@@ -123,30 +283,77 @@
 /* Only one. All other VCPUS must use VCPUOP_register_vcpu_info */
 #define XEN_LEGACY_MAX_VCPUS 1
 
-typedef uint32_t xen_ulong_t;
+typedef uint64_t xen_ulong_t;
+#define PRI_xen_ulong PRIx64
 
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
 struct vcpu_guest_context {
-    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
+#define _VGCF_online                   0
+#define VGCF_online                    (1<<_VGCF_online)
+    uint32_t flags;                         /* VGCF_* */
 
+    struct vcpu_guest_core_regs user_regs;  /* Core CPU registers */
+
     uint32_t sctlr;
-    uint32_t ttbr0, ttbr1, ttbcr;
+    uint64_t ttbcr, ttbr0, ttbr1;
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
 
-struct arch_vcpu_info { };
+/*
+ * struct xen_arch_domainconfig's ABI is covered by
+ * XEN_DOMCTL_INTERFACE_VERSION.
+ */
+#define XEN_DOMCTL_CONFIG_GIC_NATIVE    0
+#define XEN_DOMCTL_CONFIG_GIC_V2        1
+#define XEN_DOMCTL_CONFIG_GIC_V3        2
+struct xen_arch_domainconfig {
+    /* IN/OUT */
+    uint8_t gic_version;
+    /* IN */
+    uint32_t nr_spis;
+    /*
+     * OUT
+     * Based on the property clock-frequency in the DT timer node.
+     * The property may be present when the bootloader/firmware doesn't
+     * set correctly CNTFRQ which hold the timer frequency.
+     *
+     * As it's not possible to trap this register, we have to replicate
+     * the value in the guest DT.
+     *
+     * = 0 => property not present
+     * > 0 => Value of the property
+     *
+     */
+    uint32_t clock_frequency;
+};
+#endif /* __XEN__ || __XEN_TOOLS__ */
+
+struct arch_vcpu_info {
+};
 typedef struct arch_vcpu_info arch_vcpu_info_t;
 
-struct arch_shared_info { };
+struct arch_shared_info {
+};
 typedef struct arch_shared_info arch_shared_info_t;
 typedef uint64_t xen_callback_t;
 
-#endif /* ifndef __ASSEMBLY __ */
+#endif
 
-/* PSR bits (CPSR, SPSR)*/
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
 
-/* 0-4: Mode */
-#define PSR_MODE_MASK 0x1f
+/* PSR bits (CPSR, SPSR) */
+
+#define PSR_THUMB       (1<<5)        /* Thumb Mode enable */
+#define PSR_FIQ_MASK    (1<<6)        /* Fast Interrupt mask */
+#define PSR_IRQ_MASK    (1<<7)        /* Interrupt mask */
+#define PSR_ABT_MASK    (1<<8)        /* Asynchronous Abort mask */
+#define PSR_BIG_ENDIAN  (1<<9)        /* arm32: Big Endian Mode */
+#define PSR_DBG_MASK    (1<<9)        /* arm64: Debug Exception mask */
+#define PSR_IT_MASK     (0x0600fc00)  /* Thumb If-Then Mask */
+#define PSR_JAZELLE     (1<<24)       /* Jazelle Mode */
+
+/* 32 bit modes */
 #define PSR_MODE_USR 0x10
 #define PSR_MODE_FIQ 0x11
 #define PSR_MODE_IRQ 0x12
@@ -157,19 +364,102 @@
 #define PSR_MODE_UND 0x1b
 #define PSR_MODE_SYS 0x1f
 
-#define PSR_THUMB       (1<<5)        /* Thumb Mode enable */
-#define PSR_FIQ_MASK    (1<<6)        /* Fast Interrupt mask */
-#define PSR_IRQ_MASK    (1<<7)        /* Interrupt mask */
-#define PSR_ABT_MASK    (1<<8)        /* Asynchronous Abort mask */
-#define PSR_BIG_ENDIAN  (1<<9)        /* Big Endian Mode */
-#define PSR_JAZELLE     (1<<24)       /* Jazelle Mode */
+/* 64 bit modes */
+#define PSR_MODE_BIT  0x10 /* Set iff AArch32 */
+#define PSR_MODE_EL3h 0x0d
+#define PSR_MODE_EL3t 0x0c
+#define PSR_MODE_EL2h 0x09
+#define PSR_MODE_EL2t 0x08
+#define PSR_MODE_EL1h 0x05
+#define PSR_MODE_EL1t 0x04
+#define PSR_MODE_EL0t 0x00
 
+#define PSR_GUEST32_INIT  (PSR_ABT_MASK|PSR_FIQ_MASK|PSR_IRQ_MASK|PSR_MODE_SVC)
+#define PSR_GUEST64_INIT (PSR_ABT_MASK|PSR_FIQ_MASK|PSR_IRQ_MASK|PSR_MODE_EL1h)
+
+#define SCTLR_GUEST_INIT    0x00c50078
+
+/*
+ * Virtual machine platform (memory layout, interrupts)
+ *
+ * These are defined for consistency between the tools and the
+ * hypervisor. Guests must not rely on these hardcoded values but
+ * should instead use the FDT.
+ */
+
+/* Physical Address Space */
+
+/*
+ * vGIC mappings: Only one set of mapping is used by the guest.
+ * Therefore they can overlap.
+ */
+
+/* vGIC v2 mappings */
+#define GUEST_GICD_BASE   0x03001000ULL
+#define GUEST_GICD_SIZE   0x00001000ULL
+#define GUEST_GICC_BASE   0x03002000ULL
+#define GUEST_GICC_SIZE   0x00000100ULL
+
+/* vGIC v3 mappings */
+#define GUEST_GICV3_GICD_BASE      0x03001000ULL
+#define GUEST_GICV3_GICD_SIZE      0x00010000ULL
+
+#define GUEST_GICV3_RDIST_STRIDE   0x20000ULL
+#define GUEST_GICV3_RDIST_REGIONS  1
+
+#define GUEST_GICV3_GICR0_BASE     0x03020000ULL    /* vCPU0 - vCPU127 */
+#define GUEST_GICV3_GICR0_SIZE     0x01000000ULL
+
+/*
+ * 16MB == 4096 pages reserved for guest to use as a region to map its
+ * grant table in.
+ */
+#define GUEST_GNTTAB_BASE 0x38000000ULL
+#define GUEST_GNTTAB_SIZE 0x01000000ULL
+
+#define GUEST_MAGIC_BASE  0x39000000ULL
+#define GUEST_MAGIC_SIZE  0x01000000ULL
+
+#define GUEST_RAM_BANKS   2
+
+#define GUEST_RAM0_BASE   0x40000000ULL /* 3GB of low RAM @ 1GB */
+#define GUEST_RAM0_SIZE   0xc0000000ULL
+
+#define GUEST_RAM1_BASE   0x0200000000ULL /* 1016GB of RAM @ 8GB */
+#define GUEST_RAM1_SIZE   0xfe00000000ULL
+
+#define GUEST_RAM_BASE    GUEST_RAM0_BASE /* Lowest RAM address */
+/* Largest amount of actual RAM, not including holes */
+#define GUEST_RAM_MAX     (GUEST_RAM0_SIZE + GUEST_RAM1_SIZE)
+/* Suitable for e.g. const uint64_t ramfoo[] = GUEST_RAM_BANK_FOOS; */
+#define GUEST_RAM_BANK_BASES   { GUEST_RAM0_BASE, GUEST_RAM1_BASE }
+#define GUEST_RAM_BANK_SIZES   { GUEST_RAM0_SIZE, GUEST_RAM1_SIZE }
+
+/* Interrupts */
+#define GUEST_TIMER_VIRT_PPI    27
+#define GUEST_TIMER_PHYS_S_PPI  29
+#define GUEST_TIMER_PHYS_NS_PPI 30
+#define GUEST_EVTCHN_PPI        31
+
+/* PSCI functions */
+#define PSCI_cpu_suspend 0
+#define PSCI_cpu_off     1
+#define PSCI_cpu_on      2
+#define PSCI_migrate     3
+
+#endif
+
+#ifndef __ASSEMBLY__
+/* Stub definition of PMU structure */
+typedef struct xen_pmu_arch { uint8_t dummy; } xen_pmu_arch_t;
+#endif
+
 #endif /*  __XEN_PUBLIC_ARCH_ARM_H__ */
 
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/callback.h
===================================================================
--- trunk/sys/xen/interface/callback.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/callback.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -37,7 +37,7 @@
  * @extra_args == Operation-specific extra arguments (NULL if none).
  */
 
-/* ia64, x86: Callback for event delivery. */
+/* x86: Callback for event delivery. */
 #define CALLBACKTYPE_event                 0
 
 /* x86: Failsafe callback when guest state cannot be restored by Xen. */
@@ -114,7 +114,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/dom0_ops.h
===================================================================
--- trunk/sys/xen/interface/dom0_ops.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/dom0_ops.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -113,7 +113,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/domctl.h
===================================================================
--- trunk/sys/xen/interface/domctl.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/domctl.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -35,8 +35,10 @@
 
 #include "xen.h"
 #include "grant_table.h"
+#include "hvm/save.h"
+#include "memory.h"
 
-#define XEN_DOMCTL_INTERFACE_VERSION 0x00000008
+#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000b
 
 /*
  * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
@@ -47,7 +49,7 @@
     /* IN parameters */
     uint32_t ssidref;
     xen_domain_handle_t handle;
- /* Is this an HVM guest (as opposed to a PV guest)? */
+ /* Is this an HVM guest (as opposed to a PVH or PV guest)? */
 #define _XEN_DOMCTL_CDF_hvm_guest     0
 #define XEN_DOMCTL_CDF_hvm_guest      (1U<<_XEN_DOMCTL_CDF_hvm_guest)
  /* Use hardware-assisted paging if available? */
@@ -59,7 +61,11 @@
  /* Disable out-of-sync shadow page tables? */
 #define _XEN_DOMCTL_CDF_oos_off       3
 #define XEN_DOMCTL_CDF_oos_off        (1U<<_XEN_DOMCTL_CDF_oos_off)
+ /* Is this a PVH guest (as opposed to an HVM or PV guest)? */
+#define _XEN_DOMCTL_CDF_pvh_guest     4
+#define XEN_DOMCTL_CDF_pvh_guest      (1U<<_XEN_DOMCTL_CDF_pvh_guest)
     uint32_t flags;
+    struct xen_arch_domainconfig config;
 };
 typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
@@ -89,6 +95,9 @@
  /* Being debugged.  */
 #define _XEN_DOMINF_debugged  6
 #define XEN_DOMINF_debugged   (1U<<_XEN_DOMINF_debugged)
+/* domain is PVH */
+#define _XEN_DOMINF_pvh_guest 7
+#define XEN_DOMINF_pvh_guest  (1U<<_XEN_DOMINF_pvh_guest)
  /* XEN_DOMINF_shutdown guest-supplied code.  */
 #define XEN_DOMINF_shutdownmask 255
 #define XEN_DOMINF_shutdownshift 16
@@ -95,11 +104,13 @@
     uint32_t flags;              /* XEN_DOMINF_* */
     uint64_aligned_t tot_pages;
     uint64_aligned_t max_pages;
+    uint64_aligned_t outstanding_pages;
     uint64_aligned_t shr_pages;
     uint64_aligned_t paged_pages;
     uint64_aligned_t shared_info_frame; /* GMFN of shared_info struct */
     uint64_aligned_t cpu_time;
     uint32_t nr_online_vcpus;    /* Number of VCPUs currently online. */
+#define XEN_INVALID_MAX_VCPU_ID (~0U) /* Domain has no vcpus? */
     uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
     uint32_t ssidref;
     xen_domain_handle_t handle;
@@ -136,30 +147,9 @@
 #define XEN_DOMCTL_PFINFO_LPINTAB (0x1U<<31)
 #define XEN_DOMCTL_PFINFO_XTAB    (0xfU<<28) /* invalid page */
 #define XEN_DOMCTL_PFINFO_XALLOC  (0xeU<<28) /* allocate-only page */
-#define XEN_DOMCTL_PFINFO_PAGEDTAB (0x8U<<28)
+#define XEN_DOMCTL_PFINFO_BROKEN  (0xdU<<28) /* broken page */
 #define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28)
 
-struct xen_domctl_getpageframeinfo {
-    /* IN variables. */
-    uint64_aligned_t gmfn; /* GMFN to query */
-    /* OUT variables. */
-    /* Is the page PINNED to a type? */
-    uint32_t type;         /* see above type defs */
-};
-typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t);
-
-
-/* XEN_DOMCTL_getpageframeinfo2 */
-struct xen_domctl_getpageframeinfo2 {
-    /* IN variables. */
-    uint64_aligned_t num;
-    /* IN/OUT variables. */
-    XEN_GUEST_HANDLE_64(uint32) array;
-};
-typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t);
-
 /* XEN_DOMCTL_getpageframeinfo3 */
 struct xen_domctl_getpageframeinfo3 {
     /* IN variables. */
@@ -279,12 +269,47 @@
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
 
 
+/* Get/set the NUMA node(s) with which the guest has affinity with. */
+/* XEN_DOMCTL_setnodeaffinity */
+/* XEN_DOMCTL_getnodeaffinity */
+struct xen_domctl_nodeaffinity {
+    struct xenctl_bitmap nodemap;/* IN */
+};
+typedef struct xen_domctl_nodeaffinity xen_domctl_nodeaffinity_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_nodeaffinity_t);
+
+
 /* Get/set which physical cpus a vcpu can execute on. */
 /* XEN_DOMCTL_setvcpuaffinity */
 /* XEN_DOMCTL_getvcpuaffinity */
 struct xen_domctl_vcpuaffinity {
-    uint32_t  vcpu;              /* IN */
-    struct xenctl_cpumap cpumap; /* IN/OUT */
+    /* IN variables. */
+    uint32_t  vcpu;
+ /* Set/get the hard affinity for vcpu */
+#define _XEN_VCPUAFFINITY_HARD  0
+#define XEN_VCPUAFFINITY_HARD   (1U<<_XEN_VCPUAFFINITY_HARD)
+ /* Set/get the soft affinity for vcpu */
+#define _XEN_VCPUAFFINITY_SOFT  1
+#define XEN_VCPUAFFINITY_SOFT   (1U<<_XEN_VCPUAFFINITY_SOFT)
+    uint32_t flags;
+    /*
+     * IN/OUT variables.
+     *
+     * Both are IN/OUT for XEN_DOMCTL_setvcpuaffinity, in which case they
+     * contain effective hard or/and soft affinity. That is, upon successful
+     * return, cpumap_soft, contains the intersection of the soft affinity,
+     * hard affinity and the cpupool's online CPUs for the domain (if
+     * XEN_VCPUAFFINITY_SOFT was set in flags). cpumap_hard contains the
+     * intersection between hard affinity and the cpupool's online CPUs (if
+     * XEN_VCPUAFFINITY_HARD was set in flags).
+     *
+     * Both are OUT-only for XEN_DOMCTL_getvcpuaffinity, in which case they
+     * contain the plain hard and/or soft affinity masks that were set during
+     * previous successful calls to XEN_DOMCTL_setvcpuaffinity (or the
+     * default values), without intersecting or altering them in any way.
+     */
+    struct xenctl_bitmap cpumap_hard;
+    struct xenctl_bitmap cpumap_soft;
 };
 typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t);
@@ -300,10 +325,12 @@
 
 /* XEN_DOMCTL_scheduler_op */
 /* Scheduler types. */
-#define XEN_SCHEDULER_SEDF     4
+/* #define XEN_SCHEDULER_SEDF  4 (Removed) */
 #define XEN_SCHEDULER_CREDIT   5
 #define XEN_SCHEDULER_CREDIT2  6
 #define XEN_SCHEDULER_ARINC653 7
+#define XEN_SCHEDULER_RTDS     8
+
 /* Set or get info? */
 #define XEN_DOMCTL_SCHEDOP_putinfo 0
 #define XEN_DOMCTL_SCHEDOP_getinfo 1
@@ -311,13 +338,6 @@
     uint32_t sched_id;  /* XEN_SCHEDULER_* */
     uint32_t cmd;       /* XEN_DOMCTL_SCHEDOP_* */
     union {
-        struct xen_domctl_sched_sedf {
-            uint64_aligned_t period;
-            uint64_aligned_t slice;
-            uint64_aligned_t latency;
-            uint32_t extratime;
-            uint32_t weight;
-        } sedf;
         struct xen_domctl_sched_credit {
             uint16_t weight;
             uint16_t cap;
@@ -325,6 +345,10 @@
         struct xen_domctl_sched_credit2 {
             uint16_t weight;
         } credit2;
+        struct xen_domctl_sched_rtds {
+            uint32_t period;
+            uint32_t budget;
+        } rtds;
     } u;
 };
 typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
@@ -384,29 +408,9 @@
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t);
 
 
-/* XEN_DOMCTL_arch_setup */
-#define _XEN_DOMAINSETUP_hvm_guest 0
-#define XEN_DOMAINSETUP_hvm_guest  (1UL<<_XEN_DOMAINSETUP_hvm_guest)
-#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save)  */
-#define XEN_DOMAINSETUP_query  (1UL<<_XEN_DOMAINSETUP_query)
-#define _XEN_DOMAINSETUP_sioemu_guest 2
-#define XEN_DOMAINSETUP_sioemu_guest  (1UL<<_XEN_DOMAINSETUP_sioemu_guest)
-typedef struct xen_domctl_arch_setup {
-    uint64_aligned_t flags;  /* XEN_DOMAINSETUP_* */
-#ifdef __ia64__
-    uint64_aligned_t bp;     /* mpaddr of boot param area */
-    uint64_aligned_t maxmem; /* Highest memory address for MDT.  */
-    uint64_aligned_t xsi_va; /* Xen shared_info area virtual address.  */
-    uint32_t hypercall_imm;  /* Break imm for Xen hypercalls.  */
-    int8_t vhpt_size_log2;   /* Log2 of VHPT size. */
-#endif
-} xen_domctl_arch_setup_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t);
-
-
 /* XEN_DOMCTL_settimeoffset */
 struct xen_domctl_settimeoffset {
-    int32_t  time_offset_seconds; /* applied to domain wallclock time */
+    int64_aligned_t time_offset_seconds; /* applied to domain wallclock time */
 };
 typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
@@ -430,14 +434,6 @@
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_address_size_t);
 
 
-/* XEN_DOMCTL_real_mode_area */
-struct xen_domctl_real_mode_area {
-    uint32_t log; /* log2 of Real Mode Area size */
-};
-typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t);
-
-
 /* XEN_DOMCTL_sendtrigger */
 #define XEN_DOMCTL_SENDTRIGGER_NMI    0
 #define XEN_DOMCTL_SENDTRIGGER_RESET  1
@@ -452,12 +448,33 @@
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t);
 
 
-/* Assign PCI device to HVM guest. Sets up IOMMU structures. */
+/* Assign a device to a guest. Sets up IOMMU structures. */
 /* XEN_DOMCTL_assign_device */
 /* XEN_DOMCTL_test_assign_device */
-/* XEN_DOMCTL_deassign_device */
+/*
+ * XEN_DOMCTL_deassign_device: The behavior of this DOMCTL differs
+ * between the different type of device:
+ *  - PCI device (XEN_DOMCTL_DEV_PCI) will be reassigned to DOM0
+ *  - DT device (XEN_DOMCTL_DT_PCI) will left unassigned. DOM0
+ *  will have to call XEN_DOMCTL_assign_device in order to use the
+ *  device.
+ */
+#define XEN_DOMCTL_DEV_PCI      0
+#define XEN_DOMCTL_DEV_DT       1
 struct xen_domctl_assign_device {
-    uint32_t  machine_sbdf;   /* machine PCI ID of assigned device */
+    uint32_t dev;   /* XEN_DOMCTL_DEV_* */
+    union {
+        struct {
+            uint32_t machine_sbdf;   /* machine PCI ID of assigned device */
+        } pci;
+        struct {
+            uint32_t size; /* Length of the path */
+            XEN_GUEST_HANDLE_64(char) path; /* path to the device tree node */
+        } dt;
+    } u;
+    /* IN */
+#define XEN_DOMCTL_DEV_RDM_RELAXED      1
+    uint32_t  flag;   /* flag of assigned device */
 };
 typedef struct xen_domctl_assign_device xen_domctl_assign_device_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t);
@@ -481,6 +498,7 @@
     PT_IRQ_TYPE_ISA,
     PT_IRQ_TYPE_MSI,
     PT_IRQ_TYPE_MSI_TRANSLATE,
+    PT_IRQ_TYPE_SPI,    /* ARM: valid range 32-1019 */
 } pt_irq_type_t;
 struct xen_domctl_bind_pt_irq {
     uint32_t machine_irq;
@@ -501,6 +519,9 @@
             uint32_t gflags;
             uint64_aligned_t gtable;
         } msi;
+        struct {
+            uint16_t spi;
+        } spi;
     } u;
 };
 typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t;
@@ -508,6 +529,7 @@
 
 
 /* Bind machine I/O address range -> HVM address range. */
+/* If this returns -E2BIG lower nr_mfns value. */
 /* XEN_DOMCTL_memory_mapping */
 #define DPCI_ADD_MAPPING         1
 #define DPCI_REMOVE_MAPPING      0
@@ -545,6 +567,7 @@
 #define XEN_DOMCTL_MEM_CACHEATTR_WP  5
 #define XEN_DOMCTL_MEM_CACHEATTR_WB  6
 #define XEN_DOMCTL_MEM_CACHEATTR_UCM 7
+#define XEN_DOMCTL_DELETE_MEM_CACHEATTR (~(uint32_t)0)
 struct xen_domctl_pin_mem_cacheattr {
     uint64_aligned_t start, end;
     uint32_t type; /* XEN_DOMCTL_MEM_CACHEATTR_* */
@@ -572,28 +595,20 @@
     uint16_t         sysenter_callback_cs;
     uint8_t          syscall32_disables_events;
     uint8_t          sysenter_disables_events;
-    uint64_aligned_t mcg_cap;
+#if defined(__GNUC__)
+    union {
+        uint64_aligned_t mcg_cap;
+        struct hvm_vmce_vcpu vmce;
+    };
+#else
+    struct hvm_vmce_vcpu vmce;
 #endif
+#endif
 };
 typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpucontext_t);
 
 /*
- * Set optimizaton features for a domain
- */
-/* XEN_DOMCTL_set_opt_feature */
-struct xen_domctl_set_opt_feature {
-#if defined(__ia64__)
-    struct xen_ia64_opt_feature optf;
-#else
-    /* Make struct non-empty: do not depend on this field name! */
-    uint64_t dummy;
-#endif
-};
-typedef struct xen_domctl_set_opt_feature xen_domctl_set_opt_feature_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_opt_feature_t);
-
-/*
  * Set the target domain for a domain
  */
 /* XEN_DOMCTL_set_target */
@@ -617,6 +632,22 @@
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpuid_t);
 #endif
 
+/*
+ * Arranges that if the domain suspends (specifically, if it shuts
+ * down with code SHUTDOWN_suspend), this event channel will be
+ * notified.
+ *
+ * This is _instead of_ the usual notification to the global
+ * VIRQ_DOM_EXC.  (In most systems that pirq is owned by xenstored.)
+ *
+ * Only one subscription per domain is possible.  Last subscriber
+ * wins; others are silently displaced.
+ *
+ * NB that contrary to the rather general name, it only applies to
+ * domain shutdown with code suspend.  Shutdown for other reasons
+ * (including crash), and domain death, are notified to VIRQ_DOM_EXC
+ * regardless.
+ */
 /* XEN_DOMCTL_subscribe */
 struct xen_domctl_subscribe {
     uint32_t port; /* IN */
@@ -665,18 +696,13 @@
 
 /* XEN_DOMCTL_gettscinfo */
 /* XEN_DOMCTL_settscinfo */
-struct xen_guest_tsc_info {
+typedef struct xen_domctl_tsc_info {
+    /* IN/OUT */
     uint32_t tsc_mode;
     uint32_t gtsc_khz;
     uint32_t incarnation;
     uint32_t pad;
     uint64_aligned_t elapsed_nsec;
-};
-typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
-DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
-typedef struct xen_domctl_tsc_info {
-    XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
-    xen_guest_tsc_info_t info; /* IN */
 } xen_domctl_tsc_info_t;
 
 /* XEN_DOMCTL_gdbsx_guestmemio      guest mem io */
@@ -706,12 +732,23 @@
 };
 
 /*
- * Memory event operations
+ * VM event operations
  */
 
-/* XEN_DOMCTL_mem_event_op */
+/* XEN_DOMCTL_vm_event_op */
 
 /*
+ * There are currently three rings available for VM events:
+ * sharing, monitor and paging. This hypercall allows one to
+ * control these rings (enable/disable), as well as to signal
+ * to the hypervisor to pull responses (resume) from the given
+ * ring.
+ */
+#define XEN_VM_EVENT_ENABLE               0
+#define XEN_VM_EVENT_DISABLE              1
+#define XEN_VM_EVENT_RESUME               2
+
+/*
  * Domain memory paging
  * Page memory in and out.
  * Domctl interface to set up and tear down the 
@@ -718,7 +755,7 @@
  * pager<->hypervisor interface. Use XENMEM_paging_op*
  * to perform per-page operations.
  *
- * The XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE domctl returns several
+ * The XEN_VM_EVENT_PAGING_ENABLE domctl returns several
  * non-standard error codes to indicate why paging could not be enabled:
  * ENODEV - host lacks HAP support (EPT/NPT) or HAP is disabled in guest
  * EMLINK - guest has iommu passthrough enabled
@@ -725,35 +762,32 @@
  * EXDEV  - guest has PoD enabled
  * EBUSY  - guest has or had paging enabled, ring buffer still active
  */
-#define XEN_DOMCTL_MEM_EVENT_OP_PAGING            1
+#define XEN_DOMCTL_VM_EVENT_OP_PAGING            1
 
-#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE     0
-#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE    1
-
 /*
- * Access permissions.
+ * Monitor helper.
  *
  * As with paging, use the domctl for teardown/setup of the
  * helper<->hypervisor interface.
  *
- * There are HVM hypercalls to set the per-page access permissions of every
- * page in a domain.  When one of these permissions--independent, read, 
- * write, and execute--is violated, the VCPU is paused and a memory event 
- * is sent with what happened.  (See public/mem_event.h) .
+ * The monitor interface can be used to register for various VM events. For
+ * example, there are HVM hypercalls to set the per-page access permissions
+ * of every page in a domain.  When one of these permissions--independent,
+ * read, write, and execute--is violated, the VCPU is paused and a memory event
+ * is sent with what happened. The memory event handler can then resume the
+ * VCPU and redo the access with a XEN_VM_EVENT_RESUME option.
  *
- * The memory event handler can then resume the VCPU and redo the access 
- * with a XENMEM_access_op_resume hypercall.
+ * See public/vm_event.h for the list of available events that can be
+ * subscribed to via the monitor interface.
  *
- * The XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE domctl returns several
+ * The XEN_VM_EVENT_MONITOR_* domctls returns
  * non-standard error codes to indicate why access could not be enabled:
  * ENODEV - host lacks HAP support (EPT/NPT) or HAP is disabled in guest
  * EBUSY  - guest has or had access enabled, ring buffer still active
+ *
  */
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS            2
+#define XEN_DOMCTL_VM_EVENT_OP_MONITOR           2
 
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE     0
-#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE    1
-
 /*
  * Sharing ENOMEM helper.
  *
@@ -767,21 +801,18 @@
  * Note that shring can be turned on (as per the domctl below)
  * *without* this ring being setup.
  */
-#define XEN_DOMCTL_MEM_EVENT_OP_SHARING           3
+#define XEN_DOMCTL_VM_EVENT_OP_SHARING           3
 
-#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE    0
-#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_DISABLE   1
-
 /* Use for teardown/setup of helper<->hypervisor interface for paging, 
  * access and sharing.*/
-struct xen_domctl_mem_event_op {
-    uint32_t       op;           /* XEN_DOMCTL_MEM_EVENT_OP_*_* */
-    uint32_t       mode;         /* XEN_DOMCTL_MEM_EVENT_OP_* */
+struct xen_domctl_vm_event_op {
+    uint32_t       op;           /* XEN_VM_EVENT_* */
+    uint32_t       mode;         /* XEN_DOMCTL_VM_EVENT_OP_* */
 
     uint32_t port;              /* OUT: event channel for ring */
 };
-typedef struct xen_domctl_mem_event_op xen_domctl_mem_event_op_t;
-DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_event_op_t);
+typedef struct xen_domctl_vm_event_op xen_domctl_vm_event_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vm_event_op_t);
 
 /*
  * Memory sharing operations
@@ -822,7 +853,7 @@
     /* IN: VCPU that this call applies to. */
     uint32_t         vcpu;
     /*
-     * SET: xfeature support mask of struct (IN)
+     * SET: Ignored.
      * GET: xfeature support mask of struct (IN/OUT)
      * xfeature mask is served as identifications of the saving format
      * so that compatible CPUs can have a check on format to decide
@@ -850,6 +881,189 @@
 typedef struct xen_domctl_set_access_required xen_domctl_set_access_required_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_access_required_t);
 
+struct xen_domctl_set_broken_page_p2m {
+    uint64_aligned_t pfn;
+};
+typedef struct xen_domctl_set_broken_page_p2m xen_domctl_set_broken_page_p2m_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_broken_page_p2m_t);
+
+/*
+ * XEN_DOMCTL_set_max_evtchn: sets the maximum event channel port
+ * number the guest may use.  Use this limit the amount of resources
+ * (global mapping space, xenheap) a guest may use for event channels.
+ */
+struct xen_domctl_set_max_evtchn {
+    uint32_t max_port;
+};
+typedef struct xen_domctl_set_max_evtchn xen_domctl_set_max_evtchn_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_max_evtchn_t);
+
+/*
+ * ARM: Clean and invalidate caches associated with given region of
+ * guest memory.
+ */
+struct xen_domctl_cacheflush {
+    /* IN: page range to flush. */
+    xen_pfn_t start_pfn, nr_pfns;
+};
+typedef struct xen_domctl_cacheflush xen_domctl_cacheflush_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cacheflush_t);
+
+#if defined(__i386__) || defined(__x86_64__)
+struct xen_domctl_vcpu_msr {
+    uint32_t         index;
+    uint32_t         reserved;
+    uint64_aligned_t value;
+};
+typedef struct xen_domctl_vcpu_msr xen_domctl_vcpu_msr_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msr_t);
+
+/*
+ * XEN_DOMCTL_set_vcpu_msrs / XEN_DOMCTL_get_vcpu_msrs.
+ *
+ * Input:
+ * - A NULL 'msrs' guest handle is a request for the maximum 'msr_count'.
+ * - Otherwise, 'msr_count' is the number of entries in 'msrs'.
+ *
+ * Output for get:
+ * - If 'msr_count' is less than the number Xen needs to write, -ENOBUFS shall
+ *   be returned and 'msr_count' updated to reflect the intended number.
+ * - On success, 'msr_count' shall indicate the number of MSRs written, which
+ *   may be less than the maximum if some are not currently used by the vcpu.
+ *
+ * Output for set:
+ * - If Xen encounters an error with a specific MSR, -EINVAL shall be returned
+ *   and 'msr_count' shall be set to the offending index, to aid debugging.
+ */
+struct xen_domctl_vcpu_msrs {
+    uint32_t vcpu;                                   /* IN     */
+    uint32_t msr_count;                              /* IN/OUT */
+    XEN_GUEST_HANDLE_64(xen_domctl_vcpu_msr_t) msrs; /* IN/OUT */
+};
+typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
+#endif
+
+/* XEN_DOMCTL_setvnumainfo: specifies a virtual NUMA topology for the guest */
+struct xen_domctl_vnuma {
+    /* IN: number of vNUMA nodes to setup. Shall be greater than 0 */
+    uint32_t nr_vnodes;
+    /* IN: number of memory ranges to setup */
+    uint32_t nr_vmemranges;
+    /*
+     * IN: number of vCPUs of the domain (used as size of the vcpu_to_vnode
+     * array declared below). Shall be equal to the domain's max_vcpus.
+     */
+    uint32_t nr_vcpus;
+    uint32_t pad;                                  /* must be zero */
+
+    /*
+     * IN: array for specifying the distances of the vNUMA nodes
+     * between each others. Shall have nr_vnodes*nr_vnodes elements.
+     */
+    XEN_GUEST_HANDLE_64(uint) vdistance;
+    /*
+     * IN: array for specifying to what vNUMA node each vCPU belongs.
+     * Shall have nr_vcpus elements.
+     */
+    XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode;
+    /*
+     * IN: array for specifying on what physical NUMA node each vNUMA
+     * node is placed. Shall have nr_vnodes elements.
+     */
+    XEN_GUEST_HANDLE_64(uint) vnode_to_pnode;
+    /*
+     * IN: array for specifying the memory ranges. Shall have
+     * nr_vmemranges elements.
+     */
+    XEN_GUEST_HANDLE_64(xen_vmemrange_t) vmemrange;
+};
+typedef struct xen_domctl_vnuma xen_domctl_vnuma_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vnuma_t);
+
+struct xen_domctl_psr_cmt_op {
+#define XEN_DOMCTL_PSR_CMT_OP_DETACH         0
+#define XEN_DOMCTL_PSR_CMT_OP_ATTACH         1
+#define XEN_DOMCTL_PSR_CMT_OP_QUERY_RMID     2
+    uint32_t cmd;
+    uint32_t data;
+};
+typedef struct xen_domctl_psr_cmt_op xen_domctl_psr_cmt_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_psr_cmt_op_t);
+
+/*  XEN_DOMCTL_MONITOR_*
+ *
+ * Enable/disable monitoring various VM events.
+ * This domctl configures what events will be reported to helper apps
+ * via the ring buffer "MONITOR". The ring has to be first enabled
+ * with the domctl XEN_DOMCTL_VM_EVENT_OP_MONITOR.
+ *
+ * GET_CAPABILITIES can be used to determine which of these features is
+ * available on a given platform.
+ *
+ * NOTICE: mem_access events are also delivered via the "MONITOR" ring buffer;
+ * however, enabling/disabling those events is performed with the use of
+ * memory_op hypercalls!
+ */
+#define XEN_DOMCTL_MONITOR_OP_ENABLE            0
+#define XEN_DOMCTL_MONITOR_OP_DISABLE           1
+#define XEN_DOMCTL_MONITOR_OP_GET_CAPABILITIES  2
+
+#define XEN_DOMCTL_MONITOR_EVENT_WRITE_CTRLREG         0
+#define XEN_DOMCTL_MONITOR_EVENT_MOV_TO_MSR            1
+#define XEN_DOMCTL_MONITOR_EVENT_SINGLESTEP            2
+#define XEN_DOMCTL_MONITOR_EVENT_SOFTWARE_BREAKPOINT   3
+#define XEN_DOMCTL_MONITOR_EVENT_GUEST_REQUEST         4
+
+struct xen_domctl_monitor_op {
+    uint32_t op; /* XEN_DOMCTL_MONITOR_OP_* */
+
+    /*
+     * When used with ENABLE/DISABLE this has to be set to
+     * the requested XEN_DOMCTL_MONITOR_EVENT_* value.
+     * With GET_CAPABILITIES this field returns a bitmap of
+     * events supported by the platform, in the format
+     * (1 << XEN_DOMCTL_MONITOR_EVENT_*).
+     */
+    uint32_t event;
+
+    /*
+     * Further options when issuing XEN_DOMCTL_MONITOR_OP_ENABLE.
+     */
+    union {
+        struct {
+            /* Which control register */
+            uint8_t index;
+            /* Pause vCPU until response */
+            uint8_t sync;
+            /* Send event only on a change of value */
+            uint8_t onchangeonly;
+        } mov_to_cr;
+
+        struct {
+            /* Enable the capture of an extended set of MSRs */
+            uint8_t extended_capture;
+        } mov_to_msr;
+
+        struct {
+            /* Pause vCPU until response */
+            uint8_t sync;
+        } guest_request;
+    } u;
+};
+typedef struct xen_domctl_monitor_op xen_domctl_monitor_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_monitor_op_t);
+
+struct xen_domctl_psr_cat_op {
+#define XEN_DOMCTL_PSR_CAT_OP_SET_L3_CBM     0
+#define XEN_DOMCTL_PSR_CAT_OP_GET_L3_CBM     1
+    uint32_t cmd;       /* IN: XEN_DOMCTL_PSR_CAT_OP_* */
+    uint32_t target;    /* IN */
+    uint64_t data;      /* IN/OUT */
+};
+typedef struct xen_domctl_psr_cat_op xen_domctl_psr_cat_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_psr_cat_op_t);
+
 struct xen_domctl {
     uint32_t cmd;
 #define XEN_DOMCTL_createdomain                   1
@@ -858,8 +1072,8 @@
 #define XEN_DOMCTL_unpausedomain                  4
 #define XEN_DOMCTL_getdomaininfo                  5
 #define XEN_DOMCTL_getmemlist                     6
-#define XEN_DOMCTL_getpageframeinfo               7
-#define XEN_DOMCTL_getpageframeinfo2              8
+/* #define XEN_DOMCTL_getpageframeinfo            7 Obsolete - use getpageframeinfo3 */
+/* #define XEN_DOMCTL_getpageframeinfo2           8 Obsolete - use getpageframeinfo3 */
 #define XEN_DOMCTL_setvcpuaffinity                9
 #define XEN_DOMCTL_shadow_op                     10
 #define XEN_DOMCTL_max_mem                       11
@@ -874,10 +1088,10 @@
 #define XEN_DOMCTL_iomem_permission              20
 #define XEN_DOMCTL_ioport_permission             21
 #define XEN_DOMCTL_hypercall_init                22
-#define XEN_DOMCTL_arch_setup                    23
+#define XEN_DOMCTL_arch_setup                    23 /* Obsolete IA64 only */
 #define XEN_DOMCTL_settimeoffset                 24
 #define XEN_DOMCTL_getvcpuaffinity               25
-#define XEN_DOMCTL_real_mode_area                26
+#define XEN_DOMCTL_real_mode_area                26 /* Obsolete PPC only */
 #define XEN_DOMCTL_resumedomain                  27
 #define XEN_DOMCTL_sendtrigger                   28
 #define XEN_DOMCTL_subscribe                     29
@@ -892,7 +1106,7 @@
 #define XEN_DOMCTL_pin_mem_cacheattr             41
 #define XEN_DOMCTL_set_ext_vcpucontext           42
 #define XEN_DOMCTL_get_ext_vcpucontext           43
-#define XEN_DOMCTL_set_opt_feature               44
+#define XEN_DOMCTL_set_opt_feature               44 /* Obsolete IA64 only */
 #define XEN_DOMCTL_test_assign_device            45
 #define XEN_DOMCTL_set_target                    46
 #define XEN_DOMCTL_deassign_device               47
@@ -904,7 +1118,7 @@
 #define XEN_DOMCTL_suppress_spurious_page_faults 53
 #define XEN_DOMCTL_debug_op                      54
 #define XEN_DOMCTL_gethvmcontext_partial         55
-#define XEN_DOMCTL_mem_event_op                  56
+#define XEN_DOMCTL_vm_event_op                   56
 #define XEN_DOMCTL_mem_sharing_op                57
 #define XEN_DOMCTL_disable_migrate               58
 #define XEN_DOMCTL_gettscinfo                    59
@@ -915,6 +1129,17 @@
 #define XEN_DOMCTL_set_access_required           64
 #define XEN_DOMCTL_audit_p2m                     65
 #define XEN_DOMCTL_set_virq_handler              66
+#define XEN_DOMCTL_set_broken_page_p2m           67
+#define XEN_DOMCTL_setnodeaffinity               68
+#define XEN_DOMCTL_getnodeaffinity               69
+#define XEN_DOMCTL_set_max_evtchn                70
+#define XEN_DOMCTL_cacheflush                    71
+#define XEN_DOMCTL_get_vcpu_msrs                 72
+#define XEN_DOMCTL_set_vcpu_msrs                 73
+#define XEN_DOMCTL_setvnumainfo                  74
+#define XEN_DOMCTL_psr_cmt_op                    75
+#define XEN_DOMCTL_monitor_op                    77
+#define XEN_DOMCTL_psr_cat_op                    78
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -925,9 +1150,8 @@
         struct xen_domctl_createdomain      createdomain;
         struct xen_domctl_getdomaininfo     getdomaininfo;
         struct xen_domctl_getmemlist        getmemlist;
-        struct xen_domctl_getpageframeinfo  getpageframeinfo;
-        struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
         struct xen_domctl_getpageframeinfo3 getpageframeinfo3;
+        struct xen_domctl_nodeaffinity      nodeaffinity;
         struct xen_domctl_vcpuaffinity      vcpuaffinity;
         struct xen_domctl_shadow_op         shadow_op;
         struct xen_domctl_max_mem           max_mem;
@@ -941,11 +1165,9 @@
         struct xen_domctl_iomem_permission  iomem_permission;
         struct xen_domctl_ioport_permission ioport_permission;
         struct xen_domctl_hypercall_init    hypercall_init;
-        struct xen_domctl_arch_setup        arch_setup;
         struct xen_domctl_settimeoffset     settimeoffset;
         struct xen_domctl_disable_migrate   disable_migrate;
         struct xen_domctl_tsc_info          tsc_info;
-        struct xen_domctl_real_mode_area    real_mode_area;
         struct xen_domctl_hvmcontext        hvmcontext;
         struct xen_domctl_hvmcontext_partial hvmcontext_partial;
         struct xen_domctl_address_size      address_size;
@@ -957,22 +1179,29 @@
         struct xen_domctl_ioport_mapping    ioport_mapping;
         struct xen_domctl_pin_mem_cacheattr pin_mem_cacheattr;
         struct xen_domctl_ext_vcpucontext   ext_vcpucontext;
-        struct xen_domctl_set_opt_feature   set_opt_feature;
         struct xen_domctl_set_target        set_target;
         struct xen_domctl_subscribe         subscribe;
         struct xen_domctl_debug_op          debug_op;
-        struct xen_domctl_mem_event_op      mem_event_op;
+        struct xen_domctl_vm_event_op       vm_event_op;
         struct xen_domctl_mem_sharing_op    mem_sharing_op;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
         struct xen_domctl_vcpuextstate      vcpuextstate;
+        struct xen_domctl_vcpu_msrs         vcpu_msrs;
 #endif
         struct xen_domctl_set_access_required access_required;
         struct xen_domctl_audit_p2m         audit_p2m;
         struct xen_domctl_set_virq_handler  set_virq_handler;
+        struct xen_domctl_set_max_evtchn    set_max_evtchn;
         struct xen_domctl_gdbsx_memio       gdbsx_guest_memio;
+        struct xen_domctl_set_broken_page_p2m set_broken_page_p2m;
+        struct xen_domctl_cacheflush        cacheflush;
         struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
         struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
+        struct xen_domctl_vnuma             vnuma;
+        struct xen_domctl_psr_cmt_op        psr_cmt_op;
+        struct xen_domctl_monitor_op        monitor_op;
+        struct xen_domctl_psr_cat_op        psr_cat_op;
         uint8_t                             pad[128];
     } u;
 };
@@ -984,7 +1213,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/elfnote.h
===================================================================
--- trunk/sys/xen/interface/elfnote.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/elfnote.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -29,6 +29,8 @@
 #define __XEN_PUBLIC_ELFNOTE_H__
 
 /*
+ * `incontents 200 elfnotes ELF notes
+ *
  * The notes should live in a PT_NOTE segment and have "Xen" in the
  * name field.
  *
@@ -37,6 +39,9 @@
  *
  * LEGACY indicated the fields in the legacy __xen_guest string which
  * this a note type replaces.
+ *
+ * String values (for non-legacy) are NULL terminated ASCII, also known
+ * as ASCIZ type.
  */
 
 /*
@@ -67,8 +72,8 @@
 #define XEN_ELFNOTE_VIRT_BASE      3
 
 /*
- * The offset of the ELF paddr field from the acutal required
- * psuedo-physical address (numeric).
+ * The offset of the ELF paddr field from the actual required
+ * pseudo-physical address (numeric).
  *
  * This is used to maintain backwards compatibility with older kernels
  * which wrote __PAGE_OFFSET into that field. This field defaults to 0
@@ -159,6 +164,9 @@
 
 /*
  * Whether or not the guest supports cooperative suspend cancellation.
+ * This is a numeric value.
+ *
+ * Default is 0
  */
 #define XEN_ELFNOTE_SUSPEND_CANCEL 14
 
@@ -256,7 +264,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Added: trunk/sys/xen/interface/errno.h
===================================================================
--- trunk/sys/xen/interface/errno.h	                        (rev 0)
+++ trunk/sys/xen/interface/errno.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -0,0 +1,96 @@
+/* $MidnightBSD$ */
+#ifndef __XEN_PUBLIC_ERRNO_H__
+
+#ifndef __ASSEMBLY__
+
+#define XEN_ERRNO(name, value) XEN_##name = value,
+enum xen_errno {
+
+#else /* !__ASSEMBLY__ */
+
+#define XEN_ERRNO(name, value) .equ XEN_##name, value
+
+#endif /* __ASSEMBLY__ */
+
+/* ` enum neg_errnoval {  [ -Efoo for each Efoo in the list below ]  } */
+/* ` enum errnoval { */
+
+#endif /* __XEN_PUBLIC_ERRNO_H__ */
+
+#ifdef XEN_ERRNO
+
+/*
+ * Values originating from x86 Linux. Please consider using respective
+ * values when adding new definitions here.
+ *
+ * The set of identifiers to be added here shouldn't extend beyond what
+ * POSIX mandates (see e.g.
+ * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html)
+ * with the exception that we support some optional (XSR) values
+ * specified there (but no new ones should be added).
+ */
+
+XEN_ERRNO(EPERM,	 1)	/* Operation not permitted */
+XEN_ERRNO(ENOENT,	 2)	/* No such file or directory */
+XEN_ERRNO(ESRCH,	 3)	/* No such process */
+#ifdef __XEN__ /* Internal only, should never be exposed to the guest. */
+XEN_ERRNO(EINTR,	 4)	/* Interrupted system call */
+#endif
+XEN_ERRNO(EIO,		 5)	/* I/O error */
+XEN_ERRNO(ENXIO,	 6)	/* No such device or address */
+XEN_ERRNO(E2BIG,	 7)	/* Arg list too long */
+XEN_ERRNO(ENOEXEC,	 8)	/* Exec format error */
+XEN_ERRNO(EBADF,	 9)	/* Bad file number */
+XEN_ERRNO(ECHILD,	10)	/* No child processes */
+XEN_ERRNO(EAGAIN,	11)	/* Try again */
+XEN_ERRNO(ENOMEM,	12)	/* Out of memory */
+XEN_ERRNO(EACCES,	13)	/* Permission denied */
+XEN_ERRNO(EFAULT,	14)	/* Bad address */
+XEN_ERRNO(EBUSY,	16)	/* Device or resource busy */
+XEN_ERRNO(EEXIST,	17)	/* File exists */
+XEN_ERRNO(EXDEV,	18)	/* Cross-device link */
+XEN_ERRNO(ENODEV,	19)	/* No such device */
+XEN_ERRNO(EINVAL,	22)	/* Invalid argument */
+XEN_ERRNO(ENFILE,	23)	/* File table overflow */
+XEN_ERRNO(EMFILE,	24)	/* Too many open files */
+XEN_ERRNO(ENOSPC,	28)	/* No space left on device */
+XEN_ERRNO(EMLINK,	31)	/* Too many links */
+XEN_ERRNO(EDOM,		33)	/* Math argument out of domain of func */
+XEN_ERRNO(ERANGE,	34)	/* Math result not representable */
+XEN_ERRNO(EDEADLK,	35)	/* Resource deadlock would occur */
+XEN_ERRNO(ENAMETOOLONG,	36)	/* File name too long */
+XEN_ERRNO(ENOLCK,	37)	/* No record locks available */
+XEN_ERRNO(ENOSYS,	38)	/* Function not implemented */
+XEN_ERRNO(ENODATA,	61)	/* No data available */
+XEN_ERRNO(ETIME,	62)	/* Timer expired */
+XEN_ERRNO(EBADMSG,	74)	/* Not a data message */
+XEN_ERRNO(EOVERFLOW,	75)	/* Value too large for defined data type */
+XEN_ERRNO(EILSEQ,	84)	/* Illegal byte sequence */
+#ifdef __XEN__ /* Internal only, should never be exposed to the guest. */
+XEN_ERRNO(ERESTART,	85)	/* Interrupted system call should be restarted */
+#endif
+XEN_ERRNO(ENOTSOCK,	88)	/* Socket operation on non-socket */
+XEN_ERRNO(EOPNOTSUPP,	95)	/* Operation not supported on transport endpoint */
+XEN_ERRNO(EADDRINUSE,	98)	/* Address already in use */
+XEN_ERRNO(EADDRNOTAVAIL, 99)	/* Cannot assign requested address */
+XEN_ERRNO(ENOBUFS,	105)	/* No buffer space available */
+XEN_ERRNO(EISCONN,	106)	/* Transport endpoint is already connected */
+XEN_ERRNO(ENOTCONN,	107)	/* Transport endpoint is not connected */
+XEN_ERRNO(ETIMEDOUT,	110)	/* Connection timed out */
+
+#undef XEN_ERRNO
+#endif /* XEN_ERRNO */
+
+#ifndef __XEN_PUBLIC_ERRNO_H__
+#define __XEN_PUBLIC_ERRNO_H__
+
+/* ` } */
+
+#ifndef __ASSEMBLY__
+};
+#endif
+
+#define	XEN_EWOULDBLOCK	XEN_EAGAIN	/* Operation would block */
+#define	XEN_EDEADLOCK	XEN_EDEADLK	/* Resource deadlock would occur */
+
+#endif /*  __XEN_PUBLIC_ERRNO_H__ */


Property changes on: trunk/sys/xen/interface/errno.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/interface/event_channel.h
===================================================================
--- trunk/sys/xen/interface/event_channel.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/event_channel.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -72,13 +72,13 @@
 #define EVTCHNOP_bind_vcpu        8
 #define EVTCHNOP_unmask           9
 #define EVTCHNOP_reset           10
+#define EVTCHNOP_init_control    11
+#define EVTCHNOP_expand_array    12
+#define EVTCHNOP_set_priority    13
 /* ` } */
 
-#ifndef __XEN_EVTCHN_PORT_DEFINED__
 typedef uint32_t evtchn_port_t;
 DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
-#define __XEN_EVTCHN_PORT_DEFINED__ 1
-#endif
 
 /*
  * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
@@ -102,6 +102,17 @@
  * a port that is unbound and marked as accepting bindings from the calling
  * domain. A fresh port is allocated in the calling domain and returned as
  * <local_port>.
+ *
+ * In case the peer domain has already tried to set our event channel
+ * pending, before it was bound, EVTCHNOP_bind_interdomain always sets
+ * the local event channel pending.
+ *
+ * The usual pattern of use, in the guest's upcall (or subsequent
+ * handler) is as follows: (Re-enable the event channel for subsequent
+ * signalling and then) check for the existence of whatever condition
+ * is being waited for by other means, and take whatever action is
+ * needed (if any).
+ *
  * NOTES:
  *  1. <remote_dom> may be DOMID_SELF, allowing loopback connections.
  */
@@ -254,6 +265,10 @@
  * NOTES:
  *  1. <dom> may be specified as DOMID_SELF.
  *  2. Only a sufficiently-privileged domain may specify other than DOMID_SELF.
+ *  3. Destroys all control blocks and event array, resets event channel
+ *     operations to 2-level ABI if called with <dom> == DOMID_SELF and FIFO
+ *     ABI was used. Guests should not bind events during EVTCHNOP_reset call
+ *     as these events are likely to be lost.
  */
 struct evtchn_reset {
     /* IN parameters. */
@@ -262,6 +277,43 @@
 typedef struct evtchn_reset evtchn_reset_t;
 
 /*
+ * EVTCHNOP_init_control: initialize the control block for the FIFO ABI.
+ *
+ * Note: any events that are currently pending will not be resent and
+ * will be lost.  Guests should call this before binding any event to
+ * avoid losing any events.
+ */
+struct evtchn_init_control {
+    /* IN parameters. */
+    uint64_t control_gfn;
+    uint32_t offset;
+    uint32_t vcpu;
+    /* OUT parameters. */
+    uint8_t link_bits;
+    uint8_t _pad[7];
+};
+typedef struct evtchn_init_control evtchn_init_control_t;
+
+/*
+ * EVTCHNOP_expand_array: add an additional page to the event array.
+ */
+struct evtchn_expand_array {
+    /* IN parameters. */
+    uint64_t array_gfn;
+};
+typedef struct evtchn_expand_array evtchn_expand_array_t;
+
+/*
+ * EVTCHNOP_set_priority: set the priority for an event channel.
+ */
+struct evtchn_set_priority {
+    /* IN parameters. */
+    uint32_t port;
+    uint32_t priority;
+};
+typedef struct evtchn_set_priority evtchn_set_priority_t;
+
+/*
  * ` enum neg_errnoval
  * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op)
  * `
@@ -285,12 +337,48 @@
 typedef struct evtchn_op evtchn_op_t;
 DEFINE_XEN_GUEST_HANDLE(evtchn_op_t);
 
+/*
+ * 2-level ABI
+ */
+
+#define EVTCHN_2L_NR_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)
+
+/*
+ * FIFO ABI
+ */
+
+/* Events may have priorities from 0 (highest) to 15 (lowest). */
+#define EVTCHN_FIFO_PRIORITY_MAX     0
+#define EVTCHN_FIFO_PRIORITY_DEFAULT 7
+#define EVTCHN_FIFO_PRIORITY_MIN     15
+
+#define EVTCHN_FIFO_MAX_QUEUES (EVTCHN_FIFO_PRIORITY_MIN + 1)
+
+typedef uint32_t event_word_t;
+
+#define EVTCHN_FIFO_PENDING 31
+#define EVTCHN_FIFO_MASKED  30
+#define EVTCHN_FIFO_LINKED  29
+#define EVTCHN_FIFO_BUSY    28
+
+#define EVTCHN_FIFO_LINK_BITS 17
+#define EVTCHN_FIFO_LINK_MASK ((1 << EVTCHN_FIFO_LINK_BITS) - 1)
+
+#define EVTCHN_FIFO_NR_CHANNELS (1 << EVTCHN_FIFO_LINK_BITS)
+
+struct evtchn_fifo_control_block {
+    uint32_t ready;
+    uint32_t _rsvd;
+    uint32_t head[EVTCHN_FIFO_MAX_QUEUES];
+};
+typedef struct evtchn_fifo_control_block evtchn_fifo_control_block_t;
+
 #endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
 
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/features.h
===================================================================
--- trunk/sys/xen/interface/features.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/features.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -29,6 +29,20 @@
 #define __XEN_PUBLIC_FEATURES_H__
 
 /*
+ * `incontents 200 elfnotes_features XEN_ELFNOTE_FEATURES
+ *
+ * The list of all the features the guest supports. They are set by
+ * parsing the XEN_ELFNOTE_FEATURES and XEN_ELFNOTE_SUPPORTED_FEATURES
+ * string. The format is the  feature names (as given here without the
+ * "XENFEAT_" prefix) separated by '|' characters.
+ * If a feature is required for the kernel to function then the feature name
+ * must be preceded by a '!' character.
+ *
+ * Note that if XEN_ELFNOTE_SUPPORTED_FEATURES is used, then in the
+ * XENFEAT_dom0 MUST be set if the guest is to be booted as dom0,
+ */
+
+/*
  * If set, the guest does not need to write-protect its pagetables, and can
  * update them via direct writes.
  */
@@ -81,6 +95,14 @@
 /* operation as Dom0 is supported */
 #define XENFEAT_dom0                      11
 
+/* Xen also maps grant references at pfn = mfn.
+ * This feature flag is deprecated and should not be used.
+#define XENFEAT_grant_map_identity        12
+ */
+
+/* Guest can use XENMEMF_vnode to specify virtual node for memory op. */
+#define XENFEAT_memory_op_vnode_supported 13
+
 #define XENFEAT_NR_SUBMAPS 1
 
 #endif /* __XEN_PUBLIC_FEATURES_H__ */
@@ -88,7 +110,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Added: trunk/sys/xen/interface/gcov.h
===================================================================
--- trunk/sys/xen/interface/gcov.h	                        (rev 0)
+++ trunk/sys/xen/interface/gcov.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -0,0 +1,116 @@
+/* $MidnightBSD$ */
+/******************************************************************************
+ * gcov.h
+ *
+ * Coverage structures exported by Xen.
+ * Structure is different from Gcc one.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2013, Citrix Systems R&D Ltd.
+ */
+
+#ifndef __XEN_PUBLIC_GCOV_H__
+#define __XEN_PUBLIC_GCOV_H__ __XEN_PUBLIC_GCOV_H__
+
+#define XENCOV_COUNTERS         5
+#define XENCOV_TAG_BASE         0x58544300u
+#define XENCOV_TAG_FILE         (XENCOV_TAG_BASE+0x46u)
+#define XENCOV_TAG_FUNC         (XENCOV_TAG_BASE+0x66u)
+#define XENCOV_TAG_COUNTER(n)   (XENCOV_TAG_BASE+0x30u+((n)&0xfu))
+#define XENCOV_TAG_END          (XENCOV_TAG_BASE+0x2eu)
+#define XENCOV_IS_TAG_COUNTER(n) \
+    ((n) >= XENCOV_TAG_COUNTER(0) && (n) < XENCOV_TAG_COUNTER(XENCOV_COUNTERS))
+#define XENCOV_COUNTER_NUM(n) ((n)-XENCOV_TAG_COUNTER(0))
+
+/*
+ * The main structure for the blob is
+ * BLOB := FILE.. END
+ * FILE := TAG_FILE VERSION STAMP FILENAME COUNTERS FUNCTIONS
+ * FILENAME := LEN characters
+ *   characters are padded to 32 bit
+ * LEN := 32 bit value
+ * COUNTERS := TAG_COUNTER(n) NUM COUNTER..
+ * NUM := 32 bit valie
+ * COUNTER := 64 bit value
+ * FUNCTIONS := TAG_FUNC NUM FUNCTION..
+ * FUNCTION := IDENT CHECKSUM NUM_COUNTERS
+ *
+ * All tagged structures are aligned to 8 bytes
+ */
+
+/**
+ * File information
+ * Prefixed with XENCOV_TAG_FILE and a string with filename
+ * Aligned to 8 bytes
+ */
+struct xencov_file
+{
+    uint32_t tag; /* XENCOV_TAG_FILE */
+    uint32_t version;
+    uint32_t stamp;
+    uint32_t fn_len;
+    char filename[1];
+};
+
+
+/**
+ * Counters information
+ * Prefixed with XENCOV_TAG_COUNTER(n) where n is 0..(XENCOV_COUNTERS-1)
+ * Aligned to 8 bytes
+ */
+struct xencov_counter
+{
+    uint32_t tag; /* XENCOV_TAG_COUNTER(n) */
+    uint32_t num;
+    uint64_t values[1];
+};
+
+/**
+ * Information for each function
+ * Number of counter is equal to the number of counter structures got before
+ */
+struct xencov_function
+{
+    uint32_t ident;
+    uint32_t checksum;
+    uint32_t num_counters[1];
+};
+
+/**
+ * Information for all functions
+ * Aligned to 8 bytes
+ */
+struct xencov_functions
+{
+    uint32_t tag; /* XENCOV_TAG_FUNC */
+    uint32_t num;
+    struct xencov_function xencov_function[1];
+};
+
+/**
+ * Terminator
+ */
+struct xencov_end
+{
+    uint32_t tag; /* XENCOV_TAG_END */
+};
+
+#endif /* __XEN_PUBLIC_GCOV_H__ */
+


Property changes on: trunk/sys/xen/interface/gcov.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/interface/grant_table.h
===================================================================
--- trunk/sys/xen/interface/grant_table.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/grant_table.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -135,8 +135,10 @@
     /* The domain being granted foreign privileges. [GST] */
     domid_t  domid;
     /*
-     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
-     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
+     * GTF_permit_access: GFN that @domid is allowed to map and access. [GST]
+     * GTF_accept_transfer: GFN that @domid is allowed to transfer into. [GST]
+     * GTF_transfer_completed: MFN whose ownership transferred by @domid
+     *                         (non-translated guests only). [XEN]
      */
     uint32_t frame;
 };
@@ -310,6 +312,7 @@
 #define GNTTABOP_get_status_frames    9
 #define GNTTABOP_get_version          10
 #define GNTTABOP_swap_grant_ref	      11
+#define GNTTABOP_cache_flush	      12
 #endif /* __XEN_INTERFACE_VERSION__ */
 /* ` } */
 
@@ -321,7 +324,7 @@
 /*
  * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
  * by devices and/or host CPUs. If successful, <handle> is a tracking number
- * that must be presented later to destroy the mapping(s). On error, <handle>
+ * that must be presented later to destroy the mapping(s). On error, <status>
  * is a negative status code.
  * NOTES:
  *  1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address
@@ -386,7 +389,11 @@
     uint32_t nr_frames;
     /* OUT parameters. */
     int16_t  status;              /* => enum grant_status */
+#if __XEN_INTERFACE_VERSION__ < 0x00040300
     XEN_GUEST_HANDLE(ulong) frame_list;
+#else
+    XEN_GUEST_HANDLE(xen_pfn_t) frame_list;
+#endif
 };
 typedef struct gnttab_setup_table gnttab_setup_table_t;
 DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
@@ -446,12 +453,10 @@
 #define GNTCOPY_source_gref       (1<<_GNTCOPY_source_gref)
 #define _GNTCOPY_dest_gref        (1)
 #define GNTCOPY_dest_gref         (1<<_GNTCOPY_dest_gref)
-#define _GNTCOPY_can_fail         (2)
-#define GNTCOPY_can_fail          (1<<_GNTCOPY_can_fail)
 
 struct gnttab_copy {
     /* IN parameters. */
-    struct {
+    struct gnttab_copy_ptr {
         union {
             grant_ref_t ref;
             xen_pfn_t   gmfn;
@@ -573,6 +578,25 @@
 typedef struct gnttab_swap_grant_ref gnttab_swap_grant_ref_t;
 DEFINE_XEN_GUEST_HANDLE(gnttab_swap_grant_ref_t);
 
+/*
+ * Issue one or more cache maintenance operations on a portion of a
+ * page granted to the calling domain by a foreign domain.
+ */
+struct gnttab_cache_flush {
+    union {
+        uint64_t dev_bus_addr;
+        grant_ref_t ref;
+    } a;
+    uint16_t offset; /* offset from start of grant */
+    uint16_t length; /* size within the grant */
+#define GNTTAB_CACHE_CLEAN          (1<<0)
+#define GNTTAB_CACHE_INVAL          (1<<1)
+#define GNTTAB_CACHE_SOURCE_GREF    (1<<31)
+    uint32_t op;
+};
+typedef struct gnttab_cache_flush gnttab_cache_flush_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_cache_flush_t);
+
 #endif /* __XEN_INTERFACE_VERSION__ */
 
 /*
@@ -653,7 +677,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/kexec.h
===================================================================
--- trunk/sys/xen/interface/kexec.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/kexec.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -98,9 +98,6 @@
 #if defined(__i386__) || defined(__x86_64__)
     unsigned long page_list[KEXEC_XEN_NO_PAGES];
 #endif
-#if defined(__ia64__)
-    unsigned long reboot_code_buffer;
-#endif
     unsigned long indirection_page;
     unsigned long start_address;
 } xen_kexec_image_t;
@@ -109,6 +106,20 @@
  * Perform kexec having previously loaded a kexec or kdump kernel
  * as appropriate.
  * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ *
+ * Control is transferred to the image entry point with the host in
+ * the following state.
+ *
+ * - The image may be executed on any PCPU and all other PCPUs are
+ *   stopped.
+ *
+ * - Local interrupts are disabled.
+ *
+ * - Register values are undefined.
+ *
+ * - The image segments have writeable 1:1 virtual to machine
+ *   mappings.  The location of any page tables is undefined and these
+ *   page table frames are not be mapped.
  */
 #define KEXEC_CMD_kexec                 0
 typedef struct xen_kexec_exec {
@@ -120,12 +131,12 @@
  * type  == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
  * image == relocation information for kexec (ignored for unload) [in]
  */
-#define KEXEC_CMD_kexec_load            1
-#define KEXEC_CMD_kexec_unload          2
-typedef struct xen_kexec_load {
+#define KEXEC_CMD_kexec_load_v1         1 /* obsolete since 0x00040400 */
+#define KEXEC_CMD_kexec_unload_v1       2 /* obsolete since 0x00040400 */
+typedef struct xen_kexec_load_v1 {
     int type;
     xen_kexec_image_t image;
-} xen_kexec_load_t;
+} xen_kexec_load_v1_t;
 
 #define KEXEC_RANGE_MA_CRASH      0 /* machine address and size of crash area */
 #define KEXEC_RANGE_MA_XEN        1 /* machine address and size of Xen itself */
@@ -135,7 +146,7 @@
                                      * to Xen it exists in a separate EFI
                                      * region on ia64, and thus needs to be
                                      * inserted into iomem_machine separately */
-#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of
+#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* Obsolete: machine address and size of
                                      * the ia64_boot_param */
 #define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of
                                      * of the EFI Memory Map */
@@ -156,12 +167,82 @@
     unsigned long start;
 } xen_kexec_range_t;
 
+#if __XEN_INTERFACE_VERSION__ >= 0x00040400
+/*
+ * A contiguous chunk of a kexec image and it's destination machine
+ * address.
+ */
+typedef struct xen_kexec_segment {
+    union {
+        XEN_GUEST_HANDLE(const_void) h;
+        uint64_t _pad;
+    } buf;
+    uint64_t buf_size;
+    uint64_t dest_maddr;
+    uint64_t dest_size;
+} xen_kexec_segment_t;
+DEFINE_XEN_GUEST_HANDLE(xen_kexec_segment_t);
+
+/*
+ * Load a kexec image into memory.
+ *
+ * For KEXEC_TYPE_DEFAULT images, the segments may be anywhere in RAM.
+ * The image is relocated prior to being executed.
+ *
+ * For KEXEC_TYPE_CRASH images, each segment of the image must reside
+ * in the memory region reserved for kexec (KEXEC_RANGE_MA_CRASH) and
+ * the entry point must be within the image. The caller is responsible
+ * for ensuring that multiple images do not overlap.
+ *
+ * All image segments will be loaded to their destination machine
+ * addresses prior to being executed.  The trailing portion of any
+ * segments with a source buffer (from dest_maddr + buf_size to
+ * dest_maddr + dest_size) will be zeroed.
+ *
+ * Segments with no source buffer will be accessible to the image when
+ * it is executed.
+ */
+
+#define KEXEC_CMD_kexec_load 4
+typedef struct xen_kexec_load {
+    uint8_t  type;        /* One of KEXEC_TYPE_* */
+    uint8_t  _pad;
+    uint16_t arch;        /* ELF machine type (EM_*). */
+    uint32_t nr_segments;
+    union {
+        XEN_GUEST_HANDLE(xen_kexec_segment_t) h;
+        uint64_t _pad;
+    } segments;
+    uint64_t entry_maddr; /* image entry point machine address. */
+} xen_kexec_load_t;
+DEFINE_XEN_GUEST_HANDLE(xen_kexec_load_t);
+
+/*
+ * Unload a kexec image.
+ *
+ * Type must be one of KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH.
+ */
+#define KEXEC_CMD_kexec_unload 5
+typedef struct xen_kexec_unload {
+    uint8_t type;
+} xen_kexec_unload_t;
+DEFINE_XEN_GUEST_HANDLE(xen_kexec_unload_t);
+
+#else /* __XEN_INTERFACE_VERSION__ < 0x00040400 */
+
+#define KEXEC_CMD_kexec_load KEXEC_CMD_kexec_load_v1
+#define KEXEC_CMD_kexec_unload KEXEC_CMD_kexec_unload_v1
+#define xen_kexec_load xen_kexec_load_v1
+#define xen_kexec_load_t xen_kexec_load_v1_t
+
+#endif
+
 #endif /* _XEN_PUBLIC_KEXEC_H */
 
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/memory.h
===================================================================
--- trunk/sys/xen/interface/memory.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/memory.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -29,6 +29,7 @@
 #define __XEN_PUBLIC_MEMORY_H__
 
 #include "xen.h"
+#include "physdev.h"
 
 /*
  * Increase or decrease the specified domain's memory reservation. Returns the
@@ -56,6 +57,8 @@
 /* Flag to request allocation only from the node specified */
 #define XENMEMF_exact_node_request  (1<<17)
 #define XENMEMF_exact_node(n) (XENMEMF_node(n) | XENMEMF_exact_node_request)
+/* Flag to indicate the node specified is virtual node */
+#define XENMEMF_vnode  (1<<18)
 #endif
 
 struct xen_memory_reservation {
@@ -69,6 +72,8 @@
      *   IN:  GPFN bases of extents to populate with memory
      *   OUT: GMFN bases of extents that were allocated
      *   (NB. This command also updates the mach_to_phys translation table)
+     * XENMEM_claim_pages:
+     *   IN: must be zero
      */
     XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
 
@@ -186,6 +191,15 @@
 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
 
 /*
+ * For a compat caller, this is identical to XENMEM_machphys_mfn_list.
+ *
+ * For a non compat caller, this functions similarly to
+ * XENMEM_machphys_mfn_list, but returns the mfns making up the compatibility
+ * m2p table.
+ */
+#define XENMEM_machphys_compat_mfn_list     25
+
+/*
  * Returns the location in virtual address space of the machine_to_phys
  * mapping table. Architectures which do not have a m2p table, or which do not
  * map it by default into guest address space, do not implement this command.
@@ -199,6 +213,16 @@
 typedef struct xen_machphys_mapping xen_machphys_mapping_t;
 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t);
 
+/* Source mapping space. */
+/* ` enum phys_map_space { */
+#define XENMAPSPACE_shared_info  0 /* shared info page */
+#define XENMAPSPACE_grant_table  1 /* grant table page */
+#define XENMAPSPACE_gmfn         2 /* GMFN */
+#define XENMAPSPACE_gmfn_range   3 /* GMFN range, XENMEM_add_to_physmap only. */
+#define XENMAPSPACE_gmfn_foreign 4 /* GMFN from another dom,
+                                    * XENMEM_add_to_physmap_batch only. */
+/* ` } */
+
 /*
  * Sets the GPFN at which a particular page appears in the specified guest's
  * pseudophysical address space.
@@ -212,24 +236,52 @@
     /* Number of pages to go through for gmfn_range */
     uint16_t    size;
 
-    /* Source mapping space. */
-#define XENMAPSPACE_shared_info 0 /* shared info page */
-#define XENMAPSPACE_grant_table 1 /* grant table page */
-#define XENMAPSPACE_gmfn        2 /* GMFN */
-#define XENMAPSPACE_gmfn_range  3 /* GMFN range */
-    unsigned int space;
+    unsigned int space; /* => enum phys_map_space */
 
 #define XENMAPIDX_grant_table_status 0x80000000
 
-    /* Index into source mapping space. */
+    /* Index into space being mapped. */
     xen_ulong_t idx;
 
-    /* GPFN where the source mapping page should appear. */
+    /* GPFN in domid where the source mapping page should appear. */
     xen_pfn_t     gpfn;
 };
 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
 
+/* A batched version of add_to_physmap. */
+#define XENMEM_add_to_physmap_batch 23
+struct xen_add_to_physmap_batch {
+    /* IN */
+    /* Which domain to change the mapping for. */
+    domid_t domid;
+    uint16_t space; /* => enum phys_map_space */
+
+    /* Number of pages to go through */
+    uint16_t size;
+    domid_t foreign_domid; /* IFF gmfn_foreign */
+
+    /* Indexes into space being mapped. */
+    XEN_GUEST_HANDLE(xen_ulong_t) idxs;
+
+    /* GPFN in domid where the source mapping page should appear. */
+    XEN_GUEST_HANDLE(xen_pfn_t) gpfns;
+
+    /* OUT */
+
+    /* Per index error code. */
+    XEN_GUEST_HANDLE(int) errs;
+};
+typedef struct xen_add_to_physmap_batch xen_add_to_physmap_batch_t;
+DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_batch_t);
+
+#if __XEN_INTERFACE_VERSION__ < 0x00040400
+#define XENMEM_add_to_physmap_range XENMEM_add_to_physmap_batch
+#define xen_add_to_physmap_range xen_add_to_physmap_batch
+typedef struct xen_add_to_physmap_batch xen_add_to_physmap_range_t;
+DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_range_t);
+#endif
+
 /*
  * Unmaps the page appearing at a particular GPFN from the specified guest's
  * pseudophysical address space.
@@ -324,13 +376,9 @@
 #define XENMEM_paging_op_evict              1
 #define XENMEM_paging_op_prep               2
 
-#define XENMEM_access_op                    21
-#define XENMEM_access_op_resume             0
-
-struct xen_mem_event_op {
-    uint8_t     op;         /* XENMEM_*_op_* */
+struct xen_mem_paging_op {
+    uint8_t     op;         /* XENMEM_paging_op_* */
     domid_t     domain;
-    
 
     /* PAGING_PREP IN: buffer to immediately fill page in */
     uint64_aligned_t    buffer;
@@ -337,19 +385,69 @@
     /* Other OPs */
     uint64_aligned_t    gfn;           /* IN:  gfn of page being operated on */
 };
-typedef struct xen_mem_event_op xen_mem_event_op_t;
-DEFINE_XEN_GUEST_HANDLE(xen_mem_event_op_t);
+typedef struct xen_mem_paging_op xen_mem_paging_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mem_paging_op_t);
 
+#define XENMEM_access_op                    21
+#define XENMEM_access_op_set_access         0
+#define XENMEM_access_op_get_access         1
+#define XENMEM_access_op_enable_emulate     2
+#define XENMEM_access_op_disable_emulate    3
+
+typedef enum {
+    XENMEM_access_n,
+    XENMEM_access_r,
+    XENMEM_access_w,
+    XENMEM_access_rw,
+    XENMEM_access_x,
+    XENMEM_access_rx,
+    XENMEM_access_wx,
+    XENMEM_access_rwx,
+    /*
+     * Page starts off as r-x, but automatically
+     * change to r-w on a write
+     */
+    XENMEM_access_rx2rw,
+    /*
+     * Log access: starts off as n, automatically
+     * goes to rwx, generating an event without
+     * pausing the vcpu
+     */
+    XENMEM_access_n2rwx,
+    /* Take the domain default */
+    XENMEM_access_default
+} xenmem_access_t;
+
+struct xen_mem_access_op {
+    /* XENMEM_access_op_* */
+    uint8_t op;
+    /* xenmem_access_t */
+    uint8_t access;
+    domid_t domid;
+    /*
+     * Number of pages for set op
+     * Ignored on setting default access and other ops
+     */
+    uint32_t nr;
+    /*
+     * First pfn for set op
+     * pfn for get op
+     * ~0ull is used to set and get the default access for pages
+     */
+    uint64_aligned_t pfn;
+};
+typedef struct xen_mem_access_op xen_mem_access_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
+
 #define XENMEM_sharing_op                   22
 #define XENMEM_sharing_op_nominate_gfn      0
 #define XENMEM_sharing_op_nominate_gref     1
 #define XENMEM_sharing_op_share             2
-#define XENMEM_sharing_op_resume            3
-#define XENMEM_sharing_op_debug_gfn         4
-#define XENMEM_sharing_op_debug_mfn         5
-#define XENMEM_sharing_op_debug_gref        6
-#define XENMEM_sharing_op_add_physmap       7
-#define XENMEM_sharing_op_audit             8
+#define XENMEM_sharing_op_debug_gfn         3
+#define XENMEM_sharing_op_debug_mfn         4
+#define XENMEM_sharing_op_debug_gref        5
+#define XENMEM_sharing_op_add_physmap       6
+#define XENMEM_sharing_op_audit             7
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
@@ -398,14 +496,127 @@
 typedef struct xen_mem_sharing_op xen_mem_sharing_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t);
 
+/*
+ * Attempt to stake a claim for a domain on a quantity of pages
+ * of system RAM, but _not_ assign specific pageframes.  Only
+ * arithmetic is performed so the hypercall is very fast and need
+ * not be preemptible, thus sidestepping time-of-check-time-of-use
+ * races for memory allocation.  Returns 0 if the hypervisor page
+ * allocator has atomically and successfully claimed the requested
+ * number of pages, else non-zero.
+ *
+ * Any domain may have only one active claim.  When sufficient memory
+ * has been allocated to resolve the claim, the claim silently expires.
+ * Claiming zero pages effectively resets any outstanding claim and
+ * is always successful.
+ *
+ * Note that a valid claim may be staked even after memory has been
+ * allocated for a domain.  In this case, the claim is not incremental,
+ * i.e. if the domain's tot_pages is 3, and a claim is staked for 10,
+ * only 7 additional pages are claimed.
+ *
+ * Caller must be privileged or the hypercall fails.
+ */
+#define XENMEM_claim_pages                  24
+
+/*
+ * XENMEM_claim_pages flags - the are no flags at this time.
+ * The zero value is appropiate.
+ */
+
+/*
+ * With some legacy devices, certain guest-physical addresses cannot safely
+ * be used for other purposes, e.g. to map guest RAM.  This hypercall
+ * enumerates those regions so the toolstack can avoid using them.
+ */
+#define XENMEM_reserved_device_memory_map   27
+struct xen_reserved_device_memory {
+    xen_pfn_t start_pfn;
+    xen_ulong_t nr_pages;
+};
+typedef struct xen_reserved_device_memory xen_reserved_device_memory_t;
+DEFINE_XEN_GUEST_HANDLE(xen_reserved_device_memory_t);
+
+struct xen_reserved_device_memory_map {
+#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */
+    /* IN */
+    uint32_t flags;
+    /*
+     * IN/OUT
+     *
+     * Gets set to the required number of entries when too low,
+     * signaled by error code -ERANGE.
+     */
+    unsigned int nr_entries;
+    /* OUT */
+    XEN_GUEST_HANDLE(xen_reserved_device_memory_t) buffer;
+    /* IN */
+    union {
+        struct physdev_pci_device pci;
+    } dev;
+};
+typedef struct xen_reserved_device_memory_map xen_reserved_device_memory_map_t;
+DEFINE_XEN_GUEST_HANDLE(xen_reserved_device_memory_map_t);
+
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 
+/*
+ * XENMEM_get_vnumainfo used by guest to get
+ * vNUMA topology from hypervisor.
+ */
+#define XENMEM_get_vnumainfo                26
+
+/* vNUMA node memory ranges */
+struct xen_vmemrange {
+    uint64_t start, end;
+    unsigned int flags;
+    unsigned int nid;
+};
+typedef struct xen_vmemrange xen_vmemrange_t;
+DEFINE_XEN_GUEST_HANDLE(xen_vmemrange_t);
+
+/*
+ * vNUMA topology specifies vNUMA node number, distance table,
+ * memory ranges and vcpu mapping provided for guests.
+ * XENMEM_get_vnumainfo hypercall expects to see from guest
+ * nr_vnodes, nr_vmemranges and nr_vcpus to indicate available memory.
+ * After filling guests structures, nr_vnodes, nr_vmemranges and nr_vcpus
+ * copied back to guest. Domain returns expected values of nr_vnodes,
+ * nr_vmemranges and nr_vcpus to guest if the values where incorrect.
+ */
+struct xen_vnuma_topology_info {
+    /* IN */
+    domid_t domid;
+    uint16_t pad;
+    /* IN/OUT */
+    unsigned int nr_vnodes;
+    unsigned int nr_vcpus;
+    unsigned int nr_vmemranges;
+    /* OUT */
+    union {
+        XEN_GUEST_HANDLE(uint) h;
+        uint64_t pad;
+    } vdistance;
+    union {
+        XEN_GUEST_HANDLE(uint) h;
+        uint64_t pad;
+    } vcpu_to_vnode;
+    union {
+        XEN_GUEST_HANDLE(xen_vmemrange_t) h;
+        uint64_t pad;
+    } vmemrange;
+};
+typedef struct xen_vnuma_topology_info xen_vnuma_topology_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_vnuma_topology_info_t);
+
+/* Next available subop number is 28 */
+
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
 
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/nmi.h
===================================================================
--- trunk/sys/xen/interface/nmi.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/nmi.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -37,9 +37,14 @@
  /* I/O-check error reported via ISA port 0x61, bit 6. */
 #define _XEN_NMIREASON_io_error     0
 #define XEN_NMIREASON_io_error      (1UL << _XEN_NMIREASON_io_error)
+ /* PCI SERR reported via ISA port 0x61, bit 7. */
+#define _XEN_NMIREASON_pci_serr     1
+#define XEN_NMIREASON_pci_serr      (1UL << _XEN_NMIREASON_pci_serr)
+#if __XEN_INTERFACE_VERSION__ < 0x00040300 /* legacy alias of the above */
  /* Parity error reported via ISA port 0x61, bit 7. */
 #define _XEN_NMIREASON_parity_error 1
 #define XEN_NMIREASON_parity_error  (1UL << _XEN_NMIREASON_parity_error)
+#endif
  /* Unknown hardware-generated NMI. */
 #define _XEN_NMIREASON_unknown      2
 #define XEN_NMIREASON_unknown       (1UL << _XEN_NMIREASON_unknown)
@@ -73,7 +78,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/physdev.h
===================================================================
--- trunk/sys/xen/interface/physdev.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/physdev.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -17,6 +17,8 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Keir Fraser
  */
 
 #ifndef __XEN_PUBLIC_PHYSDEV_H__
@@ -152,6 +154,7 @@
 #define MAP_PIRQ_TYPE_GSI               0x1
 #define MAP_PIRQ_TYPE_UNKNOWN           0x2
 #define MAP_PIRQ_TYPE_MSI_SEG           0x3
+#define MAP_PIRQ_TYPE_MULTI_MSI         0x4
 
 #define PHYSDEVOP_map_pirq               13
 struct physdev_map_pirq {
@@ -158,15 +161,15 @@
     domid_t domid;
     /* IN */
     int type;
-    /* IN */
+    /* IN (ignored for ..._MULTI_MSI) */
     int index;
     /* IN or OUT */
     int pirq;
-    /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */
+    /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */
     int bus;
     /* IN */
     int devfn;
-    /* IN */
+    /* IN (also OUT for ..._MULTI_MSI) */
     int entry_nr;
     /* IN */
     uint64_t table_base;
@@ -293,6 +296,11 @@
         uint8_t bus;
         uint8_t devfn;
     } physfn;
+    /*
+     * Optional parameters array.
+     * First element ([0]) is PXM domain associated with the device (if
+     * XEN_PCI_DEV_PXM is set)
+     */
 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
     uint32_t optarr[];
 #elif defined(__GNUC__)
@@ -304,6 +312,12 @@
 
 #define PHYSDEVOP_pci_device_remove     26
 #define PHYSDEVOP_restore_msi_ext       27
+/*
+ * Dom0 should use these two to announce MMIO resources assigned to
+ * MSI-X capable devices won't (prepare) or may (release) change.
+ */
+#define PHYSDEVOP_prepare_msix          30
+#define PHYSDEVOP_release_msix          31
 struct physdev_pci_device {
     /* IN */
     uint16_t seg;
@@ -313,6 +327,24 @@
 typedef struct physdev_pci_device physdev_pci_device_t;
 DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_t);
 
+#define PHYSDEVOP_DBGP_RESET_PREPARE    1
+#define PHYSDEVOP_DBGP_RESET_DONE       2
+
+#define PHYSDEVOP_DBGP_BUS_UNKNOWN      0
+#define PHYSDEVOP_DBGP_BUS_PCI          1
+
+#define PHYSDEVOP_dbgp_op               29
+struct physdev_dbgp_op {
+    /* IN */
+    uint8_t op;
+    uint8_t bus;
+    union {
+        struct physdev_pci_device pci;
+    } u;
+};
+typedef struct physdev_dbgp_op physdev_dbgp_op_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_dbgp_op_t);
+
 /*
  * Notify that some PIRQ-bound event channels have been unmasked.
  * ** This command is obsolete since interface version 0x00030202 and is **
@@ -320,9 +352,11 @@
  */
 #define PHYSDEVOP_IRQ_UNMASK_NOTIFY      4
 
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
 /*
  * These all-capitals physdev operation names are superceded by the new names
- * (defined above) since interface version 0x00030202.
+ * (defined above) since interface version 0x00030202. The guard above was
+ * added post-4.5 only though and hence shouldn't check for 0x00030202.
  */
 #define PHYSDEVOP_IRQ_STATUS_QUERY       PHYSDEVOP_irq_status_query
 #define PHYSDEVOP_SET_IOPL               PHYSDEVOP_set_iopl
@@ -333,6 +367,7 @@
 #define PHYSDEVOP_FREE_VECTOR            PHYSDEVOP_free_irq_vector
 #define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
 #define PHYSDEVOP_IRQ_SHARED             XENIRQSTAT_shared
+#endif
 
 #if __XEN_INTERFACE_VERSION__ < 0x00040200
 #define PHYSDEVOP_pirq_eoi_gmfn PHYSDEVOP_pirq_eoi_gmfn_v1
@@ -345,7 +380,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/platform.h
===================================================================
--- trunk/sys/xen/interface/platform.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/platform.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -36,13 +36,28 @@
  * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
  * 1 January, 1970 if the current system time was <system_time>.
  */
-#define XENPF_settime             17
-struct xenpf_settime {
+#define XENPF_settime32           17
+struct xenpf_settime32 {
     /* IN variables. */
     uint32_t secs;
     uint32_t nsecs;
     uint64_t system_time;
 };
+#define XENPF_settime64           62
+struct xenpf_settime64 {
+    /* IN variables. */
+    uint64_t secs;
+    uint32_t nsecs;
+    uint32_t mbz;
+    uint64_t system_time;
+};
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
+#define XENPF_settime XENPF_settime32
+#define xenpf_settime xenpf_settime32
+#else
+#define XENPF_settime XENPF_settime64
+#define xenpf_settime xenpf_settime64
+#endif
 typedef struct xenpf_settime xenpf_settime_t;
 DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t);
 
@@ -127,6 +142,26 @@
 #define XEN_EFI_query_variable_info           9
 #define XEN_EFI_query_capsule_capabilities   10
 #define XEN_EFI_update_capsule               11
+
+struct xenpf_efi_time {
+    uint16_t year;
+    uint8_t month;
+    uint8_t day;
+    uint8_t hour;
+    uint8_t min;
+    uint8_t sec;
+    uint32_t ns;
+    int16_t tz;
+    uint8_t daylight;
+};
+
+struct xenpf_efi_guid {
+    uint32_t data1;
+    uint16_t data2;
+    uint16_t data3;
+    uint8_t data4[8];
+};
+
 struct xenpf_efi_runtime_call {
     uint32_t function;
     /*
@@ -135,21 +170,11 @@
      * where it holds the single returned value.
      */
     uint32_t misc;
-    unsigned long status;
+    xen_ulong_t status;
     union {
 #define XEN_EFI_GET_TIME_SET_CLEARS_NS 0x00000001
         struct {
-            struct xenpf_efi_time {
-                uint16_t year;
-                uint8_t month;
-                uint8_t day;
-                uint8_t hour;
-                uint8_t min;
-                uint8_t sec;
-                uint32_t ns;
-                int16_t tz;
-                uint8_t daylight;
-            } time;
+            struct xenpf_efi_time time;
             uint32_t resolution;
             uint32_t accuracy;
         } get_time;
@@ -169,22 +194,18 @@
 #define XEN_EFI_VARIABLE_RUNTIME_ACCESS     0x00000004
         struct {
             XEN_GUEST_HANDLE(void) name;  /* UCS-2/UTF-16 string */
-            unsigned long size;
+            xen_ulong_t size;
             XEN_GUEST_HANDLE(void) data;
-            struct xenpf_efi_guid {
-                uint32_t data1;
-                uint16_t data2;
-                uint16_t data3;
-                uint8_t data4[8];
-            } vendor_guid;
+            struct xenpf_efi_guid vendor_guid;
         } get_variable, set_variable;
 
         struct {
-            unsigned long size;
+            xen_ulong_t size;
             XEN_GUEST_HANDLE(void) name;  /* UCS-2/UTF-16 string */
             struct xenpf_efi_guid vendor_guid;
         } get_next_variable_name;
 
+#define XEN_EFI_VARINFO_BOOT_SNAPSHOT       0x00000001
         struct {
             uint32_t attr;
             uint64_t max_store_size;
@@ -194,14 +215,14 @@
 
         struct {
             XEN_GUEST_HANDLE(void) capsule_header_array;
-            unsigned long capsule_count;
+            xen_ulong_t capsule_count;
             uint64_t max_capsule_size;
-            unsigned int reset_type;
+            uint32_t reset_type;
         } query_capsule_capabilities;
 
         struct {
             XEN_GUEST_HANDLE(void) capsule_header_array;
-            unsigned long capsule_count;
+            xen_ulong_t capsule_count;
             uint64_t sg_list; /* machine address */
         } update_capsule;
     } u;
@@ -219,6 +240,8 @@
 #define  XEN_FW_EFI_VENDOR         2
 #define  XEN_FW_EFI_MEM_INFO       3
 #define  XEN_FW_EFI_RT_VERSION     4
+#define  XEN_FW_EFI_PCI_ROM        5
+#define XEN_FW_KBD_SHIFT_FLAGS    5
 struct xenpf_firmware_info {
     /* IN variables. */
     uint32_t type;
@@ -266,7 +289,21 @@
                 uint64_t attr;
                 uint32_t type;
             } mem;
+            struct {
+                /* IN variables */
+                uint16_t segment;
+                uint8_t bus;
+                uint8_t devfn;
+                uint16_t vendor;
+                uint16_t devid;
+                /* OUT variables */
+                uint64_t address;
+                xen_ulong_t size;
+            } pci_rom;
         } efi_info; /* XEN_FW_EFI_INFO */
+
+        /* Int16, Fn02: Get keyboard shift flags. */
+        uint8_t kbd_shift_flags; /* XEN_FW_KBD_SHIFT_FLAGS */
     } u;
 };
 typedef struct xenpf_firmware_info xenpf_firmware_info_t;
@@ -275,10 +312,16 @@
 #define XENPF_enter_acpi_sleep    51
 struct xenpf_enter_acpi_sleep {
     /* IN variables */
+#if __XEN_INTERFACE_VERSION__ < 0x00040300
     uint16_t pm1a_cnt_val;      /* PM1a control value. */
     uint16_t pm1b_cnt_val;      /* PM1b control value. */
+#else
+    uint16_t val_a;             /* PM1a control / sleep type A. */
+    uint16_t val_b;             /* PM1b control / sleep type B. */
+#endif
     uint32_t sleep_state;       /* Which state to enter (Sn). */
-    uint32_t flags;             /* Must be zero. */
+#define XENPF_ACPI_SLEEP_EXTENDED 0x00000001
+    uint32_t flags;             /* XENPF_ACPI_SLEEP_*. */
 };
 typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t;
 DEFINE_XEN_GUEST_HANDLE(xenpf_enter_acpi_sleep_t);
@@ -506,6 +549,67 @@
 DEFINE_XEN_GUEST_HANDLE(xenpf_core_parking_t);
 
 /*
+ * Access generic platform resources(e.g., accessing MSR, port I/O, etc)
+ * in unified way. Batch resource operations in one call are supported and
+ * they are always non-preemptible and executed in their original order.
+ * The batch itself returns a negative integer for general errors, or a
+ * non-negative integer for the number of successful operations. For the latter
+ * case, the @ret in the failed entry (if any) indicates the exact error.
+ */
+#define XENPF_resource_op   61
+
+#define XEN_RESOURCE_OP_MSR_READ  0
+#define XEN_RESOURCE_OP_MSR_WRITE 1
+
+/*
+ * Specially handled MSRs:
+ * - MSR_IA32_TSC
+ * READ: Returns the scaled system time(ns) instead of raw timestamp. In
+ *       multiple entry case, if other MSR read is followed by a MSR_IA32_TSC
+ *       read, then both reads are guaranteed to be performed atomically (with
+ *       IRQ disabled). The return time indicates the point of reading that MSR.
+ * WRITE: Not supported.
+ */
+
+struct xenpf_resource_entry {
+    union {
+        uint32_t cmd;   /* IN: XEN_RESOURCE_OP_* */
+        int32_t  ret;   /* OUT: return value for failed entry */
+    } u;
+    uint32_t rsvd;      /* IN: padding and must be zero */
+    uint64_t idx;       /* IN: resource address to access */
+    uint64_t val;       /* IN/OUT: resource value to set/get */
+};
+typedef struct xenpf_resource_entry xenpf_resource_entry_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_resource_entry_t);
+
+struct xenpf_resource_op {
+    uint32_t nr_entries;    /* number of resource entry */
+    uint32_t cpu;           /* which cpu to run */
+    XEN_GUEST_HANDLE(xenpf_resource_entry_t) entries;
+};
+typedef struct xenpf_resource_op xenpf_resource_op_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_resource_op_t);
+
+#define XENPF_get_symbol   63
+struct xenpf_symdata {
+    /* IN/OUT variables */
+    uint32_t namelen; /* IN:  size of name buffer                       */
+                      /* OUT: strlen(name) of hypervisor symbol (may be */
+                      /*      larger than what's been copied to guest)  */
+    uint32_t symnum;  /* IN:  Symbol to read                            */
+                      /* OUT: Next available symbol. If same as IN then */
+                      /*      we reached the end                        */
+
+    /* OUT variables */
+    XEN_GUEST_HANDLE(char) name;
+    uint64_t address;
+    char type;
+};
+typedef struct xenpf_symdata xenpf_symdata_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_symdata_t);
+
+/*
  * ` enum neg_errnoval
  * ` HYPERVISOR_platform_op(const struct xen_platform_op*);
  */
@@ -514,6 +618,8 @@
     uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
     union {
         struct xenpf_settime           settime;
+        struct xenpf_settime32         settime32;
+        struct xenpf_settime64         settime64;
         struct xenpf_add_memtype       add_memtype;
         struct xenpf_del_memtype       del_memtype;
         struct xenpf_read_memtype      read_memtype;
@@ -531,6 +637,8 @@
         struct xenpf_cpu_hotadd        cpu_add;
         struct xenpf_mem_hotadd        mem_add;
         struct xenpf_core_parking      core_parking;
+        struct xenpf_resource_op       resource_op;
+        struct xenpf_symdata           symdata;
         uint8_t                        pad[128];
     } u;
 };
@@ -542,7 +650,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Added: trunk/sys/xen/interface/pmu.h
===================================================================
--- trunk/sys/xen/interface/pmu.h	                        (rev 0)
+++ trunk/sys/xen/interface/pmu.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -0,0 +1,134 @@
+/* $MidnightBSD$ */
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2015 Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef __XEN_PUBLIC_PMU_H__
+#define __XEN_PUBLIC_PMU_H__
+
+#include "xen.h"
+#if defined(__i386__) || defined(__x86_64__)
+#include "arch-x86/pmu.h"
+#elif defined (__arm__) || defined (__aarch64__)
+#include "arch-arm.h"
+#else
+#error "Unsupported architecture"
+#endif
+
+#define XENPMU_VER_MAJ    0
+#define XENPMU_VER_MIN    1
+
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params *args);
+ *
+ * @cmd  == XENPMU_* (PMU operation)
+ * @args == struct xenpmu_params
+ */
+/* ` enum xenpmu_op { */
+#define XENPMU_mode_get        0 /* Also used for getting PMU version */
+#define XENPMU_mode_set        1
+#define XENPMU_feature_get     2
+#define XENPMU_feature_set     3
+#define XENPMU_init            4
+#define XENPMU_finish          5
+#define XENPMU_lvtpc_set       6
+#define XENPMU_flush           7 /* Write cached MSR values to HW     */
+/* ` } */
+
+/* Parameters structure for HYPERVISOR_xenpmu_op call */
+struct xen_pmu_params {
+    /* IN/OUT parameters */
+    struct {
+        uint32_t maj;
+        uint32_t min;
+    } version;
+    uint64_t val;
+
+    /* IN parameters */
+    uint32_t vcpu;
+    uint32_t pad;
+};
+typedef struct xen_pmu_params xen_pmu_params_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pmu_params_t);
+
+/* PMU modes:
+ * - XENPMU_MODE_OFF:   No PMU virtualization
+ * - XENPMU_MODE_SELF:  Guests can profile themselves
+ * - XENPMU_MODE_HV:    Guests can profile themselves, dom0 profiles
+ *                      itself and Xen
+ * - XENPMU_MODE_ALL:   Only dom0 has access to VPMU and it profiles
+ *                      everyone: itself, the hypervisor and the guests.
+ */
+#define XENPMU_MODE_OFF           0
+#define XENPMU_MODE_SELF          (1<<0)
+#define XENPMU_MODE_HV            (1<<1)
+#define XENPMU_MODE_ALL           (1<<2)
+
+/*
+ * PMU features:
+ * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD)
+ */
+#define XENPMU_FEATURE_INTEL_BTS  1
+
+/*
+ * Shared PMU data between hypervisor and PV(H) domains.
+ *
+ * The hypervisor fills out this structure during PMU interrupt and sends an
+ * interrupt to appropriate VCPU.
+ * Architecture-independent fields of xen_pmu_data are WO for the hypervisor
+ * and RO for the guest but some fields in xen_pmu_arch can be writable
+ * by both the hypervisor and the guest (see arch-$arch/pmu.h).
+ */
+struct xen_pmu_data {
+    /* Interrupted VCPU */
+    uint32_t vcpu_id;
+
+    /*
+     * Physical processor on which the interrupt occurred. On non-privileged
+     * guests set to vcpu_id;
+     */
+    uint32_t pcpu_id;
+
+    /*
+     * Domain that was interrupted. On non-privileged guests set to DOMID_SELF.
+     * On privileged guests can be DOMID_SELF, DOMID_XEN, or, when in
+     * XENPMU_MODE_ALL mode, domain ID of another domain.
+     */
+    domid_t  domain_id;
+
+    uint8_t pad[6];
+
+    /* Architecture-specific information */
+    struct xen_pmu_arch pmu;
+};
+
+#endif /* __XEN_PUBLIC_PMU_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */


Property changes on: trunk/sys/xen/interface/pmu.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/interface/sched.h
===================================================================
--- trunk/sys/xen/interface/sched.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/sched.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -1,9 +1,9 @@
 /* $MidnightBSD$ */
 /******************************************************************************
  * sched.h
- * 
+ *
  * Scheduler state interactions
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
@@ -31,11 +31,21 @@
 #include "event_channel.h"
 
 /*
+ * `incontents 150 sched Guest Scheduler Operations
+ *
+ * The SCHEDOP interface provides mechanisms for a guest to interact
+ * with the scheduler, including yield, blocking and shutting itself
+ * down.
+ */
+
+/*
  * The prototype for this hypercall is:
- *  long sched_op(int cmd, void *arg)
+ * ` long HYPERVISOR_sched_op(enum sched_op cmd, void *arg, ...)
+ *
  * @cmd == SCHEDOP_??? (scheduler operation).
  * @arg == Operation-specific extra argument(s), as described below.
- * 
+ * ...  == Additional Operation-specific extra arguments, described below.
+ *
  * Versions of Xen prior to 3.0.2 provided only the following legacy version
  * of this hypercall, supporting only the commands yield, block and shutdown:
  *  long sched_op(int cmd, unsigned long arg)
@@ -42,9 +52,12 @@
  * @cmd == SCHEDOP_??? (scheduler operation).
  * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
  *      == SHUTDOWN_* code (SCHEDOP_shutdown)
- * This legacy version is available to new guests as sched_op_compat().
+ *
+ * This legacy version is available to new guests as:
+ * ` long HYPERVISOR_sched_op_compat(enum sched_op cmd, unsigned long arg)
  */
 
+/* ` enum sched_op { // SCHEDOP_* => struct sched_* */
 /*
  * Voluntarily yield the CPU.
  * @arg == NULL.
@@ -62,53 +75,44 @@
 
 /*
  * Halt execution of this domain (all VCPUs) and notify the system controller.
- * @arg == pointer to sched_shutdown structure.
+ * @arg == pointer to sched_shutdown_t structure.
+ *
+ * If the sched_shutdown_t reason is SHUTDOWN_suspend then
+ * x86 PV guests must also set RDX (EDX for 32-bit guests) to the MFN
+ * of the guest's start info page.  RDX/EDX is the third hypercall
+ * argument.
+ *
+ * In addition, which reason is SHUTDOWN_suspend this hypercall
+ * returns 1 if suspend was cancelled or the domain was merely
+ * checkpointed, and 0 if it is resuming in a new domain.
  */
 #define SCHEDOP_shutdown    2
-struct sched_shutdown {
-    unsigned int reason; /* SHUTDOWN_* */
-};
-typedef struct sched_shutdown sched_shutdown_t;
-DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
 
 /*
  * Poll a set of event-channel ports. Return when one or more are pending. An
  * optional timeout may be specified.
- * @arg == pointer to sched_poll structure.
+ * @arg == pointer to sched_poll_t structure.
  */
 #define SCHEDOP_poll        3
-struct sched_poll {
-    XEN_GUEST_HANDLE(evtchn_port_t) ports;
-    unsigned int nr_ports;
-    uint64_t timeout;
-};
-typedef struct sched_poll sched_poll_t;
-DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
 
 /*
  * Declare a shutdown for another domain. The main use of this function is
  * in interpreting shutdown requests and reasons for fully-virtualized
  * domains.  A para-virtualized domain may use SCHEDOP_shutdown directly.
- * @arg == pointer to sched_remote_shutdown structure.
+ * @arg == pointer to sched_remote_shutdown_t structure.
  */
 #define SCHEDOP_remote_shutdown        4
-struct sched_remote_shutdown {
-    domid_t domain_id;         /* Remote domain ID */
-    unsigned int reason;       /* SHUTDOWN_xxx reason */
-};
-typedef struct sched_remote_shutdown sched_remote_shutdown_t;
-DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
 
 /*
  * Latch a shutdown code, so that when the domain later shuts down it
  * reports this code to the control tools.
- * @arg == as for SCHEDOP_shutdown.
+ * @arg == sched_shutdown_t, as for SCHEDOP_shutdown.
  */
 #define SCHEDOP_shutdown_code 5
 
 /*
  * Setup, poke and destroy a domain watchdog timer.
- * @arg == pointer to sched_watchdog structure.
+ * @arg == pointer to sched_watchdog_t structure.
  * With id == 0, setup a domain watchdog timer to cause domain shutdown
  *               after timeout, returns watchdog id.
  * With id != 0 and timeout == 0, destroy domain watchdog timer.
@@ -115,6 +119,29 @@
  * With id != 0 and timeout != 0, poke watchdog timer and set new timeout.
  */
 #define SCHEDOP_watchdog    6
+/* ` } */
+
+struct sched_shutdown {
+    unsigned int reason; /* SHUTDOWN_* => enum sched_shutdown_reason */
+};
+typedef struct sched_shutdown sched_shutdown_t;
+DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
+
+struct sched_poll {
+    XEN_GUEST_HANDLE(evtchn_port_t) ports;
+    unsigned int nr_ports;
+    uint64_t timeout;
+};
+typedef struct sched_poll sched_poll_t;
+DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
+
+struct sched_remote_shutdown {
+    domid_t domain_id;         /* Remote domain ID */
+    unsigned int reason;       /* SHUTDOWN_* => enum sched_shutdown_reason */
+};
+typedef struct sched_remote_shutdown sched_remote_shutdown_t;
+DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
+
 struct sched_watchdog {
     uint32_t id;                /* watchdog ID */
     uint32_t timeout;           /* timeout */
@@ -127,11 +154,14 @@
  * software to determine the appropriate action. For the most part, Xen does
  * not care about the shutdown code.
  */
+/* ` enum sched_shutdown_reason { */
 #define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up and kill. */
 #define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
 #define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
 #define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
 #define SHUTDOWN_watchdog   4  /* Restart because watchdog time expired.     */
+#define SHUTDOWN_MAX        4  /* Maximum valid shutdown reason.             */
+/* ` } */
 
 #endif /* __XEN_PUBLIC_SCHED_H__ */
 
@@ -138,7 +168,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/sysctl.h
===================================================================
--- trunk/sys/xen/interface/sysctl.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/sysctl.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -34,8 +34,10 @@
 
 #include "xen.h"
 #include "domctl.h"
+#include "physdev.h"
+#include "tmem.h"
 
-#define XEN_SYSCTL_INTERFACE_VERSION 0x00000009
+#define XEN_SYSCTL_INTERFACE_VERSION 0x0000000C
 
 /*
  * Read console content from Xen buffer ring.
@@ -72,7 +74,7 @@
 #define XEN_SYSCTL_TBUFOP_disable      5
     uint32_t cmd;
     /* IN/OUT variables */
-    struct xenctl_cpumap cpu_mask;
+    struct xenctl_bitmap cpu_mask;
     uint32_t             evt_mask;
     /* OUT variables */
     uint64_aligned_t buffer_mfn;
@@ -102,6 +104,7 @@
     uint64_aligned_t total_pages;
     uint64_aligned_t free_pages;
     uint64_aligned_t scrub_pages;
+    uint64_aligned_t outstanding_pages;
     uint32_t hw_cap[8];
 
     /* XEN_SYSCTL_PHYSCAP_??? */
@@ -226,13 +229,17 @@
     uint64_aligned_t idle_time;                 /* idle time from boot */
     XEN_GUEST_HANDLE_64(uint64) triggers;    /* Cx trigger counts */
     XEN_GUEST_HANDLE_64(uint64) residencies; /* Cx residencies */
-    uint64_aligned_t pc2;
-    uint64_aligned_t pc3;
-    uint64_aligned_t pc6;
-    uint64_aligned_t pc7;
-    uint64_aligned_t cc3;
-    uint64_aligned_t cc6;
-    uint64_aligned_t cc7;
+    uint32_t nr_pc;                          /* entry nr in pc[] */
+    uint32_t nr_cc;                          /* entry nr in cc[] */
+    /*
+     * These two arrays may (and generally will) have unused slots; slots not
+     * having a corresponding hardware register will not be written by the
+     * hypervisor. It is therefore up to the caller to put a suitable sentinel
+     * into all slots before invoking the function.
+     * Indexing is 1-biased (PC1/CC1 being at index 0).
+     */
+    XEN_GUEST_HANDLE_64(uint64) pc;
+    XEN_GUEST_HANDLE_64(uint64) cc;
 };
 
 struct xen_sysctl_get_pmstat {
@@ -458,61 +465,76 @@
 typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t);
 
-/* XEN_SYSCTL_topologyinfo */
-#define INVALID_TOPOLOGY_ID  (~0U)
-struct xen_sysctl_topologyinfo {
-    /*
-     * IN: maximum addressable entry in the caller-provided arrays.
-     * OUT: largest cpu identifier in the system.
-     * If OUT is greater than IN then the arrays are truncated!
-     * If OUT is leass than IN then the array tails are not written by sysctl.
-     */
-    uint32_t max_cpu_index;
+/* XEN_SYSCTL_cputopoinfo */
+#define XEN_INVALID_CORE_ID     (~0U)
+#define XEN_INVALID_SOCKET_ID   (~0U)
+#define XEN_INVALID_NODE_ID     (~0U)
 
-    /*
-     * If not NULL, these arrays are filled with core/socket/node identifier
-     * for each cpu.
-     * If a cpu has no core/socket/node information (e.g., cpu not present) 
-     * then the sentinel value ~0u is written to each array.
-     * The number of array elements written by the sysctl is:
-     *   min(@max_cpu_index_IN, at max_cpu_index_OUT)+1
-     */
-    XEN_GUEST_HANDLE_64(uint32) cpu_to_core;
-    XEN_GUEST_HANDLE_64(uint32) cpu_to_socket;
-    XEN_GUEST_HANDLE_64(uint32) cpu_to_node;
+struct xen_sysctl_cputopo {
+    uint32_t core;
+    uint32_t socket;
+    uint32_t node;
 };
-typedef struct xen_sysctl_topologyinfo xen_sysctl_topologyinfo_t;
-DEFINE_XEN_GUEST_HANDLE(xen_sysctl_topologyinfo_t);
+typedef struct xen_sysctl_cputopo xen_sysctl_cputopo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cputopo_t);
 
+/*
+ * IN:
+ *  - a NULL 'cputopo' handle is a request for maximun 'num_cpus'.
+ *  - otherwise it's the number of entries in 'cputopo'
+ *
+ * OUT:
+ *  - If 'num_cpus' is less than the number Xen wants to write but the handle
+ *    handle is not a NULL one, partial data gets returned and 'num_cpus' gets
+ *    updated to reflect the intended number.
+ *  - Otherwise, 'num_cpus' shall indicate the number of entries written, which
+ *    may be less than the input value.
+ */
+struct xen_sysctl_cputopoinfo {
+    uint32_t num_cpus;
+    XEN_GUEST_HANDLE_64(xen_sysctl_cputopo_t) cputopo;
+};
+typedef struct xen_sysctl_cputopoinfo xen_sysctl_cputopoinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cputopoinfo_t);
+
 /* XEN_SYSCTL_numainfo */
-#define INVALID_NUMAINFO_ID (~0U)
+#define XEN_INVALID_MEM_SZ     (~0U)
+#define XEN_INVALID_NODE_DIST  (~0U)
+
+struct xen_sysctl_meminfo {
+    uint64_t memsize;
+    uint64_t memfree;
+};
+typedef struct xen_sysctl_meminfo xen_sysctl_meminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_meminfo_t);
+
+/*
+ * IN:
+ *  - Both 'meminfo' and 'distance' handles being null is a request
+ *    for maximum value of 'num_nodes'.
+ *  - Otherwise it's the number of entries in 'meminfo' and square root
+ *    of number of entries in 'distance' (when corresponding handle is
+ *    non-null)
+ *
+ * OUT:
+ *  - If 'num_nodes' is less than the number Xen wants to write but either
+ *    handle is not a NULL one, partial data gets returned and 'num_nodes'
+ *    gets updated to reflect the intended number.
+ *  - Otherwise, 'num_nodes' shall indicate the number of entries written, which
+ *    may be less than the input value.
+ */
+
 struct xen_sysctl_numainfo {
-    /*
-     * IN: maximum addressable entry in the caller-provided arrays.
-     * OUT: largest node identifier in the system.
-     * If OUT is greater than IN then the arrays are truncated!
-     */
-    uint32_t max_node_index;
+    uint32_t num_nodes;
 
-    /* NB. Entries are 0 if node is not present. */
-    XEN_GUEST_HANDLE_64(uint64) node_to_memsize;
-    XEN_GUEST_HANDLE_64(uint64) node_to_memfree;
+    XEN_GUEST_HANDLE_64(xen_sysctl_meminfo_t) meminfo;
 
     /*
-     * Array, of size (max_node_index+1)^2, listing memory access distances
-     * between nodes. If an entry has no node distance information (e.g., node 
-     * not present) then the value ~0u is written.
-     * 
-     * Note that the array rows must be indexed by multiplying by the minimum 
-     * of the caller-provided max_node_index and the returned value of
-     * max_node_index. That is, if the largest node index in the system is
-     * smaller than the caller can handle, a smaller 2-d array is constructed
-     * within the space provided by the caller. When this occurs, trailing
-     * space provided by the caller is not modified. If the largest node index
-     * in the system is larger than the caller can handle, then a 2-d array of
-     * the maximum size handleable by the caller is constructed.
+     * Distance between nodes 'i' and 'j' is stored in index 'i*N + j',
+     * where N is the number of nodes that will be returned in 'num_nodes'
+     * (i.e. not 'num_nodes' provided by the caller)
      */
-    XEN_GUEST_HANDLE_64(uint32) node_to_node_distance;
+    XEN_GUEST_HANDLE_64(uint32) distance;
 };
 typedef struct xen_sysctl_numainfo xen_sysctl_numainfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_numainfo_t);
@@ -533,7 +555,7 @@
     uint32_t domid;       /* IN: M              */
     uint32_t cpu;         /* IN: AR             */
     uint32_t n_dom;       /*            OUT: I  */
-    struct xenctl_cpumap cpumap; /*     OUT: IF */
+    struct xenctl_bitmap cpumap; /*     OUT: IF */
 };
 typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t);
@@ -597,6 +619,152 @@
 typedef struct xen_sysctl_scheduler_op xen_sysctl_scheduler_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_scheduler_op_t);
 
+/* XEN_SYSCTL_coverage_op */
+/*
+ * Get total size of information, to help allocate
+ * the buffer. The pointer points to a 32 bit value.
+ */
+#define XEN_SYSCTL_COVERAGE_get_total_size 0
+
+/*
+ * Read coverage information in a single run
+ * You must use a tool to split them.
+ */
+#define XEN_SYSCTL_COVERAGE_read           1
+
+/*
+ * Reset all the coverage counters to 0
+ * No parameters.
+ */
+#define XEN_SYSCTL_COVERAGE_reset          2
+
+/*
+ * Like XEN_SYSCTL_COVERAGE_read but reset also
+ * counters to 0 in a single call.
+ */
+#define XEN_SYSCTL_COVERAGE_read_and_reset 3
+
+struct xen_sysctl_coverage_op {
+    uint32_t cmd;        /* XEN_SYSCTL_COVERAGE_* */
+    union {
+        uint32_t total_size; /* OUT */
+        XEN_GUEST_HANDLE_64(uint8)  raw_info;   /* OUT */
+    } u;
+};
+typedef struct xen_sysctl_coverage_op xen_sysctl_coverage_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_coverage_op_t);
+
+#define XEN_SYSCTL_PSR_CMT_get_total_rmid            0
+#define XEN_SYSCTL_PSR_CMT_get_l3_upscaling_factor   1
+/* The L3 cache size is returned in KB unit */
+#define XEN_SYSCTL_PSR_CMT_get_l3_cache_size         2
+#define XEN_SYSCTL_PSR_CMT_enabled                   3
+#define XEN_SYSCTL_PSR_CMT_get_l3_event_mask         4
+struct xen_sysctl_psr_cmt_op {
+    uint32_t cmd;       /* IN: XEN_SYSCTL_PSR_CMT_* */
+    uint32_t flags;     /* padding variable, may be extended for future use */
+    union {
+        uint64_t data;  /* OUT */
+        struct {
+            uint32_t cpu;   /* IN */
+            uint32_t rsvd;
+        } l3_cache;
+    } u;
+};
+typedef struct xen_sysctl_psr_cmt_op xen_sysctl_psr_cmt_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_psr_cmt_op_t);
+
+/* XEN_SYSCTL_pcitopoinfo */
+#define XEN_INVALID_DEV (XEN_INVALID_NODE_ID - 1)
+struct xen_sysctl_pcitopoinfo {
+    /*
+     * IN: Number of elements in 'pcitopo' and 'nodes' arrays.
+     * OUT: Number of processed elements of those arrays.
+     */
+    uint32_t num_devs;
+
+    /* IN: list of devices for which node IDs are requested. */
+    XEN_GUEST_HANDLE_64(physdev_pci_device_t) devs;
+
+    /*
+     * OUT: node identifier for each device.
+     * If information for a particular device is not available then
+     * corresponding entry will be set to XEN_INVALID_NODE_ID. If
+     * device is not known to the hypervisor then XEN_INVALID_DEV
+     * will be provided.
+     */
+    XEN_GUEST_HANDLE_64(uint32) nodes;
+};
+typedef struct xen_sysctl_pcitopoinfo xen_sysctl_pcitopoinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_pcitopoinfo_t);
+
+#define XEN_SYSCTL_PSR_CAT_get_l3_info               0
+struct xen_sysctl_psr_cat_op {
+    uint32_t cmd;       /* IN: XEN_SYSCTL_PSR_CAT_* */
+    uint32_t target;    /* IN */
+    union {
+        struct {
+            uint32_t cbm_len;   /* OUT: CBM length */
+            uint32_t cos_max;   /* OUT: Maximum COS */
+        } l3_info;
+    } u;
+};
+typedef struct xen_sysctl_psr_cat_op xen_sysctl_psr_cat_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_psr_cat_op_t);
+
+#define XEN_SYSCTL_TMEM_OP_ALL_CLIENTS 0xFFFFU
+
+#define XEN_SYSCTL_TMEM_OP_THAW                   0
+#define XEN_SYSCTL_TMEM_OP_FREEZE                 1
+#define XEN_SYSCTL_TMEM_OP_FLUSH                  2
+#define XEN_SYSCTL_TMEM_OP_DESTROY                3
+#define XEN_SYSCTL_TMEM_OP_LIST                   4
+#define XEN_SYSCTL_TMEM_OP_SET_WEIGHT             5
+#define XEN_SYSCTL_TMEM_OP_SET_CAP                6
+#define XEN_SYSCTL_TMEM_OP_SET_COMPRESS           7
+#define XEN_SYSCTL_TMEM_OP_QUERY_FREEABLE_MB      8
+#define XEN_SYSCTL_TMEM_OP_SAVE_BEGIN             10
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_VERSION       11
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_MAXPOOLS      12
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_WEIGHT 13
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_CAP    14
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_CLIENT_FLAGS  15
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_FLAGS    16
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_NPAGES   17
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_POOL_UUID     18
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_PAGE     19
+#define XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_INV      20
+#define XEN_SYSCTL_TMEM_OP_SAVE_END               21
+#define XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN          30
+#define XEN_SYSCTL_TMEM_OP_RESTORE_PUT_PAGE       32
+#define XEN_SYSCTL_TMEM_OP_RESTORE_FLUSH_PAGE     33
+
+/*
+ * XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_[PAGE|INV] override the 'buf' in
+ * xen_sysctl_tmem_op with this structure - sometimes with an extra
+ * page tackled on.
+ */
+struct tmem_handle {
+    uint32_t pool_id;
+    uint32_t index;
+    xen_tmem_oid_t oid;
+};
+
+struct xen_sysctl_tmem_op {
+    uint32_t cmd;       /* IN: XEN_SYSCTL_TMEM_OP_* . */
+    int32_t pool_id;    /* IN: 0 by default unless _SAVE_*, RESTORE_* .*/
+    uint32_t cli_id;    /* IN: client id, 0 for XEN_SYSCTL_TMEM_QUERY_FREEABLE_MB
+                           for all others can be the domain id or
+                           XEN_SYSCTL_TMEM_OP_ALL_CLIENTS for all. */
+    uint32_t arg1;      /* IN: If not applicable to command use 0. */
+    uint32_t arg2;      /* IN: If not applicable to command use 0. */
+    uint32_t pad;       /* Padding so structure is the same under 32 and 64. */
+    xen_tmem_oid_t oid; /* IN: If not applicable to command use 0s. */
+    XEN_GUEST_HANDLE_64(char) buf; /* IN/OUT: Buffer to save and restore ops. */
+};
+typedef struct xen_sysctl_tmem_op xen_sysctl_tmem_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tmem_op_t);
+
 struct xen_sysctl {
     uint32_t cmd;
 #define XEN_SYSCTL_readconsole                    1
@@ -613,16 +781,22 @@
 #define XEN_SYSCTL_pm_op                         12
 #define XEN_SYSCTL_page_offline_op               14
 #define XEN_SYSCTL_lockprof_op                   15
-#define XEN_SYSCTL_topologyinfo                  16 
+#define XEN_SYSCTL_cputopoinfo                   16
 #define XEN_SYSCTL_numainfo                      17
 #define XEN_SYSCTL_cpupool_op                    18
 #define XEN_SYSCTL_scheduler_op                  19
+#define XEN_SYSCTL_coverage_op                   20
+#define XEN_SYSCTL_psr_cmt_op                    21
+#define XEN_SYSCTL_pcitopoinfo                   22
+#define XEN_SYSCTL_psr_cat_op                    23
+#define XEN_SYSCTL_tmem_op                       24
     uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
     union {
         struct xen_sysctl_readconsole       readconsole;
         struct xen_sysctl_tbuf_op           tbuf_op;
         struct xen_sysctl_physinfo          physinfo;
-        struct xen_sysctl_topologyinfo      topologyinfo;
+        struct xen_sysctl_cputopoinfo       cputopoinfo;
+        struct xen_sysctl_pcitopoinfo       pcitopoinfo;
         struct xen_sysctl_numainfo          numainfo;
         struct xen_sysctl_sched_id          sched_id;
         struct xen_sysctl_perfc_op          perfc_op;
@@ -637,6 +811,10 @@
         struct xen_sysctl_lockprof_op       lockprof_op;
         struct xen_sysctl_cpupool_op        cpupool_op;
         struct xen_sysctl_scheduler_op      scheduler_op;
+        struct xen_sysctl_coverage_op       coverage_op;
+        struct xen_sysctl_psr_cmt_op        psr_cmt_op;
+        struct xen_sysctl_psr_cat_op        psr_cat_op;
+        struct xen_sysctl_tmem_op           tmem_op;
         uint8_t                             pad[128];
     } u;
 };
@@ -648,7 +826,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/tmem.h
===================================================================
--- trunk/sys/xen/interface/tmem.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/tmem.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -34,48 +34,28 @@
 #define TMEM_SPEC_VERSION          1
 
 /* Commands to HYPERVISOR_tmem_op() */
-#define TMEM_CONTROL               0
+#ifdef __XEN__
+#define TMEM_CONTROL               0 /* Now called XEN_SYSCTL_tmem_op */
+#else
+#undef TMEM_CONTROL
+#endif
 #define TMEM_NEW_POOL              1
 #define TMEM_DESTROY_POOL          2
-#define TMEM_NEW_PAGE              3
 #define TMEM_PUT_PAGE              4
 #define TMEM_GET_PAGE              5
 #define TMEM_FLUSH_PAGE            6
 #define TMEM_FLUSH_OBJECT          7
+#if __XEN_INTERFACE_VERSION__ < 0x00040400
+#define TMEM_NEW_PAGE              3
 #define TMEM_READ                  8
 #define TMEM_WRITE                 9
 #define TMEM_XCHG                 10
+#endif
 
 /* Privileged commands to HYPERVISOR_tmem_op() */
-#define TMEM_AUTH                 101 
+#define TMEM_AUTH                 101
 #define TMEM_RESTORE_NEW          102
 
-/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */
-#define TMEMC_THAW                   0
-#define TMEMC_FREEZE                 1
-#define TMEMC_FLUSH                  2
-#define TMEMC_DESTROY                3
-#define TMEMC_LIST                   4
-#define TMEMC_SET_WEIGHT             5
-#define TMEMC_SET_CAP                6
-#define TMEMC_SET_COMPRESS           7
-#define TMEMC_QUERY_FREEABLE_MB      8
-#define TMEMC_SAVE_BEGIN             10
-#define TMEMC_SAVE_GET_VERSION       11
-#define TMEMC_SAVE_GET_MAXPOOLS      12
-#define TMEMC_SAVE_GET_CLIENT_WEIGHT 13
-#define TMEMC_SAVE_GET_CLIENT_CAP    14
-#define TMEMC_SAVE_GET_CLIENT_FLAGS  15
-#define TMEMC_SAVE_GET_POOL_FLAGS    16
-#define TMEMC_SAVE_GET_POOL_NPAGES   17
-#define TMEMC_SAVE_GET_POOL_UUID     18
-#define TMEMC_SAVE_GET_NEXT_PAGE     19
-#define TMEMC_SAVE_GET_NEXT_INV      20
-#define TMEMC_SAVE_END               21
-#define TMEMC_RESTORE_BEGIN          30
-#define TMEMC_RESTORE_PUT_PAGE       32
-#define TMEMC_RESTORE_FLUSH_PAGE     33
-
 /* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
 #define TMEM_POOL_PERSIST          1
 #define TMEM_POOL_SHARED           2
@@ -94,9 +74,16 @@
 #define EFROZEN                 1000
 #define EEMPTY                  1001
 
+struct xen_tmem_oid {
+    uint64_t oid[3];
+};
+typedef struct xen_tmem_oid xen_tmem_oid_t;
+DEFINE_XEN_GUEST_HANDLE(xen_tmem_oid_t);
 
 #ifndef __ASSEMBLY__
+#if __XEN_INTERFACE_VERSION__ < 0x00040400
 typedef xen_pfn_t tmem_cli_mfn_t;
+#endif
 typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t;
 struct tmem_op {
     uint32_t cmd;
@@ -107,33 +94,22 @@
             uint32_t flags;
             uint32_t arg1;
         } creat; /* for cmd == TMEM_NEW_POOL, TMEM_AUTH, TMEM_RESTORE_NEW */
-        struct { 
-            uint32_t subop;
-            uint32_t cli_id;
-            uint32_t arg1;
-            uint32_t arg2;
-            uint64_t oid[3];
-            tmem_cli_va_t buf;
-        } ctrl; /* for cmd == TMEM_CONTROL */
         struct {
-            
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
             uint64_t oid[3];
+#else
+            xen_tmem_oid_t oid;
+#endif
             uint32_t index;
             uint32_t tmem_offset;
             uint32_t pfn_offset;
             uint32_t len;
-            tmem_cli_mfn_t cmfn; /* client machine page frame */
+            xen_pfn_t cmfn; /* client machine page frame */
         } gen; /* for all other cmd ("generic") */
     } u;
 };
 typedef struct tmem_op tmem_op_t;
 DEFINE_XEN_GUEST_HANDLE(tmem_op_t);
-
-struct tmem_handle {
-    uint32_t pool_id;
-    uint32_t index;
-    uint64_t oid[3];
-};
 #endif
 
 #endif /* __XEN_PUBLIC_TMEM_H__ */
@@ -141,7 +117,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/trace.h
===================================================================
--- trunk/sys/xen/interface/trace.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/trace.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -51,13 +51,41 @@
 #define TRC_SUBCLS_SHIFT 12
 
 /* trace subclasses for SVM */
-#define TRC_HVM_ENTRYEXIT 0x00081000   /* VMENTRY and #VMEXIT       */
-#define TRC_HVM_HANDLER   0x00082000   /* various HVM handlers      */
+#define TRC_HVM_ENTRYEXIT   0x00081000   /* VMENTRY and #VMEXIT       */
+#define TRC_HVM_HANDLER     0x00082000   /* various HVM handlers      */
+#define TRC_HVM_EMUL        0x00084000   /* emulated devices */
 
 #define TRC_SCHED_MIN       0x00021000   /* Just runstate changes */
 #define TRC_SCHED_CLASS     0x00022000   /* Scheduler-specific    */
 #define TRC_SCHED_VERBOSE   0x00028000   /* More inclusive scheduling */
 
+/*
+ * The highest 3 bits of the last 12 bits of TRC_SCHED_CLASS above are
+ * reserved for encoding what scheduler produced the information. The
+ * actual event is encoded in the last 9 bits.
+ *
+ * This means we have 8 scheduling IDs available (which means at most 8
+ * schedulers generating events) and, in each scheduler, up to 512
+ * different events.
+ */
+#define TRC_SCHED_ID_BITS 3
+#define TRC_SCHED_ID_SHIFT (TRC_SUBCLS_SHIFT - TRC_SCHED_ID_BITS)
+#define TRC_SCHED_ID_MASK (((1UL<<TRC_SCHED_ID_BITS) - 1) << TRC_SCHED_ID_SHIFT)
+#define TRC_SCHED_EVT_MASK (~(TRC_SCHED_ID_MASK))
+
+/* Per-scheduler IDs, to identify scheduler specific events */
+#define TRC_SCHED_CSCHED   0
+#define TRC_SCHED_CSCHED2  1
+/* #define XEN_SCHEDULER_SEDF 2 (Removed) */
+#define TRC_SCHED_ARINC653 3
+#define TRC_SCHED_RTDS     4
+
+/* Per-scheduler tracing */
+#define TRC_SCHED_CLASS_EVT(_c, _e) \
+  ( ( TRC_SCHED_CLASS | \
+      ((TRC_SCHED_##_c << TRC_SCHED_ID_SHIFT) & TRC_SCHED_ID_MASK) ) + \
+    (_e & TRC_SCHED_EVT_MASK) )
+
 /* Trace classes for Hardware */
 #define TRC_HW_PM           0x00801000   /* Power management traces */
 #define TRC_HW_IRQ          0x00802000   /* Traces relating to the handling of IRQs */
@@ -95,21 +123,52 @@
 #define TRC_MEM_POD_ZERO_RECLAIM    (TRC_MEM + 17)
 #define TRC_MEM_POD_SUPERPAGE_SPLINTER (TRC_MEM + 18)
 
+#define TRC_PV_ENTRY   0x00201000 /* Hypervisor entry points for PV guests. */
+#define TRC_PV_SUBCALL 0x00202000 /* Sub-call in a multicall hypercall */
 
-#define TRC_PV_HYPERCALL             (TRC_PV +  1)
-#define TRC_PV_TRAP                  (TRC_PV +  3)
-#define TRC_PV_PAGE_FAULT            (TRC_PV +  4)
-#define TRC_PV_FORCED_INVALID_OP     (TRC_PV +  5)
-#define TRC_PV_EMULATE_PRIVOP        (TRC_PV +  6)
-#define TRC_PV_EMULATE_4GB           (TRC_PV +  7)
-#define TRC_PV_MATH_STATE_RESTORE    (TRC_PV +  8)
-#define TRC_PV_PAGING_FIXUP          (TRC_PV +  9)
-#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV + 10)
-#define TRC_PV_PTWR_EMULATION        (TRC_PV + 11)
-#define TRC_PV_PTWR_EMULATION_PAE    (TRC_PV + 12)
-  /* Indicates that addresses in trace record are 64 bits */
-#define TRC_64_FLAG               (0x100) 
+#define TRC_PV_HYPERCALL             (TRC_PV_ENTRY +  1)
+#define TRC_PV_TRAP                  (TRC_PV_ENTRY +  3)
+#define TRC_PV_PAGE_FAULT            (TRC_PV_ENTRY +  4)
+#define TRC_PV_FORCED_INVALID_OP     (TRC_PV_ENTRY +  5)
+#define TRC_PV_EMULATE_PRIVOP        (TRC_PV_ENTRY +  6)
+#define TRC_PV_EMULATE_4GB           (TRC_PV_ENTRY +  7)
+#define TRC_PV_MATH_STATE_RESTORE    (TRC_PV_ENTRY +  8)
+#define TRC_PV_PAGING_FIXUP          (TRC_PV_ENTRY +  9)
+#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV_ENTRY + 10)
+#define TRC_PV_PTWR_EMULATION        (TRC_PV_ENTRY + 11)
+#define TRC_PV_PTWR_EMULATION_PAE    (TRC_PV_ENTRY + 12)
+#define TRC_PV_HYPERCALL_V2          (TRC_PV_ENTRY + 13)
+#define TRC_PV_HYPERCALL_SUBCALL     (TRC_PV_SUBCALL + 14)
 
+/*
+ * TRC_PV_HYPERCALL_V2 format
+ *
+ * Only some of the hypercall argument are recorded. Bit fields A0 to
+ * A5 in the first extra word are set if the argument is present and
+ * the arguments themselves are packed sequentially in the following
+ * words.
+ *
+ * The TRC_64_FLAG bit is not set for these events (even if there are
+ * 64-bit arguments in the record).
+ *
+ * Word
+ * 0    bit 31 30|29 28|27 26|25 24|23 22|21 20|19 ... 0
+ *          A5   |A4   |A3   |A2   |A1   |A0   |Hypercall op
+ * 1    First 32 bit (or low word of first 64 bit) arg in record
+ * 2    Second 32 bit (or high word of first 64 bit) arg in record
+ * ...
+ *
+ * A0-A5 bitfield values:
+ *
+ *   00b  Argument not present
+ *   01b  32-bit argument present
+ *   10b  64-bit argument present
+ *   11b  Reserved
+ */
+#define TRC_PV_HYPERCALL_V2_ARG_32(i) (0x1 << (20 + 2*(i)))
+#define TRC_PV_HYPERCALL_V2_ARG_64(i) (0x2 << (20 + 2*(i)))
+#define TRC_PV_HYPERCALL_V2_ARG_MASK  (0xfff00000)
+
 #define TRC_SHADOW_NOT_SHADOW                 (TRC_SHADOW +  1)
 #define TRC_SHADOW_FAST_PROPAGATE             (TRC_SHADOW +  2)
 #define TRC_SHADOW_FAST_MMIO                  (TRC_SHADOW +  3)
@@ -173,6 +232,25 @@
 #define TRC_HVM_IOPORT_WRITE    (TRC_HVM_HANDLER + 0x216)
 #define TRC_HVM_IOMEM_WRITE     (TRC_HVM_HANDLER + 0x217)
 
+/* Trace events for emulated devices */
+#define TRC_HVM_EMUL_HPET_START_TIMER  (TRC_HVM_EMUL + 0x1)
+#define TRC_HVM_EMUL_PIT_START_TIMER   (TRC_HVM_EMUL + 0x2)
+#define TRC_HVM_EMUL_RTC_START_TIMER   (TRC_HVM_EMUL + 0x3)
+#define TRC_HVM_EMUL_LAPIC_START_TIMER (TRC_HVM_EMUL + 0x4)
+#define TRC_HVM_EMUL_HPET_STOP_TIMER   (TRC_HVM_EMUL + 0x5)
+#define TRC_HVM_EMUL_PIT_STOP_TIMER    (TRC_HVM_EMUL + 0x6)
+#define TRC_HVM_EMUL_RTC_STOP_TIMER    (TRC_HVM_EMUL + 0x7)
+#define TRC_HVM_EMUL_LAPIC_STOP_TIMER  (TRC_HVM_EMUL + 0x8)
+#define TRC_HVM_EMUL_PIT_TIMER_CB      (TRC_HVM_EMUL + 0x9)
+#define TRC_HVM_EMUL_LAPIC_TIMER_CB    (TRC_HVM_EMUL + 0xA)
+#define TRC_HVM_EMUL_PIC_INT_OUTPUT    (TRC_HVM_EMUL + 0xB)
+#define TRC_HVM_EMUL_PIC_KICK          (TRC_HVM_EMUL + 0xC)
+#define TRC_HVM_EMUL_PIC_INTACK        (TRC_HVM_EMUL + 0xD)
+#define TRC_HVM_EMUL_PIC_POSEDGE       (TRC_HVM_EMUL + 0xE)
+#define TRC_HVM_EMUL_PIC_NEGEDGE       (TRC_HVM_EMUL + 0xF)
+#define TRC_HVM_EMUL_PIC_PEND_IRQ_CALL (TRC_HVM_EMUL + 0x10)
+#define TRC_HVM_EMUL_LAPIC_PIC_INTR    (TRC_HVM_EMUL + 0x11)
+
 /* trace events for per class */
 #define TRC_PM_FREQ_CHANGE      (TRC_HW_PM + 0x01)
 #define TRC_PM_IDLE_ENTRY       (TRC_HW_PM + 0x02)
@@ -188,6 +266,14 @@
 #define TRC_HW_IRQ_UNMAPPED_VECTOR    (TRC_HW_IRQ + 0x7)
 #define TRC_HW_IRQ_HANDLED            (TRC_HW_IRQ + 0x8)
 
+/*
+ * Event Flags
+ *
+ * Some events (e.g, TRC_PV_TRAP and TRC_HVM_IOMEM_READ) have multiple
+ * record formats.  These event flags distinguish between the
+ * different formats.
+ */
+#define TRC_64_FLAG 0x100 /* Addresses are 64 bits (instead of 32 bits) */
 
 /* This structure represents a single trace buffer record. */
 struct t_rec {
@@ -238,7 +324,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/vcpu.h
===================================================================
--- trunk/sys/xen/interface/vcpu.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/vcpu.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -32,7 +32,7 @@
 
 /*
  * Prototype for this hypercall is:
- *  int vcpu_op(int cmd, int vcpuid, void *extra_args)
+ *  long vcpu_op(int cmd, unsigned int vcpuid, void *extra_args)
  * @cmd        == VCPUOP_??? (VCPU operation).
  * @vcpuid     == VCPU to operate on.
  * @extra_args == Operation-specific extra arguments (NULL if none).
@@ -233,7 +233,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/version.h
===================================================================
--- trunk/sys/xen/interface/version.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/version.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -29,6 +29,8 @@
 #ifndef __XEN_PUBLIC_VERSION_H__
 #define __XEN_PUBLIC_VERSION_H__
 
+#include "xen.h"
+
 /* NB. All ops return zero on success, except XENVER_{version,pagesize} */
 
 /* arg == NULL; returns major:minor (16:16). */
@@ -59,7 +61,7 @@
 
 #define XENVER_platform_parameters 5
 struct xen_platform_parameters {
-    unsigned long virt_start;
+    xen_ulong_t virt_start;
 };
 typedef struct xen_platform_parameters xen_platform_parameters_t;
 
@@ -87,7 +89,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Added: trunk/sys/xen/interface/vm_event.h
===================================================================
--- trunk/sys/xen/interface/vm_event.h	                        (rev 0)
+++ trunk/sys/xen/interface/vm_event.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -0,0 +1,270 @@
+/* $MidnightBSD$ */
+/******************************************************************************
+ * vm_event.h
+ *
+ * Memory event common structures.
+ *
+ * Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _XEN_PUBLIC_VM_EVENT_H
+#define _XEN_PUBLIC_VM_EVENT_H
+
+#include "xen.h"
+
+#define VM_EVENT_INTERFACE_VERSION 0x00000001
+
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+#include "io/ring.h"
+
+/*
+ * Memory event flags
+ */
+
+/*
+ * VCPU_PAUSED in a request signals that the vCPU triggering the event has been
+ *  paused
+ * VCPU_PAUSED in a response signals to unpause the vCPU
+ */
+#define VM_EVENT_FLAG_VCPU_PAUSED        (1 << 0)
+/* Flags to aid debugging vm_event */
+#define VM_EVENT_FLAG_FOREIGN            (1 << 1)
+/*
+ * The following flags can be set in response to a mem_access event.
+ *
+ * Emulate the fault-causing instruction (if set in the event response flags).
+ * This will allow the guest to continue execution without lifting the page
+ * access restrictions.
+ */
+#define VM_EVENT_FLAG_EMULATE            (1 << 2)
+/*
+ * Same as VM_EVENT_FLAG_EMULATE, but with write operations or operations
+ * potentially having side effects (like memory mapped or port I/O) disabled.
+ */
+#define VM_EVENT_FLAG_EMULATE_NOWRITE    (1 << 3)
+/*
+ * Toggle singlestepping on vm_event response.
+ * Requires the vCPU to be paused already (synchronous events only).
+ */
+#define VM_EVENT_FLAG_TOGGLE_SINGLESTEP  (1 << 4)
+/*
+ * Data is being sent back to the hypervisor in the event response, to be
+ * returned by the read function when emulating an instruction.
+ * This flag is only useful when combined with VM_EVENT_FLAG_EMULATE
+ * and takes precedence if combined with VM_EVENT_FLAG_EMULATE_NOWRITE
+ * (i.e. if both VM_EVENT_FLAG_EMULATE_NOWRITE and
+ * VM_EVENT_FLAG_SET_EMUL_READ_DATA are set, only the latter will be honored).
+ */
+#define VM_EVENT_FLAG_SET_EMUL_READ_DATA (1 << 5)
+ /*
+  * Deny completion of the operation that triggered the event.
+  * Currently only useful for MSR, CR0, CR3 and CR4 write events.
+  */
+#define VM_EVENT_FLAG_DENY               (1 << 6)
+/*
+ * This flag can be set in a request or a response
+ *
+ * On a request, indicates that the event occurred in the alternate p2m specified by
+ * the altp2m_idx request field.
+ *
+ * On a response, indicates that the VCPU should resume in the alternate p2m specified
+ * by the altp2m_idx response field if possible.
+ */
+#define VM_EVENT_FLAG_ALTERNATE_P2M      (1 << 7)
+
+/*
+ * Reasons for the vm event request
+ */
+
+/* Default case */
+#define VM_EVENT_REASON_UNKNOWN                 0
+/* Memory access violation */
+#define VM_EVENT_REASON_MEM_ACCESS              1
+/* Memory sharing event */
+#define VM_EVENT_REASON_MEM_SHARING             2
+/* Memory paging event */
+#define VM_EVENT_REASON_MEM_PAGING              3
+/* A control register was updated */
+#define VM_EVENT_REASON_WRITE_CTRLREG           4
+/* An MSR was updated. */
+#define VM_EVENT_REASON_MOV_TO_MSR              5
+/* Debug operation executed (e.g. int3) */
+#define VM_EVENT_REASON_SOFTWARE_BREAKPOINT     6
+/* Single-step (e.g. MTF) */
+#define VM_EVENT_REASON_SINGLESTEP              7
+/* An event has been requested via HVMOP_guest_request_vm_event. */
+#define VM_EVENT_REASON_GUEST_REQUEST           8
+
+/* Supported values for the vm_event_write_ctrlreg index. */
+#define VM_EVENT_X86_CR0    0
+#define VM_EVENT_X86_CR3    1
+#define VM_EVENT_X86_CR4    2
+#define VM_EVENT_X86_XCR0   3
+
+/*
+ * Using a custom struct (not hvm_hw_cpu) so as to not fill
+ * the vm_event ring buffer too quickly.
+ */
+struct vm_event_regs_x86 {
+    uint64_t rax;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbx;
+    uint64_t rsp;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+    uint64_t rflags;
+    uint64_t dr7;
+    uint64_t rip;
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+    uint64_t msr_efer;
+    uint64_t msr_star;
+    uint64_t msr_lstar;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    uint32_t cs_arbytes;
+    uint32_t _pad;
+};
+
+/*
+ * mem_access flag definitions
+ *
+ * These flags are set only as part of a mem_event request.
+ *
+ * R/W/X: Defines the type of violation that has triggered the event
+ *        Multiple types can be set in a single violation!
+ * GLA_VALID: If the gla field holds a guest VA associated with the event
+ * FAULT_WITH_GLA: If the violation was triggered by accessing gla
+ * FAULT_IN_GPT: If the violation was triggered during translating gla
+ */
+#define MEM_ACCESS_R                    (1 << 0)
+#define MEM_ACCESS_W                    (1 << 1)
+#define MEM_ACCESS_X                    (1 << 2)
+#define MEM_ACCESS_RWX                  (MEM_ACCESS_R | MEM_ACCESS_W | MEM_ACCESS_X)
+#define MEM_ACCESS_RW                   (MEM_ACCESS_R | MEM_ACCESS_W)
+#define MEM_ACCESS_RX                   (MEM_ACCESS_R | MEM_ACCESS_X)
+#define MEM_ACCESS_WX                   (MEM_ACCESS_W | MEM_ACCESS_X)
+#define MEM_ACCESS_GLA_VALID            (1 << 3)
+#define MEM_ACCESS_FAULT_WITH_GLA       (1 << 4)
+#define MEM_ACCESS_FAULT_IN_GPT         (1 << 5)
+
+struct vm_event_mem_access {
+    uint64_t gfn;
+    uint64_t offset;
+    uint64_t gla;   /* if flags has MEM_ACCESS_GLA_VALID set */
+    uint32_t flags; /* MEM_ACCESS_* */
+    uint32_t _pad;
+};
+
+struct vm_event_write_ctrlreg {
+    uint32_t index;
+    uint32_t _pad;
+    uint64_t new_value;
+    uint64_t old_value;
+};
+
+struct vm_event_debug {
+    uint64_t gfn;
+};
+
+struct vm_event_mov_to_msr {
+    uint64_t msr;
+    uint64_t value;
+};
+
+#define MEM_PAGING_DROP_PAGE       (1 << 0)
+#define MEM_PAGING_EVICT_FAIL      (1 << 1)
+
+struct vm_event_paging {
+    uint64_t gfn;
+    uint32_t p2mt;
+    uint32_t flags;
+};
+
+struct vm_event_sharing {
+    uint64_t gfn;
+    uint32_t p2mt;
+    uint32_t _pad;
+};
+
+struct vm_event_emul_read_data {
+    uint32_t size;
+    /* The struct is used in a union with vm_event_regs_x86. */
+    uint8_t  data[sizeof(struct vm_event_regs_x86) - sizeof(uint32_t)];
+};
+
+typedef struct vm_event_st {
+    uint32_t version;   /* VM_EVENT_INTERFACE_VERSION */
+    uint32_t flags;     /* VM_EVENT_FLAG_* */
+    uint32_t reason;    /* VM_EVENT_REASON_* */
+    uint32_t vcpu_id;
+    uint16_t altp2m_idx; /* may be used during request and response */
+    uint16_t _pad[3];
+
+    union {
+        struct vm_event_paging                mem_paging;
+        struct vm_event_sharing               mem_sharing;
+        struct vm_event_mem_access            mem_access;
+        struct vm_event_write_ctrlreg         write_ctrlreg;
+        struct vm_event_mov_to_msr            mov_to_msr;
+        struct vm_event_debug                 software_breakpoint;
+        struct vm_event_debug                 singlestep;
+    } u;
+
+    union {
+        union {
+            struct vm_event_regs_x86 x86;
+        } regs;
+
+        struct vm_event_emul_read_data emul_read_data;
+    } data;
+} vm_event_request_t, vm_event_response_t;
+
+DEFINE_RING_TYPES(vm_event, vm_event_request_t, vm_event_response_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+#endif /* _XEN_PUBLIC_VM_EVENT_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */


Property changes on: trunk/sys/xen/interface/vm_event.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/interface/xen-compat.h
===================================================================
--- trunk/sys/xen/interface/xen-compat.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/xen-compat.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -28,14 +28,17 @@
 #ifndef __XEN_PUBLIC_XEN_COMPAT_H__
 #define __XEN_PUBLIC_XEN_COMPAT_H__
 
-#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040200
+#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040600
 
 #if defined(__XEN__) || defined(__XEN_TOOLS__)
 /* Xen is built with matching headers and implements the latest interface. */
 #define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
 #elif !defined(__XEN_INTERFACE_VERSION__)
-/* Guests which do not specify a version get the legacy interface. */
-#define __XEN_INTERFACE_VERSION__ 0x00000000
+/*
+ * The interface version is not set if and only if xen/xen-os.h is not
+ * included.
+ */
+#error "Please include xen/xen-os.h"
 #endif
 
 #if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__

Modified: trunk/sys/xen/interface/xen.h
===================================================================
--- trunk/sys/xen/interface/xen.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/xen.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -32,9 +32,7 @@
 
 #if defined(__i386__) || defined(__x86_64__)
 #include "arch-x86/xen.h"
-#elif defined(__ia64__)
-#include "arch-ia64.h"
-#elif defined(__arm__)
+#elif defined(__arm__) || defined (__aarch64__)
 #include "arch-arm.h"
 #else
 #error "Unsupported architecture"
@@ -46,12 +44,15 @@
 __DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
 DEFINE_XEN_GUEST_HANDLE(int);
 __DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
+#if __XEN_INTERFACE_VERSION__ < 0x00040300
 DEFINE_XEN_GUEST_HANDLE(long);
 __DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
+#endif
 DEFINE_XEN_GUEST_HANDLE(void);
 
 DEFINE_XEN_GUEST_HANDLE(uint64_t);
 DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
+DEFINE_XEN_GUEST_HANDLE(xen_ulong_t);
 #endif
 
 /*
@@ -101,6 +102,7 @@
 #define __HYPERVISOR_kexec_op             37
 #define __HYPERVISOR_tmem_op              38
 #define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
+#define __HYPERVISOR_xenpmu_op            40
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -160,6 +162,7 @@
 #define VIRQ_MEM_EVENT  10 /* G. (DOM0) A memory event has occured           */
 #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient                     */
 #define VIRQ_ENOMEM     12 /* G. (DOM0) Low on heap memory       */
+#define VIRQ_XENPMU     13 /* V.  PMC interrupt                              */
 
 /* Architecture-specific VIRQ definitions. */
 #define VIRQ_ARCH_0    16
@@ -277,15 +280,15 @@
  *  refer to Intel SDM 10.12. The PAT allows to set the caching attributes of
  *  pages instead of using MTRRs.
  *
- *  The PAT MSR is as follow (it is a 64-bit value, each entry is 8 bits):
- *             PAT4                 PAT0
- *   +---+----+----+----+-----+----+----+
- *    WC | WC | WB | UC | UC- | WC | WB |  <= Linux
- *   +---+----+----+----+-----+----+----+
- *    WC | WT | WB | UC | UC- | WT | WB |  <= BIOS (default when machine boots)
- *   +---+----+----+----+-----+----+----+
- *    WC | WP | WC | UC | UC- | WT | WB |  <= Xen
- *   +---+----+----+----+-----+----+----+
+ *  The PAT MSR is as follows (it is a 64-bit value, each entry is 8 bits):
+ *                    PAT4                 PAT0
+ *  +-----+-----+----+----+----+-----+----+----+
+ *  | UC  | UC- | WC | WB | UC | UC- | WC | WB |  <= Linux
+ *  +-----+-----+----+----+----+-----+----+----+
+ *  | UC  | UC- | WT | WB | UC | UC- | WT | WB |  <= BIOS (default when machine boots)
+ *  +-----+-----+----+----+----+-----+----+----+
+ *  | rsv | rsv | WP | WC | UC | UC- | WT | WB |  <= Xen
+ *  +-----+-----+----+----+----+-----+----+----+
  *
  *  The lookup of this index table translates to looking up
  *  Bit 7, Bit 4, and Bit 3 of val entry:
@@ -319,41 +322,47 @@
 
 /*
  * MMU EXTENDED OPERATIONS
- * 
- * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
+ *
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_mmuext_op(mmuext_op_t uops[],
+ * `                      unsigned int count,
+ * `                      unsigned int *pdone,
+ * `                      unsigned int foreigndom)
+ */
+/* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
  * A foreigndom (FD) can be specified (or DOMID_SELF for none).
  * Where the FD has some effect, it is described below.
- * 
+ *
  * cmd: MMUEXT_(UN)PIN_*_TABLE
  * mfn: Machine frame number to be (un)pinned as a p.t. page.
  *      The frame must belong to the FD, if one is specified.
- * 
+ *
  * cmd: MMUEXT_NEW_BASEPTR
  * mfn: Machine frame number of new page-table base to install in MMU.
- * 
+ *
  * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
  * mfn: Machine frame number of new page-table base to install in MMU
  *      when in user space.
- * 
+ *
  * cmd: MMUEXT_TLB_FLUSH_LOCAL
  * No additional arguments. Flushes local TLB.
- * 
+ *
  * cmd: MMUEXT_INVLPG_LOCAL
  * linear_addr: Linear address to be flushed from the local TLB.
- * 
+ *
  * cmd: MMUEXT_TLB_FLUSH_MULTI
  * vcpumask: Pointer to bitmap of VCPUs to be flushed.
- * 
+ *
  * cmd: MMUEXT_INVLPG_MULTI
  * linear_addr: Linear address to be flushed.
  * vcpumask: Pointer to bitmap of VCPUs to be flushed.
- * 
+ *
  * cmd: MMUEXT_TLB_FLUSH_ALL
  * No additional arguments. Flushes all VCPUs' TLBs.
- * 
+ *
  * cmd: MMUEXT_INVLPG_ALL
  * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
- * 
+ *
  * cmd: MMUEXT_FLUSH_CACHE
  * No additional arguments. Writes back and flushes cache contents.
  *
@@ -360,7 +369,7 @@
  * cmd: MMUEXT_FLUSH_CACHE_GLOBAL
  * No additional arguments. Writes back and flushes cache contents
  * on all CPUs in the system.
- * 
+ *
  * cmd: MMUEXT_SET_LDT
  * linear_addr: Linear address of LDT base (NB. must be page-aligned).
  * nr_ents: Number of entries in LDT.
@@ -375,6 +384,7 @@
  * cmd: MMUEXT_[UN]MARK_SUPER
  * mfn: Machine frame number of head of superpage to be [un]marked.
  */
+/* ` enum mmuext_cmd { */
 #define MMUEXT_PIN_L1_TABLE      0
 #define MMUEXT_PIN_L2_TABLE      1
 #define MMUEXT_PIN_L3_TABLE      2
@@ -395,10 +405,11 @@
 #define MMUEXT_FLUSH_CACHE_GLOBAL 18
 #define MMUEXT_MARK_SUPER       19
 #define MMUEXT_UNMARK_SUPER     20
+/* ` } */
 
 #ifndef __ASSEMBLY__
 struct mmuext_op {
-    unsigned int cmd;
+    unsigned int cmd; /* => enum mmuext_cmd */
     union {
         /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
          * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */
@@ -423,9 +434,24 @@
 DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 #endif
 
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_update_va_mapping(unsigned long va, u64 val,
+ * `                              enum uvm_flags flags)
+ * `
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, u64 val,
+ * `                                          enum uvm_flags flags,
+ * `                                          domid_t domid)
+ * `
+ * ` @va: The virtual address whose mapping we want to change
+ * ` @val: The new page table entry, must contain a machine address
+ * ` @flags: Control TLB flushes
+ */
 /* These are passed as 'flags' to update_va_mapping. They can be ORed. */
 /* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap.   */
 /* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer.         */
+/* ` enum uvm_flags { */
 #define UVMF_NONE               (0UL<<0) /* No flushing at all.   */
 #define UVMF_TLB_FLUSH          (1UL<<0) /* Flush entire TLB(s).  */
 #define UVMF_INVLPG             (2UL<<0) /* Flush only one entry. */
@@ -433,6 +459,7 @@
 #define UVMF_MULTI              (0UL<<2) /* Flush subset of TLBs. */
 #define UVMF_LOCAL              (0UL<<2) /* Flush local TLB.      */
 #define UVMF_ALL                (1UL<<2) /* Flush all TLBs.       */
+/* ` } */
 
 /*
  * Commands to HYPERVISOR_console_io().
@@ -462,7 +489,21 @@
 /* x86/PAE guests: support PDPTs above 4GB. */
 #define VMASST_TYPE_pae_extended_cr3     3
 
+/*
+ * x86/64 guests: strictly hide M2P from user mode.
+ * This allows the guest to control respective hypervisor behavior:
+ * - when not set, L4 tables get created with the respective slot blank,
+ *   and whenever the L4 table gets used as a kernel one the missing
+ *   mapping gets inserted,
+ * - when set, L4 tables get created with the respective slot initialized
+ *   as before, and whenever the L4 table gets used as a user one the
+ *   mapping gets zapped.
+ */
+#define VMASST_TYPE_m2p_strict           32
+
+#if __XEN_INTERFACE_VERSION__ < 0x00040600
 #define MAX_VMASST_TYPE                  3
+#endif
 
 #ifndef __ASSEMBLY__
 
@@ -515,21 +556,28 @@
 DEFINE_XEN_GUEST_HANDLE(mmu_update_t);
 
 /*
- * Send an array of these to HYPERVISOR_multicall().
- * NB. The fields are natural register size for this architecture.
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_multicall(multicall_entry_t call_list[],
+ * `                      uint32_t nr_calls);
+ *
+ * NB. The fields are logically the natural register size for this
+ * architecture. In cases where xen_ulong_t is larger than this then
+ * any unused bits in the upper portion must be zero.
  */
 struct multicall_entry {
-    unsigned long op, result;
-    unsigned long args[6];
+    xen_ulong_t op, result;
+    xen_ulong_t args[6];
 };
 typedef struct multicall_entry multicall_entry_t;
 DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
 
+#if __XEN_INTERFACE_VERSION__ < 0x00040400
 /*
- * Event channel endpoints per domain:
+ * Event channel endpoints per domain (when using the 2-level ABI):
  *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
  */
-#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+#define NR_EVENT_CHANNELS EVTCHN_2L_NR_CHANNELS
+#endif
 
 struct vcpu_time_info {
     /*
@@ -585,8 +633,12 @@
      * to block: this avoids wakeup-waiting races.
      */
     uint8_t evtchn_upcall_pending;
+#ifdef XEN_HAVE_PV_UPCALL_MASK
     uint8_t evtchn_upcall_mask;
-    unsigned long evtchn_pending_sel;
+#else /* XEN_HAVE_PV_UPCALL_MASK */
+    uint8_t pad0;
+#endif /* XEN_HAVE_PV_UPCALL_MASK */
+    xen_ulong_t evtchn_pending_sel;
     struct arch_vcpu_info arch;
     struct vcpu_time_info time;
 }; /* 64 bytes (x86) */
@@ -595,6 +647,7 @@
 #endif
 
 /*
+ * `incontents 200 startofday_shared Start-of-day shared data structure
  * Xen/kernel shared data -- pointer provided in start_info.
  *
  * This structure is defined to be both smaller than a page, and the
@@ -636,8 +689,8 @@
      * per-vcpu selector word to be set. Each bit in the selector covers a
      * 'C long' in the PENDING bitfield array.
      */
-    unsigned long evtchn_pending[sizeof(unsigned long) * 8];
-    unsigned long evtchn_mask[sizeof(unsigned long) * 8];
+    xen_ulong_t evtchn_pending[sizeof(xen_ulong_t) * 8];
+    xen_ulong_t evtchn_mask[sizeof(xen_ulong_t) * 8];
 
     /*
      * Wallclock time: updated only by control software. Guests should base
@@ -646,6 +699,12 @@
     uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
     uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
     uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
+#if !defined(__i386__)
+    uint32_t wc_sec_hi;
+# define xen_wc_sec_hi wc_sec_hi
+#elif !defined(__XEN__) && !defined(__XEN_TOOLS__)
+# define xen_wc_sec_hi arch.wc_sec_hi
+#endif
 
     struct arch_shared_info arch;
 
@@ -655,30 +714,43 @@
 #endif
 
 /*
- * Start-of-day memory layout:
+ * `incontents 200 startofday Start-of-day memory layout
+ *
  *  1. The domain is started within contiguous virtual-memory region.
  *  2. The contiguous region ends on an aligned 4MB boundary.
  *  3. This the order of bootstrap elements in the initial virtual region:
  *      a. relocated kernel image
  *      b. initial ram disk              [mod_start, mod_len]
+ *         (may be omitted)
  *      c. list of allocated page frames [mfn_list, nr_pages]
  *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
  *      d. start_info_t structure        [register ESI (x86)]
- *      e. bootstrap page tables         [pt_base, CR3 (x86)]
- *      f. bootstrap stack               [register ESP (x86)]
+ *         in case of dom0 this page contains the console info, too
+ *      e. unless dom0: xenstore ring page
+ *      f. unless dom0: console ring page
+ *      g. bootstrap page tables         [pt_base and CR3 (x86)]
+ *      h. bootstrap stack               [register ESP (x86)]
  *  4. Bootstrap elements are packed together, but each is 4kB-aligned.
- *  5. The initial ram disk may be omitted.
- *  6. The list of page frames forms a contiguous 'pseudo-physical' memory
+ *  5. The list of page frames forms a contiguous 'pseudo-physical' memory
  *     layout for the domain. In particular, the bootstrap virtual-memory
  *     region is a 1:1 mapping to the first section of the pseudo-physical map.
- *  7. All bootstrap elements are mapped read-writable for the guest OS. The
+ *  6. All bootstrap elements are mapped read-writable for the guest OS. The
  *     only exception is the bootstrap page table, which is mapped read-only.
- *  8. There is guaranteed to be at least 512kB padding after the final
+ *  7. There is guaranteed to be at least 512kB padding after the final
  *     bootstrap element. If necessary, the bootstrap virtual region is
  *     extended by an extra 4MB to ensure this.
+ *
+ * Note: Prior to 25833:bb85bbccb1c9. ("x86/32-on-64 adjust Dom0 initial page
+ * table layout") a bug caused the pt_base (3.g above) and cr3 to not point
+ * to the start of the guest page tables (it was offset by two pages).
+ * This only manifested itself on 32-on-64 dom0 kernels and not 32-on-64 domU
+ * or 64-bit kernels of any colour. The page tables for a 32-on-64 dom0 got
+ * allocated in the order: 'first L1','first L2', 'first L3', so the offset
+ * to the page table base is by two pages back. The initial domain if it is
+ * 32-bit and runs under a 64-bit hypervisor should _NOT_ use two of the
+ * pages preceding pt_base and mark them as reserved/unused.
  */
-
-#define MAX_GUEST_CMDLINE 1024
+#ifdef XEN_HAVE_PV_GUEST_ENTRY
 struct start_info {
     /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
     char magic[32];             /* "xen-<version>-<platform>".            */
@@ -705,6 +777,7 @@
                                 /* (PFN of pre-loaded module if           */
                                 /*  SIF_MOD_START_PFN set in flags).      */
     unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
+#define MAX_GUEST_CMDLINE 1024
     int8_t cmd_line[MAX_GUEST_CMDLINE];
     /* The pfn range here covers both page table and p->m table frames.   */
     unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
@@ -717,6 +790,7 @@
 #define console_mfn    console.domU.mfn
 #define console_evtchn console.domU.evtchn
 #endif
+#endif /* XEN_HAVE_PV_GUEST_ENTRY */
 
 /* These flags are passed in the 'flags' field of start_info_t. */
 #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
@@ -723,6 +797,8 @@
 #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
 #define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
 #define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
+#define SIF_VIRT_P2M_4TOOLS (1<<4) /* Do Xen tools understand a virt. mapped */
+                                   /* P->M making the 3 level tree obsolete? */
 #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
 
 /*
@@ -750,7 +826,14 @@
     /* Unused, must be zero */
     uint32_t pad;
 };
-
+/*
+ * `incontents 200 startofday_dom0_console Dom0_console
+ *
+ * The console structure in start_info.console.dom0
+ *
+ * This structure includes a variety of information required to
+ * have a working VGA/VESA console.
+ */
 typedef struct dom0_vga_console_info {
     uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
 #define XEN_VGATYPE_TEXT_MODE_3 0x03
@@ -815,6 +898,9 @@
 /* Default definitions for macros used by domctl/sysctl. */
 #if defined(__XEN__) || defined(__XEN_TOOLS__)
 
+#ifndef int64_aligned_t
+#define int64_aligned_t int64_t
+#endif
 #ifndef uint64_aligned_t
 #define uint64_aligned_t uint64_t
 #endif
@@ -823,9 +909,9 @@
 #endif
 
 #ifndef __ASSEMBLY__
-struct xenctl_cpumap {
+struct xenctl_bitmap {
     XEN_GUEST_HANDLE_64(uint8) bitmap;
-    uint32_t nr_cpus;
+    uint32_t nr_bits;
 };
 #endif
 
@@ -836,7 +922,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil

Modified: trunk/sys/xen/interface/xenoprof.h
===================================================================
--- trunk/sys/xen/interface/xenoprof.h	2020-02-08 19:27:58 UTC (rev 12305)
+++ trunk/sys/xen/interface/xenoprof.h	2020-02-08 19:28:08 UTC (rev 12306)
@@ -145,7 +145,7 @@
 /*
  * Local variables:
  * mode: C
- * c-set-style: "BSD"
+ * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
  * indent-tabs-mode: nil


From laffer1 at midnightbsd.org  Sat Feb  8 14:28:38 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:28:38 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12307] trunk/sys/xen/xenmem: sync with
 FreeBSD 11-stable
Message-ID: <202002081928.018JScT7060978@stargazer.midnightbsd.org>

Revision: 12307
          http://svnweb.midnightbsd.org/src/?rev=12307
Author:   laffer1
Date:     2020-02-08 14:28:38 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Added Paths:
-----------
    trunk/sys/xen/xenmem/
    trunk/sys/xen/xenmem/xenmem_if.m

Added: trunk/sys/xen/xenmem/xenmem_if.m
===================================================================
--- trunk/sys/xen/xenmem/xenmem_if.m	                        (rev 0)
+++ trunk/sys/xen/xenmem/xenmem_if.m	2020-02-08 19:28:38 UTC (rev 12307)
@@ -0,0 +1,95 @@
+#-
+# Copyright (c) 2015 Roger Pau Monn? <royger at FreeBSD.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD: stable/11/sys/xen/xenmem/xenmem_if.m 282634 2015-05-08 14:48:40Z royger $
+# $MidnightBSD$
+
+#include <sys/bus.h>
+
+INTERFACE xenmem;
+
+#
+# Default implementations of some methods.
+#
+CODE {
+        static struct resource *
+        xenmem_generic_alloc(device_t dev, device_t child, int *res_id,
+            size_t size)
+        {
+                device_t parent;
+
+                parent = device_get_parent(dev);
+                if (parent == NULL)
+                        return (NULL);
+                return (XENMEM_ALLOC(parent, child, res_id, size));
+        }
+
+        static int
+        xenmem_generic_free(device_t dev, device_t child, int res_id,
+            struct resource *res)
+        {
+                device_t parent;
+
+                parent = device_get_parent(dev);
+                if (parent == NULL)
+                        return (ENXIO);
+                return (XENMEM_FREE(parent, child, res_id, res));
+        }
+};
+
+/**
+ * @brief Request for unused physical memory regions.
+ *
+ * @param _dev          the device whose child was being probed.
+ * @param _child        the child device which failed to probe.
+ * @param _res_id       a pointer to the resource identifier.
+ * @param _size         size of the required memory region.
+ *
+ * @returns             the resource which was allocated or @c NULL if no
+ *                      resource could be allocated.
+ */
+METHOD struct resource * alloc {
+	device_t                _dev;
+	device_t                _child;
+	int                    *_res_id;
+	size_t                  _size;
+} DEFAULT xenmem_generic_alloc;
+
+/**
+ * @brief Free physical memory regions.
+ *
+ * @param _dev          the device whose child was being probed.
+ * @param _child        the child device which failed to probe.
+ * @param _res_id       the resource identifier.
+ * @param _res          the resource.
+ *
+ * @returns             0 on success, otherwise an error code.
+ */
+METHOD int free {
+	device_t                _dev;
+	device_t                _child;
+	int                     _res_id;
+	struct resource        *_res;
+} DEFAULT xenmem_generic_free;


Property changes on: trunk/sys/xen/xenmem/xenmem_if.m
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property

From laffer1 at midnightbsd.org  Sat Feb  8 14:28:55 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:28:55 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12308] trunk/sys/xen/evtchn/evtchnvar.h:
 sync with FreeBSD 11-stable
Message-ID: <202002081928.018JStBX061035@stargazer.midnightbsd.org>

Revision: 12308
          http://svnweb.midnightbsd.org/src/?rev=12308
Author:   laffer1
Date:     2020-02-08 14:28:55 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/xen/evtchn/evtchnvar.h

Modified: trunk/sys/xen/evtchn/evtchnvar.h
===================================================================
--- trunk/sys/xen/evtchn/evtchnvar.h	2020-02-08 19:28:38 UTC (rev 12307)
+++ trunk/sys/xen/evtchn/evtchnvar.h	2020-02-08 19:28:55 UTC (rev 12308)
@@ -29,7 +29,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
- * $FreeBSD: stable/10/sys/xen/evtchn/evtchnvar.h 255040 2013-08-29 19:52:18Z gibbs $
+ * $FreeBSD: stable/11/sys/xen/evtchn/evtchnvar.h 255040 2013-08-29 19:52:18Z gibbs $
  */
 
 #ifndef __XEN_EVTCHN_EVTCHNVAR_H__


From laffer1 at midnightbsd.org  Sat Feb  8 14:29:01 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:29:01 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12309] trunk/sys/xen: sync with FreeBSD
 11-stable
Message-ID: <202002081929.018JT1kQ061079@stargazer.midnightbsd.org>

Revision: 12309
          http://svnweb.midnightbsd.org/src/?rev=12309
Author:   laffer1
Date:     2020-02-08 14:29:01 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/xen/blkif.h
    trunk/sys/xen/evtchn.h
    trunk/sys/xen/features.c
    trunk/sys/xen/gnttab.h
    trunk/sys/xen/hvm.h
    trunk/sys/xen/hypervisor.h
    trunk/sys/xen/xen-os.h
    trunk/sys/xen/xen_intr.h
    trunk/sys/xen/xenbus/xenbus.c
    trunk/sys/xen/xenbus/xenbus_if.m
    trunk/sys/xen/xenbus/xenbusb.c
    trunk/sys/xen/xenbus/xenbusb.h
    trunk/sys/xen/xenbus/xenbusb_back.c
    trunk/sys/xen/xenbus/xenbusb_front.c
    trunk/sys/xen/xenbus/xenbusb_if.m
    trunk/sys/xen/xenbus/xenbusvar.h
    trunk/sys/xen/xenstore/xenstore_internal.h
    trunk/sys/xen/xenstore/xenstorevar.h

Added Paths:
-----------
    trunk/sys/xen/error.h
    trunk/sys/xen/privcmd.h
    trunk/sys/xen/xen_msi.h
    trunk/sys/xen/xen_pci.h
    trunk/sys/xen/xen_pv.h

Modified: trunk/sys/xen/blkif.h
===================================================================
--- trunk/sys/xen/blkif.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/blkif.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -18,7 +18,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
- * $FreeBSD: stable/10/sys/xen/blkif.h 285738 2015-07-21 07:22:18Z royger $
+ * $FreeBSD: stable/11/sys/xen/blkif.h 289686 2015-10-21 10:44:07Z royger $
  */
 
 #ifndef __XEN_BLKIF_H__
@@ -121,7 +121,7 @@
 	dst->handle = src->handle;
 	dst->id = src->id;
 	dst->sector_number = src->sector_number;
-	barrier();
+	__compiler_membar();
 	if (n > dst->nr_segments)
 		n = dst->nr_segments;
 	for (i = 0; i < n; i++)
@@ -136,7 +136,7 @@
 	dst->handle = src->handle;
 	dst->id = src->id;
 	dst->sector_number = src->sector_number;
-	barrier();
+	__compiler_membar();
 	if (n > dst->nr_segments)
 		n = dst->nr_segments;
 	for (i = 0; i < n; i++)

Added: trunk/sys/xen/error.h
===================================================================
--- trunk/sys/xen/error.h	                        (rev 0)
+++ trunk/sys/xen/error.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -0,0 +1,102 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2014 Roger Pau Monn? <royger at FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/xen/error.h 301195 2016-06-02 07:45:01Z royger $
+ */
+
+#ifndef __XEN_ERROR_H__
+#define __XEN_ERROR_H__
+
+#include <xen/interface/errno.h>
+
+/* Translation table */
+static int xen_errors[] =
+{
+	[XEN_EPERM]		= EPERM,
+	[XEN_ENOENT]		= ENOENT,
+	[XEN_ESRCH]		= ESRCH,
+	[XEN_EIO]		= EIO,
+	[XEN_ENXIO]		= ENXIO,
+	[XEN_E2BIG]		= E2BIG,
+	[XEN_ENOEXEC]		= ENOEXEC,
+	[XEN_EBADF]		= EBADF,
+	[XEN_ECHILD]		= ECHILD,
+	[XEN_EAGAIN]		= EAGAIN,
+	[XEN_ENOMEM]		= ENOMEM,
+	[XEN_EACCES]		= EACCES,
+	[XEN_EFAULT]		= EFAULT,
+	[XEN_EBUSY]		= EBUSY,
+	[XEN_EEXIST]		= EEXIST,
+	[XEN_EXDEV]		= EXDEV,
+	[XEN_ENODEV]		= ENODEV,
+	[XEN_EINVAL]		= EINVAL,
+	[XEN_ENFILE]		= ENFILE,
+	[XEN_EMFILE]		= EMFILE,
+	[XEN_ENOSPC]		= ENOSPC,
+	[XEN_EMLINK]		= EMLINK,
+	[XEN_EDOM]		= EDOM,
+	[XEN_ERANGE]		= ERANGE,
+	[XEN_EDEADLK]		= EDEADLK,
+	[XEN_ENAMETOOLONG]	= ENAMETOOLONG,
+	[XEN_ENOLCK]		= ENOLCK,
+	[XEN_ENOSYS]		= ENOSYS,
+	[XEN_ENODATA]		= ENOENT,
+	[XEN_ETIME]		= ETIMEDOUT,
+	[XEN_EBADMSG]		= EBADMSG,
+	[XEN_EOVERFLOW]		= EOVERFLOW,
+	[XEN_EILSEQ]		= EILSEQ,
+	[XEN_ENOTSOCK]		= ENOTSOCK,
+	[XEN_EOPNOTSUPP]	= EOPNOTSUPP,
+	[XEN_EADDRINUSE]	= EADDRINUSE,
+	[XEN_EADDRNOTAVAIL]	= EADDRNOTAVAIL,
+	[XEN_ENOBUFS]		= ENOBUFS,
+	[XEN_EISCONN]		= EISCONN,
+	[XEN_ENOTCONN]		= ENOTCONN,
+	[XEN_ETIMEDOUT]		= ETIMEDOUT,
+};
+
+static inline int
+xen_translate_error(int error)
+{
+	int bsd_error;
+
+	KASSERT((error < 0), ("Value is not a valid Xen error code"));
+
+	if (-error >= nitems(xen_errors)) {
+		/*
+		 * We received an error value that cannot be translated,
+		 * return EINVAL.
+		 */
+		return (EINVAL);
+	}
+
+	bsd_error = xen_errors[-error];
+	KASSERT((bsd_error != 0), ("Unknown Xen error code"));
+
+	return (bsd_error);
+}
+
+#endif /* !__XEN_ERROR_H__ */


Property changes on: trunk/sys/xen/error.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/evtchn.h
===================================================================
--- trunk/sys/xen/evtchn.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/evtchn.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -27,7 +27,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
- * $FreeBSD: stable/10/sys/xen/evtchn.h 255040 2013-08-29 19:52:18Z gibbs $
+ * $FreeBSD: stable/11/sys/xen/evtchn.h 255040 2013-08-29 19:52:18Z gibbs $
  */
 
 #ifndef __XEN_EVTCHN_H__

Modified: trunk/sys/xen/features.c
===================================================================
--- trunk/sys/xen/features.c	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/features.c	2020-02-08 19:29:01 UTC (rev 12309)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/xen/features.c 255040 2013-08-29 19:52:18Z gibbs $");
+__FBSDID("$FreeBSD: stable/11/sys/xen/features.c 255040 2013-08-29 19:52:18Z gibbs $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/xen/gnttab.h
===================================================================
--- trunk/sys/xen/gnttab.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/gnttab.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -36,6 +36,7 @@
  */
 
 #ifndef __ASM_GNTTAB_H__
+#define __ASM_GNTTAB_H__
 
 #include <xen/xen-os.h>
 #include <xen/hypervisor.h>
@@ -52,8 +53,6 @@
 	uint16_t count;
 };
 
-int gnttab_init(void);
-
 /*
  * Allocate a grant table reference and return it in *result. Returns
  * zero on success or errno on error.
@@ -117,7 +116,7 @@
 				       unsigned long pfn);
 
 int gnttab_suspend(void);
-int gnttab_resume(void);
+int gnttab_resume(device_t);
 
 #if 0
 
@@ -129,10 +128,8 @@
 {
 	if (flags & GNTMAP_contains_pte)
 		map->host_addr = addr;
-	else if (xen_feature(XENFEAT_auto_translated_physmap))
+	else
 		map->host_addr = vtophys(addr);
-	else
-		map->host_addr = addr;
 
 	map->flags = flags;
 	map->ref = ref;
@@ -145,10 +142,8 @@
 {
 	if (flags & GNTMAP_contains_pte)
 		unmap->host_addr = addr;
-	else if (xen_feature(XENFEAT_auto_translated_physmap))
+	else
 		unmap->host_addr = vtophys(addr);
-	else
-		unmap->host_addr = addr;
 
 	unmap->handle = handle;
 	unmap->dev_bus_addr = 0;
@@ -158,13 +153,8 @@
 gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, vm_paddr_t addr,
 		      vm_paddr_t new_addr, grant_handle_t handle)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap)) {
-		unmap->host_addr = vtophys(addr);
-		unmap->new_addr = vtophys(new_addr);
-	} else {
-		unmap->host_addr = addr;
-		unmap->new_addr = new_addr;
-	}
+	unmap->host_addr = vtophys(addr);
+	unmap->new_addr = vtophys(new_addr);
 
 	unmap->handle = handle;
 }

Modified: trunk/sys/xen/hvm.h
===================================================================
--- trunk/sys/xen/hvm.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/hvm.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -18,7 +18,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
- * $FreeBSD: stable/10/sys/xen/hvm.h 255744 2013-09-20 22:59:22Z gibbs $
+ * $FreeBSD: stable/11/sys/xen/hvm.h 255744 2013-09-20 22:59:22Z gibbs $
  */
 
 #ifndef	__XEN_HVM_H__

Modified: trunk/sys/xen/hypervisor.h
===================================================================
--- trunk/sys/xen/hypervisor.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/hypervisor.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -6,28 +6,12 @@
  * 
  * Copyright (c) 2002, K A Fraser
  *
- * $FreeBSD: stable/10/sys/xen/hypervisor.h 196322 2009-08-17 14:38:59Z jhb $
+ * $FreeBSD: stable/11/sys/xen/hypervisor.h 289686 2015-10-21 10:44:07Z royger $
  */
 
 #ifndef __XEN_HYPERVISOR_H__
 #define __XEN_HYPERVISOR_H__
 
-#ifdef XENHVM
-
-#define is_running_on_xen()	(HYPERVISOR_shared_info != NULL)
-
-#else
-
-#define is_running_on_xen() 1
-
-#endif
-
-#ifdef PAE
-#ifndef CONFIG_X86_PAE
-#define CONFIG_X86_PAE
-#endif
-#endif
-
 #include <sys/cdefs.h>
 #include <sys/systm.h>
 #include <xen/interface/xen.h>
@@ -39,32 +23,14 @@
 #include <xen/interface/memory.h>
 #include <machine/xen/hypercall.h>
 
-#if defined(__amd64__)
-#define MULTI_UVMFLAGS_INDEX 2
-#define MULTI_UVMDOMID_INDEX 3
-#else
-#define MULTI_UVMFLAGS_INDEX 3
-#define MULTI_UVMDOMID_INDEX 4
-#endif
-
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
-#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
-#else
-#define is_initial_xendomain() 0
-#endif
-
-extern start_info_t *xen_start_info;
-
 extern uint64_t get_system_time(int ticks);
 
 static inline int 
-HYPERVISOR_console_write(char *str, int count)
+HYPERVISOR_console_write(const char *str, int count)
 {
     return HYPERVISOR_console_io(CONSOLEIO_write, count, str); 
 }
 
-static inline void HYPERVISOR_crash(void) __dead2;
-
 static inline int
 HYPERVISOR_yield(void)
 {
@@ -133,23 +99,4 @@
 	return (rc);
 }
 
-static inline void
-MULTI_update_va_mapping(
-	multicall_entry_t *mcl, unsigned long va,
-        uint64_t new_val, unsigned long flags)
-{
-    mcl->op = __HYPERVISOR_update_va_mapping;
-    mcl->args[0] = va;
-#if defined(__amd64__)
-    mcl->args[1] = new_val;
-#elif defined(PAE)
-    mcl->args[1] = (uint32_t)(new_val & 0xffffffff) ;
-    mcl->args[2] = (uint32_t)(new_val >> 32);
-#else
-    mcl->args[1] = new_val;
-    mcl->args[2] = 0;
-#endif
-    mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
-}
-
 #endif /* __XEN_HYPERVISOR_H__ */

Added: trunk/sys/xen/privcmd.h
===================================================================
--- trunk/sys/xen/privcmd.h	                        (rev 0)
+++ trunk/sys/xen/privcmd.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -0,0 +1,59 @@
+/* $MidnightBSD$ */
+/******************************************************************************
+ * privcmd.h
+ * 
+ * Interface to /proc/xen/privcmd.
+ * 
+ * Copyright (c) 2003-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * $FreeBSD: stable/11/sys/xen/privcmd.h 273476 2014-10-22 17:07:20Z royger $
+ */
+
+#ifndef __XEN_PRIVCMD_H__
+#define __XEN_PRIVCMD_H__
+
+struct ioctl_privcmd_hypercall
+{
+	unsigned long op; /* hypercall number */
+	unsigned long arg[5]; /* arguments */
+	long retval; /* return value */
+};
+
+struct ioctl_privcmd_mmapbatch {
+	int num;     /* number of pages to populate */
+	domid_t dom; /* target domain */
+	unsigned long addr;  /* virtual address */
+	const xen_pfn_t *arr; /* array of mfns */
+	int *err; /* array of error codes */
+};
+
+#define IOCTL_PRIVCMD_HYPERCALL					\
+	_IOWR('E', 0, struct ioctl_privcmd_hypercall)
+#define IOCTL_PRIVCMD_MMAPBATCH					\
+	_IOWR('E', 1, struct ioctl_privcmd_mmapbatch)
+
+#endif /* !__XEN_PRIVCMD_H__ */


Property changes on: trunk/sys/xen/privcmd.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/xen-os.h
===================================================================
--- trunk/sys/xen/xen-os.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xen-os.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -25,7 +25,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
  * DEALINGS IN THE SOFTWARE.
  *
- * $FreeBSD: stable/10/sys/xen/xen-os.h 315676 2017-03-21 09:38:59Z royger $
+ * $FreeBSD: stable/11/sys/xen/xen-os.h 315668 2017-03-21 08:38:12Z royger $
  */
 
 #ifndef _XEN_XEN_OS_H_
@@ -48,15 +48,14 @@
 /* Everything below this point is not included by assembler (.S) files. */
 #ifndef __ASSEMBLY__
 
-/* Force a proper event-channel callback from Xen. */
-void force_evtchn_callback(void);
-
 extern shared_info_t *HYPERVISOR_shared_info;
+extern start_info_t *HYPERVISOR_start_info;
 
-#ifdef XENHVM
+/* XXX: we need to get rid of this and use HYPERVISOR_start_info directly */
+extern char *console_page;
+
 extern int xen_disable_pv_disks;
 extern int xen_disable_pv_nics;
-#endif
 
 extern bool xen_suspend_cancelled;
 
@@ -86,6 +85,54 @@
 	return (xen_domain_type == XEN_HVM_DOMAIN);
 }
 
+static inline bool
+xen_initial_domain(void)
+{
+	return (xen_domain() && HYPERVISOR_start_info != NULL &&
+	    (HYPERVISOR_start_info->flags & SIF_INITDOMAIN) != 0);
+}
+
+/*
+ * Based on ofed/include/linux/bitops.h
+ *
+ * Those helpers are prefixed by xen_ because xen-os.h is widely included
+ * and we don't want the other drivers using them.
+ *
+ */
+#define NBPL (NBBY * sizeof(long))
+
+static inline bool
+xen_test_bit(int bit, volatile long *addr)
+{
+	unsigned long mask = 1UL << (bit % NBPL);
+
+	return !!(atomic_load_acq_long(&addr[bit / NBPL]) & mask);
+}
+
+static inline void
+xen_set_bit(int bit, volatile long *addr)
+{
+	atomic_set_long(&addr[bit / NBPL], 1UL << (bit % NBPL));
+}
+
+static inline void
+xen_clear_bit(int bit, volatile long *addr)
+{
+	atomic_clear_long(&addr[bit / NBPL], 1UL << (bit % NBPL));
+}
+
+#undef NBPL
+
+/*
+ * Functions to allocate/free unused memory in order
+ * to map memory from other domains.
+ */
+struct resource *xenmem_alloc(device_t dev, int *res_id, size_t size);
+int xenmem_free(device_t dev, int res_id, struct resource *res);
+
+/* Debug/emergency function, prints directly to hypervisor console */
+void xc_printf(const char *, ...) __printflike(1, 2);
+
 #ifndef xen_mb
 #define xen_mb() mb()
 #endif

Modified: trunk/sys/xen/xen_intr.h
===================================================================
--- trunk/sys/xen/xen_intr.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xen_intr.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -29,16 +29,12 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
- * $FreeBSD: stable/10/sys/xen/xen_intr.h 255331 2013-09-06 22:17:02Z gibbs $
+ * $FreeBSD: stable/11/sys/xen/xen_intr.h 340016 2018-11-01 18:34:26Z jhb $
  */
 #ifndef _XEN_INTR_H_
 #define _XEN_INTR_H_
 
-#ifndef __XEN_EVTCHN_PORT_DEFINED__
-typedef uint32_t evtchn_port_t;
-DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
-#define __XEN_EVTCHN_PORT_DEFINED__ 1
-#endif
+#include <xen/interface/event_channel.h>
 
 /** Registered Xen interrupt callback handle. */
 typedef void * xen_intr_handle_t;
@@ -46,6 +42,8 @@
 /** If non-zero, the hypervisor has been configured to use a direct vector */
 extern int xen_vector_callback_enabled;
 
+void xen_intr_handle_upcall(struct trapframe *trap_frame);
+
 /**
  * Associate an already allocated local event channel port an interrupt
  * handler.
@@ -146,7 +144,6 @@
  * interupts and, if successful, associate the port with the specified
  * interrupt handler.
  *
- * \param dev       The device making this bind request.
  * \param cpu       The cpu receiving the IPI.
  * \param filter    The interrupt filter servicing this IPI.
  * \param irqflags  Interrupt handler flags.  See sys/bus.h.
@@ -155,11 +152,23 @@
  *
  * \returns  0 on success, otherwise an errno.
  */
-int xen_intr_alloc_and_bind_ipi(device_t dev, u_int cpu,
+int xen_intr_alloc_and_bind_ipi(u_int cpu,
 	driver_filter_t filter, enum intr_type irqflags,
 	xen_intr_handle_t *handlep);
 
 /**
+ * Register a physical interrupt vector and setup the interrupt source.
+ *
+ * \param vector        The global vector to use.
+ * \param trig          Default trigger method.
+ * \param pol           Default polarity of the interrupt.
+ *
+ * \returns  0 on success, otherwise an errno.
+ */
+int xen_register_pirq(int vector, enum intr_trigger trig,
+	enum intr_polarity pol);
+
+/**
  * Unbind an interrupt handler from its interrupt source.
  *
  * \param handlep  A pointer to the opaque handle that was initialized
@@ -213,4 +222,55 @@
  */
 evtchn_port_t xen_intr_port(xen_intr_handle_t handle);
 
+/**
+ * Setup MSI vector interrupt(s).
+ *
+ * \param dev     The device that requests the binding.
+ *
+ * \param vector  Requested initial vector to bind the MSI interrupt(s) to.
+ *
+ * \param count   Number of vectors to allocate.
+ *
+ * \returns  0 on success, otherwise an errno.
+ */
+int xen_register_msi(device_t dev, int vector, int count);
+
+/**
+ * Teardown a MSI vector interrupt.
+ *
+ * \param vector  Requested vector to release.
+ *
+ * \returns  0 on success, otherwise an errno.
+ */
+int xen_release_msi(int vector);
+
+/**
+ * Bind an event channel port with a handler
+ *
+ * \param dev       The device making this bind request.
+ * \param filter    An interrupt filter handler.  Specify NULL
+ *                  to always dispatch to the ithread handler.
+ * \param handler   An interrupt ithread handler.  Optional (can
+ *                  specify NULL) if all necessary event actions
+ *                  are performed by filter.
+ * \param arg       Argument to present to both filter and handler.
+ * \param irqflags  Interrupt handler flags.  See sys/bus.h.
+ * \param handle    Opaque handle used to manage this registration.
+ *
+ * \returns  0 on success, otherwise an errno.
+ */
+int xen_intr_add_handler(const char *name, driver_filter_t filter,
+	driver_intr_t handler, void *arg, enum intr_type flags,
+	xen_intr_handle_t handle);
+
+/**
+ * Register the IO-APIC PIRQs when running in legacy PVH Dom0 mode.
+ *
+ * \param pic	    PIC instance.
+ *
+ * NB: this should be removed together with the support for legacy PVH mode.
+ */
+struct pic;
+void xenpv_register_pirqs(struct pic *pic);
+
 #endif /* _XEN_INTR_H_ */

Added: trunk/sys/xen/xen_msi.h
===================================================================
--- trunk/sys/xen/xen_msi.h	                        (rev 0)
+++ trunk/sys/xen/xen_msi.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -0,0 +1,40 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/xen/xen_msi.h 276767 2015-01-06 21:26:35Z imp $
+ */
+
+#ifndef __XEN_MSI_H__
+#define __XEN_MSI_H__
+
+void	xen_msi_init(void);
+int	xen_msi_map(int irq, uint64_t *addr, uint32_t *data);
+int	xen_msi_alloc(device_t dev, int count, int maxcount, int *irqs);
+int	xen_msi_release(int *irqs, int count);
+int	xen_msix_alloc(device_t dev, int *irq);
+int	xen_msix_release(int irq);
+
+#endif /* !__XEN_MSI_H__ */


Property changes on: trunk/sys/xen/xen_msi.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/xen/xen_pci.h
===================================================================
--- trunk/sys/xen/xen_pci.h	                        (rev 0)
+++ trunk/sys/xen/xen_pci.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -0,0 +1,38 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/xen/xen_pci.h 275649 2014-12-09 18:03:25Z royger $
+ */
+
+#ifndef __XEN_PCI_H__
+#define __XEN_PCI_H__
+
+void xen_pci_enable_msi_method(device_t dev, device_t child, uint64_t address,
+     uint16_t data);
+void xen_pci_disable_msi_method(device_t dev, device_t child);
+void xen_pci_child_added_method(device_t dev, device_t child);
+
+#endif /* !__XEN_PCI_H__ */


Property changes on: trunk/sys/xen/xen_pci.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/xen/xen_pv.h
===================================================================
--- trunk/sys/xen/xen_pv.h	                        (rev 0)
+++ trunk/sys/xen/xen_pv.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -0,0 +1,35 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/xen/xen_pv.h 267536 2014-06-16 08:54:04Z royger $
+ */
+
+#ifndef __XEN_PV_H__
+#define __XEN_PV_H__
+
+extern struct apic_ops xen_apic_ops;
+
+#endif


Property changes on: trunk/sys/xen/xen_pv.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/xen/xenbus/xenbus.c
===================================================================
--- trunk/sys/xen/xenbus/xenbus.c	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenbus/xenbus.c	2020-02-08 19:29:01 UTC (rev 12309)
@@ -41,7 +41,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/xen/xenbus/xenbus.c 255040 2013-08-29 19:52:18Z gibbs $");
+__FBSDID("$FreeBSD: stable/11/sys/xen/xenbus/xenbus.c 255040 2013-08-29 19:52:18Z gibbs $");
 
 #include <sys/cdefs.h>
 #include <sys/param.h>

Modified: trunk/sys/xen/xenbus/xenbus_if.m
===================================================================
--- trunk/sys/xen/xenbus/xenbus_if.m	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenbus/xenbus_if.m	2020-02-08 19:29:01 UTC (rev 12309)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 #-
 # Copyright (c) 2008 Doug Rabson
 # All rights reserved.
@@ -23,8 +24,8 @@
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
-# $FreeBSD: stable/10/sys/xen/xenbus/xenbus_if.m 255040 2013-08-29 19:52:18Z gibbs $
-# $MidnightBSD$
+# $FreeBSD: stable/11/sys/xen/xenbus/xenbus_if.m 255040 2013-08-29 19:52:18Z gibbs $
+#
 
 #include <sys/bus.h>
 

Modified: trunk/sys/xen/xenbus/xenbusb.c
===================================================================
--- trunk/sys/xen/xenbus/xenbusb.c	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenbus/xenbusb.c	2020-02-08 19:29:01 UTC (rev 12309)
@@ -53,7 +53,7 @@
  *                        xnb1
  */
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/xen/xenbus/xenbusb.c 315676 2017-03-21 09:38:59Z royger $");
+__FBSDID("$FreeBSD: stable/11/sys/xen/xenbus/xenbusb.c 315668 2017-03-21 08:38:12Z royger $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -331,7 +331,7 @@
 	default:
 		return (EINVAL);
 	}
-	return (SYSCTL_OUT(req, value, strlen(value)));
+	return (SYSCTL_OUT_STR(req, value));
 }
 
 /**

Modified: trunk/sys/xen/xenbus/xenbusb.h
===================================================================
--- trunk/sys/xen/xenbus/xenbusb.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenbus/xenbusb.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -31,7 +31,7 @@
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
- * $FreeBSD: stable/10/sys/xen/xenbus/xenbusb.h 222975 2011-06-11 04:59:01Z gibbs $
+ * $FreeBSD: stable/11/sys/xen/xenbus/xenbusb.h 222975 2011-06-11 04:59:01Z gibbs $
  */
 #ifndef _XEN_XENBUS_XENBUSB_H
 #define _XEN_XENBUS_XENBUSB_H

Modified: trunk/sys/xen/xenbus/xenbusb_back.c
===================================================================
--- trunk/sys/xen/xenbus/xenbusb_back.c	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenbus/xenbusb_back.c	2020-02-08 19:29:01 UTC (rev 12309)
@@ -37,7 +37,7 @@
  * Xen split devices.
  */
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/xen/xenbus/xenbusb_back.c 225704 2011-09-20 23:44:34Z gibbs $");
+__FBSDID("$FreeBSD: stable/11/sys/xen/xenbus/xenbusb_back.c 225704 2011-09-20 23:44:34Z gibbs $");
 
 #include <sys/param.h>
 #include <sys/bus.h>

Modified: trunk/sys/xen/xenbus/xenbusb_front.c
===================================================================
--- trunk/sys/xen/xenbus/xenbusb_front.c	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenbus/xenbusb_front.c	2020-02-08 19:29:01 UTC (rev 12309)
@@ -37,7 +37,7 @@
  * Xen split devices.
  */
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/xen/xenbus/xenbusb_front.c 255040 2013-08-29 19:52:18Z gibbs $");
+__FBSDID("$FreeBSD: stable/11/sys/xen/xenbus/xenbusb_front.c 255040 2013-08-29 19:52:18Z gibbs $");
 
 #include <sys/param.h>
 #include <sys/bus.h>

Modified: trunk/sys/xen/xenbus/xenbusb_if.m
===================================================================
--- trunk/sys/xen/xenbus/xenbusb_if.m	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenbus/xenbusb_if.m	2020-02-08 19:29:01 UTC (rev 12309)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 #-
 # Copyright (c) 2010 Spectra Logic Corporation
 # All rights reserved.
@@ -27,8 +28,8 @@
 # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGES.
 #
-# $FreeBSD: stable/10/sys/xen/xenbus/xenbusb_if.m 222975 2011-06-11 04:59:01Z gibbs $
-# $MidnightBSD$
+# $FreeBSD: stable/11/sys/xen/xenbus/xenbusb_if.m 222975 2011-06-11 04:59:01Z gibbs $
+#
 
 #include <sys/bus.h>
 #include <sys/lock.h>

Modified: trunk/sys/xen/xenbus/xenbusvar.h
===================================================================
--- trunk/sys/xen/xenbus/xenbusvar.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenbus/xenbusvar.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -24,7 +24,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
- * $FreeBSD: stable/10/sys/xen/xenbus/xenbusvar.h 255040 2013-08-29 19:52:18Z gibbs $
+ * $FreeBSD: stable/11/sys/xen/xenbus/xenbusvar.h 294090 2016-01-15 14:34:31Z royger $
  */
 
 /**
@@ -83,7 +83,13 @@
 };
 
 /**
- * Simplified accessors for xenbus devices
+ * Simplified accessors for xenbus devices:
+ *
+ * xenbus_get_node
+ * xenbus_get_type
+ * xenbus_get_state
+ * xenbus_get_otherend_id
+ * xenbus_get_otherend_path
  */
 #define	XENBUS_ACCESSOR(var, ivar, type) \
 	__BUS_ACCESSOR(xenbus, var, XENBUS, ivar, type)

Modified: trunk/sys/xen/xenstore/xenstore_internal.h
===================================================================
--- trunk/sys/xen/xenstore/xenstore_internal.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenstore/xenstore_internal.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -30,11 +30,8 @@
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
- * $FreeBSD: stable/10/sys/xen/xenstore/xenstore_internal.h 214077 2010-10-19 20:53:30Z gibbs $
+ * $FreeBSD: stable/11/sys/xen/xenstore/xenstore_internal.h 272318 2014-09-30 17:31:04Z royger $
  */
 
-/* Initialize support for userspace access to the XenStore. */
-void xs_dev_init(void);
-
 /* Used by the XenStore character device to borrow kernel's store connection. */
 int xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result);

Modified: trunk/sys/xen/xenstore/xenstorevar.h
===================================================================
--- trunk/sys/xen/xenstore/xenstorevar.h	2020-02-08 19:28:55 UTC (rev 12308)
+++ trunk/sys/xen/xenstore/xenstorevar.h	2020-02-08 19:29:01 UTC (rev 12309)
@@ -29,7 +29,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
- * $FreeBSD: stable/10/sys/xen/xenstore/xenstorevar.h 315675 2017-03-21 09:27:24Z royger $
+ * $FreeBSD: stable/11/sys/xen/xenstore/xenstorevar.h 315667 2017-03-21 08:36:25Z royger $
  */
 
 #ifndef _XEN_XENSTORE_XENSTOREVAR_H


From laffer1 at midnightbsd.org  Sat Feb  8 14:32:42 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:32:42 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12310] trunk/sys/x86: sync with FreeBSD
 11-stable
Message-ID: <202002081932.018JWgDh061873@stargazer.midnightbsd.org>

Revision: 12310
          http://svnweb.midnightbsd.org/src/?rev=12310
Author:   laffer1
Date:     2020-02-08 14:32:41 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/x86/iommu/busdma_dmar.c
    trunk/sys/x86/iommu/busdma_dmar.h
    trunk/sys/x86/iommu/intel_ctx.c
    trunk/sys/x86/iommu/intel_dmar.h
    trunk/sys/x86/iommu/intel_drv.c
    trunk/sys/x86/iommu/intel_fault.c
    trunk/sys/x86/iommu/intel_gas.c
    trunk/sys/x86/iommu/intel_idpgtbl.c
    trunk/sys/x86/iommu/intel_qi.c
    trunk/sys/x86/iommu/intel_quirks.c
    trunk/sys/x86/iommu/intel_reg.h
    trunk/sys/x86/iommu/intel_utils.c
    trunk/sys/x86/isa/atpic.c
    trunk/sys/x86/isa/atrtc.c
    trunk/sys/x86/isa/clock.c
    trunk/sys/x86/isa/elcr.c
    trunk/sys/x86/isa/icu.h
    trunk/sys/x86/isa/isa.c
    trunk/sys/x86/isa/isa_dma.c
    trunk/sys/x86/isa/nmi.c
    trunk/sys/x86/isa/orm.c
    trunk/sys/x86/pci/pci_bus.c
    trunk/sys/x86/pci/qpi.c
    trunk/sys/x86/x86/bus_machdep.c
    trunk/sys/x86/x86/busdma_bounce.c
    trunk/sys/x86/x86/busdma_machdep.c
    trunk/sys/x86/x86/dump_machdep.c
    trunk/sys/x86/x86/fdt_machdep.c
    trunk/sys/x86/x86/identcpu.c
    trunk/sys/x86/x86/intr_machdep.c
    trunk/sys/x86/x86/io_apic.c
    trunk/sys/x86/x86/legacy.c
    trunk/sys/x86/x86/local_apic.c
    trunk/sys/x86/x86/mca.c
    trunk/sys/x86/x86/mptable.c
    trunk/sys/x86/x86/mptable_pci.c
    trunk/sys/x86/x86/msi.c
    trunk/sys/x86/x86/nexus.c
    trunk/sys/x86/x86/tsc.c
    trunk/sys/x86/xen/hvm.c
    trunk/sys/x86/xen/xen_intr.c

Added Paths:
-----------
    trunk/sys/x86/iommu/intel_intrmap.c
    trunk/sys/x86/iommu/iommu_intrmap.h
    trunk/sys/x86/x86/autoconf.c
    trunk/sys/x86/x86/cpu_machdep.c
    trunk/sys/x86/x86/delay.c
    trunk/sys/x86/x86/mp_watchdog.c
    trunk/sys/x86/x86/mp_x86.c
    trunk/sys/x86/x86/pvclock.c
    trunk/sys/x86/x86/stack_machdep.c
    trunk/sys/x86/x86/ucode.c
    trunk/sys/x86/x86/x86_mem.c
    trunk/sys/x86/xen/pv.c
    trunk/sys/x86/xen/pvcpu_enum.c
    trunk/sys/x86/xen/xen_apic.c
    trunk/sys/x86/xen/xen_msi.c
    trunk/sys/x86/xen/xen_nexus.c
    trunk/sys/x86/xen/xen_pci_bus.c
    trunk/sys/x86/xen/xenpv.c

Modified: trunk/sys/x86/iommu/busdma_dmar.c
===================================================================
--- trunk/sys/x86/iommu/busdma_dmar.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/busdma_dmar.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/busdma_dmar.c 284021 2015-06-05 08:36:25Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/busdma_dmar.c 316392 2017-04-02 07:11:15Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -48,6 +48,7 @@
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/uio.h>
+#include <sys/vmem.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <vm/vm.h>
@@ -74,14 +75,34 @@
 dmar_bus_dma_is_dev_disabled(int domain, int bus, int slot, int func)
 {
 	char str[128], *env;
+	int default_bounce;
+	bool ret;
+	static const char bounce_str[] = "bounce";
+	static const char dmar_str[] = "dmar";
 
-	snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d.bounce",
+	default_bounce = 0;
+	env = kern_getenv("hw.busdma.default");
+	if (env != NULL) {
+		if (strcmp(env, bounce_str) == 0)
+			default_bounce = 1;
+		else if (strcmp(env, dmar_str) == 0)
+			default_bounce = 0;
+		freeenv(env);
+	}
+
+	snprintf(str, sizeof(str), "hw.busdma.pci%d.%d.%d.%d",
 	    domain, bus, slot, func);
-	env = getenv(str);
+	env = kern_getenv(str);
 	if (env == NULL)
-		return (false);
+		return (default_bounce != 0);
+	if (strcmp(env, bounce_str) == 0)
+		ret = true;
+	else if (strcmp(env, dmar_str) == 0)
+		ret = false;
+	else
+		ret = default_bounce != 0;
 	freeenv(env);
-	return (true);
+	return (ret);
 }
 
 /*
@@ -93,7 +114,7 @@
  * domain, and must collectively be assigned to use either DMAR or
  * bounce mapping.
  */
-static device_t
+device_t
 dmar_get_requester(device_t dev, uint16_t *rid)
 {
 	devclass_t pci_class;
@@ -225,7 +246,7 @@
 	disabled = dmar_bus_dma_is_dev_disabled(pci_get_domain(requester), 
 	    pci_get_bus(requester), pci_get_slot(requester), 
 	    pci_get_function(requester));
-	ctx = dmar_get_ctx(dmar, requester, rid, disabled, rmrr);
+	ctx = dmar_get_ctx_for_dev(dmar, requester, rid, disabled, rmrr);
 	if (ctx == NULL)
 		return (NULL);
 	if (disabled) {
@@ -256,6 +277,8 @@
 	/* Not in scope of any DMAR ? */
 	if (dmar == NULL)
 		return (NULL);
+	if (!dmar->dma_enabled)
+		return (NULL);
 	dmar_quirks_pre_use(dmar);
 	dmar_instantiate_rmrr_ctxs(dmar);
 
@@ -369,16 +392,18 @@
 {
 	struct bus_dma_tag_dmar *tag;
 	struct bus_dmamap_dmar *map;
+	struct dmar_domain *domain;
 
 	tag = (struct bus_dma_tag_dmar *)dmat;
 	map = (struct bus_dmamap_dmar *)map1;
 	if (map != NULL) {
-		DMAR_CTX_LOCK(tag->ctx);
+		domain = tag->ctx->domain;
+		DMAR_DOMAIN_LOCK(domain);
 		if (!TAILQ_EMPTY(&map->map_entries)) {
-			DMAR_CTX_UNLOCK(tag->ctx);
+			DMAR_DOMAIN_UNLOCK(domain);
 			return (EBUSY);
 		}
-		DMAR_CTX_UNLOCK(tag->ctx);
+		DMAR_DOMAIN_UNLOCK(domain);
 		free(map, M_DMAR_DMAMAP);
 	}
 	tag->map_count--;
@@ -455,6 +480,7 @@
     struct dmar_map_entries_tailq *unroll_list)
 {
 	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	struct dmar_map_entry *entry;
 	dmar_gaddr_t size;
 	bus_size_t buflen1;
@@ -464,6 +490,7 @@
 	if (segs == NULL)
 		segs = tag->segments;
 	ctx = tag->ctx;
+	domain = ctx->domain;
 	seg = *segp;
 	error = 0;
 	idx = 0;
@@ -485,7 +512,7 @@
 		if (seg + 1 < tag->common.nsegments)
 			gas_flags |= DMAR_GM_CANSPLIT;
 
-		error = dmar_gas_map(ctx, &tag->common, size, offset,
+		error = dmar_gas_map(domain, &tag->common, size, offset,
 		    DMAR_MAP_ENTRY_READ | DMAR_MAP_ENTRY_WRITE,
 		    gas_flags, ma + idx, &entry);
 		if (error != 0)
@@ -532,10 +559,10 @@
 		    (uintmax_t)entry->start, (uintmax_t)entry->end,
 		    (uintmax_t)buflen1, (uintmax_t)tag->common.maxsegsz));
 
-		DMAR_CTX_LOCK(ctx);
+		DMAR_DOMAIN_LOCK(domain);
 		TAILQ_INSERT_TAIL(&map->map_entries, entry, dmamap_link);
 		entry->flags |= DMAR_MAP_ENTRY_MAP;
-		DMAR_CTX_UNLOCK(ctx);
+		DMAR_DOMAIN_UNLOCK(domain);
 		TAILQ_INSERT_TAIL(unroll_list, entry, unroll_link);
 
 		segs[seg].ds_addr = entry->start + offset;
@@ -557,11 +584,13 @@
     int flags, bus_dma_segment_t *segs, int *segp)
 {
 	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	struct dmar_map_entry *entry, *entry1;
 	struct dmar_map_entries_tailq unroll_list;
 	int error;
 
 	ctx = tag->ctx;
+	domain = ctx->domain;
 	atomic_add_long(&ctx->loads, 1);
 
 	TAILQ_INIT(&unroll_list);
@@ -573,7 +602,7 @@
 		 * partial buffer load, so unfortunately we have to
 		 * revert all work done.
 		 */
-		DMAR_CTX_LOCK(ctx);
+		DMAR_DOMAIN_LOCK(domain);
 		TAILQ_FOREACH_SAFE(entry, &unroll_list, unroll_link,
 		    entry1) {
 			/*
@@ -584,12 +613,12 @@
 			 */
 			TAILQ_REMOVE(&map->map_entries, entry, dmamap_link);
 			TAILQ_REMOVE(&unroll_list, entry, unroll_link);
-			TAILQ_INSERT_TAIL(&ctx->unload_entries, entry,
+			TAILQ_INSERT_TAIL(&domain->unload_entries, entry,
 			    dmamap_link);
 		}
-		DMAR_CTX_UNLOCK(ctx);
-		taskqueue_enqueue(ctx->dmar->delayed_taskqueue,
-		    &ctx->unload_task);
+		DMAR_DOMAIN_UNLOCK(domain);
+		taskqueue_enqueue(domain->dmar->delayed_taskqueue,
+		    &domain->unload_task);
 	}
 
 	if (error == ENOMEM && (flags & BUS_DMA_NOWAIT) == 0 &&
@@ -596,7 +625,7 @@
 	    !map->cansleep)
 		error = EINPROGRESS;
 	if (error == EINPROGRESS)
-		dmar_bus_schedule_dmamap(ctx->dmar, map);
+		dmar_bus_schedule_dmamap(domain->dmar, map);
 	return (error);
 }
 
@@ -762,6 +791,7 @@
 	struct bus_dma_tag_dmar *tag;
 	struct bus_dmamap_dmar *map;
 	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 #if defined(__amd64__)
 	struct dmar_map_entries_tailq entries;
 #endif
@@ -769,20 +799,22 @@
 	tag = (struct bus_dma_tag_dmar *)dmat;
 	map = (struct bus_dmamap_dmar *)map1;
 	ctx = tag->ctx;
+	domain = ctx->domain;
 	atomic_add_long(&ctx->unloads, 1);
 
 #if defined(__i386__)
-	DMAR_CTX_LOCK(ctx);
-	TAILQ_CONCAT(&ctx->unload_entries, &map->map_entries, dmamap_link);
-	DMAR_CTX_UNLOCK(ctx);
-	taskqueue_enqueue(ctx->dmar->delayed_taskqueue, &ctx->unload_task);
+	DMAR_DOMAIN_LOCK(domain);
+	TAILQ_CONCAT(&domain->unload_entries, &map->map_entries, dmamap_link);
+	DMAR_DOMAIN_UNLOCK(domain);
+	taskqueue_enqueue(domain->dmar->delayed_taskqueue,
+	    &domain->unload_task);
 #else /* defined(__amd64__) */
 	TAILQ_INIT(&entries);
-	DMAR_CTX_LOCK(ctx);
+	DMAR_DOMAIN_LOCK(domain);
 	TAILQ_CONCAT(&entries, &map->map_entries, dmamap_link);
-	DMAR_CTX_UNLOCK(ctx);
+	DMAR_DOMAIN_UNLOCK(domain);
 	THREAD_NO_SLEEPING();
-	dmar_ctx_unload(ctx, &entries, false);
+	dmar_domain_unload(domain, &entries, false);
 	THREAD_SLEEPING_OK();
 	KASSERT(TAILQ_EMPTY(&entries), ("lazy dmar_ctx_unload %p", ctx));
 #endif
@@ -855,6 +887,8 @@
 dmar_init_busdma(struct dmar_unit *unit)
 {
 
+	unit->dma_enabled = 1;
+	TUNABLE_INT_FETCH("hw.dmar.dma", &unit->dma_enabled);
 	TAILQ_INIT(&unit->delayed_maps);
 	TASK_INIT(&unit->dmamap_load_task, 0, dmar_bus_task_dmamap, unit);
 	unit->delayed_taskqueue = taskqueue_create("dmar", M_WAITOK,

Modified: trunk/sys/x86/iommu/busdma_dmar.h
===================================================================
--- trunk/sys/x86/iommu/busdma_dmar.h	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/busdma_dmar.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/iommu/busdma_dmar.h 257251 2013-10-28 13:33:29Z kib $
+ * $FreeBSD: stable/11/sys/x86/iommu/busdma_dmar.h 257251 2013-10-28 13:33:29Z kib $
  */
 
 #ifndef __X86_IOMMU_BUSDMA_DMAR_H

Modified: trunk/sys/x86/iommu/intel_ctx.c
===================================================================
--- trunk/sys/x86/iommu/intel_ctx.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_ctx.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_ctx.c 279485 2015-03-01 10:35:54Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_ctx.c 320357 2017-06-26 12:30:39Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -49,6 +49,7 @@
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/uio.h>
+#include <sys/vmem.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
@@ -68,8 +69,12 @@
 #include <dev/pci/pcivar.h>
 
 static MALLOC_DEFINE(M_DMAR_CTX, "dmar_ctx", "Intel DMAR Context");
+static MALLOC_DEFINE(M_DMAR_DOMAIN, "dmar_dom", "Intel DMAR Domain");
 
-static void dmar_ctx_unload_task(void *arg, int pending);
+static void dmar_domain_unload_task(void *arg, int pending);
+static void dmar_unref_domain_locked(struct dmar_unit *dmar,
+    struct dmar_domain *domain);
+static void dmar_domain_destroy(struct dmar_domain *domain);
 
 static void
 dmar_ensure_ctx_page(struct dmar_unit *dmar, int bus)
@@ -108,8 +113,8 @@
 {
 	dmar_ctx_entry_t *ctxp;
 
-	ctxp = dmar_map_pgtbl(ctx->dmar->ctx_obj, 1 + PCI_RID2BUS(ctx->rid),
-	    DMAR_PGF_NOALLOC | DMAR_PGF_WAITOK, sfp);
+	ctxp = dmar_map_pgtbl(ctx->domain->dmar->ctx_obj, 1 +
+	    PCI_RID2BUS(ctx->rid), DMAR_PGF_NOALLOC | DMAR_PGF_WAITOK, sfp);
 	ctxp += ctx->rid & 0xff;
 	return (ctxp);
 }
@@ -119,7 +124,7 @@
 {
 	bus_addr_t maxaddr;
 
-	maxaddr = MIN(ctx->end, BUS_SPACE_MAXADDR);
+	maxaddr = MIN(ctx->domain->end, BUS_SPACE_MAXADDR);
 	ctx->ctx_tag.common.ref_count = 1; /* Prevent free */
 	ctx->ctx_tag.common.impl = &bus_dma_dmar_impl;
 	ctx->ctx_tag.common.boundary = PCI_DMA_BOUNDARY;
@@ -130,33 +135,42 @@
 	ctx->ctx_tag.common.maxsegsz = maxaddr;
 	ctx->ctx_tag.ctx = ctx;
 	ctx->ctx_tag.owner = dev;
-	/* XXXKIB initialize tag further */
 }
 
 static void
-ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp)
+ctx_id_entry_init(struct dmar_ctx *ctx, dmar_ctx_entry_t *ctxp, bool move)
 {
 	struct dmar_unit *unit;
+	struct dmar_domain *domain;
 	vm_page_t ctx_root;
 
-	unit = ctx->dmar;
-	KASSERT(ctxp->ctx1 == 0 && ctxp->ctx2 == 0,
+	domain = ctx->domain;
+	unit = domain->dmar;
+	KASSERT(move || (ctxp->ctx1 == 0 && ctxp->ctx2 == 0),
 	    ("dmar%d: initialized ctx entry %d:%d:%d 0x%jx 0x%jx",
 	    unit->unit, pci_get_bus(ctx->ctx_tag.owner),
 	    pci_get_slot(ctx->ctx_tag.owner),
 	    pci_get_function(ctx->ctx_tag.owner),
-	    ctxp->ctx1,
-	    ctxp->ctx2));
-	ctxp->ctx2 = DMAR_CTX2_DID(ctx->domain);
-	ctxp->ctx2 |= ctx->awlvl;
-	if ((ctx->flags & DMAR_CTX_IDMAP) != 0 &&
+	    ctxp->ctx1, ctxp->ctx2));
+	/*
+	 * For update due to move, the store is not atomic.  It is
+	 * possible that DMAR read upper doubleword, while low
+	 * doubleword is not yet updated.  The domain id is stored in
+	 * the upper doubleword, while the table pointer in the lower.
+	 *
+	 * There is no good solution, for the same reason it is wrong
+	 * to clear P bit in the ctx entry for update.
+	 */
+	dmar_pte_store1(&ctxp->ctx2, DMAR_CTX2_DID(domain->domain) |
+	    domain->awlvl);
+	if ((domain->flags & DMAR_DOMAIN_IDMAP) != 0 &&
 	    (unit->hw_ecap & DMAR_ECAP_PT) != 0) {
-		KASSERT(ctx->pgtbl_obj == NULL,
+		KASSERT(domain->pgtbl_obj == NULL,
 		    ("ctx %p non-null pgtbl_obj", ctx));
-		dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P);
+		dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_PASS | DMAR_CTX1_P);
 	} else {
-		ctx_root = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_NOALLOC);
-		dmar_pte_store(&ctxp->ctx1, DMAR_CTX1_T_UNTR |
+		ctx_root = dmar_pgalloc(domain->pgtbl_obj, 0, DMAR_PGF_NOALLOC);
+		dmar_pte_store1(&ctxp->ctx1, DMAR_CTX1_T_UNTR |
 		    (DMAR_CTX1_ASR_MASK & VM_PAGE_TO_PHYS(ctx_root)) |
 		    DMAR_CTX1_P);
 	}
@@ -164,8 +178,32 @@
 }
 
 static int
-ctx_init_rmrr(struct dmar_ctx *ctx, device_t dev)
+dmar_flush_for_ctx_entry(struct dmar_unit *dmar, bool force)
 {
+	int error;
+
+	/*
+	 * If dmar declares Caching Mode as Set, follow 11.5 "Caching
+	 * Mode Consideration" and do the (global) invalidation of the
+	 * negative TLB entries.
+	 */
+	if ((dmar->hw_cap & DMAR_CAP_CM) == 0 && !force)
+		return (0);
+	if (dmar->qi_enabled) {
+		dmar_qi_invalidate_ctx_glob_locked(dmar);
+		if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force)
+			dmar_qi_invalidate_iotlb_glob_locked(dmar);
+		return (0);
+	}
+	error = dmar_inv_ctx_glob(dmar);
+	if (error == 0 && ((dmar->hw_ecap & DMAR_ECAP_DI) != 0 || force))
+		error = dmar_inv_iotlb_glob(dmar);
+	return (error);
+}
+
+static int
+domain_init_rmrr(struct dmar_domain *domain, device_t dev)
+{
 	struct dmar_map_entries_tailq rmrr_entries;
 	struct dmar_map_entry *entry, *entry1;
 	vm_page_t *ma;
@@ -175,7 +213,7 @@
 
 	error = 0;
 	TAILQ_INIT(&rmrr_entries);
-	dmar_ctx_parse_rmrr(ctx, dev, &rmrr_entries);
+	dmar_dev_parse_rmrr(domain, dev, &rmrr_entries);
 	TAILQ_FOREACH_SAFE(entry, &rmrr_entries, unroll_link, entry1) {
 		/*
 		 * VT-d specification requires that the start of an
@@ -195,7 +233,7 @@
 			if (bootverbose) {
 				device_printf(dev, "BIOS bug: dmar%d RMRR "
 				    "region (%jx, %jx) corrected\n",
-				    ctx->dmar->unit, start, end);
+				    domain->dmar->unit, start, end);
 			}
 			entry->end += DMAR_PAGE_SIZE * 0x20;
 		}
@@ -205,8 +243,9 @@
 			ma[i] = vm_page_getfake(entry->start + PAGE_SIZE * i,
 			    VM_MEMATTR_DEFAULT);
 		}
-		error1 = dmar_gas_map_region(ctx, entry, DMAR_MAP_ENTRY_READ |
-		    DMAR_MAP_ENTRY_WRITE, DMAR_GM_CANWAIT, ma);
+		error1 = dmar_gas_map_region(domain, entry,
+		    DMAR_MAP_ENTRY_READ | DMAR_MAP_ENTRY_WRITE,
+		    DMAR_GM_CANWAIT, ma);
 		/*
 		 * Non-failed RMRR entries are owned by context rb
 		 * tree.  Get rid of the failed entry, but do not stop
@@ -214,18 +253,19 @@
 		 * loaded and removed on the context destruction.
 		 */
 		if (error1 == 0 && entry->end != entry->start) {
-			DMAR_LOCK(ctx->dmar);
-			ctx->flags |= DMAR_CTX_RMRR;
-			DMAR_UNLOCK(ctx->dmar);
+			DMAR_LOCK(domain->dmar);
+			domain->refs++; /* XXXKIB prevent free */
+			domain->flags |= DMAR_DOMAIN_RMRR;
+			DMAR_UNLOCK(domain->dmar);
 		} else {
 			if (error1 != 0) {
 				device_printf(dev,
 			    "dmar%d failed to map RMRR region (%jx, %jx) %d\n",
-				    ctx->dmar->unit, start, end, error1);
+				    domain->dmar->unit, start, end, error1);
 				error = error1;
 			}
 			TAILQ_REMOVE(&rmrr_entries, entry, unroll_link);
-			dmar_gas_free_entry(ctx, entry);
+			dmar_gas_free_entry(domain, entry);
 		}
 		for (i = 0; i < size; i++)
 			vm_page_putfake(ma[i]);
@@ -234,47 +274,144 @@
 	return (error);
 }
 
+static struct dmar_domain *
+dmar_domain_alloc(struct dmar_unit *dmar, bool id_mapped)
+{
+	struct dmar_domain *domain;
+	int error, id, mgaw;
+
+	id = alloc_unr(dmar->domids);
+	if (id == -1)
+		return (NULL);
+	domain = malloc(sizeof(*domain), M_DMAR_DOMAIN, M_WAITOK | M_ZERO);
+	domain->domain = id;
+	LIST_INIT(&domain->contexts);
+	RB_INIT(&domain->rb_root);
+	TAILQ_INIT(&domain->unload_entries);
+	TASK_INIT(&domain->unload_task, 0, dmar_domain_unload_task, domain);
+	mtx_init(&domain->lock, "dmardom", NULL, MTX_DEF);
+	domain->dmar = dmar;
+
+	/*
+	 * For now, use the maximal usable physical address of the
+	 * installed memory to calculate the mgaw on id_mapped domain.
+	 * It is useful for the identity mapping, and less so for the
+	 * virtualized bus address space.
+	 */
+	domain->end = id_mapped ? ptoa(Maxmem) : BUS_SPACE_MAXADDR;
+	mgaw = dmar_maxaddr2mgaw(dmar, domain->end, !id_mapped);
+	error = domain_set_agaw(domain, mgaw);
+	if (error != 0)
+		goto fail;
+	if (!id_mapped)
+		/* Use all supported address space for remapping. */
+		domain->end = 1ULL << (domain->agaw - 1);
+
+	dmar_gas_init_domain(domain);
+
+	if (id_mapped) {
+		if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) {
+			domain->pgtbl_obj = domain_get_idmap_pgtbl(domain,
+			    domain->end);
+		}
+		domain->flags |= DMAR_DOMAIN_IDMAP;
+	} else {
+		error = domain_alloc_pgtbl(domain);
+		if (error != 0)
+			goto fail;
+		/* Disable local apic region access */
+		error = dmar_gas_reserve_region(domain, 0xfee00000,
+		    0xfeefffff + 1);
+		if (error != 0)
+			goto fail;
+	}
+	return (domain);
+
+fail:
+	dmar_domain_destroy(domain);
+	return (NULL);
+}
+
 static struct dmar_ctx *
-dmar_get_ctx_alloc(struct dmar_unit *dmar, uint16_t rid)
+dmar_ctx_alloc(struct dmar_domain *domain, uint16_t rid)
 {
 	struct dmar_ctx *ctx;
 
 	ctx = malloc(sizeof(*ctx), M_DMAR_CTX, M_WAITOK | M_ZERO);
-	RB_INIT(&ctx->rb_root);
-	TAILQ_INIT(&ctx->unload_entries);
-	TASK_INIT(&ctx->unload_task, 0, dmar_ctx_unload_task, ctx);
-	mtx_init(&ctx->lock, "dmarctx", NULL, MTX_DEF);
-	ctx->dmar = dmar;
+	ctx->domain = domain;
 	ctx->rid = rid;
+	ctx->refs = 1;
 	return (ctx);
 }
 
 static void
-dmar_ctx_dtr(struct dmar_ctx *ctx, bool gas_inited, bool pgtbl_inited)
+dmar_ctx_link(struct dmar_ctx *ctx)
 {
+	struct dmar_domain *domain;
 
-	if (gas_inited) {
-		DMAR_CTX_LOCK(ctx);
-		dmar_gas_fini_ctx(ctx);
-		DMAR_CTX_UNLOCK(ctx);
+	domain = ctx->domain;
+	DMAR_ASSERT_LOCKED(domain->dmar);
+	KASSERT(domain->refs >= domain->ctx_cnt,
+	    ("dom %p ref underflow %d %d", domain, domain->refs,
+	    domain->ctx_cnt));
+	domain->refs++;
+	domain->ctx_cnt++;
+	LIST_INSERT_HEAD(&domain->contexts, ctx, link);
+}
+
+static void
+dmar_ctx_unlink(struct dmar_ctx *ctx)
+{
+	struct dmar_domain *domain;
+
+	domain = ctx->domain;
+	DMAR_ASSERT_LOCKED(domain->dmar);
+	KASSERT(domain->refs > 0,
+	    ("domain %p ctx dtr refs %d", domain, domain->refs));
+	KASSERT(domain->ctx_cnt >= domain->refs,
+	    ("domain %p ctx dtr refs %d ctx_cnt %d", domain,
+	    domain->refs, domain->ctx_cnt));
+	domain->refs--;
+	domain->ctx_cnt--;
+	LIST_REMOVE(ctx, link);
+}
+
+static void
+dmar_domain_destroy(struct dmar_domain *domain)
+{
+
+	KASSERT(TAILQ_EMPTY(&domain->unload_entries),
+	    ("unfinished unloads %p", domain));
+	KASSERT(LIST_EMPTY(&domain->contexts),
+	    ("destroying dom %p with contexts", domain));
+	KASSERT(domain->ctx_cnt == 0,
+	    ("destroying dom %p with ctx_cnt %d", domain, domain->ctx_cnt));
+	KASSERT(domain->refs == 0,
+	    ("destroying dom %p with refs %d", domain, domain->refs));
+	if ((domain->flags & DMAR_DOMAIN_GAS_INITED) != 0) {
+		DMAR_DOMAIN_LOCK(domain);
+		dmar_gas_fini_domain(domain);
+		DMAR_DOMAIN_UNLOCK(domain);
 	}
-	if (pgtbl_inited) {
-		if (ctx->pgtbl_obj != NULL)
-			DMAR_CTX_PGLOCK(ctx);
-		ctx_free_pgtbl(ctx);
+	if ((domain->flags & DMAR_DOMAIN_PGTBL_INITED) != 0) {
+		if (domain->pgtbl_obj != NULL)
+			DMAR_DOMAIN_PGLOCK(domain);
+		domain_free_pgtbl(domain);
 	}
-	mtx_destroy(&ctx->lock);
-	free(ctx, M_DMAR_CTX);
+	mtx_destroy(&domain->lock);
+	free_unr(domain->dmar->domids, domain->domain);
+	free(domain, M_DMAR_DOMAIN);
 }
 
 struct dmar_ctx *
-dmar_get_ctx(struct dmar_unit *dmar, device_t dev, uint16_t rid, bool id_mapped,
-    bool rmrr_init)
+dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev, uint16_t rid,
+    bool id_mapped, bool rmrr_init)
 {
+	struct dmar_domain *domain, *domain1;
 	struct dmar_ctx *ctx, *ctx1;
 	dmar_ctx_entry_t *ctxp;
 	struct sf_buf *sf;
-	int bus, slot, func, error, mgaw;
+	int bus, slot, func, error;
 	bool enable;
 
 	bus = pci_get_bus(dev);
@@ -292,67 +429,20 @@
 		 */
 		DMAR_UNLOCK(dmar);
 		dmar_ensure_ctx_page(dmar, PCI_RID2BUS(rid));
-		ctx1 = dmar_get_ctx_alloc(dmar, rid);
-
-		if (id_mapped) {
-			/*
-			 * For now, use the maximal usable physical
-			 * address of the installed memory to
-			 * calculate the mgaw.  It is useful for the
-			 * identity mapping, and less so for the
-			 * virtualized bus address space.
-			 */
-			ctx1->end = ptoa(Maxmem);
-			mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, false);
-			error = ctx_set_agaw(ctx1, mgaw);
-			if (error != 0) {
-				dmar_ctx_dtr(ctx1, false, false);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
-		} else {
-			ctx1->end = BUS_SPACE_MAXADDR;
-			mgaw = dmar_maxaddr2mgaw(dmar, ctx1->end, true);
-			error = ctx_set_agaw(ctx1, mgaw);
-			if (error != 0) {
-				dmar_ctx_dtr(ctx1, false, false);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
-			/* Use all supported address space for remapping. */
-			ctx1->end = 1ULL << (ctx1->agaw - 1);
+		domain1 = dmar_domain_alloc(dmar, id_mapped);
+		if (domain1 == NULL) {
+			TD_PINNED_ASSERT;
+			return (NULL);
 		}
-
-
-		dmar_gas_init_ctx(ctx1);
-		if (id_mapped) {
-			if ((dmar->hw_ecap & DMAR_ECAP_PT) == 0) {
-				ctx1->pgtbl_obj = ctx_get_idmap_pgtbl(ctx1,
-				    ctx1->end);
-			}
-			ctx1->flags |= DMAR_CTX_IDMAP;
-		} else {
-			error = ctx_alloc_pgtbl(ctx1);
+		if (!id_mapped) {
+			error = domain_init_rmrr(domain1, dev);
 			if (error != 0) {
-				dmar_ctx_dtr(ctx1, true, false);
+				dmar_domain_destroy(domain1);
 				TD_PINNED_ASSERT;
 				return (NULL);
 			}
-			/* Disable local apic region access */
-			error = dmar_gas_reserve_region(ctx1, 0xfee00000,
-			    0xfeefffff + 1);
-			if (error != 0) {
-				dmar_ctx_dtr(ctx1, true, true);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
-			error = ctx_init_rmrr(ctx1, dev);
-			if (error != 0) {
-				dmar_ctx_dtr(ctx1, true, true);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
 		}
+		ctx1 = dmar_ctx_alloc(domain1, rid);
 		ctxp = dmar_map_ctx_entry(ctx1, &sf);
 		DMAR_LOCK(dmar);
 
@@ -362,16 +452,10 @@
 		 */
 		ctx = dmar_find_ctx_locked(dmar, rid);
 		if (ctx == NULL) {
+			domain = domain1;
 			ctx = ctx1;
+			dmar_ctx_link(ctx);
 			ctx->ctx_tag.owner = dev;
-			ctx->domain = alloc_unrl(dmar->domids);
-			if (ctx->domain == -1) {
-				DMAR_UNLOCK(dmar);
-				dmar_unmap_pgtbl(sf);
-				dmar_ctx_dtr(ctx, true, true);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
 			ctx_tag_init(ctx, dev);
 
 			/*
@@ -379,46 +463,35 @@
 			 * DMAR unit.  Enable the translation after
 			 * everything is set up.
 			 */
-			if (LIST_EMPTY(&dmar->contexts))
+			if (LIST_EMPTY(&dmar->domains))
 				enable = true;
-			LIST_INSERT_HEAD(&dmar->contexts, ctx, link);
-			ctx_id_entry_init(ctx, ctxp);
+			LIST_INSERT_HEAD(&dmar->domains, domain, link);
+			ctx_id_entry_init(ctx, ctxp, false);
 			device_printf(dev,
 			    "dmar%d pci%d:%d:%d:%d rid %x domain %d mgaw %d "
 			    "agaw %d %s-mapped\n",
 			    dmar->unit, dmar->segment, bus, slot,
-			    func, rid, ctx->domain, ctx->mgaw, ctx->agaw,
-			    id_mapped ? "id" : "re");
+			    func, rid, domain->domain, domain->mgaw,
+			    domain->agaw, id_mapped ? "id" : "re");
+			dmar_unmap_pgtbl(sf);
 		} else {
-			dmar_ctx_dtr(ctx1, true, true);
+			dmar_unmap_pgtbl(sf);
+			dmar_domain_destroy(domain1);
+			/* Nothing needs to be done to destroy ctx1. */
+			free(ctx1, M_DMAR_CTX);
+			domain = ctx->domain;
+			ctx->refs++; /* tag referenced us */
 		}
-		dmar_unmap_pgtbl(sf);
+	} else {
+		domain = ctx->domain;
+		ctx->refs++; /* tag referenced us */
 	}
-	ctx->refs++;
-	if ((ctx->flags & DMAR_CTX_RMRR) != 0)
-		ctx->refs++; /* XXXKIB */
 
-	/*
-	 * If dmar declares Caching Mode as Set, follow 11.5 "Caching
-	 * Mode Consideration" and do the (global) invalidation of the
-	 * negative TLB entries.
-	 */
-	if ((dmar->hw_cap & DMAR_CAP_CM) != 0 || enable) {
-		if (dmar->qi_enabled) {
-			dmar_qi_invalidate_ctx_glob_locked(dmar);
-			if ((dmar->hw_ecap & DMAR_ECAP_DI) != 0)
-				dmar_qi_invalidate_iotlb_glob_locked(dmar);
-		} else {
-			error = dmar_inv_ctx_glob(dmar);
-			if (error == 0 &&
-			    (dmar->hw_ecap & DMAR_ECAP_DI) != 0)
-				error = dmar_inv_iotlb_glob(dmar);
-			if (error != 0) {
-				dmar_free_ctx_locked(dmar, ctx);
-				TD_PINNED_ASSERT;
-				return (NULL);
-			}
-		}
+	error = dmar_flush_for_ctx_entry(dmar, enable);
+	if (error != 0) {
+		dmar_free_ctx_locked(dmar, ctx);
+		TD_PINNED_ASSERT;
+		return (NULL);
 	}
 
 	/*
@@ -439,11 +512,74 @@
 	return (ctx);
 }
 
+int
+dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx)
+{
+	struct dmar_unit *dmar;
+	struct dmar_domain *old_domain;
+	dmar_ctx_entry_t *ctxp;
+	struct sf_buf *sf;
+	int error;
+
+	dmar = domain->dmar;
+	old_domain = ctx->domain;
+	if (domain == old_domain)
+		return (0);
+	KASSERT(old_domain->dmar == dmar,
+	    ("domain %p %u moving between dmars %u %u", domain,
+	    domain->domain, old_domain->dmar->unit, domain->dmar->unit));
+	TD_PREP_PINNED_ASSERT;
+
+	ctxp = dmar_map_ctx_entry(ctx, &sf);
+	DMAR_LOCK(dmar);
+	dmar_ctx_unlink(ctx);
+	ctx->domain = domain;
+	dmar_ctx_link(ctx);
+	ctx_id_entry_init(ctx, ctxp, true);
+	dmar_unmap_pgtbl(sf);
+	error = dmar_flush_for_ctx_entry(dmar, true);
+	/* If flush failed, rolling back would not work as well. */
+	printf("dmar%d rid %x domain %d->%d %s-mapped\n",
+	    dmar->unit, ctx->rid, old_domain->domain, domain->domain,
+	    (domain->flags & DMAR_DOMAIN_IDMAP) != 0 ? "id" : "re");
+	dmar_unref_domain_locked(dmar, old_domain);
+	TD_PINNED_ASSERT;
+	return (error);
+}
+
+static void
+dmar_unref_domain_locked(struct dmar_unit *dmar, struct dmar_domain *domain)
+{
+
+	DMAR_ASSERT_LOCKED(dmar);
+	KASSERT(domain->refs >= 1,
+	    ("dmar %d domain %p refs %u", dmar->unit, domain, domain->refs));
+	KASSERT(domain->refs > domain->ctx_cnt,
+	    ("dmar %d domain %p refs %d ctx_cnt %d", dmar->unit, domain,
+	    domain->refs, domain->ctx_cnt));
+
+	if (domain->refs > 1) {
+		domain->refs--;
+		DMAR_UNLOCK(dmar);
+		return;
+	}
+
+	KASSERT((domain->flags & DMAR_DOMAIN_RMRR) == 0,
+	    ("lost ref on RMRR domain %p", domain));
+
+	LIST_REMOVE(domain, link);
+	DMAR_UNLOCK(dmar);
+
+	taskqueue_drain(dmar->delayed_taskqueue, &domain->unload_task);
+	dmar_domain_destroy(domain);
+}
+
 void
 dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx)
 {
 	struct sf_buf *sf;
 	dmar_ctx_entry_t *ctxp;
+	struct dmar_domain *domain;
 
 	DMAR_ASSERT_LOCKED(dmar);
 	KASSERT(ctx->refs >= 1,
@@ -459,8 +595,6 @@
 		return;
 	}
 
-	KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0,
-	    ("lost ref on RMRR ctx %p", ctx));
 	KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0,
 	    ("lost ref on disabled ctx %p", ctx));
 
@@ -488,8 +622,6 @@
 		return;
 	}
 
-	KASSERT((ctx->flags & DMAR_CTX_RMRR) == 0,
-	    ("lost ref on RMRR ctx %p", ctx));
 	KASSERT((ctx->flags & DMAR_CTX_DISABLED) == 0,
 	    ("lost ref on disabled ctx %p", ctx));
 
@@ -507,19 +639,11 @@
 		else
 			dmar_inv_iotlb_glob(dmar);
 	}
-	LIST_REMOVE(ctx, link);
-	DMAR_UNLOCK(dmar);
-
-	/*
-	 * The rest of the destruction is invisible for other users of
-	 * the dmar unit.
-	 */
-	taskqueue_drain(dmar->delayed_taskqueue, &ctx->unload_task);
-	KASSERT(TAILQ_EMPTY(&ctx->unload_entries),
-	    ("unfinished unloads %p", ctx));
 	dmar_unmap_pgtbl(sf);
-	free_unr(dmar->domids, ctx->domain);
-	dmar_ctx_dtr(ctx, true, true);
+	domain = ctx->domain;
+	dmar_ctx_unlink(ctx);
+	free(ctx, M_DMAR_CTX);
+	dmar_unref_domain_locked(dmar, domain);
 	TD_PINNED_ASSERT;
 }
 
@@ -528,86 +652,101 @@
 {
 	struct dmar_unit *dmar;
 
-	dmar = ctx->dmar;
+	dmar = ctx->domain->dmar;
 	DMAR_LOCK(dmar);
 	dmar_free_ctx_locked(dmar, ctx);
 }
 
+/*
+ * Returns with the domain locked.
+ */
 struct dmar_ctx *
 dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid)
 {
+	struct dmar_domain *domain;
 	struct dmar_ctx *ctx;
 
 	DMAR_ASSERT_LOCKED(dmar);
 
-	LIST_FOREACH(ctx, &dmar->contexts, link) {
-		if (ctx->rid == rid)
-			return (ctx);
+	LIST_FOREACH(domain, &dmar->domains, link) {
+		LIST_FOREACH(ctx, &domain->contexts, link) {
+			if (ctx->rid == rid)
+				return (ctx);
+		}
 	}
 	return (NULL);
 }
 
 void
-dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free)
+dmar_domain_free_entry(struct dmar_map_entry *entry, bool free)
 {
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 
-	ctx = entry->ctx;
-	DMAR_CTX_LOCK(ctx);
+	domain = entry->domain;
+	DMAR_DOMAIN_LOCK(domain);
 	if ((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0)
-		dmar_gas_free_region(ctx, entry);
+		dmar_gas_free_region(domain, entry);
 	else
-		dmar_gas_free_space(ctx, entry);
-	DMAR_CTX_UNLOCK(ctx);
+		dmar_gas_free_space(domain, entry);
+	DMAR_DOMAIN_UNLOCK(domain);
 	if (free)
-		dmar_gas_free_entry(ctx, entry);
+		dmar_gas_free_entry(domain, entry);
 	else
 		entry->flags = 0;
 }
 
 void
-dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free)
+dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free)
 {
 	struct dmar_unit *unit;
 
-	unit = entry->ctx->dmar;
+	unit = entry->domain->dmar;
 	if (unit->qi_enabled) {
 		DMAR_LOCK(unit);
-		dmar_qi_invalidate_locked(entry->ctx, entry->start,
-		    entry->end - entry->start, &entry->gseq);
+		dmar_qi_invalidate_locked(entry->domain, entry->start,
+		    entry->end - entry->start, &entry->gseq, true);
 		if (!free)
 			entry->flags |= DMAR_MAP_ENTRY_QI_NF;
 		TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link);
 		DMAR_UNLOCK(unit);
 	} else {
-		ctx_flush_iotlb_sync(entry->ctx, entry->start, entry->end -
-		    entry->start);
-		dmar_ctx_free_entry(entry, free);
+		domain_flush_iotlb_sync(entry->domain, entry->start,
+		    entry->end - entry->start);
+		dmar_domain_free_entry(entry, free);
 	}
 }
 
+static bool
+dmar_domain_unload_emit_wait(struct dmar_domain *domain,
+    struct dmar_map_entry *entry)
+{
+
+	if (TAILQ_NEXT(entry, dmamap_link) == NULL)
+		return (true);
+	return (domain->batch_no++ % dmar_batch_coalesce == 0);
+}
+
 void
-dmar_ctx_unload(struct dmar_ctx *ctx, struct dmar_map_entries_tailq *entries,
-    bool cansleep)
+dmar_domain_unload(struct dmar_domain *domain,
+    struct dmar_map_entries_tailq *entries, bool cansleep)
 {
 	struct dmar_unit *unit;
 	struct dmar_map_entry *entry, *entry1;
-	struct dmar_qi_genseq gseq;
 	int error;
 
-	unit = ctx->dmar;
+	unit = domain->dmar;
 
 	TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
 		KASSERT((entry->flags & DMAR_MAP_ENTRY_MAP) != 0,
-		    ("not mapped entry %p %p", ctx, entry));
-		error = ctx_unmap_buf(ctx, entry->start, entry->end -
+		    ("not mapped entry %p %p", domain, entry));
+		error = domain_unmap_buf(domain, entry->start, entry->end -
 		    entry->start, cansleep ? DMAR_PGF_WAITOK : 0);
-		KASSERT(error == 0, ("unmap %p error %d", ctx, error));
+		KASSERT(error == 0, ("unmap %p error %d", domain, error));
 		if (!unit->qi_enabled) {
-			ctx_flush_iotlb_sync(ctx, entry->start,
+			domain_flush_iotlb_sync(domain, entry->start,
 			    entry->end - entry->start);
 			TAILQ_REMOVE(entries, entry, dmamap_link);
-			dmar_ctx_free_entry(entry, true);
+			dmar_domain_free_entry(entry, true);
 		}
 	}
 	if (TAILQ_EMPTY(entries))
@@ -616,36 +755,30 @@
 	KASSERT(unit->qi_enabled, ("loaded entry left"));
 	DMAR_LOCK(unit);
 	TAILQ_FOREACH(entry, entries, dmamap_link) {
-		entry->gseq.gen = 0;
-		entry->gseq.seq = 0;
-		dmar_qi_invalidate_locked(ctx, entry->start, entry->end -
-		    entry->start, TAILQ_NEXT(entry, dmamap_link) == NULL ?
-		    &gseq : NULL);
+		dmar_qi_invalidate_locked(domain, entry->start, entry->end -
+		    entry->start, &entry->gseq,
+		    dmar_domain_unload_emit_wait(domain, entry));
 	}
-	TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
-		entry->gseq = gseq;
-		TAILQ_REMOVE(entries, entry, dmamap_link);
-		TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry, dmamap_link);
-	}
+	TAILQ_CONCAT(&unit->tlb_flush_entries, entries, dmamap_link);
 	DMAR_UNLOCK(unit);
 }	
 
 static void
-dmar_ctx_unload_task(void *arg, int pending)
+dmar_domain_unload_task(void *arg, int pending)
 {
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	struct dmar_map_entries_tailq entries;
 
-	ctx = arg;
+	domain = arg;
 	TAILQ_INIT(&entries);
 
 	for (;;) {
-		DMAR_CTX_LOCK(ctx);
-		TAILQ_SWAP(&ctx->unload_entries, &entries, dmar_map_entry,
+		DMAR_DOMAIN_LOCK(domain);
+		TAILQ_SWAP(&domain->unload_entries, &entries, dmar_map_entry,
 		    dmamap_link);
-		DMAR_CTX_UNLOCK(ctx);
+		DMAR_DOMAIN_UNLOCK(domain);
 		if (TAILQ_EMPTY(&entries))
 			break;
-		dmar_ctx_unload(ctx, &entries, true);
+		dmar_domain_unload(domain, &entries, true);
 	}
 }

Modified: trunk/sys/x86/iommu/intel_dmar.h
===================================================================
--- trunk/sys/x86/iommu/intel_dmar.h	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_dmar.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
 /*-
- * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/iommu/intel_dmar.h 281545 2015-04-15 06:56:51Z kib $
+ * $FreeBSD: stable/11/sys/x86/iommu/intel_dmar.h 320357 2017-06-26 12:30:39Z kib $
  */
 
 #ifndef __X86_IOMMU_INTEL_DMAR_H
@@ -51,10 +51,10 @@
 					   current R/B tree node */
 	u_int flags;
 	TAILQ_ENTRY(dmar_map_entry) dmamap_link; /* Link for dmamap entries */
-	RB_ENTRY(dmar_map_entry) rb_entry;	 /* Links for ctx entries */
+	RB_ENTRY(dmar_map_entry) rb_entry;	 /* Links for domain entries */
 	TAILQ_ENTRY(dmar_map_entry) unroll_link; /* Link for unroll after
 						    dmamap_load failure */
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	struct dmar_qi_genseq gseq;
 };
 
@@ -74,51 +74,85 @@
 #define	DMAR_MAP_ENTRY_SNOOP	0x4000	/* Snoop */
 #define	DMAR_MAP_ENTRY_TM	0x8000	/* Transient */
 
+/*
+ * Locking annotations:
+ * (u) - Protected by dmar unit lock
+ * (d) - Protected by domain lock
+ * (c) - Immutable after initialization
+ */
+
+/*
+ * The domain abstraction.  Most non-constant members of the domain
+ * are protected by owning dmar unit lock, not by the domain lock.
+ * Most important, the dmar lock protects the contexts list.
+ *
+ * The domain lock protects the address map for the domain, and list
+ * of unload entries delayed.
+ *
+ * Page tables pages and pages content is protected by the vm object
+ * lock pgtbl_obj, which contains the page tables pages.
+ */
+struct dmar_domain {
+	int domain;			/* (c) DID, written in context entry */
+	int mgaw;			/* (c) Real max address width */
+	int agaw;			/* (c) Adjusted guest address width */
+	int pglvl;			/* (c) The pagelevel */
+	int awlvl;			/* (c) The pagelevel as the bitmask,
+					   to set in context entry */
+	dmar_gaddr_t end;		/* (c) Highest address + 1 in
+					   the guest AS */
+	u_int ctx_cnt;			/* (u) Number of contexts owned */
+	u_int refs;			/* (u) Refs, including ctx */
+	struct dmar_unit *dmar;		/* (c) */
+	struct mtx lock;		/* (c) */
+	LIST_ENTRY(dmar_domain) link;	/* (u) Member in the dmar list */
+	LIST_HEAD(, dmar_ctx) contexts;	/* (u) */
+	vm_object_t pgtbl_obj;		/* (c) Page table pages */
+	u_int flags;			/* (u) */
+	u_int entries_cnt;		/* (d) */
+	struct dmar_gas_entries_tree rb_root; /* (d) */
+	struct dmar_map_entries_tailq unload_entries; /* (d) Entries to
+							 unload */
+	struct dmar_map_entry *first_place, *last_place; /* (d) */
+	struct task unload_task;	/* (c) */
+	u_int batch_no;
+};
+
 struct dmar_ctx {
-	uint16_t rid;	/* pci RID */
-	int domain;	/* DID */
-	int mgaw;	/* Real max address width */
-	int agaw;	/* Adjusted guest address width */
-	int pglvl;	/* The pagelevel */
-	int awlvl;	/* The pagelevel as the bitmask, to set in
-			   context entry */
-	dmar_gaddr_t end;/* Highest address + 1 in the guest AS */
-	u_int refs;	/* References to the context, from tags */
-	struct dmar_unit *dmar;
-	struct bus_dma_tag_dmar ctx_tag; /* Root tag */
-	struct mtx lock;
-	LIST_ENTRY(dmar_ctx) link;	/* Member in the dmar list */
-	vm_object_t pgtbl_obj;		/* Page table pages */
-	u_int flags;			/* Protected by dmar lock */
+	struct bus_dma_tag_dmar ctx_tag; /* (c) Root tag */
+	uint16_t rid;			/* (c) pci RID */
 	uint64_t last_fault_rec[2];	/* Last fault reported */
-	u_int entries_cnt;
-	u_long loads;
-	u_long unloads;
-	struct dmar_gas_entries_tree rb_root;
-	struct dmar_map_entries_tailq unload_entries; /* Entries to unload */
-	struct dmar_map_entry *first_place, *last_place;
-	struct task unload_task;
+	struct dmar_domain *domain;	/* (c) */
+	LIST_ENTRY(dmar_ctx) link;	/* (u) Member in the domain list */
+	u_int refs;			/* (u) References from tags */
+	u_int flags;			/* (u) */
+	u_long loads;			/* atomic updates, for stat only */
+	u_long unloads;			/* same */
 };
 
+#define	DMAR_DOMAIN_GAS_INITED		0x0001
+#define	DMAR_DOMAIN_PGTBL_INITED	0x0002
+#define	DMAR_DOMAIN_IDMAP		0x0010	/* Domain uses identity
+						   page table */
+#define	DMAR_DOMAIN_RMRR		0x0020	/* Domain contains RMRR entry,
+						   cannot be turned off */
+
 /* struct dmar_ctx flags */
 #define	DMAR_CTX_FAULTED	0x0001	/* Fault was reported,
 					   last_fault_rec is valid */
-#define	DMAR_CTX_IDMAP		0x0002	/* Context uses identity page table */
-#define	DMAR_CTX_RMRR		0x0004	/* Context contains RMRR entry,
-					   cannot be turned off */
-#define	DMAR_CTX_DISABLED	0x0008	/* Device is disabled, the
+#define	DMAR_CTX_DISABLED	0x0002	/* Device is disabled, the
 					   ephemeral reference is kept
 					   to prevent context destruction */
 
-#define	DMAR_CTX_PGLOCK(ctx)	VM_OBJECT_WLOCK((ctx)->pgtbl_obj)
-#define	DMAR_CTX_PGTRYLOCK(ctx)	VM_OBJECT_TRYWLOCK((ctx)->pgtbl_obj)
-#define	DMAR_CTX_PGUNLOCK(ctx)	VM_OBJECT_WUNLOCK((ctx)->pgtbl_obj)
-#define	DMAR_CTX_ASSERT_PGLOCKED(ctx) \
-	VM_OBJECT_ASSERT_WLOCKED((ctx)->pgtbl_obj)
+#define	DMAR_DOMAIN_PGLOCK(dom)		VM_OBJECT_WLOCK((dom)->pgtbl_obj)
+#define	DMAR_DOMAIN_PGTRYLOCK(dom)	VM_OBJECT_TRYWLOCK((dom)->pgtbl_obj)
+#define	DMAR_DOMAIN_PGUNLOCK(dom)	VM_OBJECT_WUNLOCK((dom)->pgtbl_obj)
+#define	DMAR_DOMAIN_ASSERT_PGLOCKED(dom) \
+	VM_OBJECT_ASSERT_WLOCKED((dom)->pgtbl_obj)
 
-#define	DMAR_CTX_LOCK(ctx)	mtx_lock(&(ctx)->lock)
-#define	DMAR_CTX_UNLOCK(ctx)	mtx_unlock(&(ctx)->lock)
-#define	DMAR_CTX_ASSERT_LOCKED(ctx) mtx_assert(&(ctx)->lock, MA_OWNED)
+#define	DMAR_DOMAIN_LOCK(dom)	mtx_lock(&(dom)->lock)
+#define	DMAR_DOMAIN_UNLOCK(dom)	mtx_unlock(&(dom)->lock)
+#define	DMAR_DOMAIN_ASSERT_LOCKED(dom) mtx_assert(&(dom)->lock, MA_OWNED)
 
 struct dmar_msi_data {
 	int irq;
@@ -158,7 +192,7 @@
 
 	/* Data for being a dmar */
 	struct mtx lock;
-	LIST_HEAD(, dmar_ctx) contexts;
+	LIST_HEAD(, dmar_domain) domains;
 	struct unrhdr *domids;
 	vm_object_t ctx_obj;
 	u_int barrier_flags;
@@ -186,6 +220,13 @@
 	u_int inv_seq_waiters;	/* count of waiters for seq */
 	u_int inv_queue_full;	/* informational counter */
 
+	/* IR */
+	int ir_enabled;
+	vm_paddr_t irt_phys;
+	dmar_irte_t *irt;
+	u_int irte_cnt;
+	vmem_t *irtids;
+
 	/* Delayed freeing of map entries queue processing */
 	struct dmar_map_entries_tailq tlb_flush_entries;
 	struct task qi_task;
@@ -195,6 +236,8 @@
 	struct task dmamap_load_task;
 	TAILQ_HEAD(, bus_dmamap_dmar) delayed_maps;
 	struct taskqueue *delayed_taskqueue;
+
+	int dma_enabled;
 };
 
 #define	DMAR_LOCK(dmar)		mtx_lock(&(dmar)->lock)
@@ -207,6 +250,8 @@
 
 #define	DMAR_IS_COHERENT(dmar)	(((dmar)->hw_ecap & DMAR_ECAP_C) != 0)
 #define	DMAR_HAS_QI(dmar)	(((dmar)->hw_ecap & DMAR_ECAP_QI) != 0)
+#define	DMAR_X2APIC(dmar) \
+	(x2apic_mode && ((dmar)->hw_ecap & DMAR_ECAP_EIM) != 0)
 
 /* Barrier ids */
 #define	DMAR_BARRIER_RMRR	0
@@ -213,16 +258,18 @@
 #define	DMAR_BARRIER_USEQ	1
 
 struct dmar_unit *dmar_find(device_t dev);
+struct dmar_unit *dmar_find_hpet(device_t dev, uint16_t *rid);
+struct dmar_unit *dmar_find_ioapic(u_int apic_id, uint16_t *rid);
 
 u_int dmar_nd2mask(u_int nd);
 bool dmar_pglvl_supported(struct dmar_unit *unit, int pglvl);
-int ctx_set_agaw(struct dmar_ctx *ctx, int mgaw);
-int dmar_maxaddr2mgaw(struct dmar_unit* unit, dmar_gaddr_t maxaddr,
+int domain_set_agaw(struct dmar_domain *domain, int mgaw);
+int dmar_maxaddr2mgaw(struct dmar_unit *unit, dmar_gaddr_t maxaddr,
     bool allow_less);
 vm_pindex_t pglvl_max_pages(int pglvl);
-int ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl);
+int domain_is_sp_lvl(struct dmar_domain *domain, int lvl);
 dmar_gaddr_t pglvl_page_size(int total_pglvl, int lvl);
-dmar_gaddr_t ctx_page_size(struct dmar_ctx *ctx, int lvl);
+dmar_gaddr_t domain_page_size(struct dmar_domain *domain, int lvl);
 int calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size,
     dmar_gaddr_t *isizep);
 struct vm_page *dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags);
@@ -239,8 +286,13 @@
 void dmar_flush_root_to_ram(struct dmar_unit *unit, dmar_root_entry_t *dst);
 int dmar_enable_translation(struct dmar_unit *unit);
 int dmar_disable_translation(struct dmar_unit *unit);
+int dmar_load_irt_ptr(struct dmar_unit *unit);
+int dmar_enable_ir(struct dmar_unit *unit);
+int dmar_disable_ir(struct dmar_unit *unit);
 bool dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id);
 void dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id);
+uint64_t dmar_get_timeout(void);
+void dmar_update_timeout(uint64_t newval);
 
 int dmar_fault_intr(void *arg);
 void dmar_enable_fault_intr(struct dmar_unit *unit);
@@ -253,52 +305,61 @@
 void dmar_disable_qi_intr(struct dmar_unit *unit);
 int dmar_init_qi(struct dmar_unit *unit);
 void dmar_fini_qi(struct dmar_unit *unit);
-void dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t start,
-    dmar_gaddr_t size, struct dmar_qi_genseq *pseq);
+void dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t start,
+    dmar_gaddr_t size, struct dmar_qi_genseq *psec, bool emit_wait);
 void dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit);
 void dmar_qi_invalidate_iotlb_glob_locked(struct dmar_unit *unit);
+void dmar_qi_invalidate_iec_glob(struct dmar_unit *unit);
+void dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt);
 
-vm_object_t ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr);
+vm_object_t domain_get_idmap_pgtbl(struct dmar_domain *domain,
+    dmar_gaddr_t maxaddr);
 void put_idmap_pgtbl(vm_object_t obj);
-int ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
-    vm_page_t *ma, uint64_t pflags, int flags);
-int ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
-    int flags);
-void ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base,
+int domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags);
+int domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, int flags);
+void domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base,
     dmar_gaddr_t size);
-int ctx_alloc_pgtbl(struct dmar_ctx *ctx);
-void ctx_free_pgtbl(struct dmar_ctx *ctx);
+int domain_alloc_pgtbl(struct dmar_domain *domain);
+void domain_free_pgtbl(struct dmar_domain *domain);
 
 struct dmar_ctx *dmar_instantiate_ctx(struct dmar_unit *dmar, device_t dev,
     bool rmrr);
-struct dmar_ctx *dmar_get_ctx(struct dmar_unit *dmar, device_t dev, 
+struct dmar_ctx *dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev,
     uint16_t rid, bool id_mapped, bool rmrr_init);
+int dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx);
 void dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx);
 void dmar_free_ctx(struct dmar_ctx *ctx);
 struct dmar_ctx *dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid);
-void dmar_ctx_unload_entry(struct dmar_map_entry *entry, bool free);
-void dmar_ctx_unload(struct dmar_ctx *ctx,
+void dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free);
+void dmar_domain_unload(struct dmar_domain *domain,
     struct dmar_map_entries_tailq *entries, bool cansleep);
-void dmar_ctx_free_entry(struct dmar_map_entry *entry, bool free);
+void dmar_domain_free_entry(struct dmar_map_entry *entry, bool free);
 
 int dmar_init_busdma(struct dmar_unit *unit);
 void dmar_fini_busdma(struct dmar_unit *unit);
+device_t dmar_get_requester(device_t dev, uint16_t *rid);
 
-void dmar_gas_init_ctx(struct dmar_ctx *ctx);
-void dmar_gas_fini_ctx(struct dmar_ctx *ctx);
-struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags);
-void dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry);
-void dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry);
-int dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common,
-    dmar_gaddr_t size, int offset, u_int eflags, u_int flags, vm_page_t *ma,
-    struct dmar_map_entry **res);
-void dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry);
-int dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
-    u_int eflags, u_int flags, vm_page_t *ma);
-int dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start,
+void dmar_gas_init_domain(struct dmar_domain *domain);
+void dmar_gas_fini_domain(struct dmar_domain *domain);
+struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_domain *domain,
+    u_int flags);
+void dmar_gas_free_entry(struct dmar_domain *domain,
+    struct dmar_map_entry *entry);
+void dmar_gas_free_space(struct dmar_domain *domain,
+    struct dmar_map_entry *entry);
+int dmar_gas_map(struct dmar_domain *domain,
+    const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset,
+    u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res);
+void dmar_gas_free_region(struct dmar_domain *domain,
+    struct dmar_map_entry *entry);
+int dmar_gas_map_region(struct dmar_domain *domain,
+    struct dmar_map_entry *entry, u_int eflags, u_int flags, vm_page_t *ma);
+int dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start,
     dmar_gaddr_t end);
 
-void dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev,
+void dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev,
     struct dmar_map_entries_tailq *rmrr_entries);
 int dmar_instantiate_rmrr_ctxs(struct dmar_unit *dmar);
 
@@ -305,6 +366,9 @@
 void dmar_quirks_post_ident(struct dmar_unit *dmar);
 void dmar_quirks_pre_use(struct dmar_unit *dmar);
 
+int dmar_init_irt(struct dmar_unit *unit);
+void dmar_fini_irt(struct dmar_unit *unit);
+
 #define	DMAR_GM_CANWAIT	0x0001
 #define	DMAR_GM_CANSPLIT 0x0002
 
@@ -318,6 +382,7 @@
 extern int haw;
 extern int dmar_tbl_pagecnt;
 extern int dmar_match_verbose;
+extern int dmar_batch_coalesce;
 extern int dmar_check_free;
 
 static inline uint32_t
@@ -375,13 +440,16 @@
  * containing the P or R and W bits, is set only after the high word
  * is written.  For clear, the P bit is cleared first, then the high
  * word is cleared.
+ *
+ * dmar_pte_update updates the pte.  For amd64, the update is atomic.
+ * For i386, it first disables the entry by clearing the word
+ * containing the P bit, and then defer to dmar_pte_store.  The locked
+ * cmpxchg8b is probably available on any machine having DMAR support,
+ * but interrupt translation table may be mapped uncached.
  */
 static inline void
-dmar_pte_store(volatile uint64_t *dst, uint64_t val)
+dmar_pte_store1(volatile uint64_t *dst, uint64_t val)
 {
-
-	KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx",
-	    dst, (uintmax_t)*dst, (uintmax_t)val));
 #ifdef __i386__
 	volatile uint32_t *p;
 	uint32_t hi, lo;
@@ -397,6 +465,28 @@
 }
 
 static inline void
+dmar_pte_store(volatile uint64_t *dst, uint64_t val)
+{
+
+	KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx",
+	    dst, (uintmax_t)*dst, (uintmax_t)val));
+	dmar_pte_store1(dst, val);
+}
+
+static inline void
+dmar_pte_update(volatile uint64_t *dst, uint64_t val)
+{
+
+#ifdef __i386__
+	volatile uint32_t *p;
+
+	p = (volatile uint32_t *)dst;
+	*p = 0;
+#endif
+	dmar_pte_store1(dst, val);
+}
+
+static inline void
 dmar_pte_clear(volatile uint64_t *dst)
 {
 #ifdef __i386__
@@ -420,6 +510,36 @@
 	return (start + size <= ((start + boundary) & ~(boundary - 1)));
 }
 
+extern struct timespec dmar_hw_timeout;
+
+#define	DMAR_WAIT_UNTIL(cond)					\
+{								\
+	struct timespec last, curr;				\
+	bool forever;						\
+								\
+	if (dmar_hw_timeout.tv_sec == 0 &&			\
+	    dmar_hw_timeout.tv_nsec == 0) {			\
+		forever = true;					\
+	} else {						\
+		forever = false;				\
+		nanouptime(&curr);				\
+		last = curr;					\
+		timespecadd(&last, &dmar_hw_timeout);		\
+	}							\
+	for (;;) {						\
+		if (cond) {					\
+			error = 0;				\
+			break;					\
+		}						\
+		nanouptime(&curr);				\
+		if (!forever && timespeccmp(&last, &curr, <)) {	\
+			error = ETIMEDOUT;			\
+			break;					\
+		}						\
+		cpu_spinwait();					\
+	}							\
+}
+
 #ifdef INVARIANTS
 #define	TD_PREP_PINNED_ASSERT						\
 	int old_td_pinned;						\

Modified: trunk/sys/x86/iommu/intel_drv.c
===================================================================
--- trunk/sys/x86/iommu/intel_drv.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_drv.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
 /*-
- * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
@@ -29,10 +29,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_drv.c 279470 2015-03-01 04:22:06Z rstone $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_drv.c 323921 2017-09-22 10:51:32Z kib $");
 
 #include "opt_acpi.h"
-#if defined(__amd64__) /* || defined(__ia64__) */
+#if defined(__amd64__)
 #define	DEV_APIC
 #else
 #include "opt_apic.h"
@@ -51,6 +51,7 @@
 #include <sys/smp.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <machine/bus.h>
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
@@ -66,10 +67,14 @@
 #include <x86/iommu/intel_reg.h>
 #include <x86/iommu/busdma_dmar.h>
 #include <x86/iommu/intel_dmar.h>
+#include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #ifdef DEV_APIC
 #include "pcib_if.h"
+#include <machine/intr_machdep.h>
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
 #endif
 
 #define	DMAR_FAULT_IRQ_RID	0
@@ -108,6 +113,7 @@
 		if (!iter(dmarh, arg))
 			break;
 	}
+	AcpiPutTable((ACPI_TABLE_HEADER *)dmartbl);
 }
 
 struct find_iter_args {
@@ -183,6 +189,7 @@
 		    (unsigned)dmartbl->Flags,
 		    "\020\001INTR_REMAP\002X2APIC_OPT_OUT");
 	}
+	AcpiPutTable((ACPI_TABLE_HEADER *)dmartbl);
 
 	dmar_iterate_tbl(dmar_count_iter, NULL);
 	if (dmar_devcnt == 0)
@@ -244,6 +251,7 @@
 	int i;
 
 	dmar_fini_busdma(unit);
+	dmar_fini_irt(unit);
 	dmar_fini_qi(unit);
 	dmar_fini_fault_log(unit);
 	for (i = 0; i < DMAR_INTR_TOTAL; i++)
@@ -304,7 +312,7 @@
 		    dmd->name, error);
 		goto err4;
 	}
-	bus_describe_intr(dev, dmd->irq_res, dmd->intr_handle, dmd->name);
+	bus_describe_intr(dev, dmd->irq_res, dmd->intr_handle, "%s", dmd->name);
 	error = PCIB_MAP_MSI(pcib, dev, dmd->irq, &msi_addr, &msi_data);
 	if (error != 0) {
 		device_printf(dev, "cannot map %s interrupt, %d\n",
@@ -398,6 +406,7 @@
 {
 	struct dmar_unit *unit;
 	ACPI_DMAR_HARDWARE_UNIT *dmaru;
+	uint64_t timeout;
 	int i, error;
 
 	unit = device_get_softc(dev);
@@ -422,6 +431,10 @@
 		dmar_print_caps(dev, unit, dmaru);
 	dmar_quirks_post_ident(unit);
 
+	timeout = dmar_get_timeout();
+	TUNABLE_UINT64_FETCH("hw.dmar.timeout", &timeout);
+	dmar_update_timeout(timeout);
+
 	for (i = 0; i < DMAR_INTR_TOTAL; i++)
 		unit->intrs[i].irq = -1;
 
@@ -457,6 +470,7 @@
 	mtx_init(&unit->lock, "dmarhw", NULL, MTX_DEF);
 	unit->domids = new_unrhdr(0, dmar_nd2mask(DMAR_CAP_ND(unit->hw_cap)),
 	    &unit->lock);
+	LIST_INIT(&unit->domains);
 
 	/*
 	 * 9.2 "Context Entry":
@@ -510,6 +524,11 @@
 		dmar_release_resources(dev, unit);
 		return (error);
 	}
+	error = dmar_init_irt(unit);
+	if (error != 0) {
+		dmar_release_resources(dev, unit);
+		return (error);
+	}
 	error = dmar_init_busdma(unit);
 	if (error != 0) {
 		dmar_release_resources(dev, unit);
@@ -764,8 +783,87 @@
 	return (device_get_softc(dmar_dev));
 }
 
+static struct dmar_unit *
+dmar_find_nonpci(u_int id, u_int entry_type, uint16_t *rid)
+{
+	device_t dmar_dev;
+	struct dmar_unit *unit;
+	ACPI_DMAR_HARDWARE_UNIT *dmarh;
+	ACPI_DMAR_DEVICE_SCOPE *devscope;
+	ACPI_DMAR_PCI_PATH *path;
+	char *ptr, *ptrend;
+#ifdef DEV_APIC
+	int error;
+#endif
+	int i;
+
+	for (i = 0; i < dmar_devcnt; i++) {
+		dmar_dev = dmar_devs[i];
+		if (dmar_dev == NULL)
+			continue;
+		unit = (struct dmar_unit *)device_get_softc(dmar_dev);
+		dmarh = dmar_find_by_index(i);
+		if (dmarh == NULL)
+			continue;
+		ptr = (char *)dmarh + sizeof(*dmarh);
+		ptrend = (char *)dmarh + dmarh->Header.Length;
+		for (;;) {
+			if (ptr >= ptrend)
+				break;
+			devscope = (ACPI_DMAR_DEVICE_SCOPE *)ptr;
+			ptr += devscope->Length;
+			if (devscope->EntryType != entry_type)
+				continue;
+			if (devscope->EnumerationId != id)
+				continue;
+#ifdef DEV_APIC
+			if (entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
+				error = ioapic_get_rid(id, rid);
+				/*
+				 * If our IOAPIC has PCI bindings then
+				 * use the PCI device rid.
+				 */
+				if (error == 0)
+					return (unit);
+			}
+#endif
+			if (devscope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE)
+			    == 2) {
+				if (rid != NULL) {
+					path = (ACPI_DMAR_PCI_PATH *)
+					    (devscope + 1);
+					*rid = PCI_RID(devscope->Bus,
+					    path->Device, path->Function);
+				}
+				return (unit);
+			}
+			printf(
+		           "dmar_find_nonpci: id %d type %d path length != 2\n",
+			    id, entry_type);
+			break;
+		}
+	}
+	return (NULL);
+}
+
+
+struct dmar_unit *
+dmar_find_hpet(device_t dev, uint16_t *rid)
+{
+
+	return (dmar_find_nonpci(hpet_get_uid(dev), ACPI_DMAR_SCOPE_TYPE_HPET,
+	    rid));
+}
+
+struct dmar_unit *
+dmar_find_ioapic(u_int apic_id, uint16_t *rid)
+{
+
+	return (dmar_find_nonpci(apic_id, ACPI_DMAR_SCOPE_TYPE_IOAPIC, rid));
+}
+
 struct rmrr_iter_args {
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	device_t dev;
 	int dev_domain;
 	int dev_busno;
@@ -810,7 +908,8 @@
 		if (match == 1) {
 			if (dmar_match_verbose)
 				printf("matched\n");
-			entry = dmar_gas_alloc_entry(ria->ctx, DMAR_PGF_WAITOK);
+			entry = dmar_gas_alloc_entry(ria->domain,
+			    DMAR_PGF_WAITOK);
 			entry->start = resmem->BaseAddress;
 			/* The RMRR entry end address is inclusive. */
 			entry->end = resmem->EndAddress;
@@ -825,7 +924,7 @@
 }
 
 void
-dmar_ctx_parse_rmrr(struct dmar_ctx *ctx, device_t dev,
+dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev,
     struct dmar_map_entries_tailq *rmrr_entries)
 {
 	struct rmrr_iter_args ria;
@@ -841,7 +940,7 @@
 		    dev_path);
 	}
 
-	ria.ctx = ctx;
+	ria.domain = domain;
 	ria.dev = dev;
 	ria.dev_path = dev_path;
 	ria.rmrr_entries = rmrr_entries;
@@ -961,7 +1060,7 @@
 		printf("dmar%d: instantiating RMRR contexts\n", dmar->unit);
 	dmar_iterate_tbl(dmar_inst_rmrr_iter, &iria);
 	DMAR_LOCK(dmar);
-	if (!LIST_EMPTY(&dmar->contexts)) {
+	if (!LIST_EMPTY(&dmar->domains)) {
 		KASSERT((dmar->hw_gcmd & DMAR_GCMD_TE) == 0,
 	    ("dmar%d: RMRR not handled but translation is already enabled",
 		    dmar->unit));
@@ -976,7 +1075,7 @@
 #include <ddb/db_lex.h>
 
 static void
-dmar_print_ctx_entry(const struct dmar_map_entry *entry)
+dmar_print_domain_entry(const struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *l, *r;
 
@@ -1000,24 +1099,39 @@
 }
 
 static void
-dmar_print_ctx(struct dmar_ctx *ctx, bool show_mappings)
+dmar_print_ctx(struct dmar_ctx *ctx)
 {
-	struct dmar_map_entry *entry;
 
 	db_printf(
-	    "  @%p pci%d:%d:%d dom %d mgaw %d agaw %d pglvl %d end %jx\n"
-	    "    refs %d flags %x pgobj %p map_ents %u loads %lu unloads %lu\n",
+	    "    @%p pci%d:%d:%d refs %d flags %x loads %lu unloads %lu\n",
 	    ctx, pci_get_bus(ctx->ctx_tag.owner),
 	    pci_get_slot(ctx->ctx_tag.owner),
-	    pci_get_function(ctx->ctx_tag.owner), ctx->domain, ctx->mgaw,
-	    ctx->agaw, ctx->pglvl, (uintmax_t)ctx->end, ctx->refs,
-	    ctx->flags, ctx->pgtbl_obj, ctx->entries_cnt, ctx->loads,
-	    ctx->unloads);
+	    pci_get_function(ctx->ctx_tag.owner), ctx->refs, ctx->flags,
+	    ctx->loads, ctx->unloads);
+}
+
+static void
+dmar_print_domain(struct dmar_domain *domain, bool show_mappings)
+{
+	struct dmar_map_entry *entry;
+	struct dmar_ctx *ctx;
+
+	db_printf(
+	    "  @%p dom %d mgaw %d agaw %d pglvl %d end %jx refs %d\n"
+	    "   ctx_cnt %d flags %x pgobj %p map_ents %u\n",
+	    domain, domain->domain, domain->mgaw, domain->agaw, domain->pglvl,
+	    (uintmax_t)domain->end, domain->refs, domain->ctx_cnt,
+	    domain->flags, domain->pgtbl_obj, domain->entries_cnt);
+	if (!LIST_EMPTY(&domain->contexts)) {
+		db_printf("  Contexts:\n");
+		LIST_FOREACH(ctx, &domain->contexts, link)
+			dmar_print_ctx(ctx);
+	}
 	if (!show_mappings)
 		return;
 	db_printf("    mapped:\n");
-	RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) {
-		dmar_print_ctx_entry(entry);
+	RB_FOREACH(entry, dmar_gas_entries_tree, &domain->rb_root) {
+		dmar_print_domain_entry(entry);
 		if (db_pager_quit)
 			break;
 	}
@@ -1024,19 +1138,20 @@
 	if (db_pager_quit)
 		return;
 	db_printf("    unloading:\n");
-	TAILQ_FOREACH(entry, &ctx->unload_entries, dmamap_link) {
-		dmar_print_ctx_entry(entry);
+	TAILQ_FOREACH(entry, &domain->unload_entries, dmamap_link) {
+		dmar_print_domain_entry(entry);
 		if (db_pager_quit)
 			break;
 	}
 }
 
-DB_FUNC(dmar_ctx, db_dmar_print_ctx, db_show_table, CS_OWN, NULL)
+DB_FUNC(dmar_domain, db_dmar_print_domain, db_show_table, CS_OWN, NULL)
 {
 	struct dmar_unit *unit;
+	struct dmar_domain *domain;
 	struct dmar_ctx *ctx;
 	bool show_mappings, valid;
-	int domain, bus, device, function, i, t;
+	int pci_domain, bus, device, function, i, t;
 	db_expr_t radix;
 
 	valid = false;
@@ -1057,7 +1172,7 @@
 		show_mappings = false;
 	}
 	if (t == tNUMBER) {
-		domain = db_tok_number;
+		pci_domain = db_tok_number;
 		t = db_read_token();
 		if (t == tNUMBER) {
 			bus = db_tok_number;
@@ -1075,19 +1190,24 @@
 			db_radix = radix;
 	db_skip_to_eol();
 	if (!valid) {
-		db_printf("usage: show dmar_ctx [/m] "
+		db_printf("usage: show dmar_domain [/m] "
 		    "<domain> <bus> <device> <func>\n");
 		return;
 	}
 	for (i = 0; i < dmar_devcnt; i++) {
 		unit = device_get_softc(dmar_devs[i]);
-		LIST_FOREACH(ctx, &unit->contexts, link) {
-			if (domain == unit->segment && 
-			    bus == pci_get_bus(ctx->ctx_tag.owner) &&
-			    device == pci_get_slot(ctx->ctx_tag.owner) && 
-			    function == pci_get_function(ctx->ctx_tag.owner)) {
-				dmar_print_ctx(ctx, show_mappings);
-				goto out;
+		LIST_FOREACH(domain, &unit->domains, link) {
+			LIST_FOREACH(ctx, &domain->contexts, link) {
+				if (pci_domain == unit->segment && 
+				    bus == pci_get_bus(ctx->ctx_tag.owner) &&
+				    device ==
+				    pci_get_slot(ctx->ctx_tag.owner) &&
+				    function ==
+				    pci_get_function(ctx->ctx_tag.owner)) {
+					dmar_print_domain(domain,
+					    show_mappings);
+					goto out;
+				}
 			}
 		}
 	}
@@ -1095,10 +1215,10 @@
 }
 
 static void
-dmar_print_one(int idx, bool show_ctxs, bool show_mappings)
+dmar_print_one(int idx, bool show_domains, bool show_mappings)
 {
 	struct dmar_unit *unit;
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	int i, frir;
 
 	unit = device_get_softc(dmar_devs[idx]);
@@ -1110,6 +1230,10 @@
 	    dmar_read4(unit, DMAR_GSTS_REG),
 	    dmar_read4(unit, DMAR_FSTS_REG),
 	    dmar_read4(unit, DMAR_FECTL_REG));
+	if (unit->ir_enabled) {
+		db_printf("ir is enabled; IRT @%p phys 0x%jx maxcnt %d\n",
+		    unit->irt, (uintmax_t)unit->irt_phys, unit->irte_cnt);
+	}
 	db_printf("fed 0x%x fea 0x%x feua 0x%x\n",
 	    dmar_read4(unit, DMAR_FEDATA_REG),
 	    dmar_read4(unit, DMAR_FEADDR_REG),
@@ -1148,10 +1272,10 @@
 			db_printf("qi is disabled\n");
 		}
 	}
-	if (show_ctxs) {
-		db_printf("contexts:\n");
-		LIST_FOREACH(ctx, &unit->contexts, link) {
-			dmar_print_ctx(ctx, show_mappings);
+	if (show_domains) {
+		db_printf("domains:\n");
+		LIST_FOREACH(domain, &unit->domains, link) {
+			dmar_print_domain(domain, show_mappings);
 			if (db_pager_quit)
 				break;
 		}
@@ -1160,27 +1284,27 @@
 
 DB_SHOW_COMMAND(dmar, db_dmar_print)
 {
-	bool show_ctxs, show_mappings;
+	bool show_domains, show_mappings;
 
-	show_ctxs = strchr(modif, 'c') != NULL;
+	show_domains = strchr(modif, 'd') != NULL;
 	show_mappings = strchr(modif, 'm') != NULL;
 	if (!have_addr) {
-		db_printf("usage: show dmar [/c] [/m] index\n");
+		db_printf("usage: show dmar [/d] [/m] index\n");
 		return;
 	}
-	dmar_print_one((int)addr, show_ctxs, show_mappings);
+	dmar_print_one((int)addr, show_domains, show_mappings);
 }
 
 DB_SHOW_ALL_COMMAND(dmars, db_show_all_dmars)
 {
 	int i;
-	bool show_ctxs, show_mappings;
+	bool show_domains, show_mappings;
 
-	show_ctxs = strchr(modif, 'c') != NULL;
+	show_domains = strchr(modif, 'd') != NULL;
 	show_mappings = strchr(modif, 'm') != NULL;
 
 	for (i = 0; i < dmar_devcnt; i++) {
-		dmar_print_one(i, show_ctxs, show_mappings);
+		dmar_print_one(i, show_domains, show_mappings);
 		if (db_pager_quit)
 			break;
 	}

Modified: trunk/sys/x86/iommu/intel_fault.c
===================================================================
--- trunk/sys/x86/iommu/intel_fault.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_fault.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_fault.c 279485 2015-03-01 10:35:54Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_fault.c 309882 2016-12-12 09:43:48Z kib $");
 
 #include "opt_acpi.h"
 
@@ -42,6 +42,7 @@
 #include <sys/rman.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <machine/bus.h>
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
@@ -179,7 +180,7 @@
 	}
 
 	if (enqueue) {
-		taskqueue_enqueue_fast(unit->fault_taskqueue,
+		taskqueue_enqueue(unit->fault_taskqueue,
 		    &unit->fault_task);
 	}
 	return (FILTER_HANDLED);
@@ -271,7 +272,7 @@
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 
 	TASK_INIT(&unit->fault_task, 0, dmar_fault_task, unit);
-	unit->fault_taskqueue = taskqueue_create_fast("dmar", M_WAITOK,
+	unit->fault_taskqueue = taskqueue_create_fast("dmarff", M_WAITOK,
 	    taskqueue_thread_enqueue, &unit->fault_taskqueue);
 	taskqueue_start_threads(&unit->fault_taskqueue, 1, PI_AV,
 	    "dmar%d fault taskq", unit->unit);

Modified: trunk/sys/x86/iommu/intel_gas.c
===================================================================
--- trunk/sys/x86/iommu/intel_gas.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_gas.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_gas.c 281545 2015-04-15 06:56:51Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_gas.c 329942 2018-02-25 00:32:42Z markj $");
 
 #define	RB_AUGMENT(entry) dmar_gas_augment_entry(entry)
 
@@ -50,6 +50,7 @@
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/uio.h>
+#include <sys/vmem.h>
 #include <dev/pci/pcivar.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
@@ -79,12 +80,12 @@
 
 	dmar_map_entry_zone = uma_zcreate("DMAR_MAP_ENTRY",
 	    sizeof(struct dmar_map_entry), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NODUMP);
 }
 SYSINIT(intel_gas, SI_SUB_DRIVERS, SI_ORDER_FIRST, intel_gas_init, NULL);
 
 struct dmar_map_entry *
-dmar_gas_alloc_entry(struct dmar_ctx *ctx, u_int flags)
+dmar_gas_alloc_entry(struct dmar_domain *domain, u_int flags)
 {
 	struct dmar_map_entry *res;
 
@@ -94,20 +95,20 @@
 	res = uma_zalloc(dmar_map_entry_zone, ((flags & DMAR_PGF_WAITOK) !=
 	    0 ? M_WAITOK : M_NOWAIT) | M_ZERO);
 	if (res != NULL) {
-		res->ctx = ctx;
-		atomic_add_int(&ctx->entries_cnt, 1);
+		res->domain = domain;
+		atomic_add_int(&domain->entries_cnt, 1);
 	}
 	return (res);
 }
 
 void
-dmar_gas_free_entry(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_free_entry(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 
-	KASSERT(ctx == entry->ctx,
-	    ("mismatched free ctx %p entry %p entry->ctx %p", ctx,
-	    entry, entry->ctx));
-	atomic_subtract_int(&ctx->entries_cnt, 1);
+	KASSERT(domain == entry->domain,
+	    ("mismatched free domain %p entry %p entry->domain %p", domain,
+	    entry, entry->domain));
+	atomic_subtract_int(&domain->entries_cnt, 1);
 	uma_zfree(dmar_map_entry_zone, entry);
 }
 
@@ -158,12 +159,12 @@
     dmar_gas_cmp_entries);
 
 static void
-dmar_gas_fix_free(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_fix_free(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *next;
 
-	next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	entry->free_after = (next != NULL ? next->start : ctx->end) -
+	next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
+	entry->free_after = (next != NULL ? next->start : domain->end) -
 	    entry->end;
 	dmar_gas_augment_entry(entry);
 }
@@ -170,18 +171,18 @@
 
 #ifdef INVARIANTS
 static void
-dmar_gas_check_free(struct dmar_ctx *ctx)
+dmar_gas_check_free(struct dmar_domain *domain)
 {
 	struct dmar_map_entry *entry, *next, *l, *r;
 	dmar_gaddr_t v;
 
-	RB_FOREACH(entry, dmar_gas_entries_tree, &ctx->rb_root) {
-		KASSERT(ctx == entry->ctx,
-		    ("mismatched free ctx %p entry %p entry->ctx %p", ctx,
-		    entry, entry->ctx));
-		next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
+	RB_FOREACH(entry, dmar_gas_entries_tree, &domain->rb_root) {
+		KASSERT(domain == entry->domain,
+		    ("mismatched free domain %p entry %p entry->domain %p",
+		    domain, entry, entry->domain));
+		next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
 		if (next == NULL) {
-			MPASS(entry->free_after == ctx->end - entry->end);
+			MPASS(entry->free_after == domain->end - entry->end);
 		} else {
 			MPASS(entry->free_after = next->start - entry->end);
 			MPASS(entry->end <= next->start);
@@ -198,7 +199,7 @@
 			    l->free_down));
 		} else {
 			v = MAX(entry->free_after, l->free_down);
-			v = MAX(entry->free_down, r->free_down);
+			v = MAX(v, r->free_down);
 			MPASS(entry->free_down == v);
 		}
 	}
@@ -206,93 +207,95 @@
 #endif
 
 static bool
-dmar_gas_rb_insert(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_rb_insert(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *prev, *found;
 
-	found = RB_INSERT(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	dmar_gas_fix_free(ctx, entry);
-	prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
+	found = RB_INSERT(dmar_gas_entries_tree, &domain->rb_root, entry);
+	dmar_gas_fix_free(domain, entry);
+	prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
 	if (prev != NULL)
-		dmar_gas_fix_free(ctx, prev);
+		dmar_gas_fix_free(domain, prev);
 	return (found == NULL);
 }
 
 static void
-dmar_gas_rb_remove(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_rb_remove(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *prev;
 
-	prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
+	prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
+	RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
 	if (prev != NULL)
-		dmar_gas_fix_free(ctx, prev);
+		dmar_gas_fix_free(domain, prev);
 }
 
 void
-dmar_gas_init_ctx(struct dmar_ctx *ctx)
+dmar_gas_init_domain(struct dmar_domain *domain)
 {
 	struct dmar_map_entry *begin, *end;
 
-	begin = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK);
-	end = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK);
+	begin = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK);
+	end = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK);
 
-	DMAR_CTX_LOCK(ctx);
-	KASSERT(ctx->entries_cnt == 2, ("dirty ctx %p", ctx));
-	KASSERT(RB_EMPTY(&ctx->rb_root), ("non-empty entries %p", ctx));
+	DMAR_DOMAIN_LOCK(domain);
+	KASSERT(domain->entries_cnt == 2, ("dirty domain %p", domain));
+	KASSERT(RB_EMPTY(&domain->rb_root), ("non-empty entries %p", domain));
 
 	begin->start = 0;
 	begin->end = DMAR_PAGE_SIZE;
-	begin->free_after = ctx->end - begin->end;
+	begin->free_after = domain->end - begin->end;
 	begin->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED;
-	dmar_gas_rb_insert(ctx, begin);
+	dmar_gas_rb_insert(domain, begin);
 
-	end->start = ctx->end;
-	end->end = ctx->end;
+	end->start = domain->end;
+	end->end = domain->end;
 	end->free_after = 0;
 	end->flags = DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_UNMAPPED;
-	dmar_gas_rb_insert(ctx, end);
+	dmar_gas_rb_insert(domain, end);
 
-	ctx->first_place = begin;
-	ctx->last_place = end;
-	DMAR_CTX_UNLOCK(ctx);
+	domain->first_place = begin;
+	domain->last_place = end;
+	domain->flags |= DMAR_DOMAIN_GAS_INITED;
+	DMAR_DOMAIN_UNLOCK(domain);
 }
 
 void
-dmar_gas_fini_ctx(struct dmar_ctx *ctx)
+dmar_gas_fini_domain(struct dmar_domain *domain)
 {
 	struct dmar_map_entry *entry, *entry1;
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
-	KASSERT(ctx->entries_cnt == 2, ("ctx still in use %p", ctx));
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
+	KASSERT(domain->entries_cnt == 2, ("domain still in use %p", domain));
 
-	entry = RB_MIN(dmar_gas_entries_tree, &ctx->rb_root);
-	KASSERT(entry->start == 0, ("start entry start %p", ctx));
-	KASSERT(entry->end == DMAR_PAGE_SIZE, ("start entry end %p", ctx));
+	entry = RB_MIN(dmar_gas_entries_tree, &domain->rb_root);
+	KASSERT(entry->start == 0, ("start entry start %p", domain));
+	KASSERT(entry->end == DMAR_PAGE_SIZE, ("start entry end %p", domain));
 	KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE,
-	    ("start entry flags %p", ctx));
-	RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	dmar_gas_free_entry(ctx, entry);
+	    ("start entry flags %p", domain));
+	RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
+	dmar_gas_free_entry(domain, entry);
 
-	entry = RB_MAX(dmar_gas_entries_tree, &ctx->rb_root);
-	KASSERT(entry->start == ctx->end, ("end entry start %p", ctx));
-	KASSERT(entry->end == ctx->end, ("end entry end %p", ctx));
-	KASSERT(entry->free_after == 0, ("end entry free_after%p", ctx));
+	entry = RB_MAX(dmar_gas_entries_tree, &domain->rb_root);
+	KASSERT(entry->start == domain->end, ("end entry start %p", domain));
+	KASSERT(entry->end == domain->end, ("end entry end %p", domain));
+	KASSERT(entry->free_after == 0, ("end entry free_after %p", domain));
 	KASSERT(entry->flags == DMAR_MAP_ENTRY_PLACE,
-	    ("end entry flags %p", ctx));
-	RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	dmar_gas_free_entry(ctx, entry);
+	    ("end entry flags %p", domain));
+	RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
+	dmar_gas_free_entry(domain, entry);
 
-	RB_FOREACH_SAFE(entry, dmar_gas_entries_tree, &ctx->rb_root, entry1) {
+	RB_FOREACH_SAFE(entry, dmar_gas_entries_tree, &domain->rb_root,
+	    entry1) {
 		KASSERT((entry->flags & DMAR_MAP_ENTRY_RMRR) != 0,
-		    ("non-RMRR entry left %p", ctx));
-		RB_REMOVE(dmar_gas_entries_tree, &ctx->rb_root, entry);
-		dmar_gas_free_entry(ctx, entry);
+		    ("non-RMRR entry left %p", domain));
+		RB_REMOVE(dmar_gas_entries_tree, &domain->rb_root, entry);
+		dmar_gas_free_entry(domain, entry);
 	}
 }
 
 struct dmar_gas_match_args {
-	struct dmar_ctx *ctx;
+	struct dmar_domain *domain;
 	dmar_gaddr_t size;
 	int offset;
 	const struct bus_dma_tag_common *common;
@@ -325,8 +328,8 @@
 	 * the boundary.  Check if there is enough space after the
 	 * next boundary after the prev->end.
 	 */
-	bs = (a->entry->start + a->offset + a->common->boundary) &
-	    ~(a->common->boundary - 1);
+	bs = rounddown2(a->entry->start + a->offset + a->common->boundary,
+	    a->common->boundary);
 	start = roundup2(bs, a->common->alignment);
 	/* DMAR_PAGE_SIZE to create gap after new entry. */
 	if (start + a->offset + a->size + DMAR_PAGE_SIZE <=
@@ -371,12 +374,12 @@
 	 */
 	a->entry->end = a->entry->start + a->size;
 
-	next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev);
+	next = RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, prev);
 	KASSERT(next->start >= a->entry->end &&
 	    next->start - a->entry->start >= a->size &&
 	    prev->end <= a->entry->end,
 	    ("dmar_gas_match_insert hole failed %p prev (%jx, %jx) "
-	    "free_after %jx next (%jx, %jx) entry (%jx, %jx)", a->ctx,
+	    "free_after %jx next (%jx, %jx) entry (%jx, %jx)", a->domain,
 	    (uintmax_t)prev->start, (uintmax_t)prev->end,
 	    (uintmax_t)prev->free_after,
 	    (uintmax_t)next->start, (uintmax_t)next->end,
@@ -385,19 +388,19 @@
 	prev->free_after = a->entry->start - prev->end;
 	a->entry->free_after = next->start - a->entry->end;
 
-	found = dmar_gas_rb_insert(a->ctx, a->entry);
+	found = dmar_gas_rb_insert(a->domain, a->entry);
 	KASSERT(found, ("found dup %p start %jx size %jx",
-	    a->ctx, (uintmax_t)a->entry->start, (uintmax_t)a->size));
+	    a->domain, (uintmax_t)a->entry->start, (uintmax_t)a->size));
 	a->entry->flags = DMAR_MAP_ENTRY_MAP;
 
-	KASSERT(RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root,
+	KASSERT(RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root,
 	    a->entry) == prev,
 	    ("entry %p prev %p inserted prev %p", a->entry, prev,
-	    RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry)));
-	KASSERT(RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root,
+	    RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root, a->entry)));
+	KASSERT(RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root,
 	    a->entry) == next,
 	    ("entry %p next %p inserted next %p", a->entry, next,
-	    RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, a->entry)));
+	    RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root, a->entry)));
 }
 
 static int
@@ -434,11 +437,12 @@
 	struct dmar_map_entry *next, *prev, find_entry;
 
 	find_entry.start = a->common->highaddr;
-	next = RB_NFIND(dmar_gas_entries_tree, &a->ctx->rb_root, &find_entry);
+	next = RB_NFIND(dmar_gas_entries_tree, &a->domain->rb_root,
+	    &find_entry);
 	if (next == NULL)
 		return (ENOMEM);
-	prev = RB_PREV(dmar_gas_entries_tree, &a->ctx->rb_root, next);
-	KASSERT(prev != NULL, ("no prev %p %jx", a->ctx,
+	prev = RB_PREV(dmar_gas_entries_tree, &a->domain->rb_root, next);
+	KASSERT(prev != NULL, ("no prev %p %jx", a->domain,
 	    (uintmax_t)find_entry.start));
 	for (;;) {
 		a->entry->start = prev->start + DMAR_PAGE_SIZE;
@@ -446,7 +450,7 @@
 			a->entry->start = a->common->highaddr;
 		a->entry->start = roundup2(a->entry->start,
 		    a->common->alignment);
-		if (dmar_gas_match_one(a, prev, a->ctx->end)) {
+		if (dmar_gas_match_one(a, prev, a->domain->end)) {
 			dmar_gas_match_insert(a, prev);
 			return (0);
 		}
@@ -459,16 +463,17 @@
 		 * non-optimal way.
 		 */
 		prev = next;
-		next = RB_NEXT(dmar_gas_entries_tree, &a->ctx->rb_root, prev);
-		KASSERT(next != NULL, ("no next %p %jx", a->ctx,
+		next = RB_NEXT(dmar_gas_entries_tree, &a->domain->rb_root,
+		    prev);
+		KASSERT(next != NULL, ("no next %p %jx", a->domain,
 		    (uintmax_t)find_entry.start));
-		if (next->end >= a->ctx->end)
+		if (next->end >= a->domain->end)
 			return (ENOMEM);
 	}
 }
 
 static int
-dmar_gas_find_space(struct dmar_ctx *ctx,
+dmar_gas_find_space(struct dmar_domain *domain,
     const struct bus_dma_tag_common *common, dmar_gaddr_t size,
     int offset, u_int flags, struct dmar_map_entry *entry)
 {
@@ -475,11 +480,11 @@
 	struct dmar_gas_match_args a;
 	int error;
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
-	KASSERT(entry->flags == 0, ("dirty entry %p %p", ctx, entry));
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
+	KASSERT(entry->flags == 0, ("dirty entry %p %p", domain, entry));
 	KASSERT((size & DMAR_PAGE_MASK) == 0, ("size %jx", (uintmax_t)size));
 
-	a.ctx = ctx;
+	a.domain = domain;
 	a.size = size;
 	a.offset = offset;
 	a.common = common;
@@ -488,7 +493,7 @@
 
 	/* Handle lower region. */
 	if (common->lowaddr > 0) {
-		error = dmar_gas_lowermatch(&a, RB_ROOT(&ctx->rb_root));
+		error = dmar_gas_lowermatch(&a, RB_ROOT(&domain->rb_root));
 		if (error == 0)
 			return (0);
 		KASSERT(error == ENOMEM,
@@ -495,7 +500,7 @@
 		    ("error %d from dmar_gas_lowermatch", error));
 	}
 	/* Handle upper region. */
-	if (common->highaddr >= ctx->end)
+	if (common->highaddr >= domain->end)
 		return (ENOMEM);
 	error = dmar_gas_uppermatch(&a);
 	KASSERT(error == ENOMEM,
@@ -504,13 +509,13 @@
 }
 
 static int
-dmar_gas_alloc_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
+dmar_gas_alloc_region(struct dmar_domain *domain, struct dmar_map_entry *entry,
     u_int flags)
 {
 	struct dmar_map_entry *next, *prev;
 	bool found;
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
 
 	if ((entry->start & DMAR_PAGE_MASK) != 0 ||
 	    (entry->end & DMAR_PAGE_MASK) != 0)
@@ -517,13 +522,13 @@
 		return (EINVAL);
 	if (entry->start >= entry->end)
 		return (EINVAL);
-	if (entry->end >= ctx->end)
+	if (entry->end >= domain->end)
 		return (EINVAL);
 
-	next = RB_NFIND(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	KASSERT(next != NULL, ("next must be non-null %p %jx", ctx,
+	next = RB_NFIND(dmar_gas_entries_tree, &domain->rb_root, entry);
+	KASSERT(next != NULL, ("next must be non-null %p %jx", domain,
 	    (uintmax_t)entry->start));
-	prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, next);
+	prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, next);
 	/* prev could be NULL */
 
 	/*
@@ -551,23 +556,23 @@
 
 	if (prev != NULL && prev->end > entry->start) {
 		/* This assumes that prev is the placeholder entry. */
-		dmar_gas_rb_remove(ctx, prev);
+		dmar_gas_rb_remove(domain, prev);
 		prev = NULL;
 	}
 	if (next != NULL && next->start < entry->end) {
-		dmar_gas_rb_remove(ctx, next);
+		dmar_gas_rb_remove(domain, next);
 		next = NULL;
 	}
 
-	found = dmar_gas_rb_insert(ctx, entry);
+	found = dmar_gas_rb_insert(domain, entry);
 	KASSERT(found, ("found RMRR dup %p start %jx end %jx",
-	    ctx, (uintmax_t)entry->start, (uintmax_t)entry->end));
+	    domain, (uintmax_t)entry->start, (uintmax_t)entry->end));
 	entry->flags = DMAR_MAP_ENTRY_RMRR;
 
 #ifdef INVARIANTS
 	struct dmar_map_entry *ip, *in;
-	ip = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	in = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
+	ip = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
+	in = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
 	KASSERT(prev == NULL || ip == prev,
 	    ("RMRR %p (%jx %jx) prev %p (%jx %jx) ins prev %p (%jx %jx)",
 	    entry, entry->start, entry->end, prev,
@@ -584,47 +589,47 @@
 }
 
 void
-dmar_gas_free_space(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_free_space(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
 	KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR |
 	    DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_MAP,
-	    ("permanent entry %p %p", ctx, entry));
+	    ("permanent entry %p %p", domain, entry));
 
-	dmar_gas_rb_remove(ctx, entry);
+	dmar_gas_rb_remove(domain, entry);
 	entry->flags &= ~DMAR_MAP_ENTRY_MAP;
 #ifdef INVARIANTS
 	if (dmar_check_free)
-		dmar_gas_check_free(ctx);
+		dmar_gas_check_free(domain);
 #endif
 }
 
 void
-dmar_gas_free_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry)
+dmar_gas_free_region(struct dmar_domain *domain, struct dmar_map_entry *entry)
 {
 	struct dmar_map_entry *next, *prev;
 
-	DMAR_CTX_ASSERT_LOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_LOCKED(domain);
 	KASSERT((entry->flags & (DMAR_MAP_ENTRY_PLACE | DMAR_MAP_ENTRY_RMRR |
 	    DMAR_MAP_ENTRY_MAP)) == DMAR_MAP_ENTRY_RMRR,
-	    ("non-RMRR entry %p %p", ctx, entry));
+	    ("non-RMRR entry %p %p", domain, entry));
 
-	prev = RB_PREV(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	next = RB_NEXT(dmar_gas_entries_tree, &ctx->rb_root, entry);
-	dmar_gas_rb_remove(ctx, entry);
+	prev = RB_PREV(dmar_gas_entries_tree, &domain->rb_root, entry);
+	next = RB_NEXT(dmar_gas_entries_tree, &domain->rb_root, entry);
+	dmar_gas_rb_remove(domain, entry);
 	entry->flags &= ~DMAR_MAP_ENTRY_RMRR;
 
 	if (prev == NULL)
-		dmar_gas_rb_insert(ctx, ctx->first_place);
+		dmar_gas_rb_insert(domain, domain->first_place);
 	if (next == NULL)
-		dmar_gas_rb_insert(ctx, ctx->last_place);
+		dmar_gas_rb_insert(domain, domain->last_place);
 }
 
 int
-dmar_gas_map(struct dmar_ctx *ctx, const struct bus_dma_tag_common *common,
-    dmar_gaddr_t size, int offset, u_int eflags, u_int flags, vm_page_t *ma,
-    struct dmar_map_entry **res)
+dmar_gas_map(struct dmar_domain *domain,
+    const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset,
+    u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res)
 {
 	struct dmar_map_entry *entry;
 	int error;
@@ -632,29 +637,31 @@
 	KASSERT((flags & ~(DMAR_GM_CANWAIT | DMAR_GM_CANSPLIT)) == 0,
 	    ("invalid flags 0x%x", flags));
 
-	entry = dmar_gas_alloc_entry(ctx, (flags & DMAR_GM_CANWAIT) != 0 ?
+	entry = dmar_gas_alloc_entry(domain, (flags & DMAR_GM_CANWAIT) != 0 ?
 	    DMAR_PGF_WAITOK : 0);
 	if (entry == NULL)
 		return (ENOMEM);
-	DMAR_CTX_LOCK(ctx);
-	error = dmar_gas_find_space(ctx, common, size, offset, flags, entry);
+	DMAR_DOMAIN_LOCK(domain);
+	error = dmar_gas_find_space(domain, common, size, offset, flags,
+	    entry);
 	if (error == ENOMEM) {
-		DMAR_CTX_UNLOCK(ctx);
-		dmar_gas_free_entry(ctx, entry);
+		DMAR_DOMAIN_UNLOCK(domain);
+		dmar_gas_free_entry(domain, entry);
 		return (error);
 	}
 #ifdef INVARIANTS
 	if (dmar_check_free)
-		dmar_gas_check_free(ctx);
+		dmar_gas_check_free(domain);
 #endif
 	KASSERT(error == 0,
 	    ("unexpected error %d from dmar_gas_find_entry", error));
-	KASSERT(entry->end < ctx->end, ("allocated GPA %jx, max GPA %jx",
-	    (uintmax_t)entry->end, (uintmax_t)ctx->end));
+	KASSERT(entry->end < domain->end, ("allocated GPA %jx, max GPA %jx",
+	    (uintmax_t)entry->end, (uintmax_t)domain->end));
 	entry->flags |= eflags;
-	DMAR_CTX_UNLOCK(ctx);
+	DMAR_DOMAIN_UNLOCK(domain);
 
-	error = ctx_map_buf(ctx, entry->start, entry->end - entry->start, ma,
+	error = domain_map_buf(domain, entry->start, entry->end - entry->start,
+	    ma,
 	    ((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) |
 	    ((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) |
 	    ((eflags & DMAR_MAP_ENTRY_SNOOP) != 0 ? DMAR_PTE_SNP : 0) |
@@ -661,11 +668,11 @@
 	    ((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0),
 	    (flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0);
 	if (error == ENOMEM) {
-		dmar_ctx_unload_entry(entry, true);
+		dmar_domain_unload_entry(entry, true);
 		return (error);
 	}
 	KASSERT(error == 0,
-	    ("unexpected error %d from ctx_map_buf", error));
+	    ("unexpected error %d from domain_map_buf", error));
 
 	*res = entry;
 	return (0);
@@ -672,30 +679,30 @@
 }
 
 int
-dmar_gas_map_region(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
+dmar_gas_map_region(struct dmar_domain *domain, struct dmar_map_entry *entry,
     u_int eflags, u_int flags, vm_page_t *ma)
 {
 	dmar_gaddr_t start;
 	int error;
 
-	KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", ctx,
+	KASSERT(entry->flags == 0, ("used RMRR entry %p %p %x", domain,
 	    entry, entry->flags));
 	KASSERT((flags & ~(DMAR_GM_CANWAIT)) == 0,
 	    ("invalid flags 0x%x", flags));
 
 	start = entry->start;
-	DMAR_CTX_LOCK(ctx);
-	error = dmar_gas_alloc_region(ctx, entry, flags);
+	DMAR_DOMAIN_LOCK(domain);
+	error = dmar_gas_alloc_region(domain, entry, flags);
 	if (error != 0) {
-		DMAR_CTX_UNLOCK(ctx);
+		DMAR_DOMAIN_UNLOCK(domain);
 		return (error);
 	}
 	entry->flags |= eflags;
-	DMAR_CTX_UNLOCK(ctx);
+	DMAR_DOMAIN_UNLOCK(domain);
 	if (entry->end == entry->start)
 		return (0);
 
-	error = ctx_map_buf(ctx, entry->start, entry->end - entry->start,
+	error = domain_map_buf(domain, entry->start, entry->end - entry->start,
 	    ma + OFF_TO_IDX(start - entry->start),
 	    ((eflags & DMAR_MAP_ENTRY_READ) != 0 ? DMAR_PTE_R : 0) |
 	    ((eflags & DMAR_MAP_ENTRY_WRITE) != 0 ? DMAR_PTE_W : 0) |
@@ -703,31 +710,31 @@
 	    ((eflags & DMAR_MAP_ENTRY_TM) != 0 ? DMAR_PTE_TM : 0),
 	    (flags & DMAR_GM_CANWAIT) != 0 ? DMAR_PGF_WAITOK : 0);
 	if (error == ENOMEM) {
-		dmar_ctx_unload_entry(entry, false);
+		dmar_domain_unload_entry(entry, false);
 		return (error);
 	}
 	KASSERT(error == 0,
-	    ("unexpected error %d from ctx_map_buf", error));
+	    ("unexpected error %d from domain_map_buf", error));
 
 	return (0);
 }
 
 int
-dmar_gas_reserve_region(struct dmar_ctx *ctx, dmar_gaddr_t start,
+dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start,
     dmar_gaddr_t end)
 {
 	struct dmar_map_entry *entry;
 	int error;
 
-	entry = dmar_gas_alloc_entry(ctx, DMAR_PGF_WAITOK);
+	entry = dmar_gas_alloc_entry(domain, DMAR_PGF_WAITOK);
 	entry->start = start;
 	entry->end = end;
-	DMAR_CTX_LOCK(ctx);
-	error = dmar_gas_alloc_region(ctx, entry, DMAR_GM_CANWAIT);
+	DMAR_DOMAIN_LOCK(domain);
+	error = dmar_gas_alloc_region(domain, entry, DMAR_GM_CANWAIT);
 	if (error == 0)
 		entry->flags |= DMAR_MAP_ENTRY_UNMAPPED;
-	DMAR_CTX_UNLOCK(ctx);
+	DMAR_DOMAIN_UNLOCK(domain);
 	if (error != 0)
-		dmar_gas_free_entry(ctx, entry);
+		dmar_gas_free_entry(domain, entry);
 	return (error);
 }

Modified: trunk/sys/x86/iommu/intel_idpgtbl.c
===================================================================
--- trunk/sys/x86/iommu/intel_idpgtbl.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_idpgtbl.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_idpgtbl.c 286854 2015-08-17 18:36:16Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_idpgtbl.c 286777 2015-08-14 13:51:59Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -49,6 +49,7 @@
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/uio.h>
+#include <sys/vmem.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
@@ -66,8 +67,8 @@
 #include <x86/iommu/busdma_dmar.h>
 #include <x86/iommu/intel_dmar.h>
 
-static int ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base,
-    dmar_gaddr_t size, int flags);
+static int domain_unmap_buf_locked(struct dmar_domain *domain,
+    dmar_gaddr_t base, dmar_gaddr_t size, int flags);
 
 /*
  * The cache of the identity mapping page tables for the DMARs.  Using
@@ -105,7 +106,7 @@
  *   mapped by the page table page.
  */
 static void
-ctx_idmap_nextlvl(struct idpgtbl *tbl, int lvl, vm_pindex_t idx,
+domain_idmap_nextlvl(struct idpgtbl *tbl, int lvl, vm_pindex_t idx,
     dmar_gaddr_t addr)
 {
 	vm_page_t m1;
@@ -124,7 +125,7 @@
 	pg_sz = pglvl_page_size(tbl->pglvl, lvl);
 	if (lvl != tbl->leaf) {
 		for (i = 0, f = addr; i < DMAR_NPTEPG; i++, f += pg_sz)
-			ctx_idmap_nextlvl(tbl, lvl + 1, base + i, f);
+			domain_idmap_nextlvl(tbl, lvl + 1, base + i, f);
 	}
 	VM_OBJECT_WUNLOCK(tbl->pgtbl_obj);
 	pte = dmar_map_pgtbl(tbl->pgtbl_obj, idx, DMAR_PGF_WAITOK, &sf);
@@ -146,7 +147,7 @@
 			    VM_PAGE_TO_PHYS(m1)) | DMAR_PTE_R | DMAR_PTE_W;
 		}
 	}
-	/* ctx_get_idmap_pgtbl flushes CPU cache if needed. */
+	/* domain_get_idmap_pgtbl flushes CPU cache if needed. */
 	dmar_unmap_pgtbl(sf);
 	VM_OBJECT_WLOCK(tbl->pgtbl_obj);
 }
@@ -160,7 +161,7 @@
  * maxaddr is typically mapped.
  */
 vm_object_t
-ctx_get_idmap_pgtbl(struct dmar_ctx *ctx, dmar_gaddr_t maxaddr)
+domain_get_idmap_pgtbl(struct dmar_domain *domain, dmar_gaddr_t maxaddr)
 {
 	struct dmar_unit *unit;
 	struct idpgtbl *tbl;
@@ -173,8 +174,8 @@
 	/*
 	 * First, determine where to stop the paging structures.
 	 */
-	for (i = 0; i < ctx->pglvl; i++) {
-		if (i == ctx->pglvl - 1 || ctx_is_sp_lvl(ctx, i)) {
+	for (i = 0; i < domain->pglvl; i++) {
+		if (i == domain->pglvl - 1 || domain_is_sp_lvl(domain, i)) {
 			leaf = i;
 			break;
 		}
@@ -191,12 +192,12 @@
 	sx_slock(&idpgtbl_lock);
 	LIST_FOREACH(tbl, &idpgtbls, link) {
 		if (tbl->maxaddr >= maxaddr &&
-		    dmar_pglvl_supported(ctx->dmar, tbl->pglvl) &&
+		    dmar_pglvl_supported(domain->dmar, tbl->pglvl) &&
 		    tbl->leaf == leaf) {
 			res = tbl->pgtbl_obj;
 			vm_object_reference(res);
 			sx_sunlock(&idpgtbl_lock);
-			ctx->pglvl = tbl->pglvl; /* XXXKIB ? */
+			domain->pglvl = tbl->pglvl; /* XXXKIB ? */
 			goto end;
 		}
 	}
@@ -210,12 +211,12 @@
 	sx_xlock(&idpgtbl_lock);
 	LIST_FOREACH(tbl, &idpgtbls, link) {
 		if (tbl->maxaddr >= maxaddr &&
-		    dmar_pglvl_supported(ctx->dmar, tbl->pglvl) &&
+		    dmar_pglvl_supported(domain->dmar, tbl->pglvl) &&
 		    tbl->leaf == leaf) {
 			res = tbl->pgtbl_obj;
 			vm_object_reference(res);
 			sx_xunlock(&idpgtbl_lock);
-			ctx->pglvl = tbl->pglvl; /* XXXKIB ? */
+			domain->pglvl = tbl->pglvl; /* XXXKIB ? */
 			return (res);
 		}
 	}
@@ -224,13 +225,13 @@
 	 * Still not found, create new page table.
 	 */
 	tbl = malloc(sizeof(*tbl), M_DMAR_IDPGTBL, M_WAITOK);
-	tbl->pglvl = ctx->pglvl;
+	tbl->pglvl = domain->pglvl;
 	tbl->leaf = leaf;
 	tbl->maxaddr = maxaddr;
 	tbl->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL,
 	    IDX_TO_OFF(pglvl_max_pages(tbl->pglvl)), 0, 0, NULL);
 	VM_OBJECT_WLOCK(tbl->pgtbl_obj);
-	ctx_idmap_nextlvl(tbl, 0, 0, 0);
+	domain_idmap_nextlvl(tbl, 0, 0, 0);
 	VM_OBJECT_WUNLOCK(tbl->pgtbl_obj);
 	LIST_INSERT_HEAD(&idpgtbls, tbl, link);
 	res = tbl->pgtbl_obj;
@@ -251,7 +252,7 @@
 	 * If DMAR cannot look into the chipset write buffer, flush it
 	 * as well.
 	 */
-	unit = ctx->dmar;
+	unit = domain->dmar;
 	if (!DMAR_IS_COHERENT(unit)) {
 		VM_OBJECT_WLOCK(res);
 		for (m = vm_page_lookup(res, 0); m != NULL;
@@ -320,10 +321,11 @@
  * the level lvl.
  */
 static int
-ctx_pgtbl_pte_off(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl)
+domain_pgtbl_pte_off(struct dmar_domain *domain, dmar_gaddr_t base, int lvl)
 {
 
-	base >>= DMAR_PAGE_SHIFT + (ctx->pglvl - lvl - 1) * DMAR_NPTEPGSHIFT;
+	base >>= DMAR_PAGE_SHIFT + (domain->pglvl - lvl - 1) *
+	    DMAR_NPTEPGSHIFT;
 	return (base & DMAR_PTEMASK);
 }
 
@@ -333,21 +335,24 @@
  * lvl.
  */
 static vm_pindex_t
-ctx_pgtbl_get_pindex(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl)
+domain_pgtbl_get_pindex(struct dmar_domain *domain, dmar_gaddr_t base, int lvl)
 {
 	vm_pindex_t idx, pidx;
 	int i;
 
-	KASSERT(lvl >= 0 && lvl < ctx->pglvl, ("wrong lvl %p %d", ctx, lvl));
+	KASSERT(lvl >= 0 && lvl < domain->pglvl,
+	    ("wrong lvl %p %d", domain, lvl));
 
-	for (pidx = idx = 0, i = 0; i < lvl; i++, pidx = idx)
-		idx = ctx_pgtbl_pte_off(ctx, base, i) + pidx * DMAR_NPTEPG + 1;
+	for (pidx = idx = 0, i = 0; i < lvl; i++, pidx = idx) {
+		idx = domain_pgtbl_pte_off(domain, base, i) +
+		    pidx * DMAR_NPTEPG + 1;
+	}
 	return (idx);
 }
 
 static dmar_pte_t *
-ctx_pgtbl_map_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags,
-    vm_pindex_t *idxp, struct sf_buf **sf)
+domain_pgtbl_map_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl,
+    int flags, vm_pindex_t *idxp, struct sf_buf **sf)
 {
 	vm_page_t m;
 	struct sf_buf *sfp;
@@ -354,10 +359,10 @@
 	dmar_pte_t *pte, *ptep;
 	vm_pindex_t idx, idx1;
 
-	DMAR_CTX_ASSERT_PGLOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
 	KASSERT((flags & DMAR_PGF_OBJL) != 0, ("lost PGF_OBJL"));
 
-	idx = ctx_pgtbl_get_pindex(ctx, base, lvl);
+	idx = domain_pgtbl_get_pindex(domain, base, lvl);
 	if (*sf != NULL && idx == *idxp) {
 		pte = (dmar_pte_t *)sf_buf_kva(*sf);
 	} else {
@@ -365,15 +370,16 @@
 			dmar_unmap_pgtbl(*sf);
 		*idxp = idx;
 retry:
-		pte = dmar_map_pgtbl(ctx->pgtbl_obj, idx, flags, sf);
+		pte = dmar_map_pgtbl(domain->pgtbl_obj, idx, flags, sf);
 		if (pte == NULL) {
-			KASSERT(lvl > 0, ("lost root page table page %p", ctx));
+			KASSERT(lvl > 0,
+			    ("lost root page table page %p", domain));
 			/*
 			 * Page table page does not exist, allocate
 			 * it and create a pte in the preceeding page level
 			 * to reference the allocated page table page.
 			 */
-			m = dmar_pgalloc(ctx->pgtbl_obj, idx, flags |
+			m = dmar_pgalloc(domain->pgtbl_obj, idx, flags |
 			    DMAR_PGF_ZERO);
 			if (m == NULL)
 				return (NULL);
@@ -381,25 +387,26 @@
 			/*
 			 * Prevent potential free while pgtbl_obj is
 			 * unlocked in the recursive call to
-			 * ctx_pgtbl_map_pte(), if other thread did
-			 * pte write and clean while the lock if
+			 * domain_pgtbl_map_pte(), if other thread did
+			 * pte write and clean while the lock is
 			 * dropped.
 			 */
 			m->wire_count++;
 
 			sfp = NULL;
-			ptep = ctx_pgtbl_map_pte(ctx, base, lvl - 1, flags,
-			    &idx1, &sfp);
+			ptep = domain_pgtbl_map_pte(domain, base, lvl - 1,
+			    flags, &idx1, &sfp);
 			if (ptep == NULL) {
 				KASSERT(m->pindex != 0,
-				    ("loosing root page %p", ctx));
+				    ("loosing root page %p", domain));
 				m->wire_count--;
-				dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags);
+				dmar_pgfree(domain->pgtbl_obj, m->pindex,
+				    flags);
 				return (NULL);
 			}
 			dmar_pte_store(&ptep->pte, DMAR_PTE_R | DMAR_PTE_W |
 			    VM_PAGE_TO_PHYS(m));
-			dmar_flush_pte_to_ram(ctx->dmar, ptep);
+			dmar_flush_pte_to_ram(domain->dmar, ptep);
 			sf_buf_page(sfp)->wire_count += 1;
 			m->wire_count--;
 			dmar_unmap_pgtbl(sfp);
@@ -407,13 +414,13 @@
 			goto retry;
 		}
 	}
-	pte += ctx_pgtbl_pte_off(ctx, base, lvl);
+	pte += domain_pgtbl_pte_off(domain, base, lvl);
 	return (pte);
 }
 
 static int
-ctx_map_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
-    vm_page_t *ma, uint64_t pflags, int flags)
+domain_map_buf_locked(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags)
 {
 	dmar_pte_t *pte;
 	struct sf_buf *sf;
@@ -422,7 +429,7 @@
 	int lvl;
 	bool superpage;
 
-	DMAR_CTX_ASSERT_PGLOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
 
 	base1 = base;
 	size1 = size;
@@ -432,15 +439,15 @@
 	for (sf = NULL, pi = 0; size > 0; base += pg_sz, size -= pg_sz,
 	    pi += run_sz) {
 		for (lvl = 0, c = 0, superpage = false;; lvl++) {
-			pg_sz = ctx_page_size(ctx, lvl);
+			pg_sz = domain_page_size(domain, lvl);
 			run_sz = pg_sz >> DMAR_PAGE_SHIFT;
-			if (lvl == ctx->pglvl - 1)
+			if (lvl == domain->pglvl - 1)
 				break;
 			/*
 			 * Check if the current base suitable for the
 			 * superpage mapping.  First, verify the level.
 			 */
-			if (!ctx_is_sp_lvl(ctx, lvl))
+			if (!domain_is_sp_lvl(domain, lvl))
 				continue;
 			/*
 			 * Next, look at the size of the mapping and
@@ -464,22 +471,23 @@
 			}
 		}
 		KASSERT(size >= pg_sz,
-		    ("mapping loop overflow %p %jx %jx %jx", ctx,
+		    ("mapping loop overflow %p %jx %jx %jx", domain,
 		    (uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz));
 		KASSERT(pg_sz > 0, ("pg_sz 0 lvl %d", lvl));
-		pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf);
+		pte = domain_pgtbl_map_pte(domain, base, lvl, flags, &idx, &sf);
 		if (pte == NULL) {
 			KASSERT((flags & DMAR_PGF_WAITOK) == 0,
-			    ("failed waitable pte alloc %p", ctx));
+			    ("failed waitable pte alloc %p", domain));
 			if (sf != NULL)
 				dmar_unmap_pgtbl(sf);
-			ctx_unmap_buf_locked(ctx, base1, base - base1, flags);
+			domain_unmap_buf_locked(domain, base1, base - base1,
+			    flags);
 			TD_PINNED_ASSERT;
 			return (ENOMEM);
 		}
 		dmar_pte_store(&pte->pte, VM_PAGE_TO_PHYS(ma[pi]) | pflags |
 		    (superpage ? DMAR_PTE_SP : 0));
-		dmar_flush_pte_to_ram(ctx->dmar, pte);
+		dmar_flush_pte_to_ram(domain->dmar, pte);
 		sf_buf_page(sf)->wire_count += 1;
 	}
 	if (sf != NULL)
@@ -489,32 +497,32 @@
 }
 
 int
-ctx_map_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
+domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base, dmar_gaddr_t size,
     vm_page_t *ma, uint64_t pflags, int flags)
 {
 	struct dmar_unit *unit;
 	int error;
 
-	unit = ctx->dmar;
+	unit = domain->dmar;
 
-	KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0,
-	    ("modifying idmap pagetable ctx %p", ctx));
+	KASSERT((domain->flags & DMAR_DOMAIN_IDMAP) == 0,
+	    ("modifying idmap pagetable domain %p", domain));
 	KASSERT((base & DMAR_PAGE_MASK) == 0,
-	    ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base,
+	    ("non-aligned base %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
 	KASSERT((size & DMAR_PAGE_MASK) == 0,
-	    ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base,
+	    ("non-aligned size %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
-	KASSERT(size > 0, ("zero size %p %jx %jx", ctx, (uintmax_t)base,
+	KASSERT(size > 0, ("zero size %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
-	KASSERT(base < (1ULL << ctx->agaw),
-	    ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
-	    (uintmax_t)size, ctx->agaw));
-	KASSERT(base + size < (1ULL << ctx->agaw),
-	    ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
-	    (uintmax_t)size, ctx->agaw));
+	KASSERT(base < (1ULL << domain->agaw),
+	    ("base too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+	    (uintmax_t)size, domain->agaw));
+	KASSERT(base + size < (1ULL << domain->agaw),
+	    ("end too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+	    (uintmax_t)size, domain->agaw));
 	KASSERT(base + size > base,
-	    ("size overflow %p %jx %jx", ctx, (uintmax_t)base,
+	    ("size overflow %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
 	KASSERT((pflags & (DMAR_PTE_R | DMAR_PTE_W)) != 0,
 	    ("neither read nor write %jx", (uintmax_t)pflags));
@@ -524,21 +532,21 @@
 	KASSERT((pflags & DMAR_PTE_SNP) == 0 ||
 	    (unit->hw_ecap & DMAR_ECAP_SC) != 0,
 	    ("PTE_SNP for dmar without snoop control %p %jx",
-	    ctx, (uintmax_t)pflags));
+	    domain, (uintmax_t)pflags));
 	KASSERT((pflags & DMAR_PTE_TM) == 0 ||
 	    (unit->hw_ecap & DMAR_ECAP_DI) != 0,
 	    ("PTE_TM for dmar without DIOTLB %p %jx",
-	    ctx, (uintmax_t)pflags));
+	    domain, (uintmax_t)pflags));
 	KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags));
 
-	DMAR_CTX_PGLOCK(ctx);
-	error = ctx_map_buf_locked(ctx, base, size, ma, pflags, flags);
-	DMAR_CTX_PGUNLOCK(ctx);
+	DMAR_DOMAIN_PGLOCK(domain);
+	error = domain_map_buf_locked(domain, base, size, ma, pflags, flags);
+	DMAR_DOMAIN_PGUNLOCK(domain);
 	if (error != 0)
 		return (error);
 
 	if ((unit->hw_cap & DMAR_CAP_CM) != 0)
-		ctx_flush_iotlb_sync(ctx, base, size);
+		domain_flush_iotlb_sync(domain, base, size);
 	else if ((unit->hw_cap & DMAR_CAP_RWBF) != 0) {
 		/* See 11.1 Write Buffer Flushing. */
 		DMAR_LOCK(unit);
@@ -548,11 +556,13 @@
 	return (0);
 }
 
-static void ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base,
-    int lvl, int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_fs);
+static void domain_unmap_clear_pte(struct dmar_domain *domain,
+    dmar_gaddr_t base, int lvl, int flags, dmar_pte_t *pte,
+    struct sf_buf **sf, bool free_fs);
 
 static void
-ctx_free_pgtbl_pde(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl, int flags)
+domain_free_pgtbl_pde(struct dmar_domain *domain, dmar_gaddr_t base,
+    int lvl, int flags)
 {
 	struct sf_buf *sf;
 	dmar_pte_t *pde;
@@ -559,18 +569,18 @@
 	vm_pindex_t idx;
 
 	sf = NULL;
-	pde = ctx_pgtbl_map_pte(ctx, base, lvl, flags, &idx, &sf);
-	ctx_unmap_clear_pte(ctx, base, lvl, flags, pde, &sf, true);
+	pde = domain_pgtbl_map_pte(domain, base, lvl, flags, &idx, &sf);
+	domain_unmap_clear_pte(domain, base, lvl, flags, pde, &sf, true);
 }
 
 static void
-ctx_unmap_clear_pte(struct dmar_ctx *ctx, dmar_gaddr_t base, int lvl,
+domain_unmap_clear_pte(struct dmar_domain *domain, dmar_gaddr_t base, int lvl,
     int flags, dmar_pte_t *pte, struct sf_buf **sf, bool free_sf)
 {
 	vm_page_t m;
 
 	dmar_pte_clear(&pte->pte);
-	dmar_flush_pte_to_ram(ctx->dmar, pte);
+	dmar_flush_pte_to_ram(domain->dmar, pte);
 	m = sf_buf_page(*sf);
 	if (free_sf) {
 		dmar_unmap_pgtbl(*sf);
@@ -580,13 +590,13 @@
 	if (m->wire_count != 0)
 		return;
 	KASSERT(lvl != 0,
-	    ("lost reference (lvl) on root pg ctx %p base %jx lvl %d",
-	    ctx, (uintmax_t)base, lvl));
+	    ("lost reference (lvl) on root pg domain %p base %jx lvl %d",
+	    domain, (uintmax_t)base, lvl));
 	KASSERT(m->pindex != 0,
-	    ("lost reference (idx) on root pg ctx %p base %jx lvl %d",
-	    ctx, (uintmax_t)base, lvl));
-	dmar_pgfree(ctx->pgtbl_obj, m->pindex, flags);
-	ctx_free_pgtbl_pde(ctx, base, lvl - 1, flags);
+	    ("lost reference (idx) on root pg domain %p base %jx lvl %d",
+	    domain, (uintmax_t)base, lvl));
+	dmar_pgfree(domain->pgtbl_obj, m->pindex, flags);
+	domain_free_pgtbl_pde(domain, base, lvl - 1, flags);
 }
 
 /*
@@ -593,7 +603,7 @@
  * Assumes that the unmap is never partial.
  */
 static int
-ctx_unmap_buf_locked(struct dmar_ctx *ctx, dmar_gaddr_t base,
+domain_unmap_buf_locked(struct dmar_domain *domain, dmar_gaddr_t base,
     dmar_gaddr_t size, int flags)
 {
 	dmar_pte_t *pte;
@@ -602,26 +612,26 @@
 	dmar_gaddr_t pg_sz;
 	int lvl;
 
-	DMAR_CTX_ASSERT_PGLOCKED(ctx);
+	DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
 	if (size == 0)
 		return (0);
 
-	KASSERT((ctx->flags & DMAR_CTX_IDMAP) == 0,
-	    ("modifying idmap pagetable ctx %p", ctx));
+	KASSERT((domain->flags & DMAR_DOMAIN_IDMAP) == 0,
+	    ("modifying idmap pagetable domain %p", domain));
 	KASSERT((base & DMAR_PAGE_MASK) == 0,
-	    ("non-aligned base %p %jx %jx", ctx, (uintmax_t)base,
+	    ("non-aligned base %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
 	KASSERT((size & DMAR_PAGE_MASK) == 0,
-	    ("non-aligned size %p %jx %jx", ctx, (uintmax_t)base,
+	    ("non-aligned size %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
-	KASSERT(base < (1ULL << ctx->agaw),
-	    ("base too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
-	    (uintmax_t)size, ctx->agaw));
-	KASSERT(base + size < (1ULL << ctx->agaw),
-	    ("end too high %p %jx %jx agaw %d", ctx, (uintmax_t)base,
-	    (uintmax_t)size, ctx->agaw));
+	KASSERT(base < (1ULL << domain->agaw),
+	    ("base too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+	    (uintmax_t)size, domain->agaw));
+	KASSERT(base + size < (1ULL << domain->agaw),
+	    ("end too high %p %jx %jx agaw %d", domain, (uintmax_t)base,
+	    (uintmax_t)size, domain->agaw));
 	KASSERT(base + size > base,
-	    ("size overflow %p %jx %jx", ctx, (uintmax_t)base,
+	    ("size overflow %p %jx %jx", domain, (uintmax_t)base,
 	    (uintmax_t)size));
 	KASSERT((flags & ~DMAR_PGF_WAITOK) == 0, ("invalid flags %x", flags));
 
@@ -630,26 +640,27 @@
 	TD_PREP_PINNED_ASSERT;
 
 	for (sf = NULL; size > 0; base += pg_sz, size -= pg_sz) {
-		for (lvl = 0; lvl < ctx->pglvl; lvl++) {
-			if (lvl != ctx->pglvl - 1 && !ctx_is_sp_lvl(ctx, lvl))
+		for (lvl = 0; lvl < domain->pglvl; lvl++) {
+			if (lvl != domain->pglvl - 1 &&
+			    !domain_is_sp_lvl(domain, lvl))
 				continue;
-			pg_sz = ctx_page_size(ctx, lvl);
+			pg_sz = domain_page_size(domain, lvl);
 			if (pg_sz > size)
 				continue;
-			pte = ctx_pgtbl_map_pte(ctx, base, lvl, flags,
+			pte = domain_pgtbl_map_pte(domain, base, lvl, flags,
 			    &idx, &sf);
 			KASSERT(pte != NULL,
 			    ("sleeping or page missed %p %jx %d 0x%x",
-			    ctx, (uintmax_t)base, lvl, flags));
+			    domain, (uintmax_t)base, lvl, flags));
 			if ((pte->pte & DMAR_PTE_SP) != 0 ||
-			    lvl == ctx->pglvl - 1) {
-				ctx_unmap_clear_pte(ctx, base, lvl, flags,
-				    pte, &sf, false);
+			    lvl == domain->pglvl - 1) {
+				domain_unmap_clear_pte(domain, base, lvl,
+				    flags, pte, &sf, false);
 				break;
 			}
 		}
 		KASSERT(size >= pg_sz,
-		    ("unmapping loop overflow %p %jx %jx %jx", ctx,
+		    ("unmapping loop overflow %p %jx %jx %jx", domain,
 		    (uintmax_t)base, (uintmax_t)size, (uintmax_t)pg_sz));
 	}
 	if (sf != NULL)
@@ -664,54 +675,58 @@
 }
 
 int
-ctx_unmap_buf(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size,
-    int flags)
+domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, int flags)
 {
 	int error;
 
-	DMAR_CTX_PGLOCK(ctx);
-	error = ctx_unmap_buf_locked(ctx, base, size, flags);
-	DMAR_CTX_PGUNLOCK(ctx);
+	DMAR_DOMAIN_PGLOCK(domain);
+	error = domain_unmap_buf_locked(domain, base, size, flags);
+	DMAR_DOMAIN_PGUNLOCK(domain);
 	return (error);
 }
 
 int
-ctx_alloc_pgtbl(struct dmar_ctx *ctx)
+domain_alloc_pgtbl(struct dmar_domain *domain)
 {
 	vm_page_t m;
 
-	KASSERT(ctx->pgtbl_obj == NULL, ("already initialized %p", ctx));
+	KASSERT(domain->pgtbl_obj == NULL,
+	    ("already initialized %p", domain));
 
-	ctx->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL,
-	    IDX_TO_OFF(pglvl_max_pages(ctx->pglvl)), 0, 0, NULL);
-	DMAR_CTX_PGLOCK(ctx);
-	m = dmar_pgalloc(ctx->pgtbl_obj, 0, DMAR_PGF_WAITOK |
+	domain->pgtbl_obj = vm_pager_allocate(OBJT_PHYS, NULL,
+	    IDX_TO_OFF(pglvl_max_pages(domain->pglvl)), 0, 0, NULL);
+	DMAR_DOMAIN_PGLOCK(domain);
+	m = dmar_pgalloc(domain->pgtbl_obj, 0, DMAR_PGF_WAITOK |
 	    DMAR_PGF_ZERO | DMAR_PGF_OBJL);
 	/* No implicit free of the top level page table page. */
 	m->wire_count = 1;
-	DMAR_CTX_PGUNLOCK(ctx);
+	DMAR_DOMAIN_PGUNLOCK(domain);
+	DMAR_LOCK(domain->dmar);
+	domain->flags |= DMAR_DOMAIN_PGTBL_INITED;
+	DMAR_UNLOCK(domain->dmar);
 	return (0);
 }
 
 void
-ctx_free_pgtbl(struct dmar_ctx *ctx)
+domain_free_pgtbl(struct dmar_domain *domain)
 {
 	vm_object_t obj;
 	vm_page_t m;
 
-	obj = ctx->pgtbl_obj;
+	obj = domain->pgtbl_obj;
 	if (obj == NULL) {
-		KASSERT((ctx->dmar->hw_ecap & DMAR_ECAP_PT) != 0 &&
-		    (ctx->flags & DMAR_CTX_IDMAP) != 0,
-		    ("lost pagetable object ctx %p", ctx));
+		KASSERT((domain->dmar->hw_ecap & DMAR_ECAP_PT) != 0 &&
+		    (domain->flags & DMAR_DOMAIN_IDMAP) != 0,
+		    ("lost pagetable object domain %p", domain));
 		return;
 	}
-	DMAR_CTX_ASSERT_PGLOCKED(ctx);
-	ctx->pgtbl_obj = NULL;
+	DMAR_DOMAIN_ASSERT_PGLOCKED(domain);
+	domain->pgtbl_obj = NULL;
 
-	if ((ctx->flags & DMAR_CTX_IDMAP) != 0) {
+	if ((domain->flags & DMAR_DOMAIN_IDMAP) != 0) {
 		put_idmap_pgtbl(obj);
-		ctx->flags &= ~DMAR_CTX_IDMAP;
+		domain->flags &= ~DMAR_DOMAIN_IDMAP;
 		return;
 	}
 
@@ -724,7 +739,7 @@
 }
 
 static inline uint64_t
-ctx_wait_iotlb_flush(struct dmar_unit *unit, uint64_t wt, int iro)
+domain_wait_iotlb_flush(struct dmar_unit *unit, uint64_t wt, int iro)
 {
 	uint64_t iotlbr;
 
@@ -740,7 +755,8 @@
 }
 
 void
-ctx_flush_iotlb_sync(struct dmar_ctx *ctx, dmar_gaddr_t base, dmar_gaddr_t size)
+domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size)
 {
 	struct dmar_unit *unit;
 	dmar_gaddr_t isize;
@@ -747,14 +763,14 @@
 	uint64_t iotlbr;
 	int am, iro;
 
-	unit = ctx->dmar;
+	unit = domain->dmar;
 	KASSERT(!unit->qi_enabled, ("dmar%d: sync iotlb flush call",
 	    unit->unit));
 	iro = DMAR_ECAP_IRO(unit->hw_ecap) * 16;
 	DMAR_LOCK(unit);
 	if ((unit->hw_cap & DMAR_CAP_PSI) == 0 || size > 2 * 1024 * 1024) {
-		iotlbr = ctx_wait_iotlb_flush(unit, DMAR_IOTLB_IIRG_DOM |
-		    DMAR_IOTLB_DID(ctx->domain), iro);
+		iotlbr = domain_wait_iotlb_flush(unit, DMAR_IOTLB_IIRG_DOM |
+		    DMAR_IOTLB_DID(domain->domain), iro);
 		KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) !=
 		    DMAR_IOTLB_IAIG_INVLD,
 		    ("dmar%d: invalidation failed %jx", unit->unit,
@@ -763,9 +779,9 @@
 		for (; size > 0; base += isize, size -= isize) {
 			am = calc_am(unit, base, size, &isize);
 			dmar_write8(unit, iro, base | am);
-			iotlbr = ctx_wait_iotlb_flush(unit,
-			    DMAR_IOTLB_IIRG_PAGE | DMAR_IOTLB_DID(ctx->domain),
-			    iro);
+			iotlbr = domain_wait_iotlb_flush(unit,
+			    DMAR_IOTLB_IIRG_PAGE |
+			    DMAR_IOTLB_DID(domain->domain), iro);
 			KASSERT((iotlbr & DMAR_IOTLB_IAIG_MASK) !=
 			    DMAR_IOTLB_IAIG_INVLD,
 			    ("dmar%d: PSI invalidation failed "

Added: trunk/sys/x86/iommu/intel_intrmap.c
===================================================================
--- trunk/sys/x86/iommu/intel_intrmap.c	                        (rev 0)
+++ trunk/sys/x86/iommu/intel_intrmap.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,381 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_intrmap.c 340016 2018-11-01 18:34:26Z jhb $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/rman.h>
+#include <sys/rwlock.h>
+#include <sys/taskqueue.h>
+#include <sys/tree.h>
+#include <sys/vmem.h>
+#include <machine/bus.h>
+#include <machine/intr_machdep.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <x86/include/apicreg.h>
+#include <x86/include/apicvar.h>
+#include <x86/include/busdma_impl.h>
+#include <x86/iommu/intel_reg.h>
+#include <x86/iommu/busdma_dmar.h>
+#include <x86/iommu/intel_dmar.h>
+#include <dev/pci/pcivar.h>
+#include <x86/iommu/iommu_intrmap.h>
+
+static struct dmar_unit *dmar_ir_find(device_t src, uint16_t *rid,
+    int *is_dmar);
+static void dmar_ir_program_irte(struct dmar_unit *unit, u_int idx,
+    uint64_t low, uint16_t rid);
+static int dmar_ir_free_irte(struct dmar_unit *unit, u_int cookie);
+
+int
+iommu_alloc_msi_intr(device_t src, u_int *cookies, u_int count)
+{
+	struct dmar_unit *unit;
+	vmem_addr_t vmem_res;
+	u_int idx, i;
+	int error;
+
+	unit = dmar_ir_find(src, NULL, NULL);
+	if (unit == NULL || !unit->ir_enabled) {
+		for (i = 0; i < count; i++)
+			cookies[i] = -1;
+		return (EOPNOTSUPP);
+	}
+
+	error = vmem_alloc(unit->irtids, count, M_FIRSTFIT | M_NOWAIT,
+	    &vmem_res);
+	if (error != 0) {
+		KASSERT(error != EOPNOTSUPP,
+		    ("impossible EOPNOTSUPP from vmem"));
+		return (error);
+	}
+	idx = vmem_res;
+	for (i = 0; i < count; i++)
+		cookies[i] = idx + i;
+	return (0);
+}
+
+int
+iommu_map_msi_intr(device_t src, u_int cpu, u_int vector, u_int cookie,
+    uint64_t *addr, uint32_t *data)
+{
+	struct dmar_unit *unit;
+	uint64_t low;
+	uint16_t rid;
+	int is_dmar;
+
+	unit = dmar_ir_find(src, &rid, &is_dmar);
+	if (is_dmar) {
+		KASSERT(unit == NULL, ("DMAR cannot translate itself"));
+
+		/*
+		 * See VT-d specification, 5.1.6 Remapping Hardware -
+		 * Interrupt Programming.
+		 */
+		*data = vector;
+		*addr = MSI_INTEL_ADDR_BASE | ((cpu & 0xff) << 12);
+		if (x2apic_mode)
+			*addr |= ((uint64_t)cpu & 0xffffff00) << 32;
+		else
+			KASSERT(cpu <= 0xff, ("cpu id too big %d", cpu));
+		return (0);
+	}
+	if (unit == NULL || !unit->ir_enabled || cookie == -1)
+		return (EOPNOTSUPP);
+
+	low = (DMAR_X2APIC(unit) ? DMAR_IRTE1_DST_x2APIC(cpu) :
+	    DMAR_IRTE1_DST_xAPIC(cpu)) | DMAR_IRTE1_V(vector) |
+	    DMAR_IRTE1_DLM_FM | DMAR_IRTE1_TM_EDGE | DMAR_IRTE1_RH_DIRECT |
+	    DMAR_IRTE1_DM_PHYSICAL | DMAR_IRTE1_P;
+	dmar_ir_program_irte(unit, cookie, low, rid);
+
+	if (addr != NULL) {
+		/*
+		 * See VT-d specification, 5.1.5.2 MSI and MSI-X
+		 * Register Programming.
+		 */
+		*addr = MSI_INTEL_ADDR_BASE | ((cookie & 0x7fff) << 5) |
+		    ((cookie & 0x8000) << 2) | 0x18;
+		*data = 0;
+	}
+	return (0);
+}
+
+int
+iommu_unmap_msi_intr(device_t src, u_int cookie)
+{
+	struct dmar_unit *unit;
+
+	if (cookie == -1)
+		return (0);
+	unit = dmar_ir_find(src, NULL, NULL);
+	return (dmar_ir_free_irte(unit, cookie));
+}
+
+int
+iommu_map_ioapic_intr(u_int ioapic_id, u_int cpu, u_int vector, bool edge,
+    bool activehi, int irq, u_int *cookie, uint32_t *hi, uint32_t *lo)
+{
+	struct dmar_unit *unit;
+	vmem_addr_t vmem_res;
+	uint64_t low, iorte;
+	u_int idx;
+	int error;
+	uint16_t rid;
+
+	unit = dmar_find_ioapic(ioapic_id, &rid);
+	if (unit == NULL || !unit->ir_enabled) {
+		*cookie = -1;
+		return (EOPNOTSUPP);
+	}
+
+	error = vmem_alloc(unit->irtids, 1, M_FIRSTFIT | M_NOWAIT, &vmem_res);
+	if (error != 0) {
+		KASSERT(error != EOPNOTSUPP,
+		    ("impossible EOPNOTSUPP from vmem"));
+		return (error);
+	}
+	idx = vmem_res;
+	low = 0;
+	switch (irq) {
+	case IRQ_EXTINT:
+		low |= DMAR_IRTE1_DLM_ExtINT;
+		break;
+	case IRQ_NMI:
+		low |= DMAR_IRTE1_DLM_NMI;
+		break;
+	case IRQ_SMI:
+		low |= DMAR_IRTE1_DLM_SMI;
+		break;
+	default:
+		KASSERT(vector != 0, ("No vector for IRQ %u", irq));
+		low |= DMAR_IRTE1_DLM_FM | DMAR_IRTE1_V(vector);
+		break;
+	}
+	low |= (DMAR_X2APIC(unit) ? DMAR_IRTE1_DST_x2APIC(cpu) :
+	    DMAR_IRTE1_DST_xAPIC(cpu)) |
+	    (edge ? DMAR_IRTE1_TM_EDGE : DMAR_IRTE1_TM_LEVEL) |
+	    DMAR_IRTE1_RH_DIRECT | DMAR_IRTE1_DM_PHYSICAL | DMAR_IRTE1_P;
+	dmar_ir_program_irte(unit, idx, low, rid);
+
+	if (hi != NULL) {
+		/*
+		 * See VT-d specification, 5.1.5.1 I/OxAPIC
+		 * Programming.
+		 */
+		iorte = (1ULL << 48) | ((uint64_t)(idx & 0x7fff) << 49) |
+		    ((idx & 0x8000) != 0 ? (1 << 11) : 0) |
+		    (edge ? IOART_TRGREDG : IOART_TRGRLVL) |
+		    (activehi ? IOART_INTAHI : IOART_INTALO) |
+		    IOART_DELFIXED | vector;
+		*hi = iorte >> 32;
+		*lo = iorte;
+	}
+	*cookie = idx;
+	return (0);
+}
+
+int
+iommu_unmap_ioapic_intr(u_int ioapic_id, u_int *cookie)
+{
+	struct dmar_unit *unit;
+	u_int idx;
+
+	idx = *cookie;
+	if (idx == -1)
+		return (0);
+	*cookie = -1;
+	unit = dmar_find_ioapic(ioapic_id, NULL);
+	KASSERT(unit != NULL && unit->ir_enabled,
+	    ("unmap: cookie %d unit %p", idx, unit));
+	return (dmar_ir_free_irte(unit, idx));
+}
+
+static struct dmar_unit *
+dmar_ir_find(device_t src, uint16_t *rid, int *is_dmar)
+{
+	devclass_t src_class;
+	struct dmar_unit *unit;
+
+	/*
+	 * We need to determine if the interrupt source generates FSB
+	 * interrupts.  If yes, it is either DMAR, in which case
+	 * interrupts are not remapped.  Or it is HPET, and interrupts
+	 * are remapped.  For HPET, source id is reported by HPET
+	 * record in DMAR ACPI table.
+	 */
+	if (is_dmar != NULL)
+		*is_dmar = FALSE;
+	src_class = device_get_devclass(src);
+	if (src_class == devclass_find("dmar")) {
+		unit = NULL;
+		if (is_dmar != NULL)
+			*is_dmar = TRUE;
+	} else if (src_class == devclass_find("hpet")) {
+		unit = dmar_find_hpet(src, rid);
+	} else {
+		unit = dmar_find(src);
+		if (unit != NULL && rid != NULL)
+			dmar_get_requester(src, rid);
+	}
+	return (unit);
+}
+
+static void
+dmar_ir_program_irte(struct dmar_unit *unit, u_int idx, uint64_t low,
+    uint16_t rid)
+{
+	dmar_irte_t *irte;
+	uint64_t high;
+
+	KASSERT(idx < unit->irte_cnt,
+	    ("bad cookie %d %d", idx, unit->irte_cnt));
+	irte = &(unit->irt[idx]);
+	high = DMAR_IRTE2_SVT_RID | DMAR_IRTE2_SQ_RID |
+	    DMAR_IRTE2_SID_RID(rid);
+	device_printf(unit->dev,
+	    "programming irte[%d] rid %#x high %#jx low %#jx\n",
+	    idx, rid, (uintmax_t)high, (uintmax_t)low);
+	DMAR_LOCK(unit);
+	if ((irte->irte1 & DMAR_IRTE1_P) != 0) {
+		/*
+		 * The rte is already valid.  Assume that the request
+		 * is to remap the interrupt for balancing.  Only low
+		 * word of rte needs to be changed.  Assert that the
+		 * high word contains expected value.
+		 */
+		KASSERT(irte->irte2 == high,
+		    ("irte2 mismatch, %jx %jx", (uintmax_t)irte->irte2,
+		    (uintmax_t)high));
+		dmar_pte_update(&irte->irte1, low);
+	} else {
+		dmar_pte_store(&irte->irte2, high);
+		dmar_pte_store(&irte->irte1, low);
+	}
+	dmar_qi_invalidate_iec(unit, idx, 1);
+	DMAR_UNLOCK(unit);
+
+}
+
+static int
+dmar_ir_free_irte(struct dmar_unit *unit, u_int cookie)
+{
+	dmar_irte_t *irte;
+
+	KASSERT(unit != NULL && unit->ir_enabled,
+	    ("unmap: cookie %d unit %p", cookie, unit));
+	KASSERT(cookie < unit->irte_cnt,
+	    ("bad cookie %u %u", cookie, unit->irte_cnt));
+	irte = &(unit->irt[cookie]);
+	dmar_pte_clear(&irte->irte1);
+	dmar_pte_clear(&irte->irte2);
+	DMAR_LOCK(unit);
+	dmar_qi_invalidate_iec(unit, cookie, 1);
+	DMAR_UNLOCK(unit);
+	vmem_free(unit->irtids, cookie, 1);
+	return (0);
+}
+
+static u_int
+clp2(u_int v)
+{
+
+	return (powerof2(v) ? v : 1 << fls(v));
+}
+
+int
+dmar_init_irt(struct dmar_unit *unit)
+{
+
+	if ((unit->hw_ecap & DMAR_ECAP_IR) == 0)
+		return (0);
+	unit->ir_enabled = 1;
+	TUNABLE_INT_FETCH("hw.dmar.ir", &unit->ir_enabled);
+	if (!unit->ir_enabled)
+		return (0);
+	if (!unit->qi_enabled) {
+		unit->ir_enabled = 0;
+		if (bootverbose)
+			device_printf(unit->dev,
+	     "QI disabled, disabling interrupt remapping\n");
+		return (0);
+	}
+	unit->irte_cnt = clp2(num_io_irqs);
+	unit->irt = (dmar_irte_t *)(uintptr_t)kmem_alloc_contig(kernel_arena,
+	    unit->irte_cnt * sizeof(dmar_irte_t), M_ZERO | M_WAITOK, 0,
+	    dmar_high, PAGE_SIZE, 0, DMAR_IS_COHERENT(unit) ?
+	    VM_MEMATTR_DEFAULT : VM_MEMATTR_UNCACHEABLE);
+	if (unit->irt == NULL)
+		return (ENOMEM);
+	unit->irt_phys = pmap_kextract((vm_offset_t)unit->irt);
+	unit->irtids = vmem_create("dmarirt", 0, unit->irte_cnt, 1, 0,
+	    M_FIRSTFIT | M_NOWAIT);
+	DMAR_LOCK(unit);
+	dmar_load_irt_ptr(unit);
+	dmar_qi_invalidate_iec_glob(unit);
+	DMAR_UNLOCK(unit);
+
+	/*
+	 * Initialize mappings for already configured interrupt pins.
+	 * Required, because otherwise the interrupts fault without
+	 * irtes.
+	 */
+	intr_reprogram();
+
+	DMAR_LOCK(unit);
+	dmar_enable_ir(unit);
+	DMAR_UNLOCK(unit);
+	return (0);
+}
+
+void
+dmar_fini_irt(struct dmar_unit *unit)
+{
+
+	unit->ir_enabled = 0;
+	if (unit->irt != NULL) {
+		dmar_disable_ir(unit);
+		dmar_qi_invalidate_iec_glob(unit);
+		vmem_destroy(unit->irtids);
+		kmem_free(kernel_arena, (vm_offset_t)unit->irt,
+		    unit->irte_cnt * sizeof(dmar_irte_t));
+	}
+}


Property changes on: trunk/sys/x86/iommu/intel_intrmap.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/iommu/intel_qi.c
===================================================================
--- trunk/sys/x86/iommu/intel_qi.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_qi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_qi.c 284019 2015-06-05 08:23:33Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_qi.c 320357 2017-06-26 12:30:39Z kib $");
 
 #include "opt_acpi.h"
 
@@ -41,7 +41,9 @@
 #include <sys/module.h>
 #include <sys/rman.h>
 #include <sys/taskqueue.h>
+#include <sys/time.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <machine/bus.h>
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
@@ -70,27 +72,27 @@
 static int
 dmar_enable_qi(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->hw_gcmd |= DMAR_GCMD_QIE;
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) == 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES)
+	    != 0));
+	return (error);
 }
 
 static int
 dmar_disable_qi(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->hw_gcmd &= ~DMAR_GCMD_QIE;
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES) != 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_QIES)
+	    == 0));
+	return (error);
 }
 
 static void
@@ -170,7 +172,8 @@
 }
 
 static void
-dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct dmar_qi_genseq *pseq)
+dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct dmar_qi_genseq *pseq,
+    bool emit_wait)
 {
 	struct dmar_qi_genseq gsec;
 	uint32_t seq;
@@ -191,17 +194,21 @@
 	seq = unit->inv_waitd_seq++;
 	pseq->gen = unit->inv_waitd_gen;
 	pseq->seq = seq;
-	dmar_qi_emit_wait_descr(unit, seq, true, true, false);
+	if (emit_wait) {
+		dmar_qi_ensure(unit, 1);
+		dmar_qi_emit_wait_descr(unit, seq, true, true, false);
+	}
 }
 
 static void
-dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct dmar_qi_genseq *gseq)
+dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct dmar_qi_genseq *gseq,
+    bool nowait)
 {
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->inv_seq_waiters++;
 	while (!dmar_qi_seq_processed(unit, gseq)) {
-		if (cold) {
+		if (cold || nowait) {
 			cpu_spinwait();
 		} else {
 			msleep(&unit->inv_seq_waiters, &unit->lock, 0,
@@ -212,14 +219,14 @@
 }
 
 void
-dmar_qi_invalidate_locked(struct dmar_ctx *ctx, dmar_gaddr_t base,
-    dmar_gaddr_t size, struct dmar_qi_genseq *pseq)
+dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t base,
+    dmar_gaddr_t size, struct dmar_qi_genseq *pseq, bool emit_wait)
 {
 	struct dmar_unit *unit;
 	dmar_gaddr_t isize;
 	int am;
 
-	unit = ctx->dmar;
+	unit = domain->dmar;
 	DMAR_ASSERT_LOCKED(unit);
 	for (; size > 0; base += isize, size -= isize) {
 		am = calc_am(unit, base, size, &isize);
@@ -227,13 +234,10 @@
 		dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV |
 		    DMAR_IQ_DESCR_IOTLB_PAGE | DMAR_IQ_DESCR_IOTLB_DW |
 		    DMAR_IQ_DESCR_IOTLB_DR |
-		    DMAR_IQ_DESCR_IOTLB_DID(ctx->domain),
+		    DMAR_IQ_DESCR_IOTLB_DID(domain->domain),
 		    base | am);
 	}
-	if (pseq != NULL) {
-		dmar_qi_ensure(unit, 1);
-		dmar_qi_emit_wait_seq(unit, pseq);
-	}
+	dmar_qi_emit_wait_seq(unit, pseq, emit_wait);
 	dmar_qi_advance_tail(unit);
 }
 
@@ -245,9 +249,9 @@
 	DMAR_ASSERT_LOCKED(unit);
 	dmar_qi_ensure(unit, 2);
 	dmar_qi_emit(unit, DMAR_IQ_DESCR_CTX_INV | DMAR_IQ_DESCR_CTX_GLOB, 0);
-	dmar_qi_emit_wait_seq(unit, &gseq);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
 	dmar_qi_advance_tail(unit);
-	dmar_qi_wait_for_seq(unit, &gseq);
+	dmar_qi_wait_for_seq(unit, &gseq, false);
 }
 
 void
@@ -259,11 +263,64 @@
 	dmar_qi_ensure(unit, 2);
 	dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV | DMAR_IQ_DESCR_IOTLB_GLOB |
 	    DMAR_IQ_DESCR_IOTLB_DW | DMAR_IQ_DESCR_IOTLB_DR, 0);
-	dmar_qi_emit_wait_seq(unit, &gseq);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
 	dmar_qi_advance_tail(unit);
-	dmar_qi_wait_for_seq(unit, &gseq);
+	dmar_qi_wait_for_seq(unit, &gseq, false);
 }
 
+void
+dmar_qi_invalidate_iec_glob(struct dmar_unit *unit)
+{
+	struct dmar_qi_genseq gseq;
+
+	DMAR_ASSERT_LOCKED(unit);
+	dmar_qi_ensure(unit, 2);
+	dmar_qi_emit(unit, DMAR_IQ_DESCR_IEC_INV, 0);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
+	dmar_qi_advance_tail(unit);
+	dmar_qi_wait_for_seq(unit, &gseq, false);
+}
+
+void
+dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt)
+{
+	struct dmar_qi_genseq gseq;
+	u_int c, l;
+
+	DMAR_ASSERT_LOCKED(unit);
+	KASSERT(start < unit->irte_cnt && start < start + cnt &&
+	    start + cnt <= unit->irte_cnt,
+	    ("inv iec overflow %d %d %d", unit->irte_cnt, start, cnt));
+	for (; cnt > 0; cnt -= c, start += c) {
+		l = ffs(start | cnt) - 1;
+		c = 1 << l;
+		dmar_qi_ensure(unit, 1);
+		dmar_qi_emit(unit, DMAR_IQ_DESCR_IEC_INV |
+		    DMAR_IQ_DESCR_IEC_IDX | DMAR_IQ_DESCR_IEC_IIDX(start) |
+		    DMAR_IQ_DESCR_IEC_IM(l), 0);
+	}
+	dmar_qi_ensure(unit, 1);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
+	dmar_qi_advance_tail(unit);
+
+	/*
+	 * The caller of the function, in particular,
+	 * dmar_ir_program_irte(), may be called from the context
+	 * where the sleeping is forbidden (in fact, the
+	 * intr_table_lock mutex may be held, locked from
+	 * intr_shuffle_irqs()).  Wait for the invalidation completion
+	 * using the busy wait.
+	 *
+	 * The impact on the interrupt input setup code is small, the
+	 * expected overhead is comparable with the chipset register
+	 * read.  It is more harmful for the parallel DMA operations,
+	 * since we own the dmar unit lock until whole invalidation
+	 * queue is processed, which includes requests possibly issued
+	 * before our request.
+	 */
+	dmar_qi_wait_for_seq(unit, &gseq, true);
+}
+
 int
 dmar_qi_intr(void *arg)
 {
@@ -271,7 +328,7 @@
 
 	unit = arg;
 	KASSERT(unit->qi_enabled, ("dmar%d: QI is not enabled", unit->unit));
-	taskqueue_enqueue_fast(unit->qi_taskqueue, &unit->qi_task);
+	taskqueue_enqueue(unit->qi_taskqueue, &unit->qi_task);
 	return (FILTER_HANDLED);
 }
 
@@ -289,12 +346,11 @@
 		entry = TAILQ_FIRST(&unit->tlb_flush_entries);
 		if (entry == NULL)
 			break;
-		if ((entry->gseq.gen == 0 && entry->gseq.seq == 0) ||
-		    !dmar_qi_seq_processed(unit, &entry->gseq))
+		if (!dmar_qi_seq_processed(unit, &entry->gseq))
 			break;
 		TAILQ_REMOVE(&unit->tlb_flush_entries, entry, dmamap_link);
 		DMAR_UNLOCK(unit);
-		dmar_ctx_free_entry(entry, (entry->flags &
+		dmar_domain_free_entry(entry, (entry->flags &
 		    DMAR_MAP_ENTRY_QI_NF) == 0);
 		DMAR_LOCK(unit);
 	}
@@ -324,7 +380,7 @@
 
 	TAILQ_INIT(&unit->tlb_flush_entries);
 	TASK_INIT(&unit->qi_task, 0, dmar_qi_task, unit);
-	unit->qi_taskqueue = taskqueue_create_fast("dmar", M_WAITOK,
+	unit->qi_taskqueue = taskqueue_create_fast("dmarqf", M_WAITOK,
 	    taskqueue_thread_enqueue, &unit->qi_taskqueue);
 	taskqueue_start_threads(&unit->qi_taskqueue, 1, PI_AV,
 	    "dmar%d qi taskq", unit->unit);
@@ -377,9 +433,9 @@
 	DMAR_LOCK(unit);
 	/* quisce */
 	dmar_qi_ensure(unit, 1);
-	dmar_qi_emit_wait_seq(unit, &gseq);
+	dmar_qi_emit_wait_seq(unit, &gseq, true);
 	dmar_qi_advance_tail(unit);
-	dmar_qi_wait_for_seq(unit, &gseq);
+	dmar_qi_wait_for_seq(unit, &gseq, false);
 	/* only after the quisce, disable queue */
 	dmar_disable_qi_intr(unit);
 	dmar_disable_qi(unit);

Modified: trunk/sys/x86/iommu/intel_quirks.c
===================================================================
--- trunk/sys/x86/iommu/intel_quirks.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_quirks.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -1,6 +1,6 @@
 /* $MidnightBSD$ */
 /*-
- * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013, 2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_quirks.c 257251 2013-10-28 13:33:29Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_quirks.c 280260 2015-03-19 13:57:47Z kib $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -43,6 +43,7 @@
 #include <sys/smp.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <machine/bus.h>
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/accommon.h>
@@ -60,7 +61,7 @@
 #include <x86/iommu/intel_dmar.h>
 #include <dev/pci/pcivar.h>
 
-typedef void (*dmar_quirk_fun)(struct dmar_unit *);
+typedef void (*dmar_quirk_cpu_fun)(struct dmar_unit *);
 
 struct intel_dmar_quirk_cpu {
 	u_int ext_family;
@@ -68,17 +69,21 @@
 	u_int family_code;
 	u_int model;
 	u_int stepping;
-	dmar_quirk_fun quirk;
+	dmar_quirk_cpu_fun quirk;
 	const char *descr;
 };
 
+typedef void (*dmar_quirk_nb_fun)(struct dmar_unit *, device_t nb);
+
 struct intel_dmar_quirk_nb {
 	u_int dev_id;
 	u_int rev_no;
-	dmar_quirk_fun quirk;
+	dmar_quirk_nb_fun quirk;
 	const char *descr;
 };
 
+#define	QUIRK_NB_ALL_REV	0xffffffff
+
 static void
 dmar_match_quirks(struct dmar_unit *dmar,
     const struct intel_dmar_quirk_nb *nb_quirks, int nb_quirks_len,
@@ -100,13 +105,14 @@
 			for (i = 0; i < nb_quirks_len; i++) {
 				nb_quirk = &nb_quirks[i];
 				if (nb_quirk->dev_id == dev_id &&
-				    nb_quirk->rev_no == rev_no) {
+				    (nb_quirk->rev_no == rev_no ||
+				    nb_quirk->rev_no == QUIRK_NB_ALL_REV)) {
 					if (bootverbose) {
 						device_printf(dmar->dev,
 						    "NB IOMMU quirk %s\n",
 						    nb_quirk->descr);
 					}
-					nb_quirk->quirk(dmar);
+					nb_quirk->quirk(dmar, nb);
 				}
 			}
 		} else {
@@ -140,12 +146,29 @@
 }
 
 static void
-nb_5400_no_low_high_prot_mem(struct dmar_unit *unit)
+nb_5400_no_low_high_prot_mem(struct dmar_unit *unit, device_t nb __unused)
 {
 
 	unit->hw_cap &= ~(DMAR_CAP_PHMR | DMAR_CAP_PLMR);
 }
 
+static void
+nb_no_ir(struct dmar_unit *unit, device_t nb __unused)
+{
+
+	unit->hw_ecap &= ~(DMAR_ECAP_IR | DMAR_ECAP_EIM);
+}
+
+static void
+nb_5500_no_ir_rev13(struct dmar_unit *unit, device_t nb)
+{
+	u_int rev_no;
+
+	rev_no = pci_get_revid(nb);
+	if (rev_no <= 0x13)
+		nb_no_ir(unit, nb);
+}
+
 static const struct intel_dmar_quirk_nb pre_use_nb[] = {
 	{
 	    .dev_id = 0x4001, .rev_no = 0x20,
@@ -157,6 +180,26 @@
 	    .quirk = nb_5400_no_low_high_prot_mem,
 	    .descr = "5400 E23" /* no low/high protected memory */
 	},
+	{
+	    .dev_id = 0x3403, .rev_no = QUIRK_NB_ALL_REV,
+	    .quirk = nb_5500_no_ir_rev13,
+	    .descr = "5500 E47, E53" /* interrupt remapping does not work */
+	},
+	{
+	    .dev_id = 0x3405, .rev_no = QUIRK_NB_ALL_REV,
+	    .quirk = nb_5500_no_ir_rev13,
+	    .descr = "5500 E47, E53" /* interrupt remapping does not work */
+	},
+	{
+	    .dev_id = 0x3405, .rev_no = 0x22,
+	    .quirk = nb_no_ir,
+	    .descr = "5500 E47, E53" /* interrupt remapping does not work */
+	},
+	{
+	    .dev_id = 0x3406, .rev_no = QUIRK_NB_ALL_REV,
+	    .quirk = nb_5500_no_ir_rev13,
+	    .descr = "5500 E47, E53" /* interrupt remapping does not work */
+	},
 };
 
 static void

Modified: trunk/sys/x86/iommu/intel_reg.h
===================================================================
--- trunk/sys/x86/iommu/intel_reg.h	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_reg.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/iommu/intel_reg.h 306466 2016-09-30 00:31:17Z jhb $
+ * $FreeBSD: stable/11/sys/x86/iommu/intel_reg.h 306466 2016-09-30 00:31:17Z jhb $
  */
 
 #ifndef __X86_IOMMU_INTEL_REG_H

Modified: trunk/sys/x86/iommu/intel_utils.c
===================================================================
--- trunk/sys/x86/iommu/intel_utils.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/iommu/intel_utils.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_utils.c 279470 2015-03-01 04:22:06Z rstone $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/iommu/intel_utils.c 327785 2018-01-10 20:39:26Z markj $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -47,7 +47,9 @@
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
+#include <sys/time.h>
 #include <sys/tree.h>
+#include <sys/vmem.h>
 #include <dev/pci/pcivar.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
@@ -58,6 +60,8 @@
 #include <vm/vm_pageout.h>
 #include <machine/bus.h>
 #include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <x86/include/apicvar.h>
 #include <x86/include/busdma_impl.h>
 #include <x86/iommu/intel_reg.h>
 #include <x86/iommu/busdma_dmar.h>
@@ -98,7 +102,6 @@
 	{.agaw = 64, .cap = DMAR_CAP_SAGAW_6LVL, .awlvl = DMAR_CTX2_AW_6LVL,
 	    .pglvl = 6}
 };
-#define SIZEOF_SAGAW_BITS (sizeof(sagaw_bits) / sizeof(sagaw_bits[0]))
 
 bool
 dmar_pglvl_supported(struct dmar_unit *unit, int pglvl)
@@ -105,7 +108,7 @@
 {
 	int i;
 
-	for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
+	for (i = 0; i < nitems(sagaw_bits); i++) {
 		if (sagaw_bits[i].pglvl != pglvl)
 			continue;
 		if ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
@@ -115,26 +118,23 @@
 }
 
 int
-ctx_set_agaw(struct dmar_ctx *ctx, int mgaw)
+domain_set_agaw(struct dmar_domain *domain, int mgaw)
 {
 	int sagaw, i;
 
-	ctx->mgaw = mgaw;
-	sagaw = DMAR_CAP_SAGAW(ctx->dmar->hw_cap);
-	for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
+	domain->mgaw = mgaw;
+	sagaw = DMAR_CAP_SAGAW(domain->dmar->hw_cap);
+	for (i = 0; i < nitems(sagaw_bits); i++) {
 		if (sagaw_bits[i].agaw >= mgaw) {
-			ctx->agaw = sagaw_bits[i].agaw;
-			ctx->pglvl = sagaw_bits[i].pglvl;
-			ctx->awlvl = sagaw_bits[i].awlvl;
+			domain->agaw = sagaw_bits[i].agaw;
+			domain->pglvl = sagaw_bits[i].pglvl;
+			domain->awlvl = sagaw_bits[i].awlvl;
 			return (0);
 		}
 	}
-	device_printf(ctx->dmar->dev,
-	    "context request mgaw %d for pci%d:%d:%d:%d, "
-	    "no agaw found, sagaw %x\n", mgaw, ctx->dmar->segment, 
-	    pci_get_bus(ctx->ctx_tag.owner),
-	    pci_get_slot(ctx->ctx_tag.owner),
-	    pci_get_function(ctx->ctx_tag.owner), sagaw);
+	device_printf(domain->dmar->dev,
+	    "context request mgaw %d: no agaw found, sagaw %x\n",
+	    mgaw, sagaw);
 	return (EINVAL);
 }
 
@@ -150,18 +150,18 @@
 {
 	int i;
 
-	for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
+	for (i = 0; i < nitems(sagaw_bits); i++) {
 		if ((1ULL << sagaw_bits[i].agaw) >= maxaddr &&
 		    (DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
 			break;
 	}
-	if (allow_less && i == SIZEOF_SAGAW_BITS) {
+	if (allow_less && i == nitems(sagaw_bits)) {
 		do {
 			i--;
 		} while ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap)
 		    == 0);
 	}
-	if (i < SIZEOF_SAGAW_BITS)
+	if (i < nitems(sagaw_bits))
 		return (sagaw_bits[i].agaw);
 	KASSERT(0, ("no mgaw for maxaddr %jx allow_less %d",
 	    (uintmax_t) maxaddr, allow_less));
@@ -190,7 +190,7 @@
  * the context ctx.
  */
 int
-ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl)
+domain_is_sp_lvl(struct dmar_domain *domain, int lvl)
 {
 	int alvl, cap_sps;
 	static const int sagaw_sp[] = {
@@ -200,10 +200,9 @@
 		DMAR_CAP_SPS_1T
 	};
 
-	alvl = ctx->pglvl - lvl - 1;
-	cap_sps = DMAR_CAP_SPS(ctx->dmar->hw_cap);
-	return (alvl < sizeof(sagaw_sp) / sizeof(sagaw_sp[0]) &&
-	    (sagaw_sp[alvl] & cap_sps) != 0);
+	alvl = domain->pglvl - lvl - 1;
+	cap_sps = DMAR_CAP_SPS(domain->dmar->hw_cap);
+	return (alvl < nitems(sagaw_sp) && (sagaw_sp[alvl] & cap_sps) != 0);
 }
 
 dmar_gaddr_t
@@ -222,16 +221,15 @@
 	KASSERT(lvl >= 0 && lvl < total_pglvl,
 	    ("total %d lvl %d", total_pglvl, lvl));
 	rlvl = total_pglvl - lvl - 1;
-	KASSERT(rlvl < sizeof(pg_sz) / sizeof(pg_sz[0]),
-	    ("sizeof pg_sz lvl %d", lvl));
+	KASSERT(rlvl < nitems(pg_sz), ("sizeof pg_sz lvl %d", lvl));
 	return (pg_sz[rlvl]);
 }
 
 dmar_gaddr_t
-ctx_page_size(struct dmar_ctx *ctx, int lvl)
+domain_page_size(struct dmar_domain *domain, int lvl)
 {
 
-	return (pglvl_page_size(ctx->pglvl, lvl));
+	return (pglvl_page_size(domain->pglvl, lvl));
 }
 
 int
@@ -260,9 +258,12 @@
 dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags)
 {
 	vm_page_t m;
-	int zeroed;
+	int zeroed, aflags;
 
 	zeroed = (flags & DMAR_PGF_ZERO) != 0 ? VM_ALLOC_ZERO : 0;
+	aflags = zeroed | VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP |
+	    ((flags & DMAR_PGF_WAITOK) != 0 ? VM_ALLOC_WAITFAIL :
+	    VM_ALLOC_NOWAIT);
 	for (;;) {
 		if ((flags & DMAR_PGF_OBJL) == 0)
 			VM_OBJECT_WLOCK(obj);
@@ -272,8 +273,7 @@
 				VM_OBJECT_WUNLOCK(obj);
 			break;
 		}
-		m = vm_page_alloc_contig(obj, idx, VM_ALLOC_NOBUSY |
-		    VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP | zeroed, 1, 0,
+		m = vm_page_alloc_contig(obj, idx, aflags, 1, 0,
 		    dmar_high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
 		if ((flags & DMAR_PGF_OBJL) == 0)
 			VM_OBJECT_WUNLOCK(obj);
@@ -285,11 +285,6 @@
 		}
 		if ((flags & DMAR_PGF_WAITOK) == 0)
 			break;
-		if ((flags & DMAR_PGF_OBJL) != 0)
-			VM_OBJECT_WUNLOCK(obj);
-		VM_WAIT;
-		if ((flags & DMAR_PGF_OBJL) != 0)
-			VM_OBJECT_WLOCK(obj);
 	}
 	return (m);
 }
@@ -405,6 +400,7 @@
 dmar_load_root_entry_ptr(struct dmar_unit *unit)
 {
 	vm_page_t root_entry;
+	int error;
 
 	/*
 	 * Access to the GCMD register must be serialized while the
@@ -417,10 +413,9 @@
 	VM_OBJECT_RUNLOCK(unit->ctx_obj);
 	dmar_write8(unit, DMAR_RTADDR_REG, VM_PAGE_TO_PHYS(root_entry));
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SRTP);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS) == 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS)
+	    != 0));
+	return (error);
 }
 
 /*
@@ -430,6 +425,7 @@
 int
 dmar_inv_ctx_glob(struct dmar_unit *unit)
 {
+	int error;
 
 	/*
 	 * Access to the CCMD register must be serialized while the
@@ -445,10 +441,9 @@
 	 * writes the upper dword last.
 	 */
 	dmar_write8(unit, DMAR_CCMD_REG, DMAR_CCMD_ICC | DMAR_CCMD_CIRG_GLOB);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32) != 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32)
+	    == 0));
+	return (error);
 }
 
 /*
@@ -457,7 +452,7 @@
 int
 dmar_inv_iotlb_glob(struct dmar_unit *unit)
 {
-	int reg;
+	int error, reg;
 
 	DMAR_ASSERT_LOCKED(unit);
 	KASSERT(!unit->qi_enabled, ("QI enabled"));
@@ -466,11 +461,9 @@
 	/* See a comment about DMAR_CCMD_ICC in dmar_inv_ctx_glob. */
 	dmar_write8(unit, reg + DMAR_IOTLB_REG_OFF, DMAR_IOTLB_IVT |
 	    DMAR_IOTLB_IIRG_GLB | DMAR_IOTLB_DR | DMAR_IOTLB_DW);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) &
-	    DMAR_IOTLB_IVT32) != 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) &
+	    DMAR_IOTLB_IVT32) == 0));
+	return (error);
 }
 
 /*
@@ -480,6 +473,7 @@
 int
 dmar_flush_write_bufs(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 
@@ -490,38 +484,86 @@
 	    ("dmar%d: no RWBF", unit->unit));
 
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_WBF);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS) == 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS)
+	    != 0));
+	return (error);
 }
 
 int
 dmar_enable_translation(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->hw_gcmd |= DMAR_GCMD_TE;
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) == 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES)
+	    != 0));
+	return (error);
 }
 
 int
 dmar_disable_translation(struct dmar_unit *unit)
 {
+	int error;
 
 	DMAR_ASSERT_LOCKED(unit);
 	unit->hw_gcmd &= ~DMAR_GCMD_TE;
 	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
-	/* XXXKIB should have a timeout */
-	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) != 0)
-		cpu_spinwait();
-	return (0);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES)
+	    == 0));
+	return (error);
 }
 
+int
+dmar_load_irt_ptr(struct dmar_unit *unit)
+{
+	uint64_t irta, s;
+	int error;
+
+	DMAR_ASSERT_LOCKED(unit);
+	irta = unit->irt_phys;
+	if (DMAR_X2APIC(unit))
+		irta |= DMAR_IRTA_EIME;
+	s = fls(unit->irte_cnt) - 2;
+	KASSERT(unit->irte_cnt >= 2 && s <= DMAR_IRTA_S_MASK &&
+	    powerof2(unit->irte_cnt),
+	    ("IRTA_REG_S overflow %x", unit->irte_cnt));
+	irta |= s;
+	dmar_write8(unit, DMAR_IRTA_REG, irta);
+	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SIRTP);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRTPS)
+	    != 0));
+	return (error);
+}
+
+int
+dmar_enable_ir(struct dmar_unit *unit)
+{
+	int error;
+
+	DMAR_ASSERT_LOCKED(unit);
+	unit->hw_gcmd |= DMAR_GCMD_IRE;
+	unit->hw_gcmd &= ~DMAR_GCMD_CFI;
+	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES)
+	    != 0));
+	return (error);
+}
+
+int
+dmar_disable_ir(struct dmar_unit *unit)
+{
+	int error;
+
+	DMAR_ASSERT_LOCKED(unit);
+	unit->hw_gcmd &= ~DMAR_GCMD_IRE;
+	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
+	DMAR_WAIT_UNTIL(((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_IRES)
+	    == 0));
+	return (error);
+}
+
 #define BARRIER_F				\
 	u_int f_done, f_inproc, f_wakeup;	\
 						\
@@ -573,18 +615,62 @@
 }
 
 int dmar_match_verbose;
+int dmar_batch_coalesce = 100;
+struct timespec dmar_hw_timeout = {
+	.tv_sec = 0,
+	.tv_nsec = 1000000
+};
 
-static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL,
-    "");
-SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD | CTLFLAG_TUN,
+static const uint64_t d = 1000000000;
+
+void
+dmar_update_timeout(uint64_t newval)
+{
+
+	/* XXXKIB not atomic */
+	dmar_hw_timeout.tv_sec = newval / d;
+	dmar_hw_timeout.tv_nsec = newval % d;
+}
+
+uint64_t
+dmar_get_timeout(void)
+{
+
+	return ((uint64_t)dmar_hw_timeout.tv_sec * d +
+	    dmar_hw_timeout.tv_nsec);
+}
+
+static int
+dmar_timeout_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int error;
+
+	val = dmar_get_timeout();
+	error = sysctl_handle_long(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	dmar_update_timeout(val);
+	return (error);
+}
+
+static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL, "");
+SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD,
     &dmar_tbl_pagecnt, 0,
     "Count of pages used for DMAR pagetables");
-SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RW | CTLFLAG_TUN,
+SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RWTUN,
     &dmar_match_verbose, 0,
     "Verbose matching of the PCI devices to DMAR paths");
+SYSCTL_INT(_hw_dmar, OID_AUTO, batch_coalesce, CTLFLAG_RWTUN,
+    &dmar_batch_coalesce, 0,
+    "Number of qi batches between interrupt");
+SYSCTL_PROC(_hw_dmar, OID_AUTO, timeout,
+    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+    dmar_timeout_sysctl, "QU",
+    "Timeout for command wait, in nanoseconds");
 #ifdef INVARIANTS
 int dmar_check_free;
-SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RW | CTLFLAG_TUN,
+SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RWTUN,
     &dmar_check_free, 0,
     "Check the GPA RBtree for free_down and free_after validity");
 #endif

Added: trunk/sys/x86/iommu/iommu_intrmap.h
===================================================================
--- trunk/sys/x86/iommu/iommu_intrmap.h	                        (rev 0)
+++ trunk/sys/x86/iommu/iommu_intrmap.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,44 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/iommu/iommu_intrmap.h 280260 2015-03-19 13:57:47Z kib $
+ */
+
+#ifndef __X86_IOMMU_IOMMU_INTRMAP_H
+#define	__X86_IOMMU_IOMMU_INTRMAP_H
+
+int iommu_alloc_msi_intr(device_t src, u_int *cookies, u_int count);
+int iommu_map_msi_intr(device_t src, u_int cpu, u_int vector, u_int cookie,
+    uint64_t *addr, uint32_t *data);
+int iommu_unmap_msi_intr(device_t src, u_int cookie);
+int iommu_map_ioapic_intr(u_int ioapic_id, u_int cpu, u_int vector, bool edge,
+    bool activehi, int irq, u_int *cookie, uint32_t *hi, uint32_t *lo);
+int iommu_unmap_ioapic_intr(u_int ioapic_id, u_int *cookie);
+
+#endif


Property changes on: trunk/sys/x86/iommu/iommu_intrmap.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/isa/atpic.c
===================================================================
--- trunk/sys/x86/isa/atpic.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/atpic.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -30,10 +30,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/atpic.c 262192 2014-02-18 20:27:17Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/atpic.c 340016 2018-11-01 18:34:26Z jhb $");
 
 #include "opt_auto_eoi.h"
 #include "opt_isa.h"
+#include "opt_mca.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -55,9 +56,12 @@
 #ifdef PC98
 #include <pc98/cbus/cbus.h>
 #else
-#include <x86/isa/isa.h>
+#include <isa/isareg.h>
 #endif
 #include <isa/isavar.h>
+#ifdef DEV_MCA
+#include <i386/bios/mca_machdep.h>
+#endif
 
 #ifdef __amd64__
 #define	SDT_ATPIC	SDT_SYSIGT
@@ -70,12 +74,12 @@
 #define	MASTER	0
 #define	SLAVE	1
 
+#define	IMEN_MASK(ai)		(IRQ_MASK((ai)->at_irq))
+
 #define	NUM_ISA_IRQS		16
 
 static void	atpic_init(void *dummy);
 
-unsigned int imen;	/* XXX */
-
 inthand_t
 	IDTVEC(atpic_intr0), IDTVEC(atpic_intr1), IDTVEC(atpic_intr2),
 	IDTVEC(atpic_intr3), IDTVEC(atpic_intr4), IDTVEC(atpic_intr5),
@@ -83,19 +87,42 @@
 	IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11),
 	IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14),
 	IDTVEC(atpic_intr15);
+/* XXXKIB i386 uses stubs until pti comes */
+inthand_t
+	IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti),
+	IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti),
+	IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti),
+	IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti),
+	IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti),
+	IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti),
+	IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti),
+	IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti);
 
 #define	IRQ(ap, ai)	((ap)->at_irqbase + (ai)->at_irq)
 
-#define	ATPIC(io, base, eoi, imenptr)					\
-     	{ { atpic_enable_source, atpic_disable_source, (eoi),		\
-	    atpic_enable_intr, atpic_disable_intr, atpic_vector,	\
-	    atpic_source_pending, NULL,	atpic_resume, atpic_config_intr,\
-	    atpic_assign_cpu }, (io), (base), IDT_IO_INTS + (base),	\
-	    (imenptr) }
+#define	ATPIC(io, base, eoi) {						\
+		.at_pic = {						\
+			.pic_register_sources = atpic_register_sources,	\
+			.pic_enable_source = atpic_enable_source,	\
+			.pic_disable_source = atpic_disable_source,	\
+			.pic_eoi_source = (eoi),			\
+			.pic_enable_intr = atpic_enable_intr,		\
+			.pic_disable_intr = atpic_disable_intr,		\
+			.pic_vector = atpic_vector,			\
+			.pic_source_pending = atpic_source_pending,	\
+			.pic_resume = atpic_resume,			\
+			.pic_config_intr = atpic_config_intr,		\
+			.pic_assign_cpu = atpic_assign_cpu		\
+		},							\
+		.at_ioaddr = (io),					\
+		.at_irqbase = (base),					\
+		.at_intbase = IDT_IO_INTS + (base),			\
+		.at_imen = 0xff,					\
+	}
 
 #define	INTSRC(irq)							\
 	{ { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ),	\
-	    (irq) % 8 }
+	    IDTVEC(atpic_intr ## irq ## _pti), (irq) % 8 }
 
 struct atpic {
 	struct pic at_pic;
@@ -102,12 +129,12 @@
 	int	at_ioaddr;
 	int	at_irqbase;
 	uint8_t	at_intbase;
-	uint8_t	*at_imen;
+	uint8_t	at_imen;
 };
 
 struct atpic_intsrc {
 	struct intsrc at_intsrc;
-	inthand_t *at_intr;
+	inthand_t *at_intr, *at_intr_pti;
 	int	at_irq;			/* Relative to PIC base. */
 	enum intr_trigger at_trigger;
 	u_long	at_count;
@@ -114,6 +141,7 @@
 	u_long	at_straycount;
 };
 
+static void atpic_register_sources(struct pic *pic);
 static void atpic_enable_source(struct intsrc *isrc);
 static void atpic_disable_source(struct intsrc *isrc, int eoi);
 static void atpic_eoi_master(struct intsrc *isrc);
@@ -129,8 +157,8 @@
 static void i8259_init(struct atpic *pic, int slave);
 
 static struct atpic atpics[] = {
-	ATPIC(IO_ICU1, 0, atpic_eoi_master, (uint8_t *)&imen),
-	ATPIC(IO_ICU2, 8, atpic_eoi_slave, ((uint8_t *)&imen) + 1)
+	ATPIC(IO_ICU1, 0, atpic_eoi_master),
+	ATPIC(IO_ICU2, 8, atpic_eoi_slave)
 };
 
 static struct atpic_intsrc atintrs[] = {
@@ -152,7 +180,7 @@
 	INTSRC(15),
 };
 
-CTASSERT(sizeof(atintrs) / sizeof(atintrs[0]) == NUM_ISA_IRQS);
+CTASSERT(nitems(atintrs) == NUM_ISA_IRQS);
 
 static __inline void
 _atpic_eoi_master(struct intsrc *isrc)
@@ -184,6 +212,42 @@
 }
 
 static void
+atpic_register_sources(struct pic *pic)
+{
+	struct atpic *ap = (struct atpic *)pic;
+	struct atpic_intsrc *ai;
+	int i;
+
+	/*
+	 * If any of the ISA IRQs have an interrupt source already, then
+	 * assume that the I/O APICs are being used and don't register any
+	 * of our interrupt sources.  This makes sure we don't accidentally
+	 * use mixed mode.  The "accidental" use could otherwise occur on
+	 * machines that route the ACPI SCI interrupt to a different ISA
+	 * IRQ (at least one machine routes it to IRQ 13) thus disabling
+	 * that APIC ISA routing and allowing the ATPIC source for that IRQ
+	 * to leak through.  We used to depend on this feature for routing
+	 * IRQ0 via mixed mode, but now we don't use mixed mode at all.
+	 *
+	 * To avoid the slave not register sources after the master
+	 * registers its sources, register all IRQs when this function is
+	 * called on the master.
+	 */
+	if (ap != &atpics[MASTER])
+		return;
+	for (i = 0; i < NUM_ISA_IRQS; i++)
+		if (intr_lookup_source(i) != NULL)
+			return;
+
+	/* Loop through all interrupt sources and add them. */
+	for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) {
+		if (i == ICU_SLAVEID)
+			continue;
+		intr_register_source(&ai->at_intsrc);
+	}
+}
+
+static void
 atpic_enable_source(struct intsrc *isrc)
 {
 	struct atpic_intsrc *ai = (struct atpic_intsrc *)isrc;
@@ -190,9 +254,9 @@
 	struct atpic *ap = (struct atpic *)isrc->is_pic;
 
 	spinlock_enter();
-	if (*ap->at_imen & IMEN_MASK(ai)) {
-		*ap->at_imen &= ~IMEN_MASK(ai);
-		outb(ap->at_ioaddr + ICU_IMR_OFFSET, *ap->at_imen);
+	if (ap->at_imen & IMEN_MASK(ai)) {
+		ap->at_imen &= ~IMEN_MASK(ai);
+		outb(ap->at_ioaddr + ICU_IMR_OFFSET, ap->at_imen);
 	}
 	spinlock_exit();
 }
@@ -205,8 +269,8 @@
 
 	spinlock_enter();
 	if (ai->at_trigger != INTR_TRIGGER_EDGE) {
-		*ap->at_imen |= IMEN_MASK(ai);
-		outb(ap->at_ioaddr + ICU_IMR_OFFSET, *ap->at_imen);
+		ap->at_imen |= IMEN_MASK(ai);
+		outb(ap->at_ioaddr + ICU_IMR_OFFSET, ap->at_imen);
 	}
 
 	/*
@@ -400,7 +464,7 @@
 		outb(imr_addr, MASTER_MODE);
 
 	/* Set interrupt enable mask. */
-	outb(imr_addr, *pic->at_imen);
+	outb(imr_addr, pic->at_imen);
 
 	/* Reset is finished, default to IRR on read. */
 	outb(pic->at_ioaddr, OCW3_SEL | OCW3_RR);
@@ -420,7 +484,6 @@
 	int i;
 
 	/* Start off with all interrupts disabled. */
-	imen = 0xffff;
 	i8259_init(&atpics[MASTER], 0);
 	i8259_init(&atpics[SLAVE], 1);
 	atpic_enable_source((struct intsrc *)&atintrs[ICU_SLAVEID]);
@@ -432,7 +495,8 @@
 		ai->at_intsrc.is_count = &ai->at_count;
 		ai->at_intsrc.is_straycount = &ai->at_straycount;
 		setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase +
-		    ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC);
+		    ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC,
+		    SEL_KPL, GSEL_ATPIC);
 	}
 
 #ifdef DEV_MCA
@@ -492,8 +556,6 @@
 static void
 atpic_init(void *dummy __unused)
 {
-	struct atpic_intsrc *ai;
-	int i;
 
 	/*
 	 * Register our PICs, even if we aren't going to use any of their
@@ -503,29 +565,10 @@
 	    intr_register_pic(&atpics[1].at_pic) != 0)
 		panic("Unable to register ATPICs");
 
-	/*
-	 * If any of the ISA IRQs have an interrupt source already, then
-	 * assume that the APICs are being used and don't register any
-	 * of our interrupt sources.  This makes sure we don't accidentally
-	 * use mixed mode.  The "accidental" use could otherwise occur on
-	 * machines that route the ACPI SCI interrupt to a different ISA
-	 * IRQ (at least one machines routes it to IRQ 13) thus disabling
-	 * that APIC ISA routing and allowing the ATPIC source for that IRQ
-	 * to leak through.  We used to depend on this feature for routing
-	 * IRQ0 via mixed mode, but now we don't use mixed mode at all.
-	 */
-	for (i = 0; i < NUM_ISA_IRQS; i++)
-		if (intr_lookup_source(i) != NULL)
-			return;
-
-	/* Loop through all interrupt sources and add them. */
-	for (i = 0, ai = atintrs; i < NUM_ISA_IRQS; i++, ai++) {
-		if (i == ICU_SLAVEID)
-			continue;
-		intr_register_source(&ai->at_intsrc);
-	}
+	if (num_io_irqs == 0)
+		num_io_irqs = NUM_ISA_IRQS;
 }
-SYSINIT(atpic_init, SI_SUB_INTR, SI_ORDER_SECOND + 1, atpic_init, NULL);
+SYSINIT(atpic_init, SI_SUB_INTR, SI_ORDER_FOURTH, atpic_init, NULL);
 
 void
 atpic_handle_intr(u_int vector, struct trapframe *frame)

Modified: trunk/sys/x86/isa/atrtc.c
===================================================================
--- trunk/sys/x86/isa/atrtc.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/atrtc.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -25,12 +25,13 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/isa/atrtc.c 285446 2015-07-13 11:58:08Z brueffer $
+ * $FreeBSD: stable/11/sys/x86/isa/atrtc.c 345590 2019-03-27 19:17:42Z wulf $
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/atrtc.c 285446 2015-07-13 11:58:08Z brueffer $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/atrtc.c 345590 2019-03-27 19:17:42Z wulf $");
 
+#include "opt_acpi.h"
 #include "opt_isa.h"
 
 #include <sys/param.h>
@@ -53,10 +54,24 @@
 #endif
 #include <machine/intr_machdep.h>
 #include "clock_if.h"
+#ifdef DEV_ACPI
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/accommon.h>
+#include <dev/acpica/acpivar.h>
+#include <machine/md_var.h>
+#endif
 
-#define	RTC_LOCK	do { if (!kdb_active) mtx_lock_spin(&clock_lock); } while (0)
-#define	RTC_UNLOCK	do { if (!kdb_active) mtx_unlock_spin(&clock_lock); } while (0)
+/*
+ * atrtc_lock protects low-level access to individual hardware registers.
+ * atrtc_time_lock protects the entire sequence of accessing multiple registers
+ * to read or write the date and time.
+ */
+static struct mtx atrtc_lock;
+MTX_SYSINIT(atrtc_lock_init, &atrtc_lock, "atrtc", MTX_SPIN);
 
+struct mtx atrtc_time_lock;
+MTX_SYSINIT(atrtc_time_lock_init, &atrtc_time_lock, "atrtc_time", MTX_DEF);
+
 int	atrtcclock_disable = 0;
 
 static	int	rtc_reg = -1;
@@ -63,16 +78,19 @@
 static	u_char	rtc_statusa = RTCSA_DIVIDER | RTCSA_NOPROF;
 static	u_char	rtc_statusb = RTCSB_24HR;
 
+#ifdef DEV_ACPI
+#define	_COMPONENT	ACPI_TIMER
+ACPI_MODULE_NAME("ATRTC")
+#endif
+
 /*
  * RTC support routines
  */
 
-int
-rtcin(int reg)
+static inline u_char
+rtcin_locked(int reg)
 {
-	u_char val;
 
-	RTC_LOCK;
 	if (rtc_reg != reg) {
 		inb(0x84);
 		outb(IO_RTC, reg);
@@ -79,16 +97,13 @@
 		rtc_reg = reg;
 		inb(0x84);
 	}
-	val = inb(IO_RTC + 1);
-	RTC_UNLOCK;
-	return (val);
+	return (inb(IO_RTC + 1));
 }
 
-void
-writertc(int reg, u_char val)
+static inline void
+rtcout_locked(int reg, u_char val)
 {
 
-	RTC_LOCK;
 	if (rtc_reg != reg) {
 		inb(0x84);
 		outb(IO_RTC, reg);
@@ -97,21 +112,36 @@
 	}
 	outb(IO_RTC + 1, val);
 	inb(0x84);
-	RTC_UNLOCK;
 }
 
-static __inline int
-readrtc(int port)
+int
+rtcin(int reg)
 {
-	return(bcd2bin(rtcin(port)));
+	u_char val;
+
+	mtx_lock_spin(&atrtc_lock);
+	val = rtcin_locked(reg);
+	mtx_unlock_spin(&atrtc_lock);
+	return (val);
 }
 
+void
+writertc(int reg, u_char val)
+{
+
+	mtx_lock_spin(&atrtc_lock);
+	rtcout_locked(reg, val);
+	mtx_unlock_spin(&atrtc_lock);
+}
+
 static void
 atrtc_start(void)
 {
 
-	writertc(RTC_STATUSA, rtc_statusa);
-	writertc(RTC_STATUSB, RTCSB_24HR);
+	mtx_lock_spin(&atrtc_lock);
+	rtcout_locked(RTC_STATUSA, rtc_statusa);
+	rtcout_locked(RTC_STATUSB, RTCSB_24HR);
+	mtx_unlock_spin(&atrtc_lock);
 }
 
 static void
@@ -127,8 +157,10 @@
 {
 
 	rtc_statusb |= RTCSB_PINTR;
-	writertc(RTC_STATUSB, rtc_statusb);
-	rtcin(RTC_INTR);
+	mtx_lock_spin(&atrtc_lock);
+	rtcout_locked(RTC_STATUSB, rtc_statusb);
+	rtcin_locked(RTC_INTR);
+	mtx_unlock_spin(&atrtc_lock);
 }
 
 static void
@@ -136,8 +168,10 @@
 {
 
 	rtc_statusb &= ~RTCSB_PINTR;
-	writertc(RTC_STATUSB, rtc_statusb);
-	rtcin(RTC_INTR);
+	mtx_lock_spin(&atrtc_lock);
+	rtcout_locked(RTC_STATUSB, rtc_statusb);
+	rtcin_locked(RTC_INTR);
+	mtx_unlock_spin(&atrtc_lock);
 }
 
 void
@@ -145,11 +179,13 @@
 {
 
 	/* Restore all of the RTC's "status" (actually, control) registers. */
-	rtcin(RTC_STATUSA);	/* dummy to get rtc_reg set */
-	writertc(RTC_STATUSB, RTCSB_24HR);
-	writertc(RTC_STATUSA, rtc_statusa);
-	writertc(RTC_STATUSB, rtc_statusb);
-	rtcin(RTC_INTR);
+	mtx_lock_spin(&atrtc_lock);
+	rtcin_locked(RTC_STATUSA);	/* dummy to get rtc_reg set */
+	rtcout_locked(RTC_STATUSB, RTCSB_24HR);
+	rtcout_locked(RTC_STATUSA, rtc_statusa);
+	rtcout_locked(RTC_STATUSB, rtc_statusb);
+	rtcin_locked(RTC_INTR);
+	mtx_unlock_spin(&atrtc_lock);
 }
 
 /**********************************************************************
@@ -162,6 +198,9 @@
 	struct resource *intr_res;
 	void *intr_handler;
 	struct eventtimer et;
+#ifdef DEV_ACPI
+	ACPI_HANDLE acpi_handle;
+#endif
 };
 
 static int
@@ -216,7 +255,145 @@
 	return(flag ? FILTER_HANDLED : FILTER_STRAY);
 }
 
+#ifdef DEV_ACPI
 /*
+ *  ACPI RTC CMOS address space handler
+ */
+#define	ATRTC_LAST_REG	0x40
+
+static void
+rtcin_region(int reg, void *buf, int len)
+{
+	u_char *ptr = buf;
+
+	/* Drop lock after each IO as intr and settime have greater priority */
+	while (len-- > 0)
+		*ptr++ = rtcin(reg++) & 0xff;
+}
+
+static void
+rtcout_region(int reg, const void *buf, int len)
+{
+	const u_char *ptr = buf;
+
+	while (len-- > 0)
+		writertc(reg++, *ptr++);
+}
+
+static bool
+atrtc_check_cmos_access(bool is_read, ACPI_PHYSICAL_ADDRESS addr, UINT32 len)
+{
+
+	/* Block address space wrapping on out-of-bound access */
+	if (addr >= ATRTC_LAST_REG || addr + len > ATRTC_LAST_REG)
+		return (false);
+
+	if (is_read) {
+		/* Reading 0x0C will muck with interrupts */
+		if (addr <= RTC_INTR && addr + len > RTC_INTR)
+			return (false);
+	} else {
+		/*
+		 * Allow single-byte writes to alarm registers and
+		 * multi-byte writes to addr >= 0x30, else deny.
+		 */
+		if (!((len == 1 && (addr == RTC_SECALRM ||
+				    addr == RTC_MINALRM ||
+				    addr == RTC_HRSALRM)) ||
+		      addr >= 0x30))
+			return (false);
+	}
+	return (true);
+}
+
+static ACPI_STATUS
+atrtc_acpi_cmos_handler(UINT32 func, ACPI_PHYSICAL_ADDRESS addr,
+    UINT32 bitwidth, UINT64 *value, void *context, void *region_context)
+{
+	device_t dev = context;
+	UINT32 bytewidth = howmany(bitwidth, 8);
+	bool is_read = func == ACPI_READ;
+
+	/* ACPICA is very verbose on CMOS handler failures, so we, too */
+#define	CMOS_HANDLER_ERR(fmt, ...) \
+	device_printf(dev, "ACPI [SystemCMOS] handler: " fmt, ##__VA_ARGS__)
+
+	ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
+
+	if (value == NULL) {
+		CMOS_HANDLER_ERR("NULL parameter\n");
+		return (AE_BAD_PARAMETER);
+	}
+	if (bitwidth == 0 || (bitwidth & 0x07) != 0) {
+		CMOS_HANDLER_ERR("Invalid bitwidth: %u\n", bitwidth);
+		return (AE_BAD_PARAMETER);
+	}
+	if (!atrtc_check_cmos_access(is_read, addr, bytewidth)) {
+		CMOS_HANDLER_ERR("%s access rejected: addr=%#04jx, len=%u\n",
+		    is_read ? "Read" : "Write", (uintmax_t)addr, bytewidth);
+		return (AE_BAD_PARAMETER);
+	}
+
+	switch (func) {
+	case ACPI_READ:
+		rtcin_region(addr, value, bytewidth);
+		break;
+	case ACPI_WRITE:
+		rtcout_region(addr, value, bytewidth);
+		break;
+	default:
+		CMOS_HANDLER_ERR("Invalid function: %u\n", func);
+		return (AE_BAD_PARAMETER);
+	}
+
+	ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev),
+	    "ACPI RTC CMOS %s access: addr=%#04x, len=%u, val=%*D\n",
+	    is_read ? "read" : "write", (unsigned)addr, bytewidth,
+	    bytewidth, value, " ");
+
+	return (AE_OK);
+}
+
+static int
+atrtc_reg_acpi_cmos_handler(device_t dev)
+{
+	struct atrtc_softc *sc = device_get_softc(dev);
+
+	ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__);
+
+	/* Don't handle address space events if driver is disabled. */
+	if (acpi_disabled("atrtc"))
+		return (ENXIO);
+
+	sc->acpi_handle = acpi_get_handle(dev);
+	if (sc->acpi_handle == NULL ||
+	    ACPI_FAILURE(AcpiInstallAddressSpaceHandler(sc->acpi_handle,
+	      ACPI_ADR_SPACE_CMOS, atrtc_acpi_cmos_handler, NULL, dev))) {
+		sc->acpi_handle = NULL;
+		device_printf(dev,
+		    "Can't register ACPI CMOS address space handler\n");
+		return (ENXIO);
+        }
+
+        return (0);
+}
+
+static int
+atrtc_unreg_acpi_cmos_handler(device_t dev)
+{
+	struct atrtc_softc *sc = device_get_softc(dev);
+
+	ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__);
+
+	if (sc->acpi_handle != NULL)
+		AcpiRemoveAddressSpaceHandler(sc->acpi_handle,
+		    ACPI_ADR_SPACE_CMOS, atrtc_acpi_cmos_handler);
+
+	return (0);
+}
+#endif	/* DEV_ACPI */
+
+/*
  * Attach to the ISA PnP descriptors for the timer and realtime clock.
  */
 static struct isa_pnp_id atrtc_ids[] = {
@@ -242,7 +419,7 @@
 atrtc_attach(device_t dev)
 {
 	struct atrtc_softc *sc;
-	u_long s;
+	rman_res_t s;
 	int i;
 
 	sc = device_get_softc(dev);
@@ -288,6 +465,37 @@
 }
 
 static int
+atrtc_isa_attach(device_t dev)
+{
+
+	return (atrtc_attach(dev));
+}
+
+#ifdef DEV_ACPI
+static int
+atrtc_acpi_attach(device_t dev)
+{
+	int ret;
+
+	ret = atrtc_attach(dev);
+	if (ret)
+		return (ret);
+
+	(void)atrtc_reg_acpi_cmos_handler(dev);
+
+	return (0);
+}
+
+static int
+atrtc_acpi_detach(device_t dev)
+{
+
+	(void)atrtc_unreg_acpi_cmos_handler(dev);
+	return (0);
+}
+#endif	/* DEV_ACPI */
+
+static int
 atrtc_resume(device_t dev)
 {
 
@@ -298,28 +506,38 @@
 static int
 atrtc_settime(device_t dev __unused, struct timespec *ts)
 {
-	struct clocktime ct;
+	struct bcd_clocktime bct;
 
-	clock_ts_to_ct(ts, &ct);
+	clock_ts_to_bcd(ts, &bct, false);
+	clock_dbgprint_bcd(dev, CLOCK_DBG_WRITE, &bct);
 
-	/* Disable RTC updates and interrupts. */
-	writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR);
+	mtx_lock(&atrtc_time_lock);
+	mtx_lock_spin(&atrtc_lock);
 
-	writertc(RTC_SEC, bin2bcd(ct.sec)); 		/* Write back Seconds */
-	writertc(RTC_MIN, bin2bcd(ct.min)); 		/* Write back Minutes */
-	writertc(RTC_HRS, bin2bcd(ct.hour));		/* Write back Hours   */
+	/* Disable RTC updates and interrupts.  */
+	rtcout_locked(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR);
 
-	writertc(RTC_WDAY, ct.dow + 1);			/* Write back Weekday */
-	writertc(RTC_DAY, bin2bcd(ct.day));		/* Write back Day */
-	writertc(RTC_MONTH, bin2bcd(ct.mon));           /* Write back Month   */
-	writertc(RTC_YEAR, bin2bcd(ct.year % 100));	/* Write back Year    */
+	/* Write all the time registers. */
+	rtcout_locked(RTC_SEC,   bct.sec);
+	rtcout_locked(RTC_MIN,   bct.min);
+	rtcout_locked(RTC_HRS,   bct.hour);
+	rtcout_locked(RTC_WDAY,  bct.dow + 1);
+	rtcout_locked(RTC_DAY,   bct.day);
+	rtcout_locked(RTC_MONTH, bct.mon);
+	rtcout_locked(RTC_YEAR,  bct.year & 0xff);
 #ifdef USE_RTC_CENTURY
-	writertc(RTC_CENTURY, bin2bcd(ct.year / 100));	/* ... and Century    */
+	rtcout_locked(RTC_CENTURY, bct.year >> 8);
 #endif
 
-	/* Reenable RTC updates and interrupts. */
-	writertc(RTC_STATUSB, rtc_statusb);
-	rtcin(RTC_INTR);
+	/*
+	 * Re-enable RTC updates and interrupts.
+	 */
+	rtcout_locked(RTC_STATUSB, rtc_statusb);
+	rtcin_locked(RTC_INTR);
+
+	mtx_unlock_spin(&atrtc_lock);
+	mtx_unlock(&atrtc_time_lock);
+
 	return (0);
 }
 
@@ -326,7 +544,7 @@
 static int
 atrtc_gettime(device_t dev, struct timespec *ts)
 {
-	struct clocktime ct;
+	struct bcd_clocktime bct;
 
 	/* Look if we have a RTC present and the time is valid */
 	if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) {
@@ -341,32 +559,32 @@
 	 * to make sure that no more than 240us pass after we start reading,
 	 * and try again if so.
 	 */
+	mtx_lock(&atrtc_time_lock);
 	while (rtcin(RTC_STATUSA) & RTCSA_TUP)
 		continue;
-	critical_enter();
-	ct.nsec = 0;
-	ct.sec = readrtc(RTC_SEC);
-	ct.min = readrtc(RTC_MIN);
-	ct.hour = readrtc(RTC_HRS);
-	ct.day = readrtc(RTC_DAY);
-	ct.dow = readrtc(RTC_WDAY) - 1;
-	ct.mon = readrtc(RTC_MONTH);
-	ct.year = readrtc(RTC_YEAR);
+	mtx_lock_spin(&atrtc_lock);
+	bct.sec  = rtcin_locked(RTC_SEC);
+	bct.min  = rtcin_locked(RTC_MIN);
+	bct.hour = rtcin_locked(RTC_HRS);
+	bct.day  = rtcin_locked(RTC_DAY);
+	bct.mon  = rtcin_locked(RTC_MONTH);
+	bct.year = rtcin_locked(RTC_YEAR);
 #ifdef USE_RTC_CENTURY
-	ct.year += readrtc(RTC_CENTURY) * 100;
-#else
-	ct.year += (ct.year < 80 ? 2000 : 1900);
+	bct.year |= rtcin_locked(RTC_CENTURY) << 8;
 #endif
-	critical_exit();
-	/* Set dow = -1 because some clocks don't set it correctly. */
-	ct.dow = -1;
-	return (clock_ct_to_ts(&ct, ts));
+	mtx_unlock_spin(&atrtc_lock);
+	mtx_unlock(&atrtc_time_lock);
+	/* dow is unused in timespec conversion and we have no nsec info. */
+	bct.dow  = 0;
+	bct.nsec = 0;
+	clock_dbgprint_bcd(dev, CLOCK_DBG_READ, &bct);
+	return (clock_bcd_to_ts(&bct, ts, false));
 }
 
-static device_method_t atrtc_methods[] = {
+static device_method_t atrtc_isa_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		atrtc_probe),
-	DEVMETHOD(device_attach,	atrtc_attach),
+	DEVMETHOD(device_attach,	atrtc_isa_attach),
 	DEVMETHOD(device_detach,	bus_generic_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
@@ -380,26 +598,38 @@
 	{ 0, 0 }
 };
 
-static driver_t atrtc_driver = {
+static driver_t atrtc_isa_driver = {
 	"atrtc",
-	atrtc_methods,
+	atrtc_isa_methods,
 	sizeof(struct atrtc_softc),
 };
 
-static devclass_t atrtc_devclass;
+#ifdef DEV_ACPI
+static device_method_t atrtc_acpi_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		atrtc_probe),
+	DEVMETHOD(device_attach,	atrtc_acpi_attach),
+	DEVMETHOD(device_detach,	atrtc_acpi_detach),
+		/* XXX stop statclock? */
+	DEVMETHOD(device_resume,	atrtc_resume),
 
-DRIVER_MODULE(atrtc, isa, atrtc_driver, atrtc_devclass, 0, 0);
-DRIVER_MODULE(atrtc, acpi, atrtc_driver, atrtc_devclass, 0, 0);
+	/* clock interface */
+	DEVMETHOD(clock_gettime,	atrtc_gettime),
+	DEVMETHOD(clock_settime,	atrtc_settime),
 
-#include "opt_ddb.h"
-#ifdef DDB
-#include <ddb/ddb.h>
+	{ 0, 0 }
+};
 
-DB_SHOW_COMMAND(rtc, rtc)
-{
-	printf("%02x/%02x/%02x %02x:%02x:%02x, A = %02x, B = %02x, C = %02x\n",
-		rtcin(RTC_YEAR), rtcin(RTC_MONTH), rtcin(RTC_DAY),
-		rtcin(RTC_HRS), rtcin(RTC_MIN), rtcin(RTC_SEC),
-		rtcin(RTC_STATUSA), rtcin(RTC_STATUSB), rtcin(RTC_INTR));
-}
-#endif /* DDB */
+static driver_t atrtc_acpi_driver = {
+	"atrtc",
+	atrtc_acpi_methods,
+	sizeof(struct atrtc_softc),
+};
+#endif	/* DEV_ACPI */
+
+static devclass_t atrtc_devclass;
+
+DRIVER_MODULE(atrtc, isa, atrtc_isa_driver, atrtc_devclass, 0, 0);
+#ifdef DEV_ACPI
+DRIVER_MODULE(atrtc, acpi, atrtc_acpi_driver, atrtc_devclass, 0, 0);
+#endif

Modified: trunk/sys/x86/isa/clock.c
===================================================================
--- trunk/sys/x86/isa/clock.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/clock.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/clock.c 254373 2013-08-15 17:21:06Z brooks $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/clock.c 331722 2018-03-29 02:50:57Z eadler $");
 
 /*
  * Routines to handle clock hardware.
@@ -66,6 +66,7 @@
 #include <machine/intr_machdep.h>
 #include <machine/ppireg.h>
 #include <machine/timerreg.h>
+#include <x86/init.h>
 
 #ifdef PC98
 #include <pc98/pc98/pc98_machdep.h>
@@ -98,7 +99,7 @@
 int	i8254_max_count;
 static int i8254_timecounter = 1;
 
-struct mtx clock_lock;
+static	struct mtx clock_lock;
 static	struct intsrc *i8254_intsrc;
 static	uint16_t i8254_lastcount;
 static	uint16_t i8254_offset;
@@ -140,6 +141,15 @@
 static	unsigned i8254_get_timecount(struct timecounter *tc);
 static	void	set_i8254_freq(int mode, uint32_t period);
 
+void
+clock_init(void)
+{
+	/* Init the clock lock */
+	mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE);
+	/* Init the clock in order to use DELAY */
+	init_ops.early_clock_source_init();
+}
+
 static int
 clkintr(void *arg)
 {
@@ -157,7 +167,7 @@
 		mtx_unlock_spin(&clock_lock);
 	}
 
-	if (sc && sc->et.et_active && sc->mode != MODE_STOP)
+	if (sc->et.et_active && sc->mode != MODE_STOP)
 		sc->et.et_event_cb(&sc->et, sc->et.et_arg);
 
 #ifdef DEV_MCA
@@ -248,54 +258,6 @@
 	return ((high << 8) | low);
 }
 
-#ifndef DELAYDEBUG
-static u_int
-get_tsc(__unused struct timecounter *tc)
-{
-
-	return (rdtsc32());
-}
-
-static __inline int
-delay_tc(int n)
-{
-	struct timecounter *tc;
-	timecounter_get_t *func;
-	uint64_t end, freq, now;
-	u_int last, mask, u;
-
-	tc = timecounter;
-	freq = atomic_load_acq_64(&tsc_freq);
-	if (tsc_is_invariant && freq != 0) {
-		func = get_tsc;
-		mask = ~0u;
-	} else {
-		if (tc->tc_quality <= 0)
-			return (0);
-		func = tc->tc_get_timecount;
-		mask = tc->tc_counter_mask;
-		freq = tc->tc_frequency;
-	}
-	now = 0;
-	end = freq * n / 1000000;
-	if (func == get_tsc)
-		sched_pin();
-	last = func(tc) & mask;
-	do {
-		cpu_spinwait();
-		u = func(tc) & mask;
-		if (u < last)
-			now += mask - last + u + 1;
-		else
-			now += u - last;
-		last = u;
-	} while (now < end);
-	if (func == get_tsc)
-		sched_unpin();
-	return (1);
-}
-#endif
-
 /*
  * Wait "n" microseconds.
  * Relies on timer 1 counting down from (i8254_freq / hz)
@@ -302,7 +264,7 @@
  * Note: timer had better have been programmed before this is first used!
  */
 void
-DELAY(int n)
+i8254_delay(int n)
 {
 	int delta, prev_tick, tick, ticks_left;
 #ifdef DELAYDEBUG
@@ -318,9 +280,6 @@
 	}
 	if (state == 1)
 		printf("DELAY(%d)...", n);
-#else
-	if (delay_tc(n))
-		return;
 #endif
 	/*
 	 * Read the counter first, so that the rest of the setup overhead is
@@ -500,7 +459,6 @@
 i8254_init(void)
 {
 
-	mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE);
 #ifdef PC98
 	if (pc98_machine_type & M_8M)
 		i8254_freq = 1996800L; /* 1.9968 MHz */
@@ -518,8 +476,27 @@
 void
 cpu_initclocks(void)
 {
+#ifdef EARLY_AP_STARTUP
+	struct thread *td;
+	int i;
 
+	td = curthread;
 	cpu_initclocks_bsp();
+	CPU_FOREACH(i) {
+		if (i == 0)
+			continue;
+		thread_lock(td);
+		sched_bind(td, i);
+		thread_unlock(td);
+		cpu_initclocks_ap();
+	}
+	thread_lock(td);
+	if (sched_is_bound(td))
+		sched_unbind(td);
+	thread_unlock(td);
+#else
+	cpu_initclocks_bsp();
+#endif
 }
 
 static int
@@ -699,7 +676,7 @@
 attimer_attach(device_t dev)
 {
 	struct attimer_softc *sc;
-	u_long s;
+	rman_res_t s;
 	int i;
 
 	attimer_sc = sc = device_get_softc(dev);

Modified: trunk/sys/x86/isa/elcr.c
===================================================================
--- trunk/sys/x86/isa/elcr.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/elcr.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/elcr.c 262192 2014-02-18 20:27:17Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/elcr.c 261520 2014-02-05 18:13:27Z jhb $");
 
 /*
  * The ELCR is a register that controls the trigger mode and polarity of

Modified: trunk/sys/x86/isa/icu.h
===================================================================
--- trunk/sys/x86/isa/icu.h	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/icu.h	2020-02-08 19:32:41 UTC (rev 12310)
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)icu.h	5.6 (Berkeley) 5/9/91
- * $FreeBSD: stable/10/sys/x86/isa/icu.h 233031 2012-03-16 12:13:44Z nyan $
+ * $FreeBSD: stable/11/sys/x86/isa/icu.h 339928 2018-10-30 19:10:41Z jhb $
  */
 
 /*
@@ -88,7 +88,6 @@
 #endif
 
 #define	IRQ_MASK(irq)		(1 << (irq))
-#define	IMEN_MASK(ai)		(IRQ_MASK((ai)->at_irq))
 
 void	atpic_handle_intr(u_int vector, struct trapframe *frame);
 void	atpic_startup(void);

Modified: trunk/sys/x86/isa/isa.c
===================================================================
--- trunk/sys/x86/isa/isa.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/isa.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/isa.c 221526 2011-05-06 13:48:53Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/isa.c 295832 2016-02-20 01:32:58Z jhibbits $");
 
 /*-
  * Modifications for Intel architecture by Garrett A. Wollman.
@@ -89,13 +89,13 @@
  */
 struct resource *
 isa_alloc_resource(device_t bus, device_t child, int type, int *rid,
-		   u_long start, u_long end, u_long count, u_int flags)
+		   rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	/*
 	 * Consider adding a resource definition.
 	 */
 	int passthrough = (device_get_parent(child) != bus);
-	int isdefault = (start == 0UL && end == ~0UL);
+	int isdefault = RMAN_IS_DEFAULT_RANGE(start, end);
 	struct isa_device* idev = DEVTOISA(child);
 	struct resource_list *rl = &idev->id_resources;
 	struct resource_list_entry *rle;
@@ -242,3 +242,8 @@
  * On this platform, isa can also attach to the legacy bus.
  */
 DRIVER_MODULE(isa, legacy, isa_driver, isa_devclass, 0, 0);
+
+/*
+ * Attach the ISA bus to the xenpv bus in order to get syscons.
+ */
+DRIVER_MODULE(isa, xenpv, isa_driver, isa_devclass, 0, 0);

Modified: trunk/sys/x86/isa/isa_dma.c
===================================================================
--- trunk/sys/x86/isa/isa_dma.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/isa_dma.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/isa_dma.c 233675 2012-03-29 18:58:02Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/isa_dma.c 332304 2018-04-08 20:52:09Z emaste $");
 
 /*
  * code to manage AT bus
@@ -62,7 +62,7 @@
 #include <isa/isavar.h>
 #include <isa/isa_dmareg.h>
 
-#define	ISARAM_END	RAM_END
+#define	ISARAM_END	0x1000000
 
 static int isa_dmarangecheck(caddr_t va, u_int length, int chan);
 
@@ -145,8 +145,7 @@
  * in open() or during its initialization.
  */
 int
-isa_dma_acquire(chan)
-	int chan;
+isa_dma_acquire(int chan)
 {
 #ifdef DIAGNOSTIC
 	if (chan & ~VALID_DMA_MASK)
@@ -171,8 +170,7 @@
  * during close() or during its shutdown.
  */
 void
-isa_dma_release(chan)
-	int chan;
+isa_dma_release(int chan)
 {
 #ifdef DIAGNOSTIC
 	if (chan & ~VALID_DMA_MASK)
@@ -206,8 +204,7 @@
  * external dma control by a board.
  */
 void
-isa_dmacascade(chan)
-	int chan;
+isa_dmacascade(int chan)
 {
 #ifdef DIAGNOSTIC
 	if (chan & ~VALID_DMA_MASK)

Modified: trunk/sys/x86/isa/nmi.c
===================================================================
--- trunk/sys/x86/isa/nmi.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/nmi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/nmi.c 204309 2010-02-25 14:13:39Z attilio $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/nmi.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include "opt_mca.h"
 

Modified: trunk/sys/x86/isa/orm.c
===================================================================
--- trunk/sys/x86/isa/orm.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/isa/orm.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/isa/orm.c 204309 2010-02-25 14:13:39Z attilio $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/isa/orm.c 299392 2016-05-10 22:28:06Z bz $");
 
 /*
  * Driver to take care of holes in ISA I/O memory occupied
@@ -59,7 +59,7 @@
 	{ 0,		NULL },
 };
 
-#define MAX_ROMS	16
+#define MAX_ROMS	32
 
 struct orm_softc {
 	int		rnum;
@@ -92,6 +92,9 @@
 	struct orm_softc	*sc;
 	u_int8_t		buf[3];
 
+	if (resource_disabled("orm", 0))
+		return;
+
 	child = BUS_ADD_CHILD(parent, ISA_ORDER_SENSITIVE, "orm", -1);
 	device_set_driver(child, driver);
 	isa_set_logicalid(child, ORM_ID);
@@ -98,7 +101,7 @@
 	isa_set_vendorid(child, ORM_ID);
 	sc = device_get_softc(child);
 	sc->rnum = 0;
-	while (chunk < IOMEM_END) {
+	while (sc->rnum < MAX_ROMS && chunk < IOMEM_END) {
 		bus_set_resource(child, SYS_RES_MEMORY, sc->rnum, chunk,
 		    IOMEM_STEP);
 		rid = sc->rnum;

Modified: trunk/sys/x86/pci/pci_bus.c
===================================================================
--- trunk/sys/x86/pci/pci_bus.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/pci/pci_bus.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/pci/pci_bus.c 280970 2015-04-01 21:48:54Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/pci/pci_bus.c 294883 2016-01-27 02:23:54Z jhibbits $");
 
 #include "opt_cpu.h"
 
@@ -525,7 +525,7 @@
 			device_probe_and_attach(pir);
 	}
 #endif
-	device_add_child(dev, "pci", bus);
+	device_add_child(dev, "pci", -1);
 	return bus_generic_attach(dev);
 }
 
@@ -576,12 +576,11 @@
 SYSCTL_DECL(_hw_pci);
 
 static unsigned long host_mem_start = 0x80000000;
-TUNABLE_ULONG("hw.pci.host_mem_start", &host_mem_start);
 SYSCTL_ULONG(_hw_pci, OID_AUTO, host_mem_start, CTLFLAG_RDTUN, &host_mem_start,
     0, "Limit the host bridge memory to being above this address.");
 
-u_long
-hostb_alloc_start(int type, u_long start, u_long end, u_long count)
+rman_res_t
+hostb_alloc_start(int type, rman_res_t start, rman_res_t end, rman_res_t count)
 {
 
 	if (start + count - 1 != end) {
@@ -595,7 +594,7 @@
 
 struct resource *
 legacy_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
-    u_long start, u_long end, u_long count, u_int flags)
+    rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 
 #if defined(NEW_PCIB) && defined(PCI_RES_BUS)
@@ -611,7 +610,7 @@
 #if defined(NEW_PCIB) && defined(PCI_RES_BUS)
 int
 legacy_pcib_adjust_resource(device_t dev, device_t child, int type,
-    struct resource *r, u_long start, u_long end)
+    struct resource *r, rman_res_t start, rman_res_t end)
 {
 
 	if (type == PCI_RES_BUS)

Modified: trunk/sys/x86/pci/qpi.c
===================================================================
--- trunk/sys/x86/pci/qpi.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/pci/qpi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,14 +27,14 @@
  */
 
 /*
- * This driver provides a psuedo-bus to enumerate the PCI buses
- * present on a sytem using a QPI chipset.  It creates a qpi0 bus that
- * is a child of nexus0 and then creates two Host-PCI bridges as a
+ * This driver provides a pseudo-bus to enumerate the PCI buses
+ * present on a system using a QPI chipset.  It creates a qpi0 bus that
+ * is a child of nexus0 and then creates Host-PCI bridges as a
  * child of that.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/pci/qpi.c 283927 2015-06-02 19:20:39Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/pci/qpi.c 323609 2017-09-15 09:03:01Z kib $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -64,17 +64,23 @@
 static void
 qpi_identify(driver_t *driver, device_t parent)
 {
+	int do_qpi;
 
-        /* Check CPUID to ensure this is an i7 CPU of some sort. */
-        if (!(cpu_vendor_id == CPU_VENDOR_INTEL &&
-	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
-	    (CPUID_TO_MODEL(cpu_id) == 0x1a || CPUID_TO_MODEL(cpu_id) == 0x2c)))
-                return;
+	/* Check CPUID to ensure this is an i7 CPU of some sort. */
+	if (cpu_vendor_id != CPU_VENDOR_INTEL ||
+	    CPUID_TO_FAMILY(cpu_id) != 0x6)
+		return;
 
-        /* PCI config register access is required. */
-        if (pci_cfgregopen() == 0)
-                return;
+	/* Only discover buses with configuration devices if allowed by user */
+	do_qpi = 0;
+	TUNABLE_INT_FETCH("hw.attach_intel_csr_pci", &do_qpi);
+	if (!do_qpi)
+		return;
 
+	/* PCI config register access is required. */
+	if (pci_cfgregopen() == 0)
+		return;
+
 	/* Add a qpi bus device. */
 	if (BUS_ADD_CHILD(parent, 20, "qpi", -1) == NULL)
 		panic("Failed to add qpi bus");
@@ -98,6 +104,7 @@
 	struct qpi_device *qdev;
 	device_t child;
 	uint32_t devid;
+	int s;
 
 	/*
 	 * If a PCI bus already exists for this bus number, then
@@ -107,18 +114,23 @@
 		return (EEXIST);
 
 	/*
-	 * Attempt to read the device id for device 0, function 0 on
-	 * the bus.  A value of 0xffffffff means that the bus is not
-	 * present.
+	 * Attempt to read the device id for every slot, function 0 on
+	 * the bus.  If all read values are 0xffffffff this means that
+	 * the bus is not present.
 	 */
-	devid = pci_cfgregread(bus, 0, 0, PCIR_DEVVENDOR, 4);
+	for (s = 0; s <= PCI_SLOTMAX; s++) {
+		devid = pci_cfgregread(bus, s, 0, PCIR_DEVVENDOR, 4);
+		if (devid != 0xffffffff)
+			break;
+	}
 	if (devid == 0xffffffff)
 		return (ENOENT);
 
 	if ((devid & 0xffff) != 0x8086) {
-		device_printf(dev,
-		    "Device at pci%d.0.0 has non-Intel vendor 0x%x\n", bus,
-		    devid & 0xffff);
+		if (bootverbose)
+			device_printf(dev,
+			    "Device at pci%d.%d.0 has non-Intel vendor 0x%x\n",
+			    bus, s, devid & 0xffff);
 		return (ENXIO);
 	}
 
@@ -138,12 +150,12 @@
 	int bus;
 
 	/*
-	 * Each processor socket has a dedicated PCI bus counting down from
-	 * 255.  We keep probing buses until one fails.
+	 * Each processor socket has a dedicated PCI bus, sometimes
+	 * not enumerated by ACPI.  Probe all unattached buses from 0
+	 * to 255.
 	 */
-	for (bus = 255;; bus--)
-		if (qpi_probe_pcib(dev, bus) != 0)
-			break;
+	for (bus = PCI_BUSMAX; bus >= 0; bus--)
+		qpi_probe_pcib(dev, bus);
 
 	return (bus_generic_attach(dev));
 }
@@ -219,8 +231,8 @@
 qpi_pcib_attach(device_t dev)
 {
 
-	device_add_child(dev, "pci", pcib_get_bus(dev));      
-        return (bus_generic_attach(dev));
+	device_add_child(dev, "pci", -1);
+	return (bus_generic_attach(dev));
 }
 
 static int
@@ -242,7 +254,7 @@
 #if defined(NEW_PCIB) && defined(PCI_RES_BUS)
 static struct resource *
 qpi_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
-    u_long start, u_long end, u_long count, u_int flags)
+    rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 
 	if (type == PCI_RES_BUS)

Added: trunk/sys/x86/x86/autoconf.c
===================================================================
--- trunk/sys/x86/x86/autoconf.c	                        (rev 0)
+++ trunk/sys/x86/x86/autoconf.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,162 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)autoconf.c	7.1 (Berkeley) 5/9/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/autoconf.c 332304 2018-04-08 20:52:09Z emaste $");
+
+/*
+ * Setup the system to run on the current machine.
+ *
+ * Configure() is called at boot time and initializes the vba
+ * device tables and the memory controller monitoring.  Available
+ * devices are determined (from possibilities mentioned in ioconf.c),
+ * and the drivers are initialized.
+ */
+#include "opt_bootp.h"
+#include "opt_isa.h"
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/reboot.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/cons.h>
+
+#include <sys/socket.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+
+#ifdef PC98
+#include <machine/bootinfo.h>
+#endif
+#include <machine/md_var.h>
+
+#ifdef DEV_ISA
+#include <isa/isavar.h>
+
+device_t isa_bus_device = 0;
+#endif
+
+static void	configure_first(void *);
+static void	configure(void *);
+static void	configure_final(void *);
+
+SYSINIT(configure1, SI_SUB_CONFIGURE, SI_ORDER_FIRST, configure_first, NULL);
+/* SI_ORDER_SECOND is hookable */
+SYSINIT(configure2, SI_SUB_CONFIGURE, SI_ORDER_THIRD, configure, NULL);
+/* SI_ORDER_MIDDLE is hookable */
+SYSINIT(configure3, SI_SUB_CONFIGURE, SI_ORDER_ANY, configure_final, NULL);
+
+/*
+ * Determine i/o configuration for a machine.
+ */
+static void
+configure_first(void *dummy)
+{
+
+	/* nexus0 is the top of the x86 device tree */
+	device_add_child(root_bus, "nexus", 0);
+}
+
+static void
+configure(void *dummy)
+{
+
+	/* initialize new bus architecture */
+	root_bus_configure();
+
+#ifdef DEV_ISA
+	/*
+	 * Explicitly probe and attach ISA last.  The isa bus saves
+	 * it's device node at attach time for us here.
+	 */
+	if (isa_bus_device)
+		isa_probe_children(isa_bus_device);
+#endif
+}
+
+static void
+configure_final(void *dummy)
+{
+
+	cninit_finish(); 
+
+	if (bootverbose) {
+#ifdef PC98
+		int i;
+
+		/*
+		 * Print out the BIOS's idea of the disk geometries.
+		 */
+		printf("BIOS Geometries:\n");
+		for (i = 0; i < N_BIOS_GEOM; i++) {
+			unsigned long bios_geom;
+			int max_cylinder, max_head, max_sector;
+
+			bios_geom = bootinfo.bi_bios_geom[i];
+
+			/*
+			 * XXX the bootstrap punts a 1200K floppy geometry
+			 * when the get-disk-geometry interrupt fails.  Skip
+			 * drives that have this geometry.
+			 */
+			if (bios_geom == 0x4f020f)
+				continue;
+
+			printf(" %x:%08lx ", i, bios_geom);
+			max_cylinder = bios_geom >> 16;
+			max_head = (bios_geom >> 8) & 0xff;
+			max_sector = bios_geom & 0xff;
+			printf(
+		"0..%d=%d cylinders, 0..%d=%d heads, 1..%d=%d sectors\n",
+			       max_cylinder, max_cylinder + 1,
+			       max_head, max_head + 1,
+			       max_sector, max_sector);
+		}
+		printf(" %d accounted for\n", bootinfo.bi_n_bios_used);
+#endif
+
+		printf("Device configuration finished.\n");
+	}
+	cold = 0;
+}


Property changes on: trunk/sys/x86/x86/autoconf.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/bus_machdep.c
===================================================================
--- trunk/sys/x86/x86/bus_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/bus_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/bus_machdep.c 287126 2015-08-25 14:39:40Z marcel $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/bus_machdep.c 286667 2015-08-12 15:26:32Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/x86/x86/busdma_bounce.c
===================================================================
--- trunk/sys/x86/x86/busdma_bounce.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/busdma_bounce.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/busdma_bounce.c 318977 2017-05-27 08:17:59Z hselasky $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/busdma_bounce.c 343361 2019-01-23 20:49:14Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -80,7 +80,8 @@
 	vm_offset_t	vaddr;		/* kva of bounce buffer */
 	bus_addr_t	busaddr;	/* Physical address */
 	vm_offset_t	datavaddr;	/* kva of client data */
-	bus_addr_t	dataaddr;	/* client physical address */
+	vm_offset_t	dataoffs;	/* page offset of client data */
+	vm_page_t	datapage[2];	/* physical page(s) of client data */
 	bus_size_t	datacount;	/* client data count */
 	STAILQ_ENTRY(bounce_page) links;
 };
@@ -135,10 +136,9 @@
 static int reserve_bounce_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
 				int commit);
 static bus_addr_t add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map,
-				  vm_offset_t vaddr, bus_addr_t addr,
-				  bus_size_t size);
+				  vm_offset_t vaddr, bus_addr_t addr1,
+				  bus_addr_t addr2, bus_size_t size);
 static void free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage);
-int run_filter(bus_dma_tag_t dmat, bus_addr_t paddr);
 static void _bus_dmamap_count_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
 				    pmap_t pmap, void *buf, bus_size_t buflen,
 				    int flags);
@@ -148,11 +148,6 @@
 static int _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
 				     int flags);
 
-#ifdef XEN
-#undef pmap_kextract
-#define pmap_kextract pmap_kextract_ma
-#endif
-
 /*
  * Allocate a device specific dma_tag.
  */
@@ -494,7 +489,8 @@
 		while (buflen != 0) {
 			sgsize = MIN(buflen, dmat->common.maxsegsz);
 			if (bus_dma_run_filter(&dmat->common, curaddr)) {
-				sgsize = MIN(sgsize, PAGE_SIZE);
+				sgsize = MIN(sgsize,
+				    PAGE_SIZE - (curaddr & PAGE_MASK));
 				map->pagesneeded++;
 			}
 			curaddr += sgsize;
@@ -544,6 +540,51 @@
 	}
 }
 
+static void
+_bus_dmamap_count_ma(bus_dma_tag_t dmat, bus_dmamap_t map, struct vm_page **ma,
+    int ma_offs, bus_size_t buflen, int flags)
+{
+	bus_size_t sg_len, max_sgsize;
+	int page_index;
+	vm_paddr_t paddr;
+
+	if ((map != &nobounce_dmamap && map->pagesneeded == 0)) {
+		CTR4(KTR_BUSDMA, "lowaddr= %d Maxmem= %d, boundary= %d, "
+		    "alignment= %d", dmat->common.lowaddr,
+		    ptoa((vm_paddr_t)Maxmem),
+		    dmat->common.boundary, dmat->common.alignment);
+		CTR3(KTR_BUSDMA, "map= %p, nobouncemap= %p, pagesneeded= %d",
+		    map, &nobounce_dmamap, map->pagesneeded);
+
+		/*
+		 * Count the number of bounce pages
+		 * needed in order to complete this transfer
+		 */
+		page_index = 0;
+		while (buflen > 0) {
+			paddr = VM_PAGE_TO_PHYS(ma[page_index]) + ma_offs;
+			sg_len = PAGE_SIZE - ma_offs;
+			max_sgsize = MIN(buflen, dmat->common.maxsegsz);
+			sg_len = MIN(sg_len, max_sgsize);
+			if (bus_dma_run_filter(&dmat->common, paddr) != 0) {
+				sg_len = roundup2(sg_len,
+				    dmat->common.alignment);
+				sg_len = MIN(sg_len, max_sgsize);
+				KASSERT((sg_len & (dmat->common.alignment - 1))
+				    == 0, ("Segment size is not aligned"));
+				map->pagesneeded++;
+			}
+			if (((ma_offs + sg_len) & ~PAGE_MASK) != 0)
+				page_index++;
+			ma_offs = (ma_offs + sg_len) & PAGE_MASK;
+			KASSERT(buflen >= sg_len,
+			    ("Segment length overruns original buffer"));
+			buflen -= sg_len;
+		}
+		CTR1(KTR_BUSDMA, "pagesneeded= %d\n", map->pagesneeded);
+	}
+}
+
 static int
 _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int flags)
 {
@@ -648,8 +689,8 @@
 		if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) &&
 		    map->pagesneeded != 0 &&
 		    bus_dma_run_filter(&dmat->common, curaddr)) {
-			sgsize = MIN(sgsize, PAGE_SIZE);
-			curaddr = add_bounce_page(dmat, map, 0, curaddr,
+			sgsize = MIN(sgsize, PAGE_SIZE - (curaddr & PAGE_MASK));
+			curaddr = add_bounce_page(dmat, map, 0, curaddr, 0,
 			    sgsize);
 		}
 		sgsize = _bus_dmamap_addseg(dmat, map, curaddr, sgsize, segs,
@@ -677,7 +718,7 @@
 {
 	bus_size_t sgsize, max_sgsize;
 	bus_addr_t curaddr;
-	vm_offset_t vaddr;
+	vm_offset_t kvaddr, vaddr;
 	int error;
 
 	if (map == NULL)
@@ -700,22 +741,25 @@
 		/*
 		 * Get the physical address for this segment.
 		 */
-		if (pmap == kernel_pmap)
+		if (pmap == kernel_pmap) {
 			curaddr = pmap_kextract(vaddr);
-		else
+			kvaddr = vaddr;
+		} else {
 			curaddr = pmap_extract(pmap, vaddr);
+			kvaddr = 0;
+		}
 
 		/*
 		 * Compute the segment size, and adjust counts.
 		 */
 		max_sgsize = MIN(buflen, dmat->common.maxsegsz);
-		sgsize = PAGE_SIZE - ((vm_offset_t)curaddr & PAGE_MASK);
+		sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
 		if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) &&
 		    map->pagesneeded != 0 &&
 		    bus_dma_run_filter(&dmat->common, curaddr)) {
 			sgsize = roundup2(sgsize, dmat->common.alignment);
 			sgsize = MIN(sgsize, max_sgsize);
-			curaddr = add_bounce_page(dmat, map, vaddr, curaddr,
+			curaddr = add_bounce_page(dmat, map, kvaddr, curaddr, 0,
 			    sgsize);
 		} else {
 			sgsize = MIN(sgsize, max_sgsize);
@@ -734,6 +778,88 @@
 	return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */
 }
 
+static int
+bounce_bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map,
+    struct vm_page **ma, bus_size_t buflen, int ma_offs, int flags,
+    bus_dma_segment_t *segs, int *segp)
+{
+	vm_paddr_t paddr, next_paddr;
+	int error, page_index;
+	bus_size_t sgsize, max_sgsize;
+
+	if (dmat->common.flags & BUS_DMA_KEEP_PG_OFFSET) {
+		/*
+		 * If we have to keep the offset of each page this function
+		 * is not suitable, switch back to bus_dmamap_load_ma_triv
+		 * which is going to do the right thing in this case.
+		 */
+		error = bus_dmamap_load_ma_triv(dmat, map, ma, buflen, ma_offs,
+		    flags, segs, segp);
+		return (error);
+	}
+
+	if (map == NULL)
+		map = &nobounce_dmamap;
+
+	if (segs == NULL)
+		segs = dmat->segments;
+
+	if ((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) {
+		_bus_dmamap_count_ma(dmat, map, ma, ma_offs, buflen, flags);
+		if (map->pagesneeded != 0) {
+			error = _bus_dmamap_reserve_pages(dmat, map, flags);
+			if (error)
+				return (error);
+		}
+	}
+
+	page_index = 0;
+	while (buflen > 0) {
+		/*
+		 * Compute the segment size, and adjust counts.
+		 */
+		paddr = VM_PAGE_TO_PHYS(ma[page_index]) + ma_offs;
+		max_sgsize = MIN(buflen, dmat->common.maxsegsz);
+		sgsize = PAGE_SIZE - ma_offs;
+		if (((dmat->bounce_flags & BUS_DMA_COULD_BOUNCE) != 0) &&
+		    map->pagesneeded != 0 &&
+		    bus_dma_run_filter(&dmat->common, paddr)) {
+			sgsize = roundup2(sgsize, dmat->common.alignment);
+			sgsize = MIN(sgsize, max_sgsize);
+			KASSERT((sgsize & (dmat->common.alignment - 1)) == 0,
+			    ("Segment size is not aligned"));
+			/*
+			 * Check if two pages of the user provided buffer
+			 * are used.
+			 */
+			if ((ma_offs + sgsize) > PAGE_SIZE)
+				next_paddr =
+				    VM_PAGE_TO_PHYS(ma[page_index + 1]);
+			else
+				next_paddr = 0;
+			paddr = add_bounce_page(dmat, map, 0, paddr,
+			    next_paddr, sgsize);
+		} else {
+			sgsize = MIN(sgsize, max_sgsize);
+		}
+		sgsize = _bus_dmamap_addseg(dmat, map, paddr, sgsize, segs,
+		    segp);
+		if (sgsize == 0)
+			break;
+		KASSERT(buflen >= sgsize,
+		    ("Segment length overruns original buffer"));
+		buflen -= sgsize;
+		if (((ma_offs + sgsize) & ~PAGE_MASK) != 0)
+			page_index++;
+		ma_offs = (ma_offs + sgsize) & PAGE_MASK;
+	}
+
+	/*
+	 * Did we fit?
+	 */
+	return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */
+}
+
 static void
 bounce_bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct memdesc *mem, bus_dmamap_callback_t *callback, void *callback_arg)
@@ -779,6 +905,8 @@
     bus_dmasync_op_t op)
 {
 	struct bounce_page *bpage;
+	vm_offset_t datavaddr, tempvaddr;
+	bus_size_t datacount1, datacount2;
 
 	if (map == NULL || (bpage = STAILQ_FIRST(&map->bpages)) == NULL)
 		return;
@@ -792,13 +920,40 @@
 
 	if ((op & BUS_DMASYNC_PREWRITE) != 0) {
 		while (bpage != NULL) {
-			if (bpage->datavaddr != 0) {
-				bcopy((void *)bpage->datavaddr,
-				    (void *)bpage->vaddr, bpage->datacount);
-			} else {
-				physcopyout(bpage->dataaddr,
-				    (void *)bpage->vaddr, bpage->datacount);
+			tempvaddr = 0;
+			datavaddr = bpage->datavaddr;
+			datacount1 = bpage->datacount;
+			if (datavaddr == 0) {
+				tempvaddr =
+				    pmap_quick_enter_page(bpage->datapage[0]);
+				datavaddr = tempvaddr | bpage->dataoffs;
+				datacount1 = min(PAGE_SIZE - bpage->dataoffs,
+				    datacount1);
 			}
+
+			bcopy((void *)datavaddr,
+			    (void *)bpage->vaddr, datacount1);
+
+			if (tempvaddr != 0)
+				pmap_quick_remove_page(tempvaddr);
+
+			if (bpage->datapage[1] == 0) {
+				KASSERT(datacount1 == bpage->datacount,
+		("Mismatch between data size and provided memory space"));
+				goto next_w;
+			}
+
+			/*
+			 * We are dealing with an unmapped buffer that expands
+			 * over two pages.
+			 */
+			datavaddr = pmap_quick_enter_page(bpage->datapage[1]);
+			datacount2 = bpage->datacount - datacount1;
+			bcopy((void *)datavaddr,
+			    (void *)(bpage->vaddr + datacount1), datacount2);
+			pmap_quick_remove_page(datavaddr);
+
+next_w:
 			bpage = STAILQ_NEXT(bpage, links);
 		}
 		dmat->bounce_zone->total_bounced++;
@@ -806,14 +961,40 @@
 
 	if ((op & BUS_DMASYNC_POSTREAD) != 0) {
 		while (bpage != NULL) {
-			if (bpage->datavaddr != 0) {
-				bcopy((void *)bpage->vaddr,
-				    (void *)bpage->datavaddr,
-				    bpage->datacount);
-			} else {
-				physcopyin((void *)bpage->vaddr,
-				    bpage->dataaddr, bpage->datacount);
+			tempvaddr = 0;
+			datavaddr = bpage->datavaddr;
+			datacount1 = bpage->datacount;
+			if (datavaddr == 0) {
+				tempvaddr =
+				    pmap_quick_enter_page(bpage->datapage[0]);
+				datavaddr = tempvaddr | bpage->dataoffs;
+				datacount1 = min(PAGE_SIZE - bpage->dataoffs,
+				    datacount1);
 			}
+
+			bcopy((void *)bpage->vaddr, (void *)datavaddr,
+			    datacount1);
+
+			if (tempvaddr != 0)
+				pmap_quick_remove_page(tempvaddr);
+
+			if (bpage->datapage[1] == 0) {
+				KASSERT(datacount1 == bpage->datacount,
+		("Mismatch between data size and provided memory space"));
+				goto next_r;
+			}
+
+			/*
+			 * We are dealing with an unmapped buffer that expands
+			 * over two pages.
+			 */
+			datavaddr = pmap_quick_enter_page(bpage->datapage[1]);
+			datacount2 = bpage->datacount - datacount1;
+			bcopy((void *)(bpage->vaddr + datacount1),
+			    (void *)datavaddr, datacount2);
+			pmap_quick_remove_page(datavaddr);
+
+next_r:
 			bpage = STAILQ_NEXT(bpage, links);
 		}
 		dmat->bounce_zone->total_bounced++;
@@ -979,7 +1160,7 @@
 
 static bus_addr_t
 add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr,
-		bus_addr_t addr, bus_size_t size)
+		bus_addr_t addr1, bus_addr_t addr2, bus_size_t size)
 {
 	struct bounce_zone *bz;
 	struct bounce_page *bpage;
@@ -1009,11 +1190,16 @@
 
 	if (dmat->common.flags & BUS_DMA_KEEP_PG_OFFSET) {
 		/* Page offset needs to be preserved. */
-		bpage->vaddr |= addr & PAGE_MASK;
-		bpage->busaddr |= addr & PAGE_MASK;
+		bpage->vaddr |= addr1 & PAGE_MASK;
+		bpage->busaddr |= addr1 & PAGE_MASK;
+		KASSERT(addr2 == 0,
+	("Trying to bounce multiple pages with BUS_DMA_KEEP_PG_OFFSET"));
 	}
 	bpage->datavaddr = vaddr;
-	bpage->dataaddr = addr;
+	bpage->datapage[0] = PHYS_TO_VM_PAGE(addr1);
+	KASSERT((addr2 & PAGE_MASK) == 0, ("Second page is not aligned"));
+	bpage->datapage[1] = PHYS_TO_VM_PAGE(addr2);
+	bpage->dataoffs = addr1 & PAGE_MASK;
 	bpage->datacount = size;
 	STAILQ_INSERT_TAIL(&(map->bpages), bpage, links);
 	return (bpage->busaddr);
@@ -1085,7 +1271,7 @@
 	.mem_free = bounce_bus_dmamem_free,
 	.load_phys = bounce_bus_dmamap_load_phys,
 	.load_buffer = bounce_bus_dmamap_load_buffer,
-	.load_ma = bus_dmamap_load_ma_triv,
+	.load_ma = bounce_bus_dmamap_load_ma,
 	.map_waitok = bounce_bus_dmamap_waitok,
 	.map_complete = bounce_bus_dmamap_complete,
 	.map_unload = bounce_bus_dmamap_unload,

Modified: trunk/sys/x86/x86/busdma_machdep.c
===================================================================
--- trunk/sys/x86/x86/busdma_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/busdma_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/busdma_machdep.c 259511 2013-12-17 13:39:50Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/busdma_machdep.c 257230 2013-10-27 22:05:10Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Added: trunk/sys/x86/x86/cpu_machdep.c
===================================================================
--- trunk/sys/x86/x86/cpu_machdep.c	                        (rev 0)
+++ trunk/sys/x86/x86/cpu_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,1359 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2003 Peter Wemm.
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/cpu_machdep.c 355701 2019-12-13 06:54:41Z scottl $");
+
+#include "opt_atpic.h"
+#include "opt_compat.h"
+#include "opt_cpu.h"
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_isa.h"
+#include "opt_kdb.h"
+#include "opt_kstack_pages.h"
+#include "opt_maxmem.h"
+#include "opt_mp_watchdog.h"
+#include "opt_perfmon.h"
+#include "opt_platform.h"
+#ifdef __i386__
+#include "opt_apic.h"
+#include "opt_xbox.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/cpu.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/cputypes.h>
+#include <machine/specialreg.h>
+#include <machine/md_var.h>
+#include <machine/mp_watchdog.h>
+#ifdef PERFMON
+#include <machine/perfmon.h>
+#endif
+#include <machine/tss.h>
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+#ifdef CPU_ELAN
+#include <machine/elan_mmcr.h>
+#endif
+#include <x86/acpica_machdep.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_param.h>
+
+#ifndef PC98
+#include <isa/isareg.h>
+#endif
+
+#define	STATE_RUNNING	0x0
+#define	STATE_MWAIT	0x1
+#define	STATE_SLEEPING	0x2
+
+#ifdef SMP
+static u_int	cpu_reset_proxyid;
+static volatile u_int	cpu_reset_proxy_active;
+#endif
+
+struct msr_op_arg {
+	u_int msr;
+	int op;
+	uint64_t arg1;
+};
+
+static void
+x86_msr_op_one(void *argp)
+{
+	struct msr_op_arg *a;
+	uint64_t v;
+
+	a = argp;
+	switch (a->op) {
+	case MSR_OP_ANDNOT:
+		v = rdmsr(a->msr);
+		v &= ~a->arg1;
+		wrmsr(a->msr, v);
+		break;
+	case MSR_OP_OR:
+		v = rdmsr(a->msr);
+		v |= a->arg1;
+		wrmsr(a->msr, v);
+		break;
+	case MSR_OP_WRITE:
+		wrmsr(a->msr, a->arg1);
+		break;
+	}
+}
+
+#define	MSR_OP_EXMODE_MASK	0xf0000000
+#define	MSR_OP_OP_MASK		0x000000ff
+
+void
+x86_msr_op(u_int msr, u_int op, uint64_t arg1)
+{
+	struct thread *td;
+	struct msr_op_arg a;
+	u_int exmode;
+	int bound_cpu, i, is_bound;
+
+	a.op = op & MSR_OP_OP_MASK;
+	MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
+	    a.op == MSR_OP_WRITE);
+	exmode = op & MSR_OP_EXMODE_MASK;
+	MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED ||
+	    exmode == MSR_OP_RENDEZVOUS);
+	a.msr = msr;
+	a.arg1 = arg1;
+	switch (exmode) {
+	case MSR_OP_LOCAL:
+		x86_msr_op_one(&a);
+		break;
+	case MSR_OP_SCHED:
+		td = curthread;
+		thread_lock(td);
+		is_bound = sched_is_bound(td);
+		bound_cpu = td->td_oncpu;
+		CPU_FOREACH(i) {
+			sched_bind(td, i);
+			x86_msr_op_one(&a);
+		}
+		if (is_bound)
+			sched_bind(td, bound_cpu);
+		else
+			sched_unbind(td);
+		thread_unlock(td);
+		break;
+	case MSR_OP_RENDEZVOUS:
+		smp_rendezvous(NULL, x86_msr_op_one, NULL, &a);
+		break;
+	}
+}
+
+/*
+ * Machine dependent boot() routine
+ *
+ * I haven't seen anything to put here yet
+ * Possibly some stuff might be grafted back here from boot()
+ */
+void
+cpu_boot(int howto)
+{
+}
+
+/*
+ * Flush the D-cache for non-DMA I/O so that the I-cache can
+ * be made coherent later.
+ */
+void
+cpu_flush_dcache(void *ptr, size_t len)
+{
+	/* Not applicable */
+}
+
+void
+acpi_cpu_c1(void)
+{
+
+	__asm __volatile("sti; hlt");
+}
+
+/*
+ * Use mwait to pause execution while waiting for an interrupt or
+ * another thread to signal that there is more work.
+ *
+ * NOTE: Interrupts will cause a wakeup; however, this function does
+ * not enable interrupt handling. The caller is responsible to enable
+ * interrupts.
+ */
+void
+acpi_cpu_idle_mwait(uint32_t mwait_hint)
+{
+	int *state;
+	uint64_t v;
+
+	/*
+	 * A comment in Linux patch claims that 'CPUs run faster with
+	 * speculation protection disabled. All CPU threads in a core
+	 * must disable speculation protection for it to be
+	 * disabled. Disable it while we are idle so the other
+	 * hyperthread can run fast.'
+	 *
+	 * XXXKIB.  Software coordination mode should be supported,
+	 * but all Intel CPUs provide hardware coordination.
+	 */
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	KASSERT(atomic_load_int(state) == STATE_SLEEPING,
+	    ("cpu_mwait_cx: wrong monitorbuf state"));
+	atomic_store_int(state, STATE_MWAIT);
+	if (PCPU_GET(ibpb_set) || hw_ssb_active) {
+		v = rdmsr(MSR_IA32_SPEC_CTRL);
+		wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
+		    IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
+	} else {
+		v = 0;
+	}
+	cpu_monitor(state, 0, 0);
+	if (atomic_load_int(state) == STATE_MWAIT)
+		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
+
+	/*
+	 * SSB cannot be disabled while we sleep, or rather, if it was
+	 * disabled, the sysctl thread will bind to our cpu to tweak
+	 * MSR.
+	 */
+	if (v != 0)
+		wrmsr(MSR_IA32_SPEC_CTRL, v);
+
+	/*
+	 * We should exit on any event that interrupts mwait, because
+	 * that event might be a wanted interrupt.
+	 */
+	atomic_store_int(state, STATE_RUNNING);
+}
+
+/* Get current clock frequency for the given cpu id. */
+int
+cpu_est_clockrate(int cpu_id, uint64_t *rate)
+{
+	uint64_t tsc1, tsc2;
+	uint64_t acnt, mcnt, perf;
+	register_t reg;
+
+	if (pcpu_find(cpu_id) == NULL || rate == NULL)
+		return (EINVAL);
+#ifdef __i386__
+	if ((cpu_feature & CPUID_TSC) == 0)
+		return (EOPNOTSUPP);
+#endif
+
+	/*
+	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
+	 * DELAY(9) based logic fails.
+	 */
+	if (tsc_is_invariant && !tsc_perf_stat)
+		return (EOPNOTSUPP);
+
+#ifdef SMP
+	if (smp_cpus > 1) {
+		/* Schedule ourselves on the indicated cpu. */
+		thread_lock(curthread);
+		sched_bind(curthread, cpu_id);
+		thread_unlock(curthread);
+	}
+#endif
+
+	/* Calibrate by measuring a short delay. */
+	reg = intr_disable();
+	if (tsc_is_invariant) {
+		wrmsr(MSR_MPERF, 0);
+		wrmsr(MSR_APERF, 0);
+		tsc1 = rdtsc();
+		DELAY(1000);
+		mcnt = rdmsr(MSR_MPERF);
+		acnt = rdmsr(MSR_APERF);
+		tsc2 = rdtsc();
+		intr_restore(reg);
+		perf = 1000 * acnt / mcnt;
+		*rate = (tsc2 - tsc1) * perf;
+	} else {
+		tsc1 = rdtsc();
+		DELAY(1000);
+		tsc2 = rdtsc();
+		intr_restore(reg);
+		*rate = (tsc2 - tsc1) * 1000;
+	}
+
+#ifdef SMP
+	if (smp_cpus > 1) {
+		thread_lock(curthread);
+		sched_unbind(curthread);
+		thread_unlock(curthread);
+	}
+#endif
+
+	return (0);
+}
+
+/*
+ * Shutdown the CPU as much as possible
+ */
+void
+cpu_halt(void)
+{
+	for (;;)
+		halt();
+}
+
+static void
+cpu_reset_real(void)
+{
+	struct region_descriptor null_idt;
+#ifndef PC98
+	int b;
+#endif
+
+	disable_intr();
+#ifdef CPU_ELAN
+	if (elan_mmcr != NULL)
+		elan_mmcr->RESCFG = 1;
+#endif
+#ifdef __i386__
+	if (cpu == CPU_GEODE1100) {
+		/* Attempt Geode's own reset */
+		outl(0xcf8, 0x80009044ul);
+		outl(0xcfc, 0xf);
+	}
+#endif
+#ifdef PC98
+	/*
+	 * Attempt to do a CPU reset via CPU reset port.
+	 */
+	if ((inb(0x35) & 0xa0) != 0xa0) {
+		outb(0x37, 0x0f);		/* SHUT0 = 0. */
+		outb(0x37, 0x0b);		/* SHUT1 = 0. */
+	}
+	outb(0xf0, 0x00);			/* Reset. */
+#else
+#if !defined(BROKEN_KEYBOARD_RESET)
+	/*
+	 * Attempt to do a CPU reset via the keyboard controller,
+	 * do not turn off GateA20, as any machine that fails
+	 * to do the reset here would then end up in no man's land.
+	 */
+	outb(IO_KBD + 4, 0xFE);
+	DELAY(500000);	/* wait 0.5 sec to see if that did it */
+#endif
+
+	/*
+	 * Attempt to force a reset via the Reset Control register at
+	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
+	 * transitions from 0 to 1.  Bit 1 selects the type of reset
+	 * to attempt: 0 selects a "soft" reset, and 1 selects a
+	 * "hard" reset.  We try a "hard" reset.  The first write sets
+	 * bit 1 to select a "hard" reset and clears bit 2.  The
+	 * second write forces a 0 -> 1 transition in bit 2 to trigger
+	 * a reset.
+	 */
+	outb(0xcf9, 0x2);
+	outb(0xcf9, 0x6);
+	DELAY(500000);  /* wait 0.5 sec to see if that did it */
+
+	/*
+	 * Attempt to force a reset via the Fast A20 and Init register
+	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
+	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
+	 * preserve bit 1 while setting bit 0.  We also must clear bit
+	 * 0 before setting it if it isn't already clear.
+	 */
+	b = inb(0x92);
+	if (b != 0xff) {
+		if ((b & 0x1) != 0)
+			outb(0x92, b & 0xfe);
+		outb(0x92, b | 0x1);
+		DELAY(500000);  /* wait 0.5 sec to see if that did it */
+	}
+#endif /* PC98 */
+
+	printf("No known reset method worked, attempting CPU shutdown\n");
+	DELAY(1000000); /* wait 1 sec for printf to complete */
+
+	/* Wipe the IDT. */
+	null_idt.rd_limit = 0;
+	null_idt.rd_base = 0;
+	lidt(&null_idt);
+
+	/* "good night, sweet prince .... <THUNK!>" */
+	breakpoint();
+
+	/* NOTREACHED */
+	while(1);
+}
+
+#ifdef SMP
+static void
+cpu_reset_proxy(void)
+{
+
+	cpu_reset_proxy_active = 1;
+	while (cpu_reset_proxy_active == 1)
+		ia32_pause(); /* Wait for other cpu to see that we've started */
+
+	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
+	DELAY(1000000);
+	cpu_reset_real();
+}
+#endif
+
+void
+cpu_reset(void)
+{
+#ifdef SMP
+	cpuset_t map;
+	u_int cnt;
+
+	if (smp_started) {
+		map = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &map);
+		CPU_NAND(&map, &stopped_cpus);
+		if (!CPU_EMPTY(&map)) {
+			printf("cpu_reset: Stopping other CPUs\n");
+			stop_cpus(map);
+		}
+
+		if (PCPU_GET(cpuid) != 0) {
+			cpu_reset_proxyid = PCPU_GET(cpuid);
+			cpustop_restartfunc = cpu_reset_proxy;
+			cpu_reset_proxy_active = 0;
+			printf("cpu_reset: Restarting BSP\n");
+
+			/* Restart CPU #0. */
+			CPU_SETOF(0, &started_cpus);
+			wmb();
+
+			cnt = 0;
+			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
+				ia32_pause();
+				cnt++;	/* Wait for BSP to announce restart */
+			}
+			if (cpu_reset_proxy_active == 0) {
+				printf("cpu_reset: Failed to restart BSP\n");
+			} else {
+				cpu_reset_proxy_active = 2;
+				while (1)
+					ia32_pause();
+				/* NOTREACHED */
+			}
+		}
+
+		DELAY(1000000);
+	}
+#endif
+	cpu_reset_real();
+	/* NOTREACHED */
+}
+
+bool
+cpu_mwait_usable(void)
+{
+
+	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
+	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
+	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
+}
+
+void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
+static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
+static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
+SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
+    0, "Use MONITOR/MWAIT for short idle");
+
+#ifndef PC98
+static void
+cpu_idle_acpi(sbintime_t sbt)
+{
+	int *state;
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	atomic_store_int(state, STATE_SLEEPING);
+
+	/* See comments in cpu_idle_hlt(). */
+	disable_intr();
+	if (sched_runnable())
+		enable_intr();
+	else if (cpu_idle_hook)
+		cpu_idle_hook(sbt);
+	else
+		acpi_cpu_c1();
+	atomic_store_int(state, STATE_RUNNING);
+}
+#endif /* !PC98 */
+
+static void
+cpu_idle_hlt(sbintime_t sbt)
+{
+	int *state;
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	atomic_store_int(state, STATE_SLEEPING);
+
+	/*
+	 * Since we may be in a critical section from cpu_idle(), if
+	 * an interrupt fires during that critical section we may have
+	 * a pending preemption.  If the CPU halts, then that thread
+	 * may not execute until a later interrupt awakens the CPU.
+	 * To handle this race, check for a runnable thread after
+	 * disabling interrupts and immediately return if one is
+	 * found.  Also, we must absolutely guarentee that hlt is
+	 * the next instruction after sti.  This ensures that any
+	 * interrupt that fires after the call to disable_intr() will
+	 * immediately awaken the CPU from hlt.  Finally, please note
+	 * that on x86 this works fine because of interrupts enabled only
+	 * after the instruction following sti takes place, while IF is set
+	 * to 1 immediately, allowing hlt instruction to acknowledge the
+	 * interrupt.
+	 */
+	disable_intr();
+	if (sched_runnable())
+		enable_intr();
+	else
+		acpi_cpu_c1();
+	atomic_store_int(state, STATE_RUNNING);
+}
+
+static void
+cpu_idle_mwait(sbintime_t sbt)
+{
+	int *state;
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	atomic_store_int(state, STATE_MWAIT);
+
+	/* See comments in cpu_idle_hlt(). */
+	disable_intr();
+	if (sched_runnable()) {
+		atomic_store_int(state, STATE_RUNNING);
+		enable_intr();
+		return;
+	}
+
+	cpu_monitor(state, 0, 0);
+	if (atomic_load_int(state) == STATE_MWAIT)
+		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
+	else
+		enable_intr();
+	atomic_store_int(state, STATE_RUNNING);
+}
+
+static void
+cpu_idle_spin(sbintime_t sbt)
+{
+	int *state;
+	int i;
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	atomic_store_int(state, STATE_RUNNING);
+
+	/*
+	 * The sched_runnable() call is racy but as long as there is
+	 * a loop missing it one time will have just a little impact if any 
+	 * (and it is much better than missing the check at all).
+	 */
+	for (i = 0; i < 1000; i++) {
+		if (sched_runnable())
+			return;
+		cpu_spinwait();
+	}
+}
+
+/*
+ * C1E renders the local APIC timer dead, so we disable it by
+ * reading the Interrupt Pending Message register and clearing
+ * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
+ * 
+ * Reference:
+ *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
+ *   #32559 revision 3.00+
+ */
+#define	MSR_AMDK8_IPM		0xc0010055
+#define	AMDK8_SMIONCMPHALT	(1ULL << 27)
+#define	AMDK8_C1EONCMPHALT	(1ULL << 28)
+#define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
+
+void
+cpu_probe_amdc1e(void)
+{
+
+	/*
+	 * Detect the presence of C1E capability mostly on latest
+	 * dual-cores (or future) k8 family.
+	 */
+	if (cpu_vendor_id == CPU_VENDOR_AMD &&
+	    (cpu_id & 0x00000f00) == 0x00000f00 &&
+	    (cpu_id & 0x0fff0000) >=  0x00040000) {
+		cpu_ident_amdc1e = 1;
+	}
+}
+
+#if defined(__i386__) && defined(PC98)
+void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
+#else
+void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
+#endif
+
+void
+cpu_idle(int busy)
+{
+	uint64_t msr;
+	sbintime_t sbt = -1;
+
+	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
+	    busy, curcpu);
+#ifdef MP_WATCHDOG
+	ap_watchdog(PCPU_GET(cpuid));
+#endif
+
+	/* If we are busy - try to use fast methods. */
+	if (busy) {
+		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
+			cpu_idle_mwait(busy);
+			goto out;
+		}
+	}
+
+	/* If we have time - switch timers into idle mode. */
+	if (!busy) {
+		critical_enter();
+		sbt = cpu_idleclock();
+	}
+
+	/* Apply AMD APIC timer C1E workaround. */
+	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
+		msr = rdmsr(MSR_AMDK8_IPM);
+		if (msr & AMDK8_CMPHALT)
+			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
+	}
+
+	/* Call main idle method. */
+	cpu_idle_fn(sbt);
+
+	/* Switch timers back into active mode. */
+	if (!busy) {
+		cpu_activeclock();
+		critical_exit();
+	}
+out:
+	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
+	    busy, curcpu);
+}
+
+static int cpu_idle_apl31_workaround;
+SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
+    &cpu_idle_apl31_workaround, 0,
+    "Apollo Lake APL31 MWAIT bug workaround");
+
+int
+cpu_idle_wakeup(int cpu)
+{
+	int *state;
+
+	state = (int *)pcpu_find(cpu)->pc_monitorbuf;
+	switch (atomic_load_int(state)) {
+	case STATE_SLEEPING:
+		return (0);
+	case STATE_MWAIT:
+		atomic_store_int(state, STATE_RUNNING);
+		return (cpu_idle_apl31_workaround ? 0 : 1);
+	case STATE_RUNNING:
+		return (1);
+	default:
+		panic("bad monitor state");
+		return (1);
+	}
+}
+
+/*
+ * Ordered by speed/power consumption.
+ */
+static struct {
+	void	*id_fn;
+	char	*id_name;
+	int	id_cpuid2_flag;
+} idle_tbl[] = {
+	{ .id_fn = cpu_idle_spin, .id_name = "spin" },
+	{ .id_fn = cpu_idle_mwait, .id_name = "mwait",
+	    .id_cpuid2_flag = CPUID2_MON },
+	{ .id_fn = cpu_idle_hlt, .id_name = "hlt" },
+#if !defined(__i386__) || !defined(PC98)
+	{ .id_fn = cpu_idle_acpi, .id_name = "acpi" },
+#endif
+};
+
+static int
+idle_sysctl_available(SYSCTL_HANDLER_ARGS)
+{
+	char *avail, *p;
+	int error;
+	int i;
+
+	avail = malloc(256, M_TEMP, M_WAITOK);
+	p = avail;
+	for (i = 0; i < nitems(idle_tbl); i++) {
+		if (idle_tbl[i].id_cpuid2_flag != 0 &&
+		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
+			continue;
+#if !defined(__i386__) || !defined(PC98)
+		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
+		    cpu_idle_hook == NULL)
+			continue;
+#endif
+		p += sprintf(p, "%s%s", p != avail ? ", " : "",
+		    idle_tbl[i].id_name);
+	}
+	error = sysctl_handle_string(oidp, avail, 0, req);
+	free(avail, M_TEMP);
+	return (error);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
+    0, 0, idle_sysctl_available, "A", "list of available idle functions");
+
+static bool
+cpu_idle_selector(const char *new_idle_name)
+{
+	int i;
+
+	for (i = 0; i < nitems(idle_tbl); i++) {
+		if (idle_tbl[i].id_cpuid2_flag != 0 &&
+		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
+			continue;
+#if !defined(__i386__) || !defined(PC98)
+		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
+		    cpu_idle_hook == NULL)
+			continue;
+#endif
+		if (strcmp(idle_tbl[i].id_name, new_idle_name))
+			continue;
+		cpu_idle_fn = idle_tbl[i].id_fn;
+		if (bootverbose)
+			printf("CPU idle set to %s\n", idle_tbl[i].id_name);
+		return (true);
+	}
+	return (false);
+}
+
+static int
+cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	char buf[16], *p;
+	int error, i;
+
+	p = "unknown";
+	for (i = 0; i < nitems(idle_tbl); i++) {
+		if (idle_tbl[i].id_fn == cpu_idle_fn) {
+			p = idle_tbl[i].id_name;
+			break;
+		}
+	}
+	strncpy(buf, p, sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	return (cpu_idle_selector(buf) ? 0 : EINVAL);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
+    cpu_idle_sysctl, "A", "currently selected idle function");
+
+static void
+cpu_idle_tun(void *unused __unused)
+{
+	char tunvar[16];
+
+	if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
+		cpu_idle_selector(tunvar);
+	else if (cpu_vendor_id == CPU_VENDOR_AMD &&
+	    CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
+		/* Ryzen erratas 1057, 1109. */
+		cpu_idle_selector("hlt");
+		idle_mwait = 0;
+	}
+
+	if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
+		/*
+		 * Apollo Lake errata APL31 (public errata APL30).
+		 * Stores to the armed address range may not trigger
+		 * MWAIT to resume execution.  OS needs to use
+		 * interrupts to wake processors from MWAIT-induced
+		 * sleep states.
+		 */
+		cpu_idle_apl31_workaround = 1;
+	}
+	TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
+}
+SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
+
+static int panic_on_nmi = 1;
+SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
+    &panic_on_nmi, 0,
+    "Panic on NMI raised by hardware failure");
+int nmi_is_broadcast = 1;
+SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
+    &nmi_is_broadcast, 0,
+    "Chipset NMI is broadcast");
+#ifdef KDB
+int kdb_on_nmi = 1;
+SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
+    &kdb_on_nmi, 0,
+    "Go to KDB on NMI with unknown source");
+#endif
+
+void
+nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
+{
+	bool claimed = false;
+
+#ifdef DEV_ISA
+	/* machine/parity/power fail/"kitchen sink" faults */
+	if (isa_nmi(frame->tf_err)) {
+		claimed = true;
+		if (panic_on_nmi)
+			panic("NMI indicates hardware failure");
+	}
+#endif /* DEV_ISA */
+#ifdef KDB
+	if (!claimed && kdb_on_nmi) {
+		/*
+		 * NMI can be hooked up to a pushbutton for debugging.
+		 */
+		printf("NMI/cpu%d ... going to debugger\n", cpu);
+		kdb_trap(type, 0, frame);
+	}
+#endif /* KDB */
+}
+
+void
+nmi_handle_intr(u_int type, struct trapframe *frame)
+{
+
+#ifdef SMP
+	if (nmi_is_broadcast) {
+		nmi_call_kdb_smp(type, frame);
+		return;
+	}
+#endif
+	nmi_call_kdb(PCPU_GET(cpuid), type, frame);
+}
+
+int hw_ibrs_active;
+int hw_ibrs_disable = 1;
+
+SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
+    "Indirect Branch Restricted Speculation active");
+
+void
+hw_ibrs_recalculate(void)
+{
+	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
+		x86_msr_op(MSR_IA32_SPEC_CTRL, MSR_OP_LOCAL |
+		    (hw_ibrs_disable ? MSR_OP_ANDNOT : MSR_OP_OR),
+		    IA32_SPEC_CTRL_IBRS);
+		return;
+	}
+	hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 &&
+	    !hw_ibrs_disable;
+}
+
+static int
+hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = hw_ibrs_disable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	hw_ibrs_disable = val != 0;
+	hw_ibrs_recalculate();
+	return (0);
+}
+SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
+    CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
+    "Disable Indirect Branch Restricted Speculation");
+
+int hw_ssb_active;
+int hw_ssb_disable;
+
+SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
+    &hw_ssb_active, 0,
+    "Speculative Store Bypass Disable active");
+
+static void
+hw_ssb_set(bool enable, bool for_all_cpus)
+{
+
+	if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
+		hw_ssb_active = 0;
+		return;
+	}
+	hw_ssb_active = enable;
+	x86_msr_op(MSR_IA32_SPEC_CTRL,
+	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
+	    (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD);
+}
+
+void
+hw_ssb_recalculate(bool all_cpus)
+{
+
+	switch (hw_ssb_disable) {
+	default:
+		hw_ssb_disable = 0;
+		/* FALLTHROUGH */
+	case 0: /* off */
+		hw_ssb_set(false, all_cpus);
+		break;
+	case 1: /* on */
+		hw_ssb_set(true, all_cpus);
+		break;
+	case 2: /* auto */
+		hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
+		    false : true, all_cpus);
+		break;
+	}
+}
+
+static int
+hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = hw_ssb_disable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	hw_ssb_disable = val;
+	hw_ssb_recalculate(true);
+	return (0);
+}
+SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
+    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+    hw_ssb_disable_handler, "I",
+    "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
+
+int hw_mds_disable;
+
+/*
+ * Handler for Microarchitectural Data Sampling issues.  Really not a
+ * pointer to C function: on amd64 the code must not change any CPU
+ * architectural state except possibly %rflags. Also, it is always
+ * called with interrupts disabled.
+ */
+void mds_handler_void(void);
+void mds_handler_verw(void);
+void mds_handler_ivb(void);
+void mds_handler_bdw(void);
+void mds_handler_skl_sse(void);
+void mds_handler_skl_avx(void);
+void mds_handler_skl_avx512(void);
+void mds_handler_silvermont(void);
+void (*mds_handler)(void) = mds_handler_void;
+
+static int
+sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
+{
+	const char *state;
+
+	if (mds_handler == mds_handler_void)
+		state = "inactive";
+	else if (mds_handler == mds_handler_verw)
+		state = "VERW";
+	else if (mds_handler == mds_handler_ivb)
+		state = "software IvyBridge";
+	else if (mds_handler == mds_handler_bdw)
+		state = "software Broadwell";
+	else if (mds_handler == mds_handler_skl_sse)
+		state = "software Skylake SSE";
+	else if (mds_handler == mds_handler_skl_avx)
+		state = "software Skylake AVX";
+	else if (mds_handler == mds_handler_skl_avx512)
+		state = "software Skylake AVX512";
+	else if (mds_handler == mds_handler_silvermont)
+		state = "software Silvermont";
+	else
+		state = "unknown";
+	return (SYSCTL_OUT(req, state, strlen(state)));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_hw_mds_disable_state_handler, "A",
+    "Microarchitectural Data Sampling Mitigation state");
+
+_Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
+
+void
+hw_mds_recalculate(void)
+{
+	struct pcpu *pc;
+	vm_offset_t b64;
+	u_long xcr0;
+	int i;
+
+	/*
+	 * Allow user to force VERW variant even if MD_CLEAR is not
+	 * reported.  For instance, hypervisor might unknowingly
+	 * filter the cap out.
+	 * For the similar reasons, and for testing, allow to enable
+	 * mitigation even for RDCL_NO or MDS_NO caps.
+	 */
+	if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
+	    ((cpu_ia32_arch_caps & (IA32_ARCH_CAP_RDCL_NO |
+	    IA32_ARCH_CAP_MDS_NO)) != 0 && hw_mds_disable == 3)) {
+		mds_handler = mds_handler_void;
+	} else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
+	    hw_mds_disable == 3) || hw_mds_disable == 1) {
+		mds_handler = mds_handler_verw;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
+	    CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
+	    CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
+	    CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
+	    CPUID_TO_MODEL(cpu_id) == 0x3a) &&
+	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+		/*
+		 * Nehalem, SandyBridge, IvyBridge
+		 */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL) {
+				pc->pc_mds_buf = malloc(672, M_TEMP,
+				    M_WAITOK);
+				bzero(pc->pc_mds_buf, 16);
+			}
+		}
+		mds_handler = mds_handler_ivb;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
+	    CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
+	    CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
+	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+		/*
+		 * Haswell, Broadwell
+		 */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL) {
+				pc->pc_mds_buf = malloc(1536, M_TEMP,
+				    M_WAITOK);
+				bzero(pc->pc_mds_buf, 16);
+			}
+		}
+		mds_handler = mds_handler_bdw;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
+	    CPUID_STEPPING) <= 5) ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
+	    (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
+	    CPUID_STEPPING) <= 0xb) ||
+	    (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
+	    CPUID_STEPPING) <= 0xc)) &&
+	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
+		/*
+		 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
+		 * CascadeLake
+		 */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL) {
+				pc->pc_mds_buf = malloc(6 * 1024,
+				    M_TEMP, M_WAITOK);
+				b64 = (vm_offset_t)malloc(64 + 63,
+				    M_TEMP, M_WAITOK);
+				pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
+				bzero(pc->pc_mds_buf64, 64);
+			}
+		}
+		xcr0 = rxcr(0);
+		if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
+		    (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0)
+			mds_handler = mds_handler_skl_avx512;
+		else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
+		    (cpu_feature2 & CPUID2_AVX) != 0)
+			mds_handler = mds_handler_skl_avx;
+		else
+			mds_handler = mds_handler_skl_sse;
+	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+	    ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
+	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
+	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
+	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
+	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
+	    CPUID_TO_MODEL(cpu_id) == 0x65 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x75 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
+	    CPUID_TO_MODEL(cpu_id) == 0x26 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
+	    CPUID_TO_MODEL(cpu_id) == 0x7a))) {
+		/* Silvermont, Airmont */
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			if (pc->pc_mds_buf == NULL)
+				pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
+		}
+		mds_handler = mds_handler_silvermont;
+	} else {
+		hw_mds_disable = 0;
+		mds_handler = mds_handler_void;
+	}
+}
+
+static void
+hw_mds_recalculate_boot(void *arg __unused)
+{
+
+	hw_mds_recalculate();
+}
+SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
+
+static int
+sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = hw_mds_disable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (val < 0 || val > 3)
+		return (EINVAL);
+	hw_mds_disable = val;
+	hw_mds_recalculate();
+	return (0);
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
+    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_mds_disable_handler, "I",
+    "Microarchitectural Data Sampling Mitigation "
+    "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
+
+
+/*
+ * Intel Transactional Memory Asynchronous Abort Mitigation
+ * CVE-2019-11135
+ */
+int x86_taa_enable;
+int x86_taa_state;
+enum {
+	TAA_NONE	= 0,	/* No mitigation enabled */
+	TAA_TSX_DISABLE	= 1,	/* Disable TSX via MSR */
+	TAA_VERW	= 2,	/* Use VERW mitigation */
+	TAA_AUTO	= 3,	/* Automatically select the mitigation */
+
+	/* The states below are not selectable by the operator */
+
+	TAA_TAA_UC	= 4,	/* Mitigation present in microcode */
+	TAA_NOT_PRESENT	= 5	/* TSX is not present */
+};
+
+static void
+taa_set(bool enable, bool all)
+{
+
+	x86_msr_op(MSR_IA32_TSX_CTRL,
+	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
+	    (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
+	    IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR);
+}
+
+void
+x86_taa_recalculate(void)
+{
+	static int taa_saved_mds_disable = 0;
+	int taa_need = 0, taa_state = 0;
+	int mds_disable = 0, need_mds_recalc = 0;
+
+	/* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
+	if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
+	    (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
+		/* TSX is not present */
+		x86_taa_state = TAA_NOT_PRESENT;
+		return;
+	}
+
+	/* Check to see what mitigation options the CPU gives us */
+	if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
+		/* CPU is not suseptible to TAA */
+		taa_need = TAA_TAA_UC;
+	} else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
+		/*
+		 * CPU can turn off TSX.  This is the next best option
+		 * if TAA_NO hardware mitigation isn't present
+		 */
+		taa_need = TAA_TSX_DISABLE;
+	} else {
+		/* No TSX/TAA specific remedies are available. */
+		if (x86_taa_enable == TAA_TSX_DISABLE) {
+			if (bootverbose)
+				printf("TSX control not available\n");
+			return;
+		} else
+			taa_need = TAA_VERW;
+	}
+
+	/* Can we automatically take action, or are we being forced? */
+	if (x86_taa_enable == TAA_AUTO)
+		taa_state = taa_need;
+	else
+		taa_state = x86_taa_enable;
+
+	/* No state change, nothing to do */
+	if (taa_state == x86_taa_state) {
+		if (bootverbose)
+			printf("No TSX change made\n");
+		return;
+	}
+
+	/* Does the MSR need to be turned on or off? */
+	if (taa_state == TAA_TSX_DISABLE)
+		taa_set(true, true);
+	else if (x86_taa_state == TAA_TSX_DISABLE)
+		taa_set(false, true);
+
+	/* Does MDS need to be set to turn on VERW? */
+	if (taa_state == TAA_VERW) {
+		taa_saved_mds_disable = hw_mds_disable;
+		mds_disable = hw_mds_disable = 1;
+		need_mds_recalc = 1;
+	} else if (x86_taa_state == TAA_VERW) {
+		mds_disable = hw_mds_disable = taa_saved_mds_disable;
+		need_mds_recalc = 1;
+	}
+	if (need_mds_recalc) {
+		hw_mds_recalculate();
+		if (mds_disable != hw_mds_disable) {
+			if (bootverbose)
+				printf("Cannot change MDS state for TAA\n");
+			/* Don't update our state */
+			return;
+		}
+	}
+
+	x86_taa_state = taa_state;
+	return;
+}
+
+static void
+taa_recalculate_boot(void * arg __unused)
+{
+
+	x86_taa_recalculate();
+}
+SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
+
+SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW, 0,
+	"TSX Asynchronous Abort Mitigation");
+
+static int
+sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = x86_taa_enable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (val < TAA_NONE || val > TAA_AUTO)
+		return (EINVAL);
+	x86_taa_enable = val;
+	x86_taa_recalculate();
+	return (0);
+}
+
+SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
+    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_taa_handler, "I",
+    "TAA Mitigation enablement control "
+    "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO");
+
+static int
+sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
+{
+	const char *state;
+
+	switch (x86_taa_state) {
+	case TAA_NONE:
+		state = "inactive";
+		break;
+	case TAA_TSX_DISABLE:
+		state = "TSX disabled";
+		break;
+	case TAA_VERW:
+		state = "VERW";
+		break;
+	case TAA_TAA_UC:
+		state = "Mitigated in microcode";
+		break;
+	case TAA_NOT_PRESENT:
+		state = "TSX not present";
+		break;
+	default:
+		state = "unknown";
+	}
+
+	return (SYSCTL_OUT(req, state, strlen(state)));
+}
+
+SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_taa_state_handler, "A",
+    "TAA Mitigation state");
+


Property changes on: trunk/sys/x86/x86/cpu_machdep.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/delay.c
===================================================================
--- trunk/sys/x86/x86/delay.c	                        (rev 0)
+++ trunk/sys/x86/x86/delay.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,138 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * Copyright (c) 2010 Alexander Motin <mav at FreeBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz and Don Ahn.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)clock.c	7.2 (Berkeley) 5/12/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/delay.c 340270 2018-11-08 22:42:55Z jhb $");
+
+/* Generic x86 routines to handle delay */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/timetc.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/sched.h>
+
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <x86/init.h>
+
+static void
+delay_tsc(int n)
+{
+	uint64_t end, now;
+
+	/*
+	 * Pin the current thread ensure correct behavior if the TSCs
+	 * on different CPUs are not in sync.
+	 */
+	sched_pin();
+	now = rdtsc();
+	end = now + tsc_freq * n / 1000000;
+	do {
+		cpu_spinwait();
+		now = rdtsc();
+	} while (now < end);
+	sched_unpin();
+}
+
+static int
+delay_tc(int n)
+{
+	struct timecounter *tc;
+	timecounter_get_t *func;
+	uint64_t end, freq, now;
+	u_int last, mask, u;
+
+	/*
+	 * Only use the TSC if it is P-state invariant.  If the TSC is
+	 * not P-state invariant and the CPU is not running at the
+	 * "full" P-state, then the TSC will increment at some rate
+	 * less than tsc_freq and delay_tsc() will wait too long.
+	 */
+	if (tsc_is_invariant && tsc_freq != 0) {
+		delay_tsc(n);
+		return (1);
+	}
+	tc = timecounter;
+	if (tc->tc_quality <= 0)
+		return (0);
+	func = tc->tc_get_timecount;
+	mask = tc->tc_counter_mask;
+	freq = tc->tc_frequency;
+	now = 0;
+	end = freq * n / 1000000;
+	last = func(tc) & mask;
+	do {
+		cpu_spinwait();
+		u = func(tc) & mask;
+		if (u < last)
+			now += mask - last + u + 1;
+		else
+			now += u - last;
+		last = u;
+	} while (now < end);
+	return (1);
+}
+
+void
+DELAY(int n)
+{
+
+	if (delay_tc(n))
+		return;
+
+	init_ops.early_delay(n);
+}
+
+void
+cpu_lock_delay(void)
+{
+
+	/*
+	 * Use TSC to wait for a usec if present, otherwise fall back
+	 * to reading from port 0x84.  We can't call into timecounters
+	 * for this delay since timecounters might use spin locks.
+	 *
+	 * Note that unlike delay_tc(), this uses the TSC even if it
+	 * is not P-state invariant.  For this function it is ok to
+	 * wait even a few usecs.
+	 */
+	if (tsc_freq != 0)
+		delay_tsc(1);
+	else
+		inb(0x84);
+}


Property changes on: trunk/sys/x86/x86/delay.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/dump_machdep.c
===================================================================
--- trunk/sys/x86/x86/dump_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/dump_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,355 +26,30 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/dump_machdep.c 236503 2012-06-03 08:01:12Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/dump_machdep.c 276772 2015-01-07 01:01:39Z markj $");
 
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
-#include <sys/systm.h>
 #include <sys/conf.h>
-#include <sys/cons.h>
+#include <sys/kerneldump.h>
 #include <sys/sysctl.h>
-#include <sys/kernel.h>
-#include <sys/kerneldump.h>
-#include <sys/watchdog.h>
+#include <sys/systm.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
-#include <machine/elf.h>
-#include <machine/md_var.h>
 
-#ifdef __amd64__
-#define	KERNELDUMP_VERSION	KERNELDUMP_AMD64_VERSION
-#define	EM_VALUE		EM_X86_64
-#else
-#define	KERNELDUMP_VERSION	KERNELDUMP_I386_VERSION
-#define	EM_VALUE		EM_386
-#endif
-
-CTASSERT(sizeof(struct kerneldumpheader) == 512);
-
 int do_minidump = 1;
-TUNABLE_INT("debug.minidump", &do_minidump);
-SYSCTL_INT(_debug, OID_AUTO, minidump, CTLFLAG_RW, &do_minidump, 0,
+SYSCTL_INT(_debug, OID_AUTO, minidump, CTLFLAG_RWTUN, &do_minidump, 0,
     "Enable mini crash dumps");
 
-/*
- * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
- * is to protect us from metadata and to protect metadata from us.
- */
-#define	SIZEOF_METADATA		(64*1024)
-
-#define	MD_ALIGN(x)	(((off_t)(x) + PAGE_MASK) & ~PAGE_MASK)
-#define	DEV_ALIGN(x)	(((off_t)(x) + (DEV_BSIZE-1)) & ~(DEV_BSIZE-1))
-
-struct md_pa {
-	vm_paddr_t md_start;
-	vm_paddr_t md_size;
-};
-
-typedef int callback_t(struct md_pa *, int, void *);
-
-static struct kerneldumpheader kdh;
-static off_t dumplo, fileofs;
-
-/* Handle buffered writes. */
-static char buffer[DEV_BSIZE];
-static size_t fragsz;
-
-/* 20 phys_avail entry pairs correspond to 10 md_pa's */
-static struct md_pa dump_map[10];
-
-static void
-md_pa_init(void)
-{
-	int n, idx;
-
-	bzero(dump_map, sizeof(dump_map));
-	for (n = 0; n < sizeof(dump_map) / sizeof(dump_map[0]); n++) {
-		idx = n * 2;
-		if (dump_avail[idx] == 0 && dump_avail[idx + 1] == 0)
-			break;
-		dump_map[n].md_start = dump_avail[idx];
-		dump_map[n].md_size = dump_avail[idx + 1] - dump_avail[idx];
-	}
-}
-
-static struct md_pa *
-md_pa_first(void)
-{
-
-	return (&dump_map[0]);
-}
-
-static struct md_pa *
-md_pa_next(struct md_pa *mdp)
-{
-
-	mdp++;
-	if (mdp->md_size == 0)
-		mdp = NULL;
-	return (mdp);
-}
-
-static int
-buf_write(struct dumperinfo *di, char *ptr, size_t sz)
-{
-	size_t len;
-	int error;
-
-	while (sz) {
-		len = DEV_BSIZE - fragsz;
-		if (len > sz)
-			len = sz;
-		bcopy(ptr, buffer + fragsz, len);
-		fragsz += len;
-		ptr += len;
-		sz -= len;
-		if (fragsz == DEV_BSIZE) {
-			error = dump_write(di, buffer, 0, dumplo,
-			    DEV_BSIZE);
-			if (error)
-				return error;
-			dumplo += DEV_BSIZE;
-			fragsz = 0;
-		}
-	}
-
-	return (0);
-}
-
-static int
-buf_flush(struct dumperinfo *di)
-{
-	int error;
-
-	if (fragsz == 0)
-		return (0);
-
-	error = dump_write(di, buffer, 0, dumplo, DEV_BSIZE);
-	dumplo += DEV_BSIZE;
-	fragsz = 0;
-	return (error);
-}
-
-#define PG2MB(pgs) ((pgs + (1 << 8) - 1) >> 8)
-
-static int
-cb_dumpdata(struct md_pa *mdp, int seqnr, void *arg)
-{
-	struct dumperinfo *di = (struct dumperinfo*)arg;
-	vm_paddr_t a, pa;
-	void *va;
-	uint64_t pgs;
-	size_t counter, sz, chunk;
-	int i, c, error, twiddle;
-	u_int maxdumppgs;
-
-	error = 0;	/* catch case in which chunk size is 0 */
-	counter = 0;	/* Update twiddle every 16MB */
-	twiddle = 0;
-	va = 0;
-	pgs = mdp->md_size / PAGE_SIZE;
-	pa = mdp->md_start;
-	maxdumppgs = min(di->maxiosize / PAGE_SIZE, MAXDUMPPGS);
-	if (maxdumppgs == 0)	/* seatbelt */
-		maxdumppgs = 1;
-
-	printf("  chunk %d: %juMB (%ju pages)", seqnr, (uintmax_t)PG2MB(pgs),
-	    (uintmax_t)pgs);
-
-	while (pgs) {
-		chunk = pgs;
-		if (chunk > maxdumppgs)
-			chunk = maxdumppgs;
-		sz = chunk << PAGE_SHIFT;
-		counter += sz;
-		if (counter >> 24) {
-			printf(" %ju", (uintmax_t)PG2MB(pgs));
-			counter &= (1<<24) - 1;
-		}
-		for (i = 0; i < chunk; i++) {
-			a = pa + i * PAGE_SIZE;
-			va = pmap_kenter_temporary(trunc_page(a), i);
-		}
-
-		wdog_kern_pat(WD_LASTVAL);
-
-		error = dump_write(di, va, 0, dumplo, sz);
-		if (error)
-			break;
-		dumplo += sz;
-		pgs -= chunk;
-		pa += sz;
-
-		/* Check for user abort. */
-		c = cncheckc();
-		if (c == 0x03)
-			return (ECANCELED);
-		if (c != -1)
-			printf(" (CTRL-C to abort) ");
-	}
-	printf(" ... %s\n", (error) ? "fail" : "ok");
-	return (error);
-}
-
-static int
-cb_dumphdr(struct md_pa *mdp, int seqnr, void *arg)
-{
-	struct dumperinfo *di = (struct dumperinfo*)arg;
-	Elf_Phdr phdr;
-	uint64_t size;
-	int error;
-
-	size = mdp->md_size;
-	bzero(&phdr, sizeof(phdr));
-	phdr.p_type = PT_LOAD;
-	phdr.p_flags = PF_R;			/* XXX */
-	phdr.p_offset = fileofs;
-	phdr.p_vaddr = mdp->md_start;
-	phdr.p_paddr = mdp->md_start;
-	phdr.p_filesz = size;
-	phdr.p_memsz = size;
-	phdr.p_align = PAGE_SIZE;
-
-	error = buf_write(di, (char*)&phdr, sizeof(phdr));
-	fileofs += phdr.p_filesz;
-	return (error);
-}
-
-static int
-cb_size(struct md_pa *mdp, int seqnr, void *arg)
-{
-	uint64_t *sz = (uint64_t*)arg;
-
-	*sz += (uint64_t)mdp->md_size;
-	return (0);
-}
-
-static int
-foreach_chunk(callback_t cb, void *arg)
-{
-	struct md_pa *mdp;
-	int error, seqnr;
-
-	seqnr = 0;
-	mdp = md_pa_first();
-	while (mdp != NULL) {
-		error = (*cb)(mdp, seqnr++, arg);
-		if (error)
-			return (-error);
-		mdp = md_pa_next(mdp);
-	}
-	return (seqnr);
-}
-
 void
-dumpsys(struct dumperinfo *di)
+dumpsys_map_chunk(vm_paddr_t pa, size_t chunk, void **va)
 {
-	Elf_Ehdr ehdr;
-	uint64_t dumpsize;
-	off_t hdrgap;
-	size_t hdrsz;
-	int error;
+	int i;
+	vm_paddr_t a;
 
-	if (do_minidump) {
-		minidumpsys(di);
-		return;
+	for (i = 0; i < chunk; i++) {
+		a = pa + i * PAGE_SIZE;
+		*va = pmap_kenter_temporary(trunc_page(a), i);
 	}
-	bzero(&ehdr, sizeof(ehdr));
-	ehdr.e_ident[EI_MAG0] = ELFMAG0;
-	ehdr.e_ident[EI_MAG1] = ELFMAG1;
-	ehdr.e_ident[EI_MAG2] = ELFMAG2;
-	ehdr.e_ident[EI_MAG3] = ELFMAG3;
-	ehdr.e_ident[EI_CLASS] = ELF_CLASS;
-#if BYTE_ORDER == LITTLE_ENDIAN
-	ehdr.e_ident[EI_DATA] = ELFDATA2LSB;
-#else
-	ehdr.e_ident[EI_DATA] = ELFDATA2MSB;
-#endif
-	ehdr.e_ident[EI_VERSION] = EV_CURRENT;
-	ehdr.e_ident[EI_OSABI] = ELFOSABI_STANDALONE;	/* XXX big picture? */
-	ehdr.e_type = ET_CORE;
-	ehdr.e_machine = EM_VALUE;
-	ehdr.e_phoff = sizeof(ehdr);
-	ehdr.e_flags = 0;
-	ehdr.e_ehsize = sizeof(ehdr);
-	ehdr.e_phentsize = sizeof(Elf_Phdr);
-	ehdr.e_shentsize = sizeof(Elf_Shdr);
-
-	md_pa_init();
-
-	/* Calculate dump size. */
-	dumpsize = 0L;
-	ehdr.e_phnum = foreach_chunk(cb_size, &dumpsize);
-	hdrsz = ehdr.e_phoff + ehdr.e_phnum * ehdr.e_phentsize;
-	fileofs = MD_ALIGN(hdrsz);
-	dumpsize += fileofs;
-	hdrgap = fileofs - DEV_ALIGN(hdrsz);
-
-	/* Determine dump offset on device. */
-	if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
-		error = ENOSPC;
-		goto fail;
-	}
-	dumplo = di->mediaoffset + di->mediasize - dumpsize;
-	dumplo -= sizeof(kdh) * 2;
-
-	mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_VERSION, dumpsize,
-	    di->blocksize);
-
-	printf("Dumping %llu MB (%d chunks)\n", (long long)dumpsize >> 20,
-	    ehdr.e_phnum);
-
-	/* Dump leader */
-	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
-	if (error)
-		goto fail;
-	dumplo += sizeof(kdh);
-
-	/* Dump ELF header */
-	error = buf_write(di, (char*)&ehdr, sizeof(ehdr));
-	if (error)
-		goto fail;
-
-	/* Dump program headers */
-	error = foreach_chunk(cb_dumphdr, di);
-	if (error < 0)
-		goto fail;
-	buf_flush(di);
-
-	/*
-	 * All headers are written using blocked I/O, so we know the
-	 * current offset is (still) block aligned. Skip the alignement
-	 * in the file to have the segment contents aligned at page
-	 * boundary. We cannot use MD_ALIGN on dumplo, because we don't
-	 * care and may very well be unaligned within the dump device.
-	 */
-	dumplo += hdrgap;
-
-	/* Dump memory chunks (updates dumplo) */
-	error = foreach_chunk(cb_dumpdata, di);
-	if (error < 0)
-		goto fail;
-
-	/* Dump trailer */
-	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
-	if (error)
-		goto fail;
-
-	/* Signal completion, signoff and exit stage left. */
-	dump_write(di, NULL, 0, 0, 0);
-	printf("\nDump complete\n");
-	return;
-
- fail:
-	if (error < 0)
-		error = -error;
-
-	if (error == ECANCELED)
-		printf("\nDump aborted\n");
-	else if (error == ENOSPC)
-		printf("\nDump failed. Partition too small.\n");
-	else
-		printf("\n** DUMP FAILED (ERROR %d) **\n", error);
 }

Modified: trunk/sys/x86/x86/fdt_machdep.c
===================================================================
--- trunk/sys/x86/x86/fdt_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/fdt_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/fdt_machdep.c 250840 2013-05-21 03:05:49Z marcel $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/fdt_machdep.c 287000 2015-08-21 15:57:57Z royger $");
 
 #include "opt_platform.h"
 
@@ -55,7 +55,7 @@
 	mdp = preload_search_by_type("elf kernel");
 	if (mdp == NULL)
 		mdp = preload_search_by_type("elf32 kernel");
-	dtbp = (mdp != NULL) ? MD_FETCH(mdp, MODINFOMD_DTBP, void *) : NULL;
+	dtbp = MD_FETCH(mdp, MODINFOMD_DTBP, void *);
 
 #if defined(FDT_DTB_STATIC)
 	/*

Modified: trunk/sys/x86/x86/identcpu.c
===================================================================
--- trunk/sys/x86/x86/identcpu.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/identcpu.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -40,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/identcpu.c 332743 2018-04-19 00:11:02Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/identcpu.c 354658 2019-11-12 19:35:46Z scottl $");
 
 #include "opt_cpu.h"
 
@@ -84,9 +84,46 @@
 static void print_via_padlock_info(void);
 static void print_vmx_info(void);
 
+#ifdef __i386__
+int	cpu;			/* Are we 386, 386sx, 486, etc? */
 int	cpu_class;
+#endif
+u_int	cpu_feature;		/* Feature flags */
+u_int	cpu_feature2;		/* Feature flags */
+u_int	amd_feature;		/* AMD feature flags */
+u_int	amd_feature2;		/* AMD feature flags */
+u_int	amd_pminfo;		/* AMD advanced power management info */
+u_int	amd_extended_feature_extensions;
+u_int	via_feature_rng;	/* VIA RNG features */
+u_int	via_feature_xcrypt;	/* VIA ACE features */
+u_int	cpu_high;		/* Highest arg to CPUID */
+u_int	cpu_exthigh;		/* Highest arg to extended CPUID */
+u_int	cpu_id;			/* Stepping ID */
+u_int	cpu_procinfo;		/* HyperThreading Info / Brand Index / CLFUSH */
+u_int	cpu_procinfo2;		/* Multicore info */
+char	cpu_vendor[20];		/* CPU Origin code */
+u_int	cpu_vendor_id;		/* CPU vendor ID */
+u_int	cpu_fxsr;		/* SSE enabled */
+u_int	cpu_mxcsr_mask;		/* Valid bits in mxcsr */
+u_int	cpu_clflush_line_size = 32;
+u_int	cpu_stdext_feature;	/* %ebx */
+u_int	cpu_stdext_feature2;	/* %ecx */
+u_int	cpu_stdext_feature3;	/* %edx */
+uint64_t cpu_ia32_arch_caps;
+u_int	cpu_max_ext_state_size;
+u_int	cpu_mon_mwait_flags;	/* MONITOR/MWAIT flags (CPUID.05H.ECX) */
+u_int	cpu_mon_min_size;	/* MONITOR minimum range size, bytes */
+u_int	cpu_mon_max_size;	/* MONITOR minimum range size, bytes */
+u_int	cpu_maxphyaddr;		/* Max phys addr width in bits */
 char machine[] = MACHINE;
 
+SYSCTL_UINT(_hw, OID_AUTO, via_feature_rng, CTLFLAG_RD,
+    &via_feature_rng, 0,
+    "VIA RNG feature available in CPU");
+SYSCTL_UINT(_hw, OID_AUTO, via_feature_xcrypt, CTLFLAG_RD,
+    &via_feature_xcrypt, 0,
+    "VIA xcrypt feature available in CPU");
+
 #ifdef __amd64__
 #ifdef SCTL_MASK32
 extern int adaptive_machine_arch;
@@ -109,8 +146,8 @@
 	return (error);
 
 }
-SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD,
-    NULL, 0, sysctl_hw_machine, "A", "Machine class");
+SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine, "A", "Machine class");
 #else
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD,
     machine, 0, "Machine class");
@@ -117,7 +154,7 @@
 #endif
 
 static char cpu_model[128];
-SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD,
+SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD | CTLFLAG_MPSAFE,
     cpu_model, 0, "Machine model");
 
 static int hw_clockrate;
@@ -126,8 +163,8 @@
 
 u_int hv_high;
 char hv_vendor[16];
-SYSCTL_STRING(_hw, OID_AUTO, hv_vendor, CTLFLAG_RD, hv_vendor, 0,
-    "Hypervisor vendor");
+SYSCTL_STRING(_hw, OID_AUTO, hv_vendor, CTLFLAG_RD | CTLFLAG_MPSAFE, hv_vendor,
+    0, "Hypervisor vendor");
 
 static eventhandler_tag tsc_post_tag;
 
@@ -147,13 +184,11 @@
 	NULL,
 	"Intel Pentium 4"
 };
-#endif
 
 static struct {
 	char	*cpu_name;
 	int	cpu_class;
 } cpus[] = {
-#ifdef __i386__
 	{ "Intel 80286",	CPUCLASS_286 },		/* CPU_286   */
 	{ "i386SX",		CPUCLASS_386 },		/* CPU_386SX */
 	{ "i386DX",		CPUCLASS_386 },		/* CPU_386   */
@@ -171,11 +206,8 @@
 	{ "Pentium II",		CPUCLASS_686 },		/* CPU_PII */
 	{ "Pentium III",	CPUCLASS_686 },		/* CPU_PIII */
 	{ "Pentium 4",		CPUCLASS_686 },		/* CPU_P4 */
-#else
-	{ "Clawhammer",		CPUCLASS_K8 },		/* CPU_CLAWHAMMER */
-	{ "Sledgehammer",	CPUCLASS_K8 },		/* CPU_SLEDGEHAMMER */
+};
 #endif
-};
 
 static struct {
 	char	*vendor;
@@ -205,9 +237,13 @@
 	u_int regs[4], i;
 	char *brand;
 
+	printf("CPU: ");
+#ifdef __i386__
 	cpu_class = cpus[cpu].cpu_class;
-	printf("CPU: ");
 	strncpy(cpu_model, cpus[cpu].cpu_name, sizeof (cpu_model));
+#else
+	strncpy(cpu_model, "Hammer", sizeof (cpu_model));
+#endif
 
 	/* Check for extended CPUID information and a processor name. */
 	if (cpu_exthigh >= 0x80000004) {
@@ -660,8 +696,8 @@
 		    (intmax_t)(tsc_freq + 4999) / 1000000,
 		    (u_int)((tsc_freq + 4999) / 10000) % 100);
 	}
+#ifdef __i386__
 	switch(cpu_class) {
-#ifdef __i386__
 	case CPUCLASS_286:
 		printf("286");
 		break;
@@ -683,14 +719,12 @@
 		printf("686");
 		break;
 #endif
-#else
-	case CPUCLASS_K8:
-		printf("K8");
-		break;
-#endif
 	default:
 		printf("Unknown");	/* will panic below... */
 	}
+#else
+	printf("K8");
+#endif
 	printf("-class CPU)\n");
 	if (*cpu_vendor)
 		printf("  Origin=\"%s\"", cpu_vendor);
@@ -914,6 +948,7 @@
 				       "\020PQE"
 				       /* AVX512 Foundation */
 				       "\021AVX512F"
+				       "\022AVX512DQ"
 				       /* Enhanced NRBG */
 				       "\023RDSEED"
 				       /* ADCX + ADOX */
@@ -920,12 +955,17 @@
 				       "\024ADX"
 				       /* Supervisor Mode Access Prevention */
 				       "\025SMAP"
+				       "\026AVX512IFMA"
+				       "\027PCOMMIT"
 				       "\030CLFLUSHOPT"
+				       "\031CLWB"
 				       "\032PROCTRACE"
 				       "\033AVX512PF"
 				       "\034AVX512ER"
 				       "\035AVX512CD"
 				       "\036SHA"
+				       "\037AVX512BW"
+				       "\040AVX512VL"
 				       );
 			}
 
@@ -934,14 +974,35 @@
 				    cpu_stdext_feature2,
 				       "\020"
 				       "\001PREFETCHWT1"
+				       "\002AVX512VBMI"
 				       "\003UMIP"
 				       "\004PKU"
 				       "\005OSPKE"
+				       "\006WAITPKG"
+				       "\011GFNI"
 				       "\027RDPID"
+				       "\032CLDEMOTE"
+				       "\034MOVDIRI"
+				       "\035MOVDIRI64B"
 				       "\037SGXLC"
 				       );
 			}
 
+			if (cpu_stdext_feature3 != 0) {
+				printf("\n  Structured Extended Features3=0x%b",
+				    cpu_stdext_feature3,
+				       "\020"
+				       "\013MD_CLEAR"
+				       "\016TSXFA"
+				       "\033IBPB"
+				       "\034STIBP"
+				       "\035L1DFL"
+				       "\036ARCH_CAP"
+				       "\037CORE_CAP"
+				       "\040SSBD"
+				       );
+			}
+
 			if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
 				cpuid_count(0xd, 0x1, regs);
 				if (regs[0] != 0) {
@@ -955,6 +1016,31 @@
 				}
 			}
 
+			if (cpu_ia32_arch_caps != 0) {
+				printf("\n  IA32_ARCH_CAPS=0x%b",
+				    (u_int)cpu_ia32_arch_caps,
+				       "\020"
+				       "\001RDCL_NO"
+				       "\002IBRS_ALL"
+				       "\003RSBA"
+				       "\004SKIP_L1DFL_VME"
+				       "\005SSB_NO"
+				       "\006MDS_NO"
+				       "\010TSX_CTRL"
+				       "\011TAA_NO"
+				       );
+			}
+
+			if (amd_extended_feature_extensions != 0) {
+				printf("\n  "
+				    "AMD Extended Feature Extensions ID EBX="
+				    "0x%b", amd_extended_feature_extensions,
+				    "\020"
+				    "\001CLZERO"
+				    "\002IRPerf"
+				    "\003XSaveErPtr");
+			}
+
 			if (via_feature_rng != 0 || via_feature_xcrypt != 0)
 				print_via_padlock_info();
 
@@ -1008,11 +1094,11 @@
 	print_hypervisor_info();
 }
 
+#ifdef __i386__
 void
 panicifcpuunsupported(void)
 {
 
-#ifdef __i386__
 #if !defined(lint)
 #if !defined(I486_CPU) && !defined(I586_CPU) && !defined(I686_CPU)
 #error This kernel is not configured for one of the supported CPUs
@@ -1019,17 +1105,11 @@
 #endif
 #else /* lint */
 #endif /* lint */
-#else /* __amd64__ */
-#ifndef HAMMER
-#error "You need to specify a cpu type"
-#endif
-#endif
 	/*
 	 * Now that we have told the user what they have,
 	 * let them know if that machine type isn't configured.
 	 */
 	switch (cpu_class) {
-#ifdef __i386__
 	case CPUCLASS_286:	/* a 286 should not make it this far, anyway */
 	case CPUCLASS_386:
 #if !defined(I486_CPU)
@@ -1041,12 +1121,6 @@
 #if !defined(I686_CPU)
 	case CPUCLASS_686:
 #endif
-#else /* __amd64__ */
-	case CPUCLASS_X86:
-#ifndef HAMMER
-	case CPUCLASS_K8:
-#endif
-#endif
 		panic("CPU class not configured");
 	default:
 		break;
@@ -1053,7 +1127,6 @@
 	}
 }
 
-#ifdef __i386__
 static	volatile u_int trap_by_rdmsr;
 
 /*
@@ -1210,7 +1283,6 @@
 
 SYSINIT(hook_tsc_freq, SI_SUB_CONFIGURE, SI_ORDER_ANY, hook_tsc_freq, NULL);
 
-#ifndef XEN
 static const char *const vm_bnames[] = {
 	"QEMU",				/* QEMU */
 	"Plex86",			/* Plex86 */
@@ -1270,6 +1342,10 @@
 				vm_guest = VM_GUEST_VMWARE;
 			else if (strcmp(hv_vendor, "Microsoft Hv") == 0)
 				vm_guest = VM_GUEST_HV;
+			else if (strcmp(hv_vendor, "KVMKVMKVM") == 0)
+				vm_guest = VM_GUEST_KVM;
+			else if (strcmp(hv_vendor, "bhyve bhyve") == 0)
+				vm_guest = VM_GUEST_BHYVE;
 		}
 		return;
 	}
@@ -1277,7 +1353,7 @@
 	/*
 	 * Examine SMBIOS strings for older hypervisors.
 	 */
-	p = getenv("smbios.system.serial");
+	p = kern_getenv("smbios.system.serial");
 	if (p != NULL) {
 		if (strncmp(p, "VMware-", 7) == 0 || strncmp(p, "VMW", 3) == 0) {
 			vmware_hvcall(VMW_HVCMD_GETVERSION, regs);
@@ -1294,7 +1370,7 @@
 	 * XXX: Some of these entries may not be needed since they were
 	 * added to FreeBSD before the checks above.
 	 */
-	p = getenv("smbios.bios.vendor");
+	p = kern_getenv("smbios.bios.vendor");
 	if (p != NULL) {
 		for (i = 0; vm_bnames[i] != NULL; i++)
 			if (strcmp(p, vm_bnames[i]) == 0) {
@@ -1304,7 +1380,7 @@
 			}
 		freeenv(p);
 	}
-	p = getenv("smbios.system.product");
+	p = kern_getenv("smbios.system.product");
 	if (p != NULL) {
 		for (i = 0; vm_pnames[i] != NULL; i++)
 			if (strcmp(p, vm_pnames[i]) == 0) {
@@ -1315,7 +1391,6 @@
 		freeenv(p);
 	}
 }
-#endif
 
 bool
 fix_cpuid(void)
@@ -1360,9 +1435,8 @@
 	return (false);
 }
 
-#ifdef __amd64__
 void
-identify_cpu(void)
+identify_cpu1(void)
 {
 	u_int regs[4];
 
@@ -1379,8 +1453,34 @@
 	cpu_feature = regs[3];
 	cpu_feature2 = regs[2];
 }
-#endif
 
+void
+identify_cpu2(void)
+{
+	u_int regs[4], cpu_stdext_disable;
+
+	if (cpu_high >= 7) {
+		cpuid_count(7, 0, regs);
+		cpu_stdext_feature = regs[1];
+
+		/*
+		 * Some hypervisors failed to filter out unsupported
+		 * extended features.  Allow to disable the
+		 * extensions, activation of which requires setting a
+		 * bit in CR4, and which VM monitors do not support.
+		 */
+		cpu_stdext_disable = 0;
+		TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
+		cpu_stdext_feature &= ~cpu_stdext_disable;
+
+		cpu_stdext_feature2 = regs[2];
+		cpu_stdext_feature3 = regs[3];
+
+		if ((cpu_stdext_feature3 & CPUID_STDEXT3_ARCH_CAP) != 0)
+			cpu_ia32_arch_caps = rdmsr(MSR_IA32_ARCH_CAP);
+	}
+}
+
 /*
  * Final stage of CPU identification.
  */
@@ -1387,7 +1487,7 @@
 void
 finishidentcpu(void)
 {
-	u_int regs[4], cpu_stdext_disable;
+	u_int regs[4];
 #ifdef __i386__
 	u_char ccr3;
 #endif
@@ -1406,26 +1506,8 @@
 		cpu_mon_max_size = regs[1] &  CPUID5_MON_MAX_SIZE;
 	}
 
-	if (cpu_high >= 7) {
-		cpuid_count(7, 0, regs);
-		cpu_stdext_feature = regs[1];
+	identify_cpu2();
 
-		/*
-		 * Some hypervisors fail to filter out unsupported
-		 * extended features.  For now, disable the
-		 * extensions, activation of which requires setting a
-		 * bit in CR4, and which VM monitors do not support.
-		 */
-		if (cpu_feature2 & CPUID2_HV) {
-			cpu_stdext_disable = CPUID_STDEXT_FSGSBASE |
-			    CPUID_STDEXT_SMEP;
-		} else
-			cpu_stdext_disable = 0;
-		TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
-		cpu_stdext_feature &= ~cpu_stdext_disable;
-		cpu_stdext_feature2 = regs[2];
-	}
-
 #ifdef __i386__
 	if (cpu_high > 0 &&
 	    (cpu_vendor_id == CPU_VENDOR_INTEL ||
@@ -1457,6 +1539,7 @@
 	if (cpu_exthigh >= 0x80000008) {
 		do_cpuid(0x80000008, regs);
 		cpu_maxphyaddr = regs[0] & 0xff;
+		amd_extended_feature_extensions = regs[1];
 		cpu_procinfo2 = regs[2];
 	} else {
 		cpu_maxphyaddr = (cpu_feature & CPUID_PAE) != 0 ? 36 : 32;
@@ -1550,18 +1633,26 @@
 			return;
 		}
 	}
-#else
-	/* XXX */
-	cpu = CPU_CLAWHAMMER;
 #endif
 }
 
+int
+pti_get_default(void)
+{
+
+	if (strcmp(cpu_vendor, AMD_VENDOR_ID) == 0)
+		return (0);
+	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) != 0)
+		return (0);
+	return (1);
+}
+
 static u_int
 find_cpu_vendor_id(void)
 {
 	int	i;
 
-	for (i = 0; i < sizeof(cpu_vendors) / sizeof(cpu_vendors[0]); i++)
+	for (i = 0; i < nitems(cpu_vendors); i++)
 		if (strcmp(cpu_vendor, cpu_vendors[i].vendor) == 0)
 			return (cpu_vendors[i].vendor_id);
 	return (0);

Modified: trunk/sys/x86/x86/intr_machdep.c
===================================================================
--- trunk/sys/x86/x86/intr_machdep.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/intr_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/x86/intr_machdep.c 307244 2016-10-14 02:03:53Z sephe $
+ * $FreeBSD: stable/11/sys/x86/x86/intr_machdep.c 340016 2018-11-01 18:34:26Z jhb $
  */
 
 /*
@@ -37,6 +37,7 @@
 
 #include "opt_atpic.h"
 #include "opt_ddb.h"
+#include "opt_smp.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -44,6 +45,7 @@
 #include <sys/ktr.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
@@ -50,6 +52,7 @@
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 #include <machine/clock.h>
 #include <machine/intr_machdep.h>
 #include <machine/smp.h>
@@ -65,7 +68,7 @@
 #ifdef PC98
 #include <pc98/cbus/cbus.h>
 #else
-#include <x86/isa/isa.h>
+#include <isa/isareg.h>
 #endif
 #endif
 
@@ -74,22 +77,26 @@
 typedef void (*mask_fn)(void *);
 
 static int intrcnt_index;
-static struct intsrc *interrupt_sources[NUM_IO_INTS];
+static struct intsrc **interrupt_sources;
 static struct sx intrsrc_lock;
 static struct mtx intrpic_lock;
 static struct mtx intrcnt_lock;
 static TAILQ_HEAD(pics_head, pic) pics;
+u_int num_io_irqs;
 
-#ifdef SMP
+#if defined(SMP) && !defined(EARLY_AP_STARTUP)
 static int assign_cpu;
 #endif
 
-u_long intrcnt[INTRCNT_COUNT];
-char intrnames[INTRCNT_COUNT * (MAXCOMLEN + 1)];
+u_long *intrcnt;
+char *intrnames;
 size_t sintrcnt = sizeof(intrcnt);
 size_t sintrnames = sizeof(intrnames);
+int nintrcnt;
 
-static int	intr_assign_cpu(void *arg, u_char cpu);
+static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
+
+static int	intr_assign_cpu(void *arg, int cpu);
 static void	intr_disable_src(void *arg);
 static void	intr_init(void *__dummy);
 static int	intr_pic_registered(struct pic *pic);
@@ -97,6 +104,18 @@
 static void	intrcnt_updatename(struct intsrc *is);
 static void	intrcnt_register(struct intsrc *is);
 
+/*
+ * SYSINIT levels for SI_SUB_INTR:
+ *
+ * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
+ * SI_ORDER_SECOND: Xen PICs
+ * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
+ * SI_ORDER_FOURTH: Add 8259A PICs
+ * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
+ * SI_ORDER_MIDDLE: SMP interrupt counters
+ * SI_ORDER_ANY: Enable interrupts on BSP
+ */
+
 static int
 intr_pic_registered(struct pic *pic)
 {
@@ -132,6 +151,56 @@
 }
 
 /*
+ * Allocate interrupt source arrays and register interrupt sources
+ * once the number of interrupts is known.
+ */
+static void
+intr_init_sources(void *arg)
+{
+	struct pic *pic;
+
+	MPASS(num_io_irqs > 0);
+
+	interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
+	    M_INTR, M_WAITOK | M_ZERO);
+
+	/*
+	 * - 1 ??? dummy counter.
+	 * - 2 counters for each I/O interrupt.
+	 * - 1 counter for each CPU for lapic timer.
+	 * - 1 counter for each CPU for the Hyper-V vmbus driver.
+	 * - 8 counters for each CPU for IPI counters for SMP.
+	 */
+	nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
+#ifdef COUNT_IPIS
+	if (mp_ncpus > 1)
+		nintrcnt += 8 * mp_ncpus;
+#endif
+	intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
+	    M_ZERO);
+	intrnames = mallocarray(nintrcnt, MAXCOMLEN + 1, M_INTR, M_WAITOK |
+	    M_ZERO);
+	sintrcnt = nintrcnt * sizeof(u_long);
+	sintrnames = nintrcnt * (MAXCOMLEN + 1);
+
+	intrcnt_setname("???", 0);
+	intrcnt_index = 1;
+
+	/*
+	 * NB: intrpic_lock is not held here to avoid LORs due to
+	 * malloc() in intr_register_source().  However, we are still
+	 * single-threaded at this point in startup so the list of
+	 * PICs shouldn't change.
+	 */
+	TAILQ_FOREACH(pic, &pics, pics) {
+		if (pic->pic_register_sources != NULL)
+			pic->pic_register_sources(pic);
+	}
+}
+SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
+    NULL);
+
+/*
  * Register a new interrupt source with the global interrupt system.
  * The global interrupts need to be disabled when this function is
  * called.
@@ -143,6 +212,8 @@
 
 	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
 	vector = isrc->is_pic->pic_vector(isrc);
+	KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
+	    num_io_irqs));
 	if (interrupt_sources[vector] != NULL)
 		return (EEXIST);
 	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
@@ -168,6 +239,8 @@
 intr_lookup_source(int vector)
 {
 
+	if (vector < 0 || vector >= num_io_irqs)
+		return (NULL);
 	return (interrupt_sources[vector]);
 }
 
@@ -308,17 +381,24 @@
 }
 
 static int
-intr_assign_cpu(void *arg, u_char cpu)
+intr_assign_cpu(void *arg, int cpu)
 {
 #ifdef SMP
 	struct intsrc *isrc;
 	int error;
 
+#ifdef EARLY_AP_STARTUP
+	MPASS(mp_ncpus == 1 || smp_started);
+
+	/* Nothing to do if there is only a single CPU. */
+	if (mp_ncpus > 1 && cpu != NOCPU) {
+#else
 	/*
 	 * Don't do anything during early boot.  We will pick up the
 	 * assignment once the APs are started.
 	 */
 	if (assign_cpu && cpu != NOCPU) {
+#endif
 		isrc = arg;
 		sx_xlock(&intrsrc_lock);
 		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
@@ -353,6 +433,7 @@
 
 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
 	mtx_lock_spin(&intrcnt_lock);
+	MPASS(intrcnt_index + 2 <= nintrcnt);
 	is->is_index = intrcnt_index;
 	intrcnt_index += 2;
 	snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
@@ -369,6 +450,7 @@
 {
 
 	mtx_lock_spin(&intrcnt_lock);
+	MPASS(intrcnt_index < nintrcnt);
 	*countp = &intrcnt[intrcnt_index];
 	intrcnt_setname(name, intrcnt_index);
 	intrcnt_index++;
@@ -379,8 +461,6 @@
 intr_init(void *dummy __unused)
 {
 
-	intrcnt_setname("???", 0);
-	intrcnt_index = 1;
 	TAILQ_INIT(&pics);
 	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
 	sx_init(&intrsrc_lock, "intrsrc");
@@ -388,6 +468,21 @@
 }
 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
 
+static void
+intr_init_final(void *dummy __unused)
+{
+
+	/*
+	 * Enable interrupts on the BSP after all of the interrupt
+	 * controllers are initialized.  Device interrupts are still
+	 * disabled in the interrupt controllers until interrupt
+	 * handlers are registered.  Interrupts are enabled on each AP
+	 * after their first context switch.
+	 */
+	enable_intr();
+}
+SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
+
 #ifndef DEV_ATPIC
 /* Initialize the two 8259A's to a known-good shutdown state. */
 void
@@ -427,6 +522,23 @@
 	return (0);
 }
 
+void
+intr_reprogram(void)
+{
+	struct intsrc *is;
+	u_int v;
+
+	sx_xlock(&intrsrc_lock);
+	for (v = 0; v < num_io_irqs; v++) {
+		is = interrupt_sources[v];
+		if (is == NULL)
+			continue;
+		if (is->is_pic->pic_reprogram_pin != NULL)
+			is->is_pic->pic_reprogram_pin(is);
+	}
+	sx_xunlock(&intrsrc_lock);
+}
+
 #ifdef DDB
 /*
  * Dump data about interrupt handlers
@@ -434,7 +546,8 @@
 DB_SHOW_COMMAND(irqs, db_show_irqs)
 {
 	struct intsrc **isrc;
-	int i, verbose;
+	u_int i;
+	int verbose;
 
 	if (strcmp(modif, "v") == 0)
 		verbose = 1;
@@ -441,7 +554,7 @@
 	else
 		verbose = 0;
 	isrc = interrupt_sources;
-	for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++)
+	for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
 		if (*isrc != NULL)
 			db_dump_intr_event((*isrc)->is_event, verbose);
 }
@@ -453,7 +566,7 @@
  * allocate CPUs round-robin.
  */
 
-static cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
+cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
 static int current_cpu;
 
 /*
@@ -465,9 +578,15 @@
 {
 	u_int apic_id;
 
+#ifdef EARLY_AP_STARTUP
+	MPASS(mp_ncpus == 1 || smp_started);
+	if (mp_ncpus == 1)
+		return (PCPU_GET(apic_id));
+#else
 	/* Leave all interrupts on the BSP during boot. */
 	if (!assign_cpu)
 		return (PCPU_GET(apic_id));
+#endif
 
 	mtx_lock_spin(&icu_lock);
 	apic_id = cpu_apic_ids[current_cpu];
@@ -509,6 +628,7 @@
 	CPU_SET(cpu, &intr_cpus);
 }
 
+#ifndef EARLY_AP_STARTUP
 /*
  * Distribute all the interrupt sources among the available CPUs once the
  * AP's have been launched.
@@ -517,15 +637,8 @@
 intr_shuffle_irqs(void *arg __unused)
 {
 	struct intsrc *isrc;
-	int i;
+	u_int i;
 
-#ifdef XEN
-	/*
-	 * Doesn't work yet
-	 */
-	return;
-#endif
-
 	/* Don't bother on UP. */
 	if (mp_ncpus == 1)
 		return;
@@ -533,7 +646,7 @@
 	/* Round-robin assign a CPU to each enabled source. */
 	sx_xlock(&intrsrc_lock);
 	assign_cpu = 1;
-	for (i = 0; i < NUM_IO_INTS; i++) {
+	for (i = 0; i < num_io_irqs; i++) {
 		isrc = interrupt_sources[i];
 		if (isrc != NULL && isrc->is_handlers > 0) {
 			/*
@@ -556,6 +669,7 @@
 }
 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
     NULL);
+#endif
 #else
 /*
  * Always route interrupts to the current processor in the UP case.

Modified: trunk/sys/x86/x86/io_apic.c
===================================================================
--- trunk/sys/x86/x86/io_apic.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/io_apic.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,8 +26,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/io_apic.c 330959 2018-03-14 23:59:52Z marius $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/io_apic.c 340016 2018-11-01 18:34:26Z jhb $");
 
+#include "opt_acpi.h"
 #include "opt_isa.h"
 
 #include <sys/param.h>
@@ -38,6 +39,7 @@
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
+#include <sys/rman.h>
 #include <sys/sysctl.h>
 
 #include <dev/pci/pcireg.h>
@@ -49,9 +51,10 @@
 #include <x86/apicreg.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
 #include <machine/resource.h>
 #include <machine/segments.h>
+#include <x86/iommu/iommu_intrmap.h>
 
 #define IOAPIC_ISA_INTS		16
 #define	IOAPIC_MEM_REGION	32
@@ -58,11 +61,6 @@
 #define	IOAPIC_REDTBL_LO(i)	(IOAPIC_REDTBL + (i) * 2)
 #define	IOAPIC_REDTBL_HI(i)	(IOAPIC_REDTBL_LO(i) + 1)
 
-#define	IRQ_EXTINT		(NUM_IO_INTS + 1)
-#define	IRQ_NMI			(NUM_IO_INTS + 2)
-#define	IRQ_SMI			(NUM_IO_INTS + 3)
-#define	IRQ_DISABLED		(NUM_IO_INTS + 4)
-
 static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures");
 
 /*
@@ -81,15 +79,16 @@
 
 struct ioapic_intsrc {
 	struct intsrc io_intsrc;
-	u_int io_irq;
+	int io_irq;
 	u_int io_intpin:8;
 	u_int io_vector:8;
-	u_int io_cpu:8;
+	u_int io_cpu;
 	u_int io_activehi:1;
 	u_int io_edgetrigger:1;
 	u_int io_masked:1;
 	int io_bus:4;
 	uint32_t io_lowreg;
+	u_int io_remap_cookie;
 };
 
 struct ioapic {
@@ -98,9 +97,13 @@
 	u_int io_apic_id:4;
 	u_int io_intbase:8;		/* System Interrupt base */
 	u_int io_numintr:8;
+	u_int io_haseoi:1;
 	volatile ioapic_t *io_addr;	/* XXX: should use bus_space */
 	vm_paddr_t io_paddr;
 	STAILQ_ENTRY(ioapic) io_next;
+	device_t pci_dev;		/* matched pci device, if found */
+	struct resource *pci_wnd;	/* BAR 0, should be same or alias to
+					   io_paddr */
 	struct ioapic_intsrc io_pins[0];
 };
 
@@ -108,6 +111,7 @@
 static void	ioapic_write(volatile ioapic_t *apic, int reg, u_int val);
 static const char *ioapic_bus_string(int bus_type);
 static void	ioapic_print_irq(struct ioapic_intsrc *intpin);
+static void	ioapic_register_sources(struct pic *pic);
 static void	ioapic_enable_source(struct intsrc *isrc);
 static void	ioapic_disable_source(struct intsrc *isrc, int eoi);
 static void	ioapic_eoi_source(struct intsrc *isrc);
@@ -120,27 +124,79 @@
 static void	ioapic_resume(struct pic *pic, bool suspend_cancelled);
 static int	ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id);
 static void	ioapic_program_intpin(struct ioapic_intsrc *intpin);
+static void	ioapic_reprogram_intpin(struct intsrc *isrc);
 
 static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list);
-struct pic ioapic_template = { ioapic_enable_source, ioapic_disable_source,
-			       ioapic_eoi_source, ioapic_enable_intr,
-			       ioapic_disable_intr, ioapic_vector,
-			       ioapic_source_pending, NULL, ioapic_resume,
-			       ioapic_config_intr, ioapic_assign_cpu };
+struct pic ioapic_template = {
+	.pic_register_sources = ioapic_register_sources,
+	.pic_enable_source = ioapic_enable_source,
+	.pic_disable_source = ioapic_disable_source,
+	.pic_eoi_source = ioapic_eoi_source,
+	.pic_enable_intr = ioapic_enable_intr,
+	.pic_disable_intr = ioapic_disable_intr,
+	.pic_vector = ioapic_vector,
+	.pic_source_pending = ioapic_source_pending,
+	.pic_suspend = NULL,
+	.pic_resume = ioapic_resume,
+	.pic_config_intr = ioapic_config_intr,
+	.pic_assign_cpu = ioapic_assign_cpu,
+	.pic_reprogram_pin = ioapic_reprogram_intpin,
+};
 
-static int next_ioapic_base;
+static u_int next_ioapic_base;
 static u_int next_id;
 
-static SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
 static int enable_extint;
 SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0,
     "Enable the ExtINT pin in the first I/O APIC");
-TUNABLE_INT("hw.apic.enable_extint", &enable_extint);
 
-static __inline void
-_ioapic_eoi_source(struct intsrc *isrc)
+static void
+_ioapic_eoi_source(struct intsrc *isrc, int locked)
 {
+	struct ioapic_intsrc *src;
+	struct ioapic *io;
+	volatile uint32_t *apic_eoi;
+	uint32_t low1;
+
 	lapic_eoi();
+	if (!lapic_eoi_suppression)
+		return;
+	src = (struct ioapic_intsrc *)isrc;
+	if (src->io_edgetrigger)
+		return;
+	io = (struct ioapic *)isrc->is_pic;
+
+	/*
+	 * Handle targeted EOI for level-triggered pins, if broadcast
+	 * EOI suppression is supported by LAPICs.
+	 */
+	if (io->io_haseoi) {
+		/*
+		 * If IOAPIC has EOI Register, simply write vector
+		 * number into the reg.
+		 */
+		apic_eoi = (volatile uint32_t *)((volatile char *)
+		    io->io_addr + IOAPIC_EOIR);
+		*apic_eoi = src->io_vector;
+	} else {
+		/*
+		 * Otherwise, if IO-APIC is too old to provide EOIR,
+		 * do what Intel did for the Linux kernel. Temporary
+		 * switch the pin to edge-trigger and back, masking
+		 * the pin during the trick.
+		 */
+		if (!locked)
+			mtx_lock_spin(&icu_lock);
+		low1 = src->io_lowreg;
+		low1 &= ~IOART_TRGRLVL;
+		low1 |= IOART_TRGREDG | IOART_INTMSET;
+		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
+		    low1);
+		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
+		    src->io_lowreg);
+		if (!locked)
+			mtx_unlock_spin(&icu_lock);
+	}
 }
 
 static u_int
@@ -195,7 +251,7 @@
 		printf("SMI");
 		break;
 	default:
-		printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus),
+		printf("%s IRQ %d", ioapic_bus_string(intpin->io_bus),
 		    intpin->io_irq);
 	}
 }
@@ -233,7 +289,7 @@
 	}
 
 	if (eoi == PIC_EOI)
-		_ioapic_eoi_source(isrc);
+		_ioapic_eoi_source(isrc, 1);
 
 	mtx_unlock_spin(&icu_lock);
 }
@@ -242,7 +298,7 @@
 ioapic_eoi_source(struct intsrc *isrc)
 {
 
-	_ioapic_eoi_source(isrc);
+	_ioapic_eoi_source(isrc, 0);
 }
 
 /*
@@ -254,6 +310,9 @@
 {
 	struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic;
 	uint32_t low, high;
+#ifdef ACPI_DMAR
+	int error;
+#endif
 
 	/*
 	 * If a pin is completely invalid or if it is valid but hasn't
@@ -260,7 +319,7 @@
 	 * been enabled yet, just ensure that the pin is masked.
 	 */
 	mtx_assert(&icu_lock, MA_OWNED);
-	if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS &&
+	if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq >= 0 &&
 	    intpin->io_vector == 0)) {
 		low = ioapic_read(io->io_addr,
 		    IOAPIC_REDTBL_LO(intpin->io_intpin));
@@ -268,9 +327,34 @@
 			ioapic_write(io->io_addr,
 			    IOAPIC_REDTBL_LO(intpin->io_intpin),
 			    low | IOART_INTMSET);
+#ifdef ACPI_DMAR
+		mtx_unlock_spin(&icu_lock);
+		iommu_unmap_ioapic_intr(io->io_apic_id,
+		    &intpin->io_remap_cookie);
+		mtx_lock_spin(&icu_lock);
+#endif
 		return;
 	}
 
+#ifdef ACPI_DMAR
+	mtx_unlock_spin(&icu_lock);
+	error = iommu_map_ioapic_intr(io->io_apic_id,
+	    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
+	    intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie,
+	    &high, &low);
+	mtx_lock_spin(&icu_lock);
+	if (error == 0) {
+		ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin),
+		    high);
+		intpin->io_lowreg = low;
+		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
+		    low);
+		return;
+	} else if (error != EOPNOTSUPP) {
+		return;
+	}
+#endif
+
 	/*
 	 * Set the destination.  Note that with Intel interrupt remapping,
 	 * the previously reserved bits 55:48 now have a purpose so ensure
@@ -318,6 +402,15 @@
 	ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low);
 }
 
+static void
+ioapic_reprogram_intpin(struct intsrc *isrc)
+{
+
+	mtx_lock_spin(&icu_lock);
+	ioapic_program_intpin((struct ioapic_intsrc *)isrc);
+	mtx_unlock_spin(&icu_lock);
+}
+
 static int
 ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id)
 {
@@ -537,6 +630,8 @@
 	io = malloc(sizeof(struct ioapic) +
 	    numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK);
 	io->io_pic = ioapic_template;
+	io->pci_dev = NULL;
+	io->pci_wnd = NULL;
 	mtx_lock_spin(&icu_lock);
 	io->io_id = next_id++;
 	io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
@@ -557,11 +652,29 @@
 		    io->io_id, intbase, next_ioapic_base);
 	io->io_intbase = intbase;
 	next_ioapic_base = intbase + numintr;
+	if (next_ioapic_base > num_io_irqs)
+		num_io_irqs = next_ioapic_base;
 	io->io_numintr = numintr;
 	io->io_addr = apic;
 	io->io_paddr = addr;
 
+	if (bootverbose) {
+		printf("ioapic%u: ver 0x%02x maxredir 0x%02x\n", io->io_id,
+		    (value & IOART_VER_VERSION), (value & IOART_VER_MAXREDIR)
+		    >> MAXREDIRSHIFT);
+	}
 	/*
+	 * The  summary information about IO-APIC versions is taken from
+	 * the Linux kernel source:
+	 *     0Xh     82489DX
+	 *     1Xh     I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+	 *     2Xh     I/O(x)APIC which is PCI 2.2 Compliant
+	 *     30h-FFh Reserved
+	 * IO-APICs with version >= 0x20 have working EOIR register.
+	 */
+	io->io_haseoi = (value & IOART_VER_VERSION) >= 0x20;
+
+	/*
 	 * Initialize pins.  Start off with interrupts disabled.  Default
 	 * to active-hi and edge-triggered for ISA interrupts and active-lo
 	 * and level-triggered for all others.
@@ -599,6 +712,15 @@
 		intpin->io_cpu = PCPU_GET(apic_id);
 		value = ioapic_read(apic, IOAPIC_REDTBL_LO(i));
 		ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET);
+#ifdef ACPI_DMAR
+		/* dummy, but sets cookie */
+		mtx_unlock_spin(&icu_lock);
+		iommu_map_ioapic_intr(io->io_apic_id,
+		    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
+		    intpin->io_activehi, intpin->io_irq,
+		    &intpin->io_remap_cookie, NULL, NULL);
+		mtx_lock_spin(&icu_lock);
+#endif
 	}
 	mtx_unlock_spin(&icu_lock);
 
@@ -640,7 +762,7 @@
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || vector < 0)
 		return (EINVAL);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	io->io_pins[pin].io_irq = vector;
 	if (bootverbose)
@@ -659,7 +781,7 @@
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	if (io->io_pins[pin].io_bus == bus_type)
 		return (0);
@@ -680,7 +802,7 @@
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_NMI)
 		return (0);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_NMI;
@@ -703,7 +825,7 @@
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_SMI)
 		return (0);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_SMI;
@@ -726,7 +848,7 @@
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_EXTINT)
 		return (0);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_EXTINT;
@@ -751,7 +873,7 @@
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	activehi = (pol == INTR_POLARITY_HIGH);
 	if (io->io_pins[pin].io_activehi == activehi)
@@ -772,7 +894,7 @@
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
-	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
+	if (io->io_pins[pin].io_irq < 0)
 		return (EINVAL);
 	edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 	if (io->io_pins[pin].io_edgetrigger == edgetrigger)
@@ -808,14 +930,26 @@
 
 	/*
 	 * Reprogram pins to handle special case pins (such as NMI and
-	 * SMI) and register valid pins as interrupt sources.
+	 * SMI) and disable normal pins until a handler is registered.
 	 */
 	intr_register_pic(&io->io_pic);
+	for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++)
+		ioapic_reprogram_intpin(&pin->io_intsrc);
+}
+
+/*
+ * Add interrupt sources for I/O APIC interrupt pins.
+ */
+static void
+ioapic_register_sources(struct pic *pic)
+{
+	struct ioapic_intsrc *pin;
+	struct ioapic *io;
+	int i;
+
+	io = (struct ioapic *)pic;
 	for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) {
-		mtx_lock_spin(&icu_lock);
-		ioapic_program_intpin(pin);
-		mtx_unlock_spin(&icu_lock);
-		if (pin->io_irq < NUM_IO_INTS)
+		if (pin->io_irq >= 0)
 			intr_register_source(&pin->io_intsrc);
 	}
 }
@@ -846,7 +980,72 @@
 static int
 ioapic_pci_attach(device_t dev)
 {
+	struct resource *res;
+	volatile ioapic_t *apic;
+	struct ioapic *io;
+	int rid;
+	u_int apic_id;
 
+	/*
+	 * Try to match the enumerated ioapic.  Match BAR start
+	 * against io_paddr.  Due to a fear that PCI window is not the
+	 * same as the MADT reported io window, but an alias, read the
+	 * APIC ID from the mapped BAR and match against it.
+	 */
+	rid = PCIR_BAR(0);
+	res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
+	    RF_ACTIVE | RF_SHAREABLE);
+	if (res == NULL) {
+		if (bootverbose)
+			device_printf(dev, "cannot activate BAR0\n");
+		return (ENXIO);
+	}
+	apic = (volatile ioapic_t *)rman_get_virtual(res);
+	if (rman_get_size(res) < IOAPIC_WND_SIZE) {
+		if (bootverbose)
+			device_printf(dev,
+			    "BAR0 too small (%jd) for IOAPIC window\n",
+			    (uintmax_t)rman_get_size(res));
+		goto fail;
+	}
+	mtx_lock_spin(&icu_lock);
+	apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
+	/* First match by io window address */
+	STAILQ_FOREACH(io, &ioapic_list, io_next) {
+		if (io->io_paddr == (vm_paddr_t)rman_get_start(res))
+			goto found;
+	}
+	/* Then by apic id */
+	STAILQ_FOREACH(io, &ioapic_list, io_next) {
+		if (io->io_apic_id == apic_id)
+			goto found;
+	}
+	mtx_unlock_spin(&icu_lock);
+	if (bootverbose)
+		device_printf(dev,
+		    "cannot match pci bar apic id %d against MADT\n",
+		    apic_id);
+fail:
+	bus_release_resource(dev, SYS_RES_MEMORY, rid, res);
+	return (ENXIO);
+found:
+	KASSERT(io->pci_dev == NULL,
+	    ("ioapic %d pci_dev not NULL", io->io_id));
+	KASSERT(io->pci_wnd == NULL,
+	    ("ioapic %d pci_wnd not NULL", io->io_id));
+
+	io->pci_dev = dev;
+	io->pci_wnd = res;
+	if (bootverbose && (io->io_paddr != (vm_paddr_t)rman_get_start(res) ||
+	    io->io_apic_id != apic_id)) {
+		device_printf(dev, "pci%d:%d:%d:%d pci BAR0@%jx id %d "
+		    "MADT id %d paddr@%jx\n",
+		    pci_get_domain(dev), pci_get_bus(dev),
+		    pci_get_slot(dev), pci_get_function(dev),
+		    (uintmax_t)rman_get_start(res), apic_id,
+		    io->io_apic_id, (uintmax_t)io->io_paddr);
+	}
+	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
@@ -863,6 +1062,28 @@
 static devclass_t ioapic_devclass;
 DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0);
 
+int
+ioapic_get_rid(u_int apic_id, uint16_t *ridp)
+{
+	struct ioapic *io;
+	uintptr_t rid;
+	int error;
+
+	mtx_lock_spin(&icu_lock);
+	STAILQ_FOREACH(io, &ioapic_list, io_next) {
+		if (io->io_apic_id == apic_id)
+			break;
+	}
+	mtx_unlock_spin(&icu_lock);
+	if (io == NULL || io->pci_dev == NULL)
+		return (EINVAL);
+	error = pci_get_id(io->pci_dev, PCI_ID_RID, &rid);
+	if (error != 0)
+		return (error);
+	*ridp = rid;
+	return (0);
+}
+
 /*
  * A new-bus driver to consume the memory resources associated with
  * the APICs in the system.  On some systems ACPI or PnPBIOS system
@@ -896,19 +1117,11 @@
 {
 	int error;
 
-#ifdef PAE
-	/*
-	 * Resources use long's to track resources, so we can't
-	 * include memory regions above 4GB.
-	 */
-	if (base >= ~0ul)
-		return;
-#endif
 	error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length);
 	if (error)
 		panic("apic_add_resource: resource %d failed set with %d", rid,
 		    error);
-	bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0);
+	bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_SHAREABLE);
 }
 
 static int
@@ -918,7 +1131,7 @@
 	int i;
 
 	/* Reserve the local APIC. */
-	apic_add_resource(dev, 0, lapic_paddr, sizeof(lapic_t));
+	apic_add_resource(dev, 0, lapic_paddr, LAPIC_MEM_REGION);
 	i = 1;
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION);

Modified: trunk/sys/x86/x86/legacy.c
===================================================================
--- trunk/sys/x86/x86/legacy.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/legacy.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -33,7 +33,7 @@
 #include "opt_mca.h"
 #endif
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/legacy.c 233707 2012-03-30 19:10:14Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/legacy.c 233707 2012-03-30 19:10:14Z jhb $");
 
 /*
  * This code implements a system driver for legacy systems that do not

Modified: trunk/sys/x86/x86/local_apic.c
===================================================================
--- trunk/sys/x86/x86/local_apic.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/local_apic.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -33,11 +33,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/local_apic.c 314662 2017-03-04 12:04:24Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/local_apic.c 351757 2019-09-03 16:27:23Z emaste $");
 
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
-#include "opt_kdtrace.h"
 
 #include "opt_ddb.h"
 
@@ -51,6 +50,7 @@
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
+#include <sys/sysctl.h>
 #include <sys/timeet.h>
 
 #include <vm/vm.h>
@@ -58,14 +58,16 @@
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
+#include <machine/cpufunc.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
+#include <x86/init.h>
 
 #ifdef DDB
 #include <sys/interrupt.h>
@@ -88,12 +90,24 @@
 CTASSERT(APIC_LOCAL_INTS == 240);
 CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
 
-/* Magic IRQ values for the timer and syscalls. */
-#define	IRQ_TIMER	(NUM_IO_INTS + 1)
-#define	IRQ_SYSCALL	(NUM_IO_INTS + 2)
-#define	IRQ_DTRACE_RET	(NUM_IO_INTS + 3)
-#define	IRQ_EVTCHN	(NUM_IO_INTS + 4)
+/*
+ * I/O interrupts use non-negative IRQ values.  These values are used
+ * to mark unused IDT entries or IDT entries reserved for a non-I/O
+ * interrupt.
+ */
+#define	IRQ_FREE	-1
+#define	IRQ_TIMER	-2
+#define	IRQ_SYSCALL	-3
+#define	IRQ_DTRACE_RET	-4
+#define	IRQ_EVTCHN	-5
 
+enum lat_timer_mode {
+	LAT_MODE_UNDEF =	0,
+	LAT_MODE_PERIODIC =	1,
+	LAT_MODE_ONESHOT =	2,
+	LAT_MODE_DEADLINE =	3,
+};
+
 /*
  * Support for local APICs.  Local APICs manage interrupts on each
  * individual processor as opposed to I/O APICs which receive interrupts
@@ -114,14 +128,16 @@
 
 struct lapic {
 	struct lvt la_lvts[APIC_LVT_MAX + 1];
+	struct lvt la_elvts[APIC_ELVT_MAX + 1];
 	u_int la_id:8;
 	u_int la_cluster:4;
 	u_int la_cluster_id:2;
 	u_int la_present:1;
 	u_long *la_timer_count;
-	u_long la_timer_period;
-	u_int la_timer_mode;
-	uint32_t lvt_timer_cache;
+	uint64_t la_timer_period;
+	enum lat_timer_mode la_timer_mode;
+	uint32_t lvt_timer_base;
+	uint32_t lvt_timer_last;
 	/* Include IDT_SYSCALL to make indexing easier. */
 	int la_ioint_irqs[APIC_NUM_IOINTS + 1];
 } static lapics[MAX_APIC_ID + 1];
@@ -137,6 +153,14 @@
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },	/* CMCI */
 };
 
+/* Global defaults for AMD local APIC ELVT entries. */
+static struct lvt elvts[APIC_ELVT_MAX + 1] = {
+	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
+	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT },
+	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
+	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
+};
+
 static inthand_t *ioint_handlers[] = {
 	NULL,			/* 0 - 31 */
 	IDTVEC(apic_isr1),	/* 32 - 63 */
@@ -148,6 +172,16 @@
 	IDTVEC(apic_isr7),	/* 224 - 255 */
 };
 
+static inthand_t *ioint_pti_handlers[] = {
+	NULL,			/* 0 - 31 */
+	IDTVEC(apic_isr1_pti),	/* 32 - 63 */
+	IDTVEC(apic_isr2_pti),	/* 64 - 95 */
+	IDTVEC(apic_isr3_pti),	/* 96 - 127 */
+	IDTVEC(apic_isr4_pti),	/* 128 - 159 */
+	IDTVEC(apic_isr5_pti),	/* 160 - 191 */
+	IDTVEC(apic_isr6_pti),	/* 192 - 223 */
+	IDTVEC(apic_isr7_pti),	/* 224 - 255 */
+};
 
 static u_int32_t lapic_timer_divisors[] = {
 	APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
@@ -154,42 +188,223 @@
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
-extern inthand_t IDTVEC(rsvd);
+extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
 
-volatile lapic_t *lapic;
+volatile char *lapic_map;
 vm_paddr_t lapic_paddr;
-static u_long lapic_timer_divisor;
+int x2apic_mode;
+int lapic_eoi_suppression;
+static int lapic_timer_tsc_deadline;
+static u_long lapic_timer_divisor, count_freq;
 static struct eventtimer lapic_et;
 #ifdef SMP
 static uint64_t lapic_ipi_wait_mult;
 #endif
 
+SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options");
+SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, "");
+SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD,
+    &lapic_eoi_suppression, 0, "");
+SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD,
+    &lapic_timer_tsc_deadline, 0, "");
+
+static void lapic_calibrate_initcount(struct lapic *la);
+static void lapic_calibrate_deadline(struct lapic *la);
+
+static uint32_t
+lapic_read32(enum LAPIC_REGISTERS reg)
+{
+	uint32_t res;
+
+	if (x2apic_mode) {
+		res = rdmsr32(MSR_APIC_000 + reg);
+	} else {
+		res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL);
+	}
+	return (res);
+}
+
+static void
+lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val)
+{
+
+	if (x2apic_mode) {
+		mfence();
+		lfence();
+		wrmsr(MSR_APIC_000 + reg, val);
+	} else {
+		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
+	}
+}
+
+static void
+lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val)
+{
+
+	if (x2apic_mode) {
+		wrmsr(MSR_APIC_000 + reg, val);
+	} else {
+		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
+	}
+}
+
+#ifdef SMP
+static uint64_t
+lapic_read_icr(void)
+{
+	uint64_t v;
+	uint32_t vhi, vlo;
+
+	if (x2apic_mode) {
+		v = rdmsr(MSR_APIC_000 + LAPIC_ICR_LO);
+	} else {
+		vhi = lapic_read32(LAPIC_ICR_HI);
+		vlo = lapic_read32(LAPIC_ICR_LO);
+		v = ((uint64_t)vhi << 32) | vlo;
+	}
+	return (v);
+}
+
+static uint64_t
+lapic_read_icr_lo(void)
+{
+
+	return (lapic_read32(LAPIC_ICR_LO));
+}
+
+static void
+lapic_write_icr(uint32_t vhi, uint32_t vlo)
+{
+	uint64_t v;
+
+	if (x2apic_mode) {
+		v = ((uint64_t)vhi << 32) | vlo;
+		mfence();
+		wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v);
+	} else {
+		lapic_write32(LAPIC_ICR_HI, vhi);
+		lapic_write32(LAPIC_ICR_LO, vlo);
+	}
+}
+#endif /* SMP */
+
+static void
+native_lapic_enable_x2apic(void)
+{
+	uint64_t apic_base;
+
+	apic_base = rdmsr(MSR_APICBASE);
+	apic_base |= APICBASE_X2APIC | APICBASE_ENABLED;
+	wrmsr(MSR_APICBASE, apic_base);
+}
+
+static bool
+native_lapic_is_x2apic(void)
+{
+	uint64_t apic_base;
+
+	apic_base = rdmsr(MSR_APICBASE);
+	return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) ==
+	    (APICBASE_X2APIC | APICBASE_ENABLED));
+}
+
 static void	lapic_enable(void);
 static void	lapic_resume(struct pic *pic, bool suspend_cancelled);
-static void	lapic_timer_oneshot(struct lapic *,
-		    u_int count, int enable_int);
-static void	lapic_timer_periodic(struct lapic *,
-		    u_int count, int enable_int);
+static void	lapic_timer_oneshot(struct lapic *);
+static void	lapic_timer_oneshot_nointr(struct lapic *, uint32_t);
+static void	lapic_timer_periodic(struct lapic *);
+static void	lapic_timer_deadline(struct lapic *);
 static void	lapic_timer_stop(struct lapic *);
 static void	lapic_timer_set_divisor(u_int divisor);
 static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 static int	lapic_et_start(struct eventtimer *et,
-    sbintime_t first, sbintime_t period);
+		    sbintime_t first, sbintime_t period);
 static int	lapic_et_stop(struct eventtimer *et);
+static u_int	apic_idt_to_irq(u_int apic_id, u_int vector);
+static void	lapic_set_tpr(u_int vector);
 
 struct pic lapic_pic = { .pic_resume = lapic_resume };
 
+/* Forward declarations for apic_ops */
+static void	native_lapic_create(u_int apic_id, int boot_cpu);
+static void	native_lapic_init(vm_paddr_t addr);
+static void	native_lapic_xapic_mode(void);
+static void	native_lapic_setup(int boot);
+static void	native_lapic_dump(const char *str);
+static void	native_lapic_disable(void);
+static void	native_lapic_eoi(void);
+static int	native_lapic_id(void);
+static int	native_lapic_intr_pending(u_int vector);
+static u_int	native_apic_cpuid(u_int apic_id);
+static u_int	native_apic_alloc_vector(u_int apic_id, u_int irq);
+static u_int	native_apic_alloc_vectors(u_int apic_id, u_int *irqs,
+		    u_int count, u_int align);
+static void 	native_apic_disable_vector(u_int apic_id, u_int vector);
+static void 	native_apic_enable_vector(u_int apic_id, u_int vector);
+static void 	native_apic_free_vector(u_int apic_id, u_int vector, u_int irq);
+static void 	native_lapic_set_logical_id(u_int apic_id, u_int cluster,
+		    u_int cluster_id);
+static int 	native_lapic_enable_pmc(void);
+static void 	native_lapic_disable_pmc(void);
+static void 	native_lapic_reenable_pmc(void);
+static void 	native_lapic_enable_cmc(void);
+static int 	native_lapic_enable_mca_elvt(void);
+static int 	native_lapic_set_lvt_mask(u_int apic_id, u_int lvt,
+		    u_char masked);
+static int 	native_lapic_set_lvt_mode(u_int apic_id, u_int lvt,
+		    uint32_t mode);
+static int 	native_lapic_set_lvt_polarity(u_int apic_id, u_int lvt,
+		    enum intr_polarity pol);
+static int 	native_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
+		    enum intr_trigger trigger);
+#ifdef SMP
+static void 	native_lapic_ipi_raw(register_t icrlo, u_int dest);
+static void 	native_lapic_ipi_vectored(u_int vector, int dest);
+static int 	native_lapic_ipi_wait(int delay);
+#endif /* SMP */
+static int	native_lapic_ipi_alloc(inthand_t *ipifunc);
+static void	native_lapic_ipi_free(int vector);
+
+struct apic_ops apic_ops = {
+	.create			= native_lapic_create,
+	.init			= native_lapic_init,
+	.xapic_mode		= native_lapic_xapic_mode,
+	.is_x2apic		= native_lapic_is_x2apic,
+	.setup			= native_lapic_setup,
+	.dump			= native_lapic_dump,
+	.disable		= native_lapic_disable,
+	.eoi			= native_lapic_eoi,
+	.id			= native_lapic_id,
+	.intr_pending		= native_lapic_intr_pending,
+	.set_logical_id		= native_lapic_set_logical_id,
+	.cpuid			= native_apic_cpuid,
+	.alloc_vector		= native_apic_alloc_vector,
+	.alloc_vectors		= native_apic_alloc_vectors,
+	.enable_vector		= native_apic_enable_vector,
+	.disable_vector		= native_apic_disable_vector,
+	.free_vector		= native_apic_free_vector,
+	.enable_pmc		= native_lapic_enable_pmc,
+	.disable_pmc		= native_lapic_disable_pmc,
+	.reenable_pmc		= native_lapic_reenable_pmc,
+	.enable_cmc		= native_lapic_enable_cmc,
+	.enable_mca_elvt	= native_lapic_enable_mca_elvt,
+#ifdef SMP
+	.ipi_raw		= native_lapic_ipi_raw,
+	.ipi_vectored		= native_lapic_ipi_vectored,
+	.ipi_wait		= native_lapic_ipi_wait,
+#endif
+	.ipi_alloc		= native_lapic_ipi_alloc,
+	.ipi_free		= native_lapic_ipi_free,
+	.set_lvt_mask		= native_lapic_set_lvt_mask,
+	.set_lvt_mode		= native_lapic_set_lvt_mode,
+	.set_lvt_polarity	= native_lapic_set_lvt_polarity,
+	.set_lvt_triggermode	= native_lapic_set_lvt_triggermode,
+};
+
 static uint32_t
-lvt_mode(struct lapic *la, u_int pin, uint32_t value)
+lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value)
 {
-	struct lvt *lvt;
 
-	KASSERT(pin <= APIC_LVT_MAX, ("%s: pin %u out of range", __func__, pin));
-	if (la->la_lvts[pin].lvt_active)
-		lvt = &la->la_lvts[pin];
-	else
-		lvt = &lvts[pin];
-
 	value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
 	    APIC_LVT_VECTOR);
 	if (lvt->lvt_edgetrigger == 0)
@@ -204,7 +419,7 @@
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
-		if (!lvt->lvt_edgetrigger) {
+		if (!lvt->lvt_edgetrigger && bootverbose) {
 			printf("lapic%u: Forcing LINT%u to edge trigger\n",
 			    la->la_id, pin);
 			value &= ~APIC_LVT_TM;
@@ -220,23 +435,70 @@
 	return (value);
 }
 
+static uint32_t
+lvt_mode(struct lapic *la, u_int pin, uint32_t value)
+{
+	struct lvt *lvt;
+
+	KASSERT(pin <= APIC_LVT_MAX,
+	    ("%s: pin %u out of range", __func__, pin));
+	if (la->la_lvts[pin].lvt_active)
+		lvt = &la->la_lvts[pin];
+	else
+		lvt = &lvts[pin];
+
+	return (lvt_mode_impl(la, lvt, pin, value));
+}
+
+static uint32_t
+elvt_mode(struct lapic *la, u_int idx, uint32_t value)
+{
+	struct lvt *elvt;
+
+	KASSERT(idx <= APIC_ELVT_MAX,
+	    ("%s: idx %u out of range", __func__, idx));
+
+	elvt = &la->la_elvts[idx];
+	KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx));
+	KASSERT(elvt->lvt_edgetrigger,
+	    ("%s: ELVT%u is not edge triggered", __func__, idx));
+	KASSERT(elvt->lvt_activehi,
+	    ("%s: ELVT%u is not active high", __func__, idx));
+	return (lvt_mode_impl(la, elvt, idx, value));
+}
+
 /*
  * Map the local APIC and setup necessary interrupt vectors.
  */
-void
-lapic_init(vm_paddr_t addr)
+static void
+native_lapic_init(vm_paddr_t addr)
 {
 #ifdef SMP
 	uint64_t r, r1, r2, rx;
 #endif
+	uint32_t ver;
 	u_int regs[4];
 	int i, arat;
 
-	/* Map the local APIC and setup the spurious interrupt handler. */
+	/*
+	 * Enable x2APIC mode if possible. Map the local APIC
+	 * registers page.
+	 *
+	 * Keep the LAPIC registers page mapped uncached for x2APIC
+	 * mode too, to have direct map page attribute set to
+	 * uncached.  This is needed to work around CPU errata present
+	 * on all Intel processors.
+	 */
 	KASSERT(trunc_page(addr) == addr,
 	    ("local APIC not aligned on a page boundary"));
 	lapic_paddr = addr;
-	lapic = pmap_mapdev(addr, sizeof(lapic_t));
+	lapic_map = pmap_mapdev(addr, PAGE_SIZE);
+	if (x2apic_mode) {
+		native_lapic_enable_x2apic();
+		lapic_map = NULL;
+	}
+
+	/* Setup the spurious interrupt handler. */
 	setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL,
 	    GSEL_APIC);
 
@@ -247,15 +509,18 @@
 	PCPU_SET(apic_id, lapic_id());
 
 	/* Local APIC timer interrupt. */
-	setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC);
+	setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* Local APIC error interrupt. */
-	setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
+	setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* XXX: Thermal interrupt */
 
 	/* Local APIC CMCI. */
-	setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
+	    SDT_APICT, SEL_KPL, GSEL_APIC);
 
 	if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
 		arat = 0;
@@ -264,6 +529,9 @@
 			do_cpuid(0x06, regs);
 			if ((regs[0] & CPUTPM1_ARAT) != 0)
 				arat = 1;
+		} else if (cpu_vendor_id == CPU_VENDOR_AMD &&
+		    CPUID_TO_FAMILY(cpu_id) >= 0x12) {
+			arat = 1;
 		}
 		bzero(&lapic_et, sizeof(lapic_et));
 		lapic_et.et_name = "LAPIC";
@@ -272,8 +540,16 @@
 		lapic_et.et_quality = 600;
 		if (!arat) {
 			lapic_et.et_flags |= ET_FLAGS_C3STOP;
-			lapic_et.et_quality -= 200;
+			lapic_et.et_quality = 100;
 		}
+		if ((cpu_feature & CPUID_TSC) != 0 &&
+		    (cpu_feature2 & CPUID2_TSCDLT) != 0 &&
+		    tsc_is_invariant && tsc_freq != 0) {
+			lapic_timer_tsc_deadline = 1;
+			TUNABLE_INT_FETCH("hw.lapic_tsc_deadline",
+			    &lapic_timer_tsc_deadline);
+		}
+
 		lapic_et.et_frequency = 0;
 		/* We don't know frequency yet, so trying to guess. */
 		lapic_et.et_min_period = 0x00001000LL;
@@ -284,6 +560,29 @@
 		et_register(&lapic_et);
 	}
 
+	/*
+	 * Set lapic_eoi_suppression after lapic_enable(), to not
+	 * enable suppression in the hardware prematurely.  Note that
+	 * we by default enable suppression even when system only has
+	 * one IO-APIC, since EOI is broadcasted to all APIC agents,
+	 * including CPUs, otherwise.
+	 *
+	 * It seems that at least some KVM versions report
+	 * EOI_SUPPRESSION bit, but auto-EOI does not work.
+	 */
+	ver = lapic_read32(LAPIC_VERSION);
+	if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) {
+		lapic_eoi_suppression = 1;
+		if (vm_guest == VM_GUEST_KVM) {
+			if (bootverbose)
+				printf(
+		       "KVM -- disabling lapic eoi suppression\n");
+			lapic_eoi_suppression = 0;
+		}
+		TUNABLE_INT_FETCH("hw.lapic_eoi_suppression",
+		    &lapic_eoi_suppression);
+	}
+
 #ifdef SMP
 #define	LOOPS	100000
 	/*
@@ -299,20 +598,22 @@
 	 */
 	KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0,
 	    ("TSC not initialized"));
-	r = rdtsc();
-	for (rx = 0; rx < LOOPS; rx++) {
-		(void)lapic->icr_lo;
-		ia32_pause();
+	if (!x2apic_mode) {
+		r = rdtsc();
+		for (rx = 0; rx < LOOPS; rx++) {
+			(void)lapic_read_icr_lo();
+			ia32_pause();
+		}
+		r = rdtsc() - r;
+		r1 = tsc_freq * LOOPS;
+		r2 = r * 1000000;
+		lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
+		if (bootverbose) {
+			printf("LAPIC: ipi_wait() us multiplier %ju (r %ju "
+			    "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult,
+			    (uintmax_t)r, (uintmax_t)tsc_freq);
+		}
 	}
-	r = rdtsc() - r;
-	r1 = tsc_freq * LOOPS;
-	r2 = r * 1000000;
-	lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
-	if (bootverbose) {
-		printf("LAPIC: ipi_wait() us multiplier %ju (r %ju tsc %ju)\n",
-		    (uintmax_t)lapic_ipi_wait_mult, (uintmax_t)r,
-		    (uintmax_t)tsc_freq);
-	}
 #undef LOOPS
 #endif /* SMP */
 }
@@ -320,8 +621,8 @@
 /*
  * Create a local APIC instance.
  */
-void
-lapic_create(u_int apic_id, int boot_cpu)
+static void
+native_lapic_create(u_int apic_id, int boot_cpu)
 {
 	int i;
 
@@ -344,8 +645,12 @@
 		lapics[apic_id].la_lvts[i] = lvts[i];
 		lapics[apic_id].la_lvts[i].lvt_active = 0;
 	}
+	for (i = 0; i <= APIC_ELVT_MAX; i++) {
+		lapics[apic_id].la_elvts[i] = elvts[i];
+		lapics[apic_id].la_elvts[i].lvt_active = 0;
+	}
 	for (i = 0; i <= APIC_NUM_IOINTS; i++)
-	    lapics[apic_id].la_ioint_irqs[i] = -1;
+	    lapics[apic_id].la_ioint_irqs[i] = IRQ_FREE;
 	lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
 	lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] =
 	    IRQ_TIMER;
@@ -363,41 +668,100 @@
 #endif
 }
 
+static inline uint32_t
+amd_read_ext_features(void)
+{
+	uint32_t version;
+
+	if (cpu_vendor_id != CPU_VENDOR_AMD)
+		return (0);
+	version = lapic_read32(LAPIC_VERSION);
+	if ((version & APIC_VER_AMD_EXT_SPACE) != 0)
+		return (lapic_read32(LAPIC_EXT_FEATURES));
+	else
+		return (0);
+}
+
+static inline uint32_t
+amd_read_elvt_count(void)
+{
+	uint32_t extf;
+	uint32_t count;
+
+	extf = amd_read_ext_features();
+	count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT;
+	count = min(count, APIC_ELVT_MAX + 1);
+	return (count);
+}
+
 /*
  * Dump contents of local APIC registers
  */
-void
-lapic_dump(const char* str)
+static void
+native_lapic_dump(const char* str)
 {
+	uint32_t version;
 	uint32_t maxlvt;
+	uint32_t extf;
+	int elvt_count;
+	int i;
 
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	version = lapic_read32(LAPIC_VERSION);
+	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
-	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n",
-	    lapic->id, lapic->version, lapic->ldr, lapic->dfr);
-	printf("  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
-	    lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr);
+	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x",
+	    lapic_read32(LAPIC_ID), version,
+	    lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR));
+	if ((cpu_feature2 & CPUID2_X2APIC) != 0)
+		printf(" x2APIC: %d", x2apic_mode);
+	printf("\n  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
+	    lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1),
+	    lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR));
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x",
-	    lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error);
+	    lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL),
+	    lapic_read32(LAPIC_LVT_ERROR));
 	if (maxlvt >= APIC_LVT_PMC)
-		printf(" pmc: 0x%08x", lapic->lvt_pcint);
+		printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT));
 	printf("\n");
 	if (maxlvt >= APIC_LVT_CMCI)
-		printf("   cmci: 0x%08x\n", lapic->lvt_cmci);
+		printf("   cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI));
+	extf = amd_read_ext_features();
+	if (extf != 0) {
+		printf("   AMD ext features: 0x%08x\n", extf);
+		elvt_count = amd_read_elvt_count();
+		for (i = 0; i < elvt_count; i++)
+			printf("   AMD elvt%d: 0x%08x\n", i,
+			    lapic_read32(LAPIC_EXT_LVT0 + i));
+	}
 }
 
-void
-lapic_setup(int boot)
+static void
+native_lapic_xapic_mode(void)
 {
+	register_t saveintr;
+
+	saveintr = intr_disable();
+	if (x2apic_mode)
+		native_lapic_enable_x2apic();
+	intr_restore(saveintr);
+}
+
+static void
+native_lapic_setup(int boot)
+{
 	struct lapic *la;
-	u_int32_t maxlvt;
+	uint32_t version;
+	uint32_t maxlvt;
 	register_t saveintr;
-	char buf[MAXCOMLEN + 1];
+	int elvt_count;
+	int i;
 
+	saveintr = intr_disable();
+
 	la = &lapics[lapic_id()];
 	KASSERT(la->la_present, ("missing APIC structure"));
-	saveintr = intr_disable();
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	version = lapic_read32(LAPIC_VERSION);
+	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 
 	/* Initialize the TPR to allow all interrupts. */
 	lapic_set_tpr(0);
@@ -406,54 +770,103 @@
 	lapic_enable();
 
 	/* Program LINT[01] LVT entries. */
-	lapic->lvt_lint0 = lvt_mode(la, APIC_LVT_LINT0, lapic->lvt_lint0);
-	lapic->lvt_lint1 = lvt_mode(la, APIC_LVT_LINT1, lapic->lvt_lint1);
+	lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0,
+	    lapic_read32(LAPIC_LVT_LINT0)));
+	lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1,
+	    lapic_read32(LAPIC_LVT_LINT1)));
 
 	/* Program the PMC LVT entry if present. */
-	if (maxlvt >= APIC_LVT_PMC)
-		lapic->lvt_pcint = lvt_mode(la, APIC_LVT_PMC, lapic->lvt_pcint);
+	if (maxlvt >= APIC_LVT_PMC) {
+		lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
+		    LAPIC_LVT_PCINT));
+	}
 
-	/* Program timer LVT and setup handler. */
-	la->lvt_timer_cache = lapic->lvt_timer =
-	    lvt_mode(la, APIC_LVT_TIMER, lapic->lvt_timer);
-	if (boot) {
-		snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid));
-		intrcnt_add(buf, &la->la_timer_count);
+	/* Program timer LVT. */
+	la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER,
+	    lapic_read32(LAPIC_LVT_TIMER));
+	la->lvt_timer_last = la->lvt_timer_base;
+	lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base);
+
+	/* Calibrate the timer parameters using BSP. */
+	if (boot && IS_BSP()) {
+		lapic_calibrate_initcount(la);
+		if (lapic_timer_tsc_deadline)
+			lapic_calibrate_deadline(la);
 	}
 
 	/* Setup the timer if configured. */
-	if (la->la_timer_mode != 0) {
+	if (la->la_timer_mode != LAT_MODE_UNDEF) {
 		KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor",
 		    lapic_id()));
-		lapic_timer_set_divisor(lapic_timer_divisor);
-		if (la->la_timer_mode == 1)
-			lapic_timer_periodic(la, la->la_timer_period, 1);
-		else
-			lapic_timer_oneshot(la, la->la_timer_period, 1);
+		switch (la->la_timer_mode) {
+		case LAT_MODE_PERIODIC:
+			lapic_timer_set_divisor(lapic_timer_divisor);
+			lapic_timer_periodic(la);
+			break;
+		case LAT_MODE_ONESHOT:
+			lapic_timer_set_divisor(lapic_timer_divisor);
+			lapic_timer_oneshot(la);
+			break;
+		case LAT_MODE_DEADLINE:
+			lapic_timer_deadline(la);
+			break;
+		default:
+			panic("corrupted la_timer_mode %p %d", la,
+			    la->la_timer_mode);
+		}
 	}
 
 	/* Program error LVT and clear any existing errors. */
-	lapic->lvt_error = lvt_mode(la, APIC_LVT_ERROR, lapic->lvt_error);
-	lapic->esr = 0;
+	lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR,
+	    lapic_read32(LAPIC_LVT_ERROR)));
+	lapic_write32(LAPIC_ESR, 0);
 
 	/* XXX: Thermal LVT */
 
 	/* Program the CMCI LVT entry if present. */
-	if (maxlvt >= APIC_LVT_CMCI)
-		lapic->lvt_cmci = lvt_mode(la, APIC_LVT_CMCI, lapic->lvt_cmci);
+	if (maxlvt >= APIC_LVT_CMCI) {
+		lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI,
+		    lapic_read32(LAPIC_LVT_CMCI)));
+	}
 
+	elvt_count = amd_read_elvt_count();
+	for (i = 0; i < elvt_count; i++) {
+		if (la->la_elvts[i].lvt_active)
+			lapic_write32(LAPIC_EXT_LVT0 + i,
+			    elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i)));
+	}
+
 	intr_restore(saveintr);
 }
 
-void
-lapic_reenable_pmc(void)
+static void
+native_lapic_intrcnt(void *dummy __unused)
 {
+	struct pcpu *pc;
+	struct lapic *la;
+	char buf[MAXCOMLEN + 1];
+
+	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+		la = &lapics[pc->pc_apic_id];
+		if (!la->la_present)
+		    continue;
+
+		snprintf(buf, sizeof(buf), "cpu%d:timer", pc->pc_cpuid);
+		intrcnt_add(buf, &la->la_timer_count);
+	}
+}
+SYSINIT(native_lapic_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, native_lapic_intrcnt,
+    NULL);
+
+static void
+native_lapic_reenable_pmc(void)
+{
 #ifdef HWPMC_HOOKS
 	uint32_t value;
 
-	value =  lapic->lvt_pcint;
+	value = lapic_read32(LAPIC_LVT_PCINT);
 	value &= ~APIC_LVT_M;
-	lapic->lvt_pcint = value;
+	lapic_write32(LAPIC_LVT_PCINT, value);
 #endif
 }
 
@@ -464,27 +877,32 @@
 	struct lapic *la;
 
 	la = &lapics[lapic_id()];
-	lapic->lvt_pcint = lvt_mode(la, APIC_LVT_PMC, lapic->lvt_pcint);
+	lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
+	    lapic_read32(LAPIC_LVT_PCINT)));
 }
 #endif
 
-int
-lapic_enable_pmc(void)
+static int
+native_lapic_enable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
-	if (lapic == NULL)
+	if (!x2apic_mode && lapic_map == NULL)
 		return (0);
 
 	/* Fail if the PMC LVT is not present. */
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return (0);
 
 	lvts[APIC_LVT_PMC].lvt_masked = 0;
 
+#ifdef EARLY_AP_STARTUP
+	MPASS(mp_ncpus == 1 || smp_started);
+	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
+#else
 #ifdef SMP
 	/*
 	 * If hwpmc was loaded at boot time then the APs may not be
@@ -496,6 +914,7 @@
 	else
 #endif
 		lapic_update_pmc(NULL);
+#endif
 	return (1);
 #else
 	return (0);
@@ -502,18 +921,18 @@
 #endif
 }
 
-void
-lapic_disable_pmc(void)
+static void
+native_lapic_disable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 	/* Fail if the local APIC is not present. */
-	if (lapic == NULL)
+	if (!x2apic_mode && lapic_map == NULL)
 		return;
 
 	/* Fail if the PMC LVT is not present. */
-	maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
+	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return;
 
@@ -527,45 +946,89 @@
 #endif
 }
 
+static void
+lapic_calibrate_initcount(struct lapic *la)
+{
+	u_long value;
+
+	/* Start off with a divisor of 2 (power on reset default). */
+	lapic_timer_divisor = 2;
+	/* Try to calibrate the local APIC timer. */
+	do {
+		lapic_timer_set_divisor(lapic_timer_divisor);
+		lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
+		DELAY(1000000);
+		value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER);
+		if (value != APIC_TIMER_MAX_COUNT)
+			break;
+		lapic_timer_divisor <<= 1;
+	} while (lapic_timer_divisor <= 128);
+	if (lapic_timer_divisor > 128)
+		panic("lapic: Divisor too big");
+	if (bootverbose) {
+		printf("lapic: Divisor %lu, Frequency %lu Hz\n",
+		    lapic_timer_divisor, value);
+	}
+	count_freq = value;
+}
+
+static void
+lapic_calibrate_deadline(struct lapic *la __unused)
+{
+
+	if (bootverbose) {
+		printf("lapic: deadline tsc mode, Frequency %ju Hz\n",
+		    (uintmax_t)tsc_freq);
+	}
+}
+
+static void
+lapic_change_mode(struct eventtimer *et, struct lapic *la,
+    enum lat_timer_mode newmode)
+{
+
+	if (la->la_timer_mode == newmode)
+		return;
+	switch (newmode) {
+	case LAT_MODE_PERIODIC:
+		lapic_timer_set_divisor(lapic_timer_divisor);
+		et->et_frequency = count_freq;
+		break;
+	case LAT_MODE_DEADLINE:
+		et->et_frequency = tsc_freq;
+		break;
+	case LAT_MODE_ONESHOT:
+		lapic_timer_set_divisor(lapic_timer_divisor);
+		et->et_frequency = count_freq;
+		break;
+	default:
+		panic("lapic_change_mode %d", newmode);
+	}
+	la->la_timer_mode = newmode;
+	et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
+	et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
+}
+
 static int
 lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
 {
 	struct lapic *la;
-	u_long value;
 
 	la = &lapics[PCPU_GET(apic_id)];
-	if (et->et_frequency == 0) {
-		/* Start off with a divisor of 2 (power on reset default). */
-		lapic_timer_divisor = 2;
-		/* Try to calibrate the local APIC timer. */
-		do {
-			lapic_timer_set_divisor(lapic_timer_divisor);
-			lapic_timer_oneshot(la, APIC_TIMER_MAX_COUNT, 0);
-			DELAY(1000000);
-			value = APIC_TIMER_MAX_COUNT - lapic->ccr_timer;
-			if (value != APIC_TIMER_MAX_COUNT)
-				break;
-			lapic_timer_divisor <<= 1;
-		} while (lapic_timer_divisor <= 128);
-		if (lapic_timer_divisor > 128)
-			panic("lapic: Divisor too big");
-		if (bootverbose)
-			printf("lapic: Divisor %lu, Frequency %lu Hz\n",
-			    lapic_timer_divisor, value);
-		et->et_frequency = value;
-		et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
-		et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
-	}
-	if (la->la_timer_mode == 0)
-		lapic_timer_set_divisor(lapic_timer_divisor);
 	if (period != 0) {
-		la->la_timer_mode = 1;
-		la->la_timer_period = ((uint32_t)et->et_frequency * period) >> 32;
-		lapic_timer_periodic(la, la->la_timer_period, 1);
+		lapic_change_mode(et, la, LAT_MODE_PERIODIC);
+		la->la_timer_period = ((uint32_t)et->et_frequency * period) >>
+		    32;
+		lapic_timer_periodic(la);
+	} else if (lapic_timer_tsc_deadline) {
+		lapic_change_mode(et, la, LAT_MODE_DEADLINE);
+		la->la_timer_period = (et->et_frequency * first) >> 32;
+		lapic_timer_deadline(la);
 	} else {
-		la->la_timer_mode = 2;
-		la->la_timer_period = ((uint32_t)et->et_frequency * first) >> 32;
-		lapic_timer_oneshot(la, la->la_timer_period, 1);
+		lapic_change_mode(et, la, LAT_MODE_ONESHOT);
+		la->la_timer_period = ((uint32_t)et->et_frequency * first) >>
+		    32;
+		lapic_timer_oneshot(la);
 	}
 	return (0);
 }
@@ -573,34 +1036,37 @@
 static int
 lapic_et_stop(struct eventtimer *et)
 {
-	struct lapic *la = &lapics[PCPU_GET(apic_id)];
+	struct lapic *la;
 
-	la->la_timer_mode = 0;
+	la = &lapics[PCPU_GET(apic_id)];
 	lapic_timer_stop(la);
+	la->la_timer_mode = LAT_MODE_UNDEF;
 	return (0);
 }
 
-void
-lapic_disable(void)
+static void
+native_lapic_disable(void)
 {
 	uint32_t value;
 
 	/* Software disable the local APIC. */
-	value = lapic->svr;
+	value = lapic_read32(LAPIC_SVR);
 	value &= ~APIC_SVR_SWEN;
-	lapic->svr = value;
+	lapic_write32(LAPIC_SVR, value);
 }
 
 static void
 lapic_enable(void)
 {
-	u_int32_t value;
+	uint32_t value;
 
 	/* Program the spurious vector to enable the local APIC. */
-	value = lapic->svr;
+	value = lapic_read32(LAPIC_SVR);
 	value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
-	value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT);
-	lapic->svr = value;
+	value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT;
+	if (lapic_eoi_suppression)
+		value |= APIC_SVR_EOI_SUPPRESSION;
+	lapic_write32(LAPIC_SVR, value);
 }
 
 /* Reset the local APIC on the BSP during resume. */
@@ -611,34 +1077,36 @@
 	lapic_setup(0);
 }
 
-int
-lapic_id(void)
+static int
+native_lapic_id(void)
 {
+	uint32_t v;
 
-	KASSERT(lapic != NULL, ("local APIC is not mapped"));
-	return (lapic->id >> APIC_ID_SHIFT);
+	KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped"));
+	v = lapic_read32(LAPIC_ID);
+	if (!x2apic_mode)
+		v >>= APIC_ID_SHIFT;
+	return (v);
 }
 
-int
-lapic_intr_pending(u_int vector)
+static int
+native_lapic_intr_pending(u_int vector)
 {
-	volatile u_int32_t *irr;
+	uint32_t irr;
 
 	/*
-	 * The IRR registers are an array of 128-bit registers each of
-	 * which only describes 32 interrupts in the low 32 bits..  Thus,
-	 * we divide the vector by 32 to get the 128-bit index.  We then
-	 * multiply that index by 4 to get the equivalent index from
-	 * treating the IRR as an array of 32-bit registers.  Finally, we
-	 * modulus the vector by 32 to determine the individual bit to
-	 * test.
+	 * The IRR registers are an array of registers each of which
+	 * only describes 32 interrupts in the low 32 bits.  Thus, we
+	 * divide the vector by 32 to get the register index.
+	 * Finally, we modulus the vector by 32 to determine the
+	 * individual bit to test.
 	 */
-	irr = &lapic->irr0;
-	return (irr[(vector / 32) * 4] & 1 << (vector % 32));
+	irr = lapic_read32(LAPIC_IRR0 + vector / 32);
+	return (irr & 1 << (vector % 32));
 }
 
-void
-lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
+static void
+native_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 	struct lapic *la;
 
@@ -653,8 +1121,8 @@
 	la->la_cluster_id = cluster_id;
 }
 
-int
-lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
+static int
+native_lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
 {
 
 	if (pin > APIC_LVT_MAX)
@@ -676,8 +1144,8 @@
 	return (0);
 }
 
-int
-lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
+static int
+native_lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
 {
 	struct lvt *lvt;
 
@@ -732,8 +1200,8 @@
 	return (0);
 }
 
-int
-lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
+static int
+native_lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
 {
 
 	if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM)
@@ -757,8 +1225,9 @@
 	return (0);
 }
 
-int
-lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger)
+static int
+native_lapic_set_lvt_triggermode(u_int apic_id, u_int pin,
+     enum intr_trigger trigger)
 {
 
 	if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
@@ -786,25 +1255,25 @@
  * Adjust the TPR of the current CPU so that it blocks all interrupts below
  * the passed in vector.
  */
-void
+static void
 lapic_set_tpr(u_int vector)
 {
 #ifdef CHEAP_TPR
-	lapic->tpr = vector;
+	lapic_write32(LAPIC_TPR, vector);
 #else
-	u_int32_t tpr;
+	uint32_t tpr;
 
-	tpr = lapic->tpr & ~APIC_TPR_PRIO;
+	tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO;
 	tpr |= vector;
-	lapic->tpr = tpr;
+	lapic_write32(LAPIC_TPR, tpr);
 #endif
 }
 
-void
-lapic_eoi(void)
+static void
+native_lapic_eoi(void)
 {
 
-	lapic->eoi = 0;
+	lapic_write32_nofence(LAPIC_EOI, 0);
 }
 
 void
@@ -864,48 +1333,82 @@
 {
 
 	KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
-	KASSERT(ffs(divisor) <= sizeof(lapic_timer_divisors) /
-	    sizeof(u_int32_t), ("lapic: invalid divisor %u", divisor));
-	lapic->dcr_timer = lapic_timer_divisors[ffs(divisor) - 1];
+	KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors),
+		("lapic: invalid divisor %u", divisor));
+	lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]);
 }
 
 static void
-lapic_timer_oneshot(struct lapic *la, u_int count, int enable_int)
+lapic_timer_oneshot(struct lapic *la)
 {
-	u_int32_t value;
+	uint32_t value;
 
-	value = la->lvt_timer_cache;
-	value &= ~APIC_LVTT_TM;
+	value = la->lvt_timer_base;
+	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_ONE_SHOT;
-	if (enable_int)
-		value &= ~APIC_LVT_M;
-	lapic->lvt_timer = value;
-	lapic->icr_timer = count;
+	la->lvt_timer_last = value;
+	lapic_write32(LAPIC_LVT_TIMER, value);
+	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
-lapic_timer_periodic(struct lapic *la, u_int count, int enable_int)
+lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count)
 {
-	u_int32_t value;
+	uint32_t value;
 
-	value = la->lvt_timer_cache;
+	value = la->lvt_timer_base;
 	value &= ~APIC_LVTT_TM;
+	value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M;
+	la->lvt_timer_last = value;
+	lapic_write32(LAPIC_LVT_TIMER, value);
+	lapic_write32(LAPIC_ICR_TIMER, count);
+}
+
+static void
+lapic_timer_periodic(struct lapic *la)
+{
+	uint32_t value;
+
+	value = la->lvt_timer_base;
+	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_PERIODIC;
-	if (enable_int)
-		value &= ~APIC_LVT_M;
-	lapic->lvt_timer = value;
-	lapic->icr_timer = count;
+	la->lvt_timer_last = value;
+	lapic_write32(LAPIC_LVT_TIMER, value);
+	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
+lapic_timer_deadline(struct lapic *la)
+{
+	uint32_t value;
+
+	value = la->lvt_timer_base;
+	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
+	value |= APIC_LVTT_TM_TSCDLT;
+	if (value != la->lvt_timer_last) {
+		la->lvt_timer_last = value;
+		lapic_write32_nofence(LAPIC_LVT_TIMER, value);
+		if (!x2apic_mode)
+			mfence();
+	}
+	wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc());
+}
+
+static void
 lapic_timer_stop(struct lapic *la)
 {
-	u_int32_t value;
+	uint32_t value;
 
-	value = la->lvt_timer_cache;
-	value &= ~APIC_LVTT_TM;
-	value |= APIC_LVT_M;
-	lapic->lvt_timer = value;
+	if (la->la_timer_mode == LAT_MODE_DEADLINE) {
+		wrmsr(MSR_TSC_DEADLINE, 0);
+		mfence();
+	} else {
+		value = la->lvt_timer_base;
+		value &= ~APIC_LVTT_TM;
+		value |= APIC_LVT_M;
+		la->lvt_timer_last = value;
+		lapic_write32(LAPIC_LVT_TIMER, value);
+	}
 }
 
 void
@@ -922,13 +1425,13 @@
  * is called prior to lapic_setup() during boot, this just needs to unmask
  * this CPU's LVT_CMCI entry.
  */
-void
-lapic_enable_cmc(void)
+static void
+native_lapic_enable_cmc(void)
 {
 	u_int apic_id;
 
 #ifdef DEV_ATPIC
-	if (lapic == NULL)
+	if (!x2apic_mode && lapic_map == NULL)
 		return;
 #endif
 	apic_id = PCPU_GET(apic_id);
@@ -940,10 +1443,41 @@
 		printf("lapic%u: CMCI unmasked\n", apic_id);
 }
 
+static int
+native_lapic_enable_mca_elvt(void)
+{
+	u_int apic_id;
+	uint32_t value;
+	int elvt_count;
+
+#ifdef DEV_ATPIC
+	if (lapic_map == NULL)
+		return (-1);
+#endif
+
+	apic_id = PCPU_GET(apic_id);
+	KASSERT(lapics[apic_id].la_present,
+	    ("%s: missing APIC %u", __func__, apic_id));
+	elvt_count = amd_read_elvt_count();
+	if (elvt_count <= APIC_ELVT_MCA)
+		return (-1);
+
+	value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA);
+	if ((value & APIC_LVT_M) == 0) {
+		printf("AMD MCE Thresholding Extended LVT is already active\n");
+		return (-1);
+	}
+	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0;
+	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1;
+	if (bootverbose)
+		printf("lapic%u: MCE Thresholding ELVT unmasked\n", apic_id);
+	return (APIC_ELVT_MCA);
+}
+
 void
 lapic_handle_error(void)
 {
-	u_int32_t esr;
+	uint32_t esr;
 
 	/*
 	 * Read the contents of the error status register.  Write to
@@ -951,15 +1485,15 @@
 	 * to update its value to indicate any errors that have
 	 * occurred since the previous write to the register.
 	 */
-	lapic->esr = 0;
-	esr = lapic->esr;
+	lapic_write32(LAPIC_ESR, 0);
+	esr = lapic_read32(LAPIC_ESR);
 
 	printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
 	lapic_eoi();
 }
 
-u_int
-apic_cpuid(u_int apic_id)
+static u_int
+native_apic_cpuid(u_int apic_id)
 {
 #ifdef SMP
 	return apic_cpuids[apic_id];
@@ -969,12 +1503,12 @@
 }
 
 /* Request a free IDT vector to be used by the specified IRQ. */
-u_int
-apic_alloc_vector(u_int apic_id, u_int irq)
+static u_int
+native_apic_alloc_vector(u_int apic_id, u_int irq)
 {
 	u_int vector;
 
-	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
+	KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
 
 	/*
 	 * Search for a free vector.  Currently we just use a very simple
@@ -982,7 +1516,7 @@
 	 */
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
-		if (lapics[apic_id].la_ioint_irqs[vector] != -1)
+		if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE)
 			continue;
 		lapics[apic_id].la_ioint_irqs[vector] = irq;
 		mtx_unlock_spin(&icu_lock);
@@ -998,8 +1532,8 @@
  * aligned on a boundary of 'align'.  If the request cannot be
  * satisfied, 0 is returned.
  */
-u_int
-apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
+static u_int
+native_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 	u_int first, run, vector;
 
@@ -1008,7 +1542,7 @@
 	KASSERT(align >= count, ("align < count"));
 #ifdef INVARIANTS
 	for (run = 0; run < count; run++)
-		KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u",
+		KASSERT(irqs[run] < num_io_irqs, ("Invalid IRQ %u at index %u",
 		    irqs[run], run));
 #endif
 
@@ -1022,7 +1556,7 @@
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 
 		/* Vector is in use, end run. */
-		if (lapics[apic_id].la_ioint_irqs[vector] != -1) {
+		if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE) {
 			run = 0;
 			first = 0;
 			continue;
@@ -1058,8 +1592,8 @@
  * which do not have the vector configured would report spurious interrupts
  * should it fire.
  */
-void
-apic_enable_vector(u_int apic_id, u_int vector)
+static void
+native_apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
@@ -1069,12 +1603,12 @@
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
-	setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL,
-	    GSEL_APIC);
+	setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 }
 
-void
-apic_disable_vector(u_int apic_id, u_int vector)
+static void
+native_apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
@@ -1089,13 +1623,14 @@
 	 * We can not currently clear the idt entry because other cpus
 	 * may have a valid vector at this offset.
 	 */
-	setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	    SEL_KPL, GSEL_APIC);
 #endif
 }
 
 /* Release an APIC vector when it's no longer in use. */
-void
-apic_free_vector(u_int apic_id, u_int vector, u_int irq)
+static void
+native_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 	struct thread *td;
 
@@ -1102,7 +1637,7 @@
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
-	KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq));
+	KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
 	KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] ==
 	    irq, ("IRQ mismatch"));
 #ifdef KDTRACE_HOOKS
@@ -1123,7 +1658,7 @@
 		thread_unlock(td);
 	}
 	mtx_lock_spin(&icu_lock);
-	lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1;
+	lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = IRQ_FREE;
 	mtx_unlock_spin(&icu_lock);
 	if (!rebooting) {
 		thread_lock(td);
@@ -1133,7 +1668,7 @@
 }
 
 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
-u_int
+static u_int
 apic_idt_to_irq(u_int apic_id, u_int vector)
 {
 	int irq;
@@ -1174,7 +1709,7 @@
 		db_printf("Interrupts bound to lapic %u\n", apic_id);
 		for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
 			irq = lapics[apic_id].la_ioint_irqs[i];
-			if (irq == -1 || irq == IRQ_SYSCALL)
+			if (irq == IRQ_FREE || irq == IRQ_SYSCALL)
 				continue;
 #ifdef KDTRACE_HOOKS
 			if (irq == IRQ_DTRACE_RET)
@@ -1187,7 +1722,7 @@
 			db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
 			if (irq == IRQ_TIMER)
 				db_printf("lapic timer\n");
-			else if (irq < NUM_IO_INTS) {
+			else if (irq < num_io_irqs) {
 				isrc = intr_lookup_source(irq);
 				if (isrc == NULL || verbose == 0)
 					db_printf("IRQ %u\n", irq);
@@ -1224,48 +1759,49 @@
 	uint32_t v;
 
 	db_printf("lapic ID = %d\n", lapic_id());
-	v = lapic->version;
+	v = lapic_read32(LAPIC_VERSION);
 	db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
 	    v & 0xf);
 	db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
-	v = lapic->svr;
+	v = lapic_read32(LAPIC_SVR);
 	db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
 	    v & APIC_SVR_ENABLE ? "enabled" : "disabled");
-	db_printf("TPR      = %02x\n", lapic->tpr);
+	db_printf("TPR      = %02x\n", lapic_read32(LAPIC_TPR));
 
-#define dump_field(prefix, index)					\
-	dump_mask(__XSTRING(prefix ## index), lapic->prefix ## index,	\
+#define dump_field(prefix, regn, index)					\
+	dump_mask(__XSTRING(prefix ## index), 				\
+	    lapic_read32(LAPIC_ ## regn ## index),			\
 	    index * 32)
 
 	db_printf("In-service Interrupts:\n");
-	dump_field(isr, 0);
-	dump_field(isr, 1);
-	dump_field(isr, 2);
-	dump_field(isr, 3);
-	dump_field(isr, 4);
-	dump_field(isr, 5);
-	dump_field(isr, 6);
-	dump_field(isr, 7);
+	dump_field(isr, ISR, 0);
+	dump_field(isr, ISR, 1);
+	dump_field(isr, ISR, 2);
+	dump_field(isr, ISR, 3);
+	dump_field(isr, ISR, 4);
+	dump_field(isr, ISR, 5);
+	dump_field(isr, ISR, 6);
+	dump_field(isr, ISR, 7);
 
 	db_printf("TMR Interrupts:\n");
-	dump_field(tmr, 0);
-	dump_field(tmr, 1);
-	dump_field(tmr, 2);
-	dump_field(tmr, 3);
-	dump_field(tmr, 4);
-	dump_field(tmr, 5);
-	dump_field(tmr, 6);
-	dump_field(tmr, 7);
+	dump_field(tmr, TMR, 0);
+	dump_field(tmr, TMR, 1);
+	dump_field(tmr, TMR, 2);
+	dump_field(tmr, TMR, 3);
+	dump_field(tmr, TMR, 4);
+	dump_field(tmr, TMR, 5);
+	dump_field(tmr, TMR, 6);
+	dump_field(tmr, TMR, 7);
 
 	db_printf("IRR Interrupts:\n");
-	dump_field(irr, 0);
-	dump_field(irr, 1);
-	dump_field(irr, 2);
-	dump_field(irr, 3);
-	dump_field(irr, 4);
-	dump_field(irr, 5);
-	dump_field(irr, 6);
-	dump_field(irr, 7);
+	dump_field(irr, IRR, 0);
+	dump_field(irr, IRR, 1);
+	dump_field(irr, IRR, 2);
+	dump_field(irr, IRR, 3);
+	dump_field(irr, IRR, 4);
+	dump_field(irr, IRR, 5);
+	dump_field(irr, IRR, 6);
+	dump_field(irr, IRR, 7);
 
 #undef dump_field
 }
@@ -1391,20 +1927,18 @@
 	 * Local APIC must be registered before other PICs and pseudo PICs
 	 * for proper suspend/resume order.
 	 */
-#ifndef XEN
 	intr_register_pic(&lapic_pic);
-#endif
 
 	retval = best_enum->apic_setup_io();
 	if (retval != 0)
 		printf("%s: Failed to setup I/O APICs: returned %d\n",
 		    best_enum->apic_name, retval);
-#ifdef XEN
-	return;
-#endif
+
 	/*
-	 * Finish setting up the local APIC on the BSP once we know how to
-	 * properly program the LINT pins.
+	 * Finish setting up the local APIC on the BSP once we know
+	 * how to properly program the LINT pins.  In particular, this
+	 * enables the EOI suppression mode, if LAPIC support it and
+	 * user did not disabled the mode.
 	 */
 	lapic_setup(1);
 	if (bootverbose)
@@ -1411,9 +1945,13 @@
 		lapic_dump("BSP");
 
 	/* Enable the MSI "pic". */
-	msi_init();
+	init_ops.msi_init();
+
+#ifdef XENHVM
+	xen_intr_alloc_irqs();
+#endif
 }
-SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_SECOND, apic_setup_io, NULL);
+SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL);
 
 #ifdef SMP
 /*
@@ -1426,13 +1964,18 @@
  * Wait delay microseconds for IPI to be sent.  If delay is -1, we
  * wait forever.
  */
-int
-lapic_ipi_wait(int delay)
+static int
+native_lapic_ipi_wait(int delay)
 {
 	uint64_t rx;
 
+	/* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */
+	if (x2apic_mode)
+		return (1);
+
 	for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) {
-		if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE)
+		if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) ==
+		    APIC_DELSTAT_IDLE)
 			return (1);
 		ia32_pause();
 	}
@@ -1439,33 +1982,51 @@
 	return (0);
 }
 
-void
-lapic_ipi_raw(register_t icrlo, u_int dest)
+static void
+native_lapic_ipi_raw(register_t icrlo, u_int dest)
 {
-	register_t value, saveintr;
+	uint64_t icr;
+	uint32_t vhi, vlo;
+	register_t saveintr;
 
 	/* XXX: Need more sanity checking of icrlo? */
-	KASSERT(lapic != NULL, ("%s called too early", __func__));
-	KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
+	KASSERT(x2apic_mode || lapic_map != NULL,
+	    ("%s called too early", __func__));
+	KASSERT(x2apic_mode ||
+	    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 	    ("%s: invalid dest field", __func__));
 	KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
 	    ("%s: reserved bits set in ICR LO register", __func__));
 
 	/* Set destination in ICR HI register if it is being used. */
-	saveintr = intr_disable();
+	if (!x2apic_mode) {
+		saveintr = intr_disable();
+		icr = lapic_read_icr();
+	}
+
 	if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
-		value = lapic->icr_hi;
-		value &= ~APIC_ID_MASK;
-		value |= dest << APIC_ID_SHIFT;
-		lapic->icr_hi = value;
+		if (x2apic_mode) {
+			vhi = dest;
+		} else {
+			vhi = icr >> 32;
+			vhi &= ~APIC_ID_MASK;
+			vhi |= dest << APIC_ID_SHIFT;
+		}
+	} else {
+		vhi = 0;
 	}
 
 	/* Program the contents of the IPI and dispatch it. */
-	value = lapic->icr_lo;
-	value &= APIC_ICRLO_RESV_MASK;
-	value |= icrlo;
-	lapic->icr_lo = value;
-	intr_restore(saveintr);
+	if (x2apic_mode) {
+		vlo = icrlo;
+	} else {
+		vlo = icr;
+		vlo &= APIC_ICRLO_RESV_MASK;
+		vlo |= icrlo;
+	}
+	lapic_write_icr(vhi, vlo);
+	if (!x2apic_mode)
+		intr_restore(saveintr);
 }
 
 #define	BEFORE_SPIN	50000
@@ -1473,8 +2034,8 @@
 #define	AFTER_SPIN	50
 #endif
 
-void
-lapic_ipi_vectored(u_int vector, int dest)
+static void
+native_lapic_ipi_vectored(u_int vector, int dest)
 {
 	register_t icrlo, destfield;
 
@@ -1484,11 +2045,10 @@
 	icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT;
 
 	/*
-	 * IPI_STOP_HARD is just a "fake" vector used to send a NMI.
-	 * Use special rules regard NMI if passed, otherwise specify
-	 * the vector.
+	 * NMI IPIs are just fake vectors used to send a NMI.  Use special rules
+	 * regarding NMIs if passed, otherwise specify the vector.
 	 */
-	if (vector == IPI_STOP_HARD)
+	if (vector >= IPI_NMI_FIRST)
 		icrlo |= APIC_DELMODE_NMI;
 	else
 		icrlo |= vector | APIC_DELMODE_FIXED;
@@ -1504,7 +2064,8 @@
 		icrlo |= APIC_DEST_ALLESELF;
 		break;
 	default:
-		KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
+		KASSERT(x2apic_mode ||
+		    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 		    ("%s: invalid destination 0x%x", __func__, dest));
 		destfield = dest;
 	}
@@ -1541,10 +2102,70 @@
 		printf("APIC: IPI might be stuck\n");
 #else /* !needsattention */
 		/* Wait until mesage is sent without a timeout. */
-		while (lapic->icr_lo & APIC_DELSTAT_PEND)
+		while (lapic_read_icr_lo() & APIC_DELSTAT_PEND)
 			ia32_pause();
 #endif /* needsattention */
 	}
 #endif /* DETECT_DEADLOCK */
 }
+
 #endif /* SMP */
+
+/*
+ * Since the IDT is shared by all CPUs the IPI slot update needs to be globally
+ * visible.
+ *
+ * Consider the case where an IPI is generated immediately after allocation:
+ *     vector = lapic_ipi_alloc(ipifunc);
+ *     ipi_selected(other_cpus, vector);
+ *
+ * In xAPIC mode a write to ICR_LO has serializing semantics because the
+ * APIC page is mapped as an uncached region. In x2APIC mode there is an
+ * explicit 'mfence' before the ICR MSR is written. Therefore in both cases
+ * the IDT slot update is globally visible before the IPI is delivered.
+ */
+static int
+native_lapic_ipi_alloc(inthand_t *ipifunc)
+{
+	struct gate_descriptor *ip;
+	long func;
+	int idx, vector;
+
+	KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
+	    ("invalid ipifunc %p", ipifunc));
+
+	vector = -1;
+	mtx_lock_spin(&icu_lock);
+	for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
+		ip = &idt[idx];
+		func = (ip->gd_hioffset << 16) | ip->gd_looffset;
+		if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) ||
+		    (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) {
+			vector = idx;
+			setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
+			break;
+		}
+	}
+	mtx_unlock_spin(&icu_lock);
+	return (vector);
+}
+
+static void
+native_lapic_ipi_free(int vector)
+{
+	struct gate_descriptor *ip;
+	long func;
+
+	KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST,
+	    ("%s: invalid vector %d", __func__, vector));
+
+	mtx_lock_spin(&icu_lock);
+	ip = &idt[vector];
+	func = (ip->gd_hioffset << 16) | ip->gd_looffset;
+	KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
+	    func != (uintptr_t)&IDTVEC(rsvd_pti),
+	    ("invalid idtfunc %#lx", func));
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	    SEL_KPL, GSEL_APIC);
+	mtx_unlock_spin(&icu_lock);
+}

Modified: trunk/sys/x86/x86/mca.c
===================================================================
--- trunk/sys/x86/x86/mca.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/mca.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mca.c 314667 2017-03-04 13:03:31Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mca.c 333159 2018-05-02 07:38:38Z kib $");
 
 #ifdef __amd64__
 #define	DEV_APIC
@@ -53,7 +53,7 @@
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
@@ -76,6 +76,11 @@
 	int	max_threshold;
 	time_t	last_intr;
 };
+
+struct amd_et_state {
+	int	cur_threshold;
+	time_t	last_intr;
+};
 #endif
 
 struct mca_internal {
@@ -93,22 +98,20 @@
     "Machine Check Architecture");
 
 static int mca_enabled = 1;
-TUNABLE_INT("hw.mca.enabled", &mca_enabled);
 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
     "Administrative toggle for machine check support");
 
 static int amd10h_L1TP = 1;
-TUNABLE_INT("hw.mca.amd10h_L1TP", &amd10h_L1TP);
 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
     "Administrative toggle for logging of level one TLB parity (L1TP) errors");
 
 static int intel6h_HSD131;
-TUNABLE_INT("hw.mca.intel6h_hsd131", &intel6h_HSD131);
 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
     "Administrative toggle for logging of spurious corrected errors");
 
 int workaround_erratum383;
-SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
+SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
+    &workaround_erratum383, 0,
     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
 
 static STAILQ_HEAD(, mca_internal) mca_freelist;
@@ -121,8 +124,18 @@
 static struct mtx mca_lock;
 
 #ifdef DEV_APIC
-static struct cmc_state **cmc_state;	/* Indexed by cpuid, bank */
+static struct cmc_state **cmc_state;		/* Indexed by cpuid, bank. */
+static struct amd_et_state *amd_et_state;	/* Indexed by cpuid. */
 static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
+
+static int amd_elvt = -1;
+
+static inline bool
+amd_thresholding_supported(void)
+{
+	return (cpu_vendor_id == CPU_VENDOR_AMD &&
+	    CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16);
+}
 #endif
 
 static int
@@ -511,8 +524,8 @@
 	STAILQ_INSERT_TAIL(&mca_records, rec, link);
 	mca_count++;
 	mtx_unlock_spin(&mca_lock);
-	if (mode == CMCI)
-		taskqueue_enqueue_fast(mca_tq, &mca_refill_task);
+	if (mode == CMCI && !cold)
+		taskqueue_enqueue(mca_tq, &mca_refill_task);
 }
 
 #ifdef DEV_APIC
@@ -524,19 +537,15 @@
  * cmc_throttle seconds or the periodic scan.  If a periodic scan
  * finds that the threshold is too high, it is lowered.
  */
-static void
-cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+static int
+update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
+    int cur_threshold, int max_threshold)
 {
-	struct cmc_state *cc;
-	uint64_t ctl;
 	u_int delta;
-	int count, limit;
+	int limit;
 
-	/* Fetch the current limit for this bank. */
-	cc = &cmc_state[PCPU_GET(cpuid)][bank];
-	ctl = rdmsr(MSR_MC_CTL2(bank));
-	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
-	delta = (u_int)(time_uptime - cc->last_intr);
+	delta = (u_int)(time_uptime - last_intr);
+	limit = cur_threshold;
 
 	/*
 	 * If an interrupt was received less than cmc_throttle seconds
@@ -545,16 +554,11 @@
 	 * double the threshold up to the max.
 	 */
 	if (mode == CMCI && valid) {
-		limit = ctl & MC_CTL2_THRESHOLD;
 		if (delta < cmc_throttle && count >= limit &&
-		    limit < cc->max_threshold) {
-			limit = min(limit << 1, cc->max_threshold);
-			ctl &= ~MC_CTL2_THRESHOLD;
-			ctl |= limit;
-			wrmsr(MSR_MC_CTL2(bank), ctl);
+		    limit < max_threshold) {
+			limit = min(limit << 1, max_threshold);
 		}
-		cc->last_intr = time_uptime;
-		return;
+		return (limit);
 	}
 
 	/*
@@ -562,11 +566,11 @@
 	 * should be lowered.
 	 */
 	if (mode != POLLED)
-		return;
+		return (limit);
 
 	/* If a CMCI occured recently, do nothing for now. */
 	if (delta < cmc_throttle)
-		return;
+		return (limit);
 
 	/*
 	 * Compute a new limit based on the average rate of events per
@@ -573,20 +577,70 @@
 	 * cmc_throttle seconds since the last interrupt.
 	 */
 	if (valid) {
-		count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
 		limit = count * cmc_throttle / delta;
 		if (limit <= 0)
 			limit = 1;
-		else if (limit > cc->max_threshold)
-			limit = cc->max_threshold;
-	} else
+		else if (limit > max_threshold)
+			limit = max_threshold;
+	} else {
 		limit = 1;
-	if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+	}
+	return (limit);
+}
+
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+	struct cmc_state *cc;
+	uint64_t ctl;
+	int cur_threshold, new_threshold;
+	int count;
+
+	/* Fetch the current limit for this bank. */
+	cc = &cmc_state[PCPU_GET(cpuid)][bank];
+	ctl = rdmsr(MSR_MC_CTL2(bank));
+	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+	cur_threshold = ctl & MC_CTL2_THRESHOLD;
+
+	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
+	    cur_threshold, cc->max_threshold);
+
+	if (mode == CMCI && valid)
+		cc->last_intr = time_uptime;
+	if (new_threshold != cur_threshold) {
 		ctl &= ~MC_CTL2_THRESHOLD;
-		ctl |= limit;
+		ctl |= new_threshold;
 		wrmsr(MSR_MC_CTL2(bank), ctl);
 	}
 }
+
+static void
+amd_thresholding_update(enum scan_mode mode, int bank, int valid)
+{
+	struct amd_et_state *cc;
+	uint64_t misc;
+	int new_threshold;
+	int count;
+
+	KASSERT(bank == MC_AMDNB_BANK,
+	    ("%s: unexpected bank %d", __func__, bank));
+	cc = &amd_et_state[PCPU_GET(cpuid)];
+	misc = rdmsr(MSR_MC_MISC(bank));
+	count = (misc & MC_MISC_AMDNB_CNT_MASK) >> MC_MISC_AMDNB_CNT_SHIFT;
+	count = count - (MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold);
+
+	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
+	    cc->cur_threshold, MC_MISC_AMDNB_CNT_MAX);
+
+	cc->cur_threshold = new_threshold;
+	misc &= ~MC_MISC_AMDNB_CNT_MASK;
+	misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
+	    << MC_MISC_AMDNB_CNT_SHIFT;
+	misc &= ~MC_MISC_AMDNB_OVERFLOW;
+	wrmsr(MSR_MC_MISC(bank), misc);
+	if (mode == CMCI && valid)
+		cc->last_intr = time_uptime;
+}
 #endif
 
 /*
@@ -600,7 +654,7 @@
  * count of the number of valid MC records found.
  */
 static int
-mca_scan(enum scan_mode mode)
+mca_scan(enum scan_mode mode, int *recoverablep)
 {
 	struct mca_record rec;
 	uint64_t mcg_cap, ucmask;
@@ -641,13 +695,19 @@
 		 * If this is a bank this CPU monitors via CMCI,
 		 * update the threshold.
 		 */
-		if (PCPU_GET(cmci_mask) & 1 << i)
-			cmci_update(mode, i, valid, &rec);
+		if (PCPU_GET(cmci_mask) & 1 << i) {
+			if (cmc_state != NULL)
+				cmci_update(mode, i, valid, &rec);
+			else
+				amd_thresholding_update(mode, i, valid);
+		}
 #endif
 	}
 	if (mode == POLLED)
 		mca_fill_freelist();
-	return (mode == MCE ? recoverable : count);
+	if (recoverablep != NULL)
+		*recoverablep = recoverable;
+	return (count);
 }
 
 /*
@@ -669,7 +729,7 @@
 	CPU_FOREACH(cpu) {
 		sched_bind(td, cpu);
 		thread_unlock(td);
-		count += mca_scan(POLLED);
+		count += mca_scan(POLLED, NULL);
 		thread_lock(td);
 		sched_unbind(td);
 	}
@@ -690,7 +750,7 @@
 mca_periodic_scan(void *arg)
 {
 
-	taskqueue_enqueue_fast(mca_tq, &mca_scan_task);
+	taskqueue_enqueue(mca_tq, &mca_scan_task);
 	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
 }
 
@@ -704,7 +764,7 @@
 	if (error)
 		return (error);
 	if (i)
-		taskqueue_enqueue_fast(mca_tq, &mca_scan_task);
+		taskqueue_enqueue(mca_tq, &mca_scan_task);
 	return (0);
 }
 
@@ -717,6 +777,9 @@
 	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
 	    taskqueue_thread_enqueue, &mca_tq);
 	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
+
+	/* CMCIs during boot may have claimed items from the freelist. */
+	mca_fill_freelist();
 }
 SYSINIT(mca_createtq, SI_SUB_CONFIGURE, SI_ORDER_ANY, mca_createtq, NULL);
 
@@ -729,7 +792,11 @@
 
 	callout_reset(&mca_timer, mca_ticks * hz, mca_periodic_scan, NULL);
 }
+#ifdef EARLY_AP_STARTUP
+SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
+#else
 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
+#endif
 
 #ifdef DEV_APIC
 static void
@@ -747,6 +814,18 @@
 	    &cmc_throttle, 0, sysctl_positive_int, "I",
 	    "Interval in seconds to throttle corrected MC interrupts");
 }
+
+static void
+amd_thresholding_setup(void)
+{
+
+	amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state),
+	    M_MCA, M_WAITOK | M_ZERO);
+	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+	    &cmc_throttle, 0, sysctl_positive_int, "I",
+	    "Interval in seconds to throttle corrected MC interrupts");
+}
 #endif
 
 static void
@@ -785,6 +864,8 @@
 #ifdef DEV_APIC
 	if (mcg_cap & MCG_CAP_CMCI_P)
 		cmci_setup();
+	else if (amd_thresholding_supported())
+		amd_thresholding_setup();
 #endif
 }
 
@@ -859,6 +940,82 @@
 	ctl |= MC_CTL2_CMCI_EN | 1;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 }
+
+static void
+amd_thresholding_start(struct amd_et_state *cc)
+{
+	uint64_t misc;
+
+	KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
+	misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
+	misc &= ~MC_MISC_AMDNB_INT_MASK;
+	misc |= MC_MISC_AMDNB_INT_LVT;
+	misc &= ~MC_MISC_AMDNB_LVT_MASK;
+	misc |= (uint64_t)amd_elvt << MC_MISC_AMDNB_LVT_SHIFT;
+	misc &= ~MC_MISC_AMDNB_CNT_MASK;
+	misc |= (uint64_t)(MC_MISC_AMDNB_CNT_MAX - cc->cur_threshold)
+	    << MC_MISC_AMDNB_CNT_SHIFT;
+	misc &= ~MC_MISC_AMDNB_OVERFLOW;
+	misc |= MC_MISC_AMDNB_CNTEN;
+
+	wrmsr(MSR_MC_MISC(MC_AMDNB_BANK), misc);
+}
+
+static void
+amd_thresholding_init(void)
+{
+	struct amd_et_state *cc;
+	uint64_t misc;
+
+	/* The counter must be valid and present. */
+	misc = rdmsr(MSR_MC_MISC(MC_AMDNB_BANK));
+	if ((misc & (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP)) !=
+	    (MC_MISC_AMDNB_VAL | MC_MISC_AMDNB_CNTP))
+		return;
+
+	/* The register should not be locked. */
+	if ((misc & MC_MISC_AMDNB_LOCK) != 0)
+		return;
+
+	/*
+	 * If counter is enabled then either the firmware or another CPU
+	 * has already claimed it.
+	 */
+	if ((misc & MC_MISC_AMDNB_CNTEN) != 0)
+		return;
+
+	/*
+	 * Configure an Extended Interrupt LVT register for reporting
+	 * counter overflows if that feature is supported and the first
+	 * extended register is available.
+	 */
+	amd_elvt = lapic_enable_mca_elvt();
+	if (amd_elvt < 0)
+		return;
+
+	/* Re-use Intel CMC support infrastructure. */
+	cc = &amd_et_state[PCPU_GET(cpuid)];
+	cc->cur_threshold = 1;
+	amd_thresholding_start(cc);
+
+	/* Mark the NB bank as monitored. */
+	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << MC_AMDNB_BANK);
+}
+
+static void
+amd_thresholding_resume(void)
+{
+	struct amd_et_state *cc;
+
+	/* Nothing to do if this CPU doesn't monitor the NB bank. */
+	if ((PCPU_GET(cmci_mask) & 1 << MC_AMDNB_BANK) == 0)
+		return;
+
+	cc = &amd_et_state[PCPU_GET(cpuid)];
+	cc->last_intr = 0;
+	cc->cur_threshold = 1;
+	amd_thresholding_start(cc);
+}
 #endif
 
 /*
@@ -884,7 +1041,7 @@
 		if (mcg_cap & MCG_CAP_CTL_P)
 			/* Enable MCA features. */
 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
-		if (PCPU_GET(cpuid) == 0 && boot)
+		if (IS_BSP() && boot)
 			mca_setup(mcg_cap);
 
 		/*
@@ -900,6 +1057,14 @@
 			if ((mask & (1UL << 5)) == 0)
 				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
 		}
+
+		/*
+		 * The cmci_monitor() must not be executed
+		 * simultaneously by several CPUs.
+		 */
+		if (boot)
+			mtx_lock_spin(&mca_lock);
+
 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
 			/* By default enable logging of all errors. */
 			ctl = 0xffffffffffffffffUL;
@@ -934,10 +1099,30 @@
 			/* Clear all errors. */
 			wrmsr(MSR_MC_STATUS(i), 0);
 		}
+		if (boot)
+			mtx_unlock_spin(&mca_lock);
 
 #ifdef DEV_APIC
-		if (PCPU_GET(cmci_mask) != 0 && boot)
+		/*
+		 * AMD Processors from families 10h - 16h provide support
+		 * for Machine Check Error Thresholding.
+		 * The processors support counters of MC errors and they
+		 * can be configured to generate an interrupt when a counter
+		 * overflows.
+		 * The counters are all associated with Bank 4 and each
+		 * of them covers a group of errors reported via that bank.
+		 * At the moment only the DRAM Error Threshold Group is
+		 * supported.
+		 */
+		if (amd_thresholding_supported() &&
+		    (mcg_cap & MCG_CAP_COUNT) >= 4) {
+			if (boot)
+				amd_thresholding_init();
+			else
+				amd_thresholding_resume();
+		} else if (PCPU_GET(cmci_mask) != 0 && boot) {
 			lapic_enable_cmc();
+		}
 #endif
 	}
 
@@ -978,7 +1163,7 @@
 mca_intr(void)
 {
 	uint64_t mcg_status;
-	int old_count, recoverable;
+	int recoverable, count;
 
 	if (!(cpu_feature & CPUID_MCA)) {
 		/*
@@ -992,8 +1177,7 @@
 	}
 
 	/* Scan the banks and check for any non-recoverable errors. */
-	old_count = mca_count;
-	recoverable = mca_scan(MCE);
+	count = mca_scan(MCE, &recoverable);
 	mcg_status = rdmsr(MSR_MCG_STATUS);
 	if (!(mcg_status & MCG_STATUS_RIPV))
 		recoverable = 0;
@@ -1000,12 +1184,11 @@
 
 	if (!recoverable) {
 		/*
-		 * Wait for at least one error to be logged before
-		 * panic'ing.  Some errors will assert a machine check
-		 * on all CPUs, but only certain CPUs will find a valid
-		 * bank to log.
+		 * Only panic if the error was detected local to this CPU.
+		 * Some errors will assert a machine check on all CPUs, but
+		 * only certain CPUs will find a valid bank to log.
 		 */
-		while (mca_count == old_count)
+		while (count == 0)
 			cpu_spinwait();
 
 		panic("Unrecoverable machine check exception");
@@ -1027,7 +1210,7 @@
 	 * Serialize MCA bank scanning to prevent collisions from
 	 * sibling threads.
 	 */
-	count = mca_scan(CMCI);
+	count = mca_scan(CMCI, NULL);
 
 	/* If we found anything, log them to the console. */
 	if (count != 0) {

Added: trunk/sys/x86/x86/mp_watchdog.c
===================================================================
--- trunk/sys/x86/x86/mp_watchdog.c	                        (rev 0)
+++ trunk/sys/x86/x86/mp_watchdog.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,211 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2004 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/x86/mp_watchdog.c 303912 2016-08-10 13:38:44Z kib $
+ */
+
+#include "opt_mp_watchdog.h"
+#include "opt_sched.h"
+
+#ifdef SCHED_ULE
+#error MP_WATCHDOG cannot currently be used with SCHED_ULE
+#endif
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
+#include <machine/mp_watchdog.h>
+
+/*
+ * mp_watchdog hijacks the idle thread on a specified CPU, prevents new work
+ * from being scheduled there, and uses it as a "watchdog" to detect kernel
+ * failure on other CPUs.  This is made reasonable by inclusion of logical
+ * processors in Xeon hardware.  The watchdog is configured by setting the
+ * debug.watchdog sysctl/tunable to the CPU of interest.  A callout will then
+ * begin executing reseting a timer that is gradually lowered by the watching
+ * thread.  If the timer reaches 0, the watchdog fires by ether dropping
+ * directly to the debugger, or by sending an NMI IPI to the boot processor.
+ * This is a somewhat less efficient substitute for dedicated watchdog
+ * hardware, but can be quite an effective tool for debugging hangs.
+ *
+ * XXXRW: This should really use the watchdog(9)/watchdog(4) framework, but
+ * doesn't yet.
+ */
+static int	watchdog_cpu = -1;
+static int	watchdog_dontfire = 1;
+static int	watchdog_timer = -1;
+static int	watchdog_nmi = 1;
+
+SYSCTL_INT(_debug, OID_AUTO, watchdog_nmi, CTLFLAG_RWTUN, &watchdog_nmi, 0,
+    "IPI the boot processor with an NMI to enter the debugger");
+
+static struct callout	watchdog_callout;
+
+static void watchdog_change(int wdcpu);
+
+/*
+ * Number of seconds before the watchdog will fire if the callout fails to
+ * reset the timer.
+ */
+#define	WATCHDOG_THRESHOLD	10
+
+static void
+watchdog_init(void *arg)
+{
+
+	callout_init(&watchdog_callout, 1);
+	if (watchdog_cpu != -1)
+		watchdog_change(watchdog_cpu);
+}
+
+/*
+ * This callout resets a timer until the watchdog kicks in.  It acquires some
+ * critical locks to make sure things haven't gotten wedged with those locks
+ * held.
+ */
+static void
+watchdog_function(void *arg)
+{
+
+	/*
+	 * Since the timer ran, we must not be wedged.  Acquire some critical
+	 * locks to make sure.  Then reset the timer.
+	 */
+	mtx_lock(&Giant);
+	watchdog_timer = WATCHDOG_THRESHOLD;
+	mtx_unlock(&Giant);
+	callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL);
+}
+SYSINIT(watchdog_init, SI_SUB_DRIVERS, SI_ORDER_ANY, watchdog_init, NULL);
+
+static void
+watchdog_change(int wdcpu)
+{
+
+	if (wdcpu == -1 || wdcpu == 0xffffffff) {
+		/*
+		 * Disable the watchdog.
+		 */
+		watchdog_cpu = -1;
+		watchdog_dontfire = 1;
+		callout_stop(&watchdog_callout);
+		printf("watchdog stopped\n");
+	} else {
+		watchdog_timer = WATCHDOG_THRESHOLD;
+		watchdog_dontfire = 0;
+		watchdog_cpu = wdcpu;
+		callout_reset(&watchdog_callout, 1 * hz, watchdog_function,
+		    NULL);
+	}
+}
+
+/*
+ * This sysctl sets which CPU is the watchdog CPU.  Set to -1 or 0xffffffff
+ * to disable the watchdog.
+ */
+static int
+sysctl_watchdog(SYSCTL_HANDLER_ARGS)
+{
+	int error, temp;
+
+	temp = watchdog_cpu;
+	error = sysctl_handle_int(oidp, &temp, 0, req);
+	if (error)
+		return (error);
+
+	if (req->newptr != NULL)
+		watchdog_change(temp);
+	return (0);
+}
+SYSCTL_PROC(_debug, OID_AUTO, watchdog, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+    sysctl_watchdog, "I", "");
+
+/*
+ * Drop into the debugger by sending an IPI NMI to the boot processor.
+ */
+static void
+watchdog_ipi_nmi(void)
+{
+
+	/*
+	 * Deliver NMI to the boot processor.  Why not?
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_NMI,
+	    boot_cpu_id);
+	lapic_ipi_wait(-1);
+}
+
+/*
+ * ap_watchdog() is called by the SMP idle loop code.  It works on the same
+ * premise that the disabling of logical processors does: that if the cpu is
+ * idle, then it can ignore the world from then on, as nothing will be
+ * scheduled on it.  Leaving aside multi-runqueue schedulers (SCHED_ULE) and
+ * explicit process migration (sched_bind()), this is not an unreasonable
+ * assumption.
+ */
+void
+ap_watchdog(u_int cpuid)
+{
+	char old_pcomm[MAXCOMLEN + 1];
+	struct proc *p;
+
+	if (watchdog_cpu != cpuid)
+		return;
+
+	printf("watchdog started on cpu %d\n", cpuid);
+	p = curproc;
+	bcopy(p->p_comm, old_pcomm, MAXCOMLEN + 1);
+	snprintf(p->p_comm, MAXCOMLEN + 1, "mp_watchdog cpu %d", cpuid);
+	while (1) {
+		DELAY(1000000);				/* One second. */
+		if (watchdog_cpu != cpuid)
+			break;
+		atomic_subtract_int(&watchdog_timer, 1);
+		if (watchdog_timer < 4)
+			printf("Watchdog timer: %d\n", watchdog_timer);
+		if (watchdog_timer == 0 && watchdog_dontfire == 0) {
+			printf("Watchdog firing!\n");
+			watchdog_dontfire = 1;
+			if (watchdog_nmi)
+				watchdog_ipi_nmi();
+			else
+				kdb_enter(KDB_WHY_WATCHDOG, "mp_watchdog");
+		}
+	}
+	bcopy(old_pcomm, p->p_comm, MAXCOMLEN + 1);
+	printf("watchdog stopped on cpu %d\n", cpuid);
+}


Property changes on: trunk/sys/x86/x86/mp_watchdog.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/mp_x86.c
===================================================================
--- trunk/sys/x86/x86/mp_x86.c	                        (rev 0)
+++ trunk/sys/x86/x86/mp_x86.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,1640 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1996, by Steve Passe
+ * Copyright (c) 2003, by Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mp_x86.c 349958 2019-07-12 22:31:12Z jhb $");
+
+#ifdef __i386__
+#include "opt_apic.h"
+#endif
+#include "opt_cpu.h"
+#include "opt_kstack_pages.h"
+#include "opt_pmap.h"
+#include "opt_sched.h"
+#include "opt_smp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/cons.h>	/* cngetc() */
+#include <sys/cpuset.h>
+#ifdef GPROF 
+#include <sys/gmon.h>
+#endif
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+
+#include <x86/apicreg.h>
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/cputypes.h>
+#include <x86/mca.h>
+#include <machine/md_var.h>
+#include <machine/pcb.h>
+#include <machine/psl.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+#include <x86/ucode.h>
+
+/* lock region used by kernel profiling */
+int	mcount_lock;
+
+int	mp_naps;		/* # of Applications processors */
+int	boot_cpu_id = -1;	/* designated BSP */
+
+extern	struct pcpu __pcpu[];
+
+/* AP uses this during bootstrap.  Do not staticize.  */
+char *bootSTK;
+int bootAP;
+
+/* Free these after use */
+void *bootstacks[MAXCPU];
+void *dpcpu;
+
+struct pcb stoppcbs[MAXCPU];
+struct susppcb **susppcbs;
+
+#ifdef COUNT_IPIS
+/* Interrupt counts. */
+static u_long *ipi_preempt_counts[MAXCPU];
+static u_long *ipi_ast_counts[MAXCPU];
+u_long *ipi_invltlb_counts[MAXCPU];
+u_long *ipi_invlrng_counts[MAXCPU];
+u_long *ipi_invlpg_counts[MAXCPU];
+u_long *ipi_invlcache_counts[MAXCPU];
+u_long *ipi_rendezvous_counts[MAXCPU];
+static u_long *ipi_hardclock_counts[MAXCPU];
+#endif
+
+/* Default cpu_ops implementation. */
+struct cpu_ops cpu_ops;
+
+/*
+ * Local data and functions.
+ */
+
+static volatile cpuset_t ipi_stop_nmi_pending;
+
+volatile cpuset_t resuming_cpus;
+volatile cpuset_t toresume_cpus;
+
+/* used to hold the AP's until we are ready to release them */
+struct mtx ap_boot_mtx;
+
+/* Set to 1 once we're ready to let the APs out of the pen. */
+volatile int aps_ready = 0;
+
+/*
+ * Store data from cpu_add() until later in the boot when we actually setup
+ * the APs.
+ */
+struct cpu_info cpu_info[MAX_APIC_ID + 1];
+int apic_cpuids[MAX_APIC_ID + 1];
+int cpu_apic_ids[MAXCPU];
+
+/* Holds pending bitmap based IPIs per CPU */
+volatile u_int cpu_ipi_pending[MAXCPU];
+
+static void	release_aps(void *dummy);
+static void	cpustop_handler_post(u_int cpu);
+
+static int	hyperthreading_allowed = 1;
+SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
+	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
+
+static struct topo_node topo_root;
+
+static int pkg_id_shift;
+static int core_id_shift;
+static int disabled_cpus;
+
+struct cache_info {
+	int	id_shift;
+	int	present;
+} static caches[MAX_CACHE_LEVELS];
+
+void
+mem_range_AP_init(void)
+{
+
+	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
+		mem_range_softc.mr_op->initAP(&mem_range_softc);
+}
+
+/*
+ * Round up to the next power of two, if necessary, and then
+ * take log2.
+ * Returns -1 if argument is zero.
+ */
+static __inline int
+mask_width(u_int x)
+{
+
+	return (fls(x << (1 - powerof2(x))) - 1);
+}
+
+/*
+ * Add a cache level to the cache topology description.
+ */
+static int
+add_deterministic_cache(int type, int level, int share_count)
+{
+
+	if (type == 0)
+		return (0);
+	if (type > 3) {
+		printf("unexpected cache type %d\n", type);
+		return (1);
+	}
+	if (type == 2) /* ignore instruction cache */
+		return (1);
+	if (level == 0 || level > MAX_CACHE_LEVELS) {
+		printf("unexpected cache level %d\n", type);
+		return (1);
+	}
+
+	if (caches[level - 1].present) {
+		printf("WARNING: multiple entries for L%u data cache\n", level);
+		printf("%u => %u\n", caches[level - 1].id_shift,
+		    mask_width(share_count));
+	}
+	caches[level - 1].id_shift = mask_width(share_count);
+	caches[level - 1].present = 1;
+
+	if (caches[level - 1].id_shift > pkg_id_shift) {
+		printf("WARNING: L%u data cache covers more "
+		    "APIC IDs than a package\n", level);
+		printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
+		caches[level - 1].id_shift = pkg_id_shift;
+	}
+	if (caches[level - 1].id_shift < core_id_shift) {
+		printf("WARNING: L%u data cache covers less "
+		    "APIC IDs than a core\n", level);
+		printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
+		caches[level - 1].id_shift = core_id_shift;
+	}
+
+	return (1);
+}
+
+/*
+ * Determine topology of processing units and caches for AMD CPUs.
+ * See:
+ *  - AMD CPUID Specification (Publication # 25481)
+ *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
+ *  - BKDG For AMD Family 10h Processors (Publication # 31116)
+ *  - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
+ *  - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
+ *  - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
+ */
+static void
+topo_probe_amd(void)
+{
+	u_int p[4];
+	uint64_t v;
+	int level;
+	int nodes_per_socket;
+	int share_count;
+	int type;
+	int i;
+
+	/* No multi-core capability. */
+	if ((amd_feature2 & AMDID2_CMP) == 0)
+		return;
+
+	/* For families 10h and newer. */
+	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
+	    AMDID_COREID_SIZE_SHIFT;
+
+	/* For 0Fh family. */
+	if (pkg_id_shift == 0)
+		pkg_id_shift =
+		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
+
+	/*
+	 * Families prior to 16h define the following value as
+	 * cores per compute unit and we don't really care about the AMD
+	 * compute units at the moment.  Perhaps we should treat them as
+	 * cores and cores within the compute units as hardware threads,
+	 * but that's up for debate.
+	 * Later families define the value as threads per compute unit,
+	 * so we are following AMD's nomenclature here.
+	 */
+	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
+	    CPUID_TO_FAMILY(cpu_id) >= 0x16) {
+		cpuid_count(0x8000001e, 0, p);
+		share_count = ((p[1] >> 8) & 0xff) + 1;
+		core_id_shift = mask_width(share_count);
+	}
+
+	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
+		for (i = 0; ; i++) {
+			cpuid_count(0x8000001d, i, p);
+			type = p[0] & 0x1f;
+			level = (p[0] >> 5) & 0x7;
+			share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+			if (!add_deterministic_cache(type, level, share_count))
+				break;
+		}
+	} else {
+		if (cpu_exthigh >= 0x80000005) {
+			cpuid_count(0x80000005, 0, p);
+			if (((p[2] >> 24) & 0xff) != 0) {
+				caches[0].id_shift = 0;
+				caches[0].present = 1;
+			}
+		}
+		if (cpu_exthigh >= 0x80000006) {
+			cpuid_count(0x80000006, 0, p);
+			if (((p[2] >> 16) & 0xffff) != 0) {
+				caches[1].id_shift = 0;
+				caches[1].present = 1;
+			}
+			if (((p[3] >> 18) & 0x3fff) != 0) {
+				nodes_per_socket = 1;
+				if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
+					/*
+					 * Handle multi-node processors that
+					 * have multiple chips, each with its
+					 * own L3 cache, on the same die.
+					 */
+					v = rdmsr(0xc001100c);
+					nodes_per_socket = 1 + ((v >> 3) & 0x7);
+				}
+				caches[2].id_shift =
+				    pkg_id_shift - mask_width(nodes_per_socket);
+				caches[2].present = 1;
+			}
+		}
+	}
+}
+
+/*
+ * Determine topology of processing units for Intel CPUs
+ * using CPUID Leaf 1 and Leaf 4, if supported.
+ * See:
+ *  - Intel 64 Architecture Processor Topology Enumeration
+ *  - Intel 64 and IA-32 ArchitecturesSoftware Developer?s Manual,
+ *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
+ *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
+ */
+static void
+topo_probe_intel_0x4(void)
+{
+	u_int p[4];
+	int max_cores;
+	int max_logical;
+
+	/* Both zero and one here mean one logical processor per package. */
+	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
+	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
+	if (max_logical <= 1)
+		return;
+
+	if (cpu_high >= 0x4) {
+		cpuid_count(0x04, 0, p);
+		max_cores = ((p[0] >> 26) & 0x3f) + 1;
+	} else
+		max_cores = 1;
+
+	core_id_shift = mask_width(max_logical/max_cores);
+	KASSERT(core_id_shift >= 0,
+	    ("intel topo: max_cores > max_logical\n"));
+	pkg_id_shift = core_id_shift + mask_width(max_cores);
+}
+
+/*
+ * Determine topology of processing units for Intel CPUs
+ * using CPUID Leaf 11, if supported.
+ * See:
+ *  - Intel 64 Architecture Processor Topology Enumeration
+ *  - Intel 64 and IA-32 ArchitecturesSoftware Developer?s Manual,
+ *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
+ *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
+ */
+static void
+topo_probe_intel_0xb(void)
+{
+	u_int p[4];
+	int bits;
+	int type;
+	int i;
+
+	/* Fall back if CPU leaf 11 doesn't really exist. */
+	cpuid_count(0x0b, 0, p);
+	if (p[1] == 0) {
+		topo_probe_intel_0x4();
+		return;
+	}
+
+	/* We only support three levels for now. */
+	for (i = 0; ; i++) {
+		cpuid_count(0x0b, i, p);
+
+		bits = p[0] & 0x1f;
+		type = (p[2] >> 8) & 0xff;
+
+		if (type == 0)
+			break;
+
+		/* TODO: check for duplicate (re-)assignment */
+		if (type == CPUID_TYPE_SMT)
+			core_id_shift = bits;
+		else if (type == CPUID_TYPE_CORE)
+			pkg_id_shift = bits;
+		else
+			printf("unknown CPU level type %d\n", type);
+	}
+
+	if (pkg_id_shift < core_id_shift) {
+		printf("WARNING: core covers more APIC IDs than a package\n");
+		core_id_shift = pkg_id_shift;
+	}
+}
+
+/*
+ * Determine topology of caches for Intel CPUs.
+ * See:
+ *  - Intel 64 Architecture Processor Topology Enumeration
+ *  - Intel 64 and IA-32 Architectures Software Developer?s Manual
+ *    Volume 2A: Instruction Set Reference, A-M,
+ *    CPUID instruction
+ */
+static void
+topo_probe_intel_caches(void)
+{
+	u_int p[4];
+	int level;
+	int share_count;
+	int type;
+	int i;
+
+	if (cpu_high < 0x4) {
+		/*
+		 * Available cache level and sizes can be determined
+		 * via CPUID leaf 2, but that requires a huge table of hardcoded
+		 * values, so for now just assume L1 and L2 caches potentially
+		 * shared only by HTT processing units, if HTT is present.
+		 */
+		caches[0].id_shift = pkg_id_shift;
+		caches[0].present = 1;
+		caches[1].id_shift = pkg_id_shift;
+		caches[1].present = 1;
+		return;
+	}
+
+	for (i = 0; ; i++) {
+		cpuid_count(0x4, i, p);
+		type = p[0] & 0x1f;
+		level = (p[0] >> 5) & 0x7;
+		share_count = 1 + ((p[0] >> 14) & 0xfff);
+
+		if (!add_deterministic_cache(type, level, share_count))
+			break;
+	}
+}
+
+/*
+ * Determine topology of processing units and caches for Intel CPUs.
+ * See:
+ *  - Intel 64 Architecture Processor Topology Enumeration
+ */
+static void
+topo_probe_intel(void)
+{
+
+	/*
+	 * Note that 0x1 <= cpu_high < 4 case should be
+	 * compatible with topo_probe_intel_0x4() logic when
+	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
+	 * or it should trigger the fallback otherwise.
+	 */
+	if (cpu_high >= 0xb)
+		topo_probe_intel_0xb();
+	else if (cpu_high >= 0x1)
+		topo_probe_intel_0x4();
+
+	topo_probe_intel_caches();
+}
+
+/*
+ * Topology information is queried only on BSP, on which this
+ * code runs and for which it can query CPUID information.
+ * Then topology is extrapolated on all packages using an
+ * assumption that APIC ID to hardware component ID mapping is
+ * homogenious.
+ * That doesn't necesserily imply that the topology is uniform.
+ */
+void
+topo_probe(void)
+{
+	static int cpu_topo_probed = 0;
+	struct x86_topo_layer {
+		int type;
+		int subtype;
+		int id_shift;
+	} topo_layers[MAX_CACHE_LEVELS + 3];
+	struct topo_node *parent;
+	struct topo_node *node;
+	int layer;
+	int nlayers;
+	int node_id;
+	int i;
+
+	if (cpu_topo_probed)
+		return;
+
+	CPU_ZERO(&logical_cpus_mask);
+
+	if (mp_ncpus <= 1)
+		; /* nothing */
+	else if (cpu_vendor_id == CPU_VENDOR_AMD)
+		topo_probe_amd();
+	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
+		topo_probe_intel();
+
+	KASSERT(pkg_id_shift >= core_id_shift,
+	    ("bug in APIC topology discovery"));
+
+	nlayers = 0;
+	bzero(topo_layers, sizeof(topo_layers));
+
+	topo_layers[nlayers].type = TOPO_TYPE_PKG;
+	topo_layers[nlayers].id_shift = pkg_id_shift;
+	if (bootverbose)
+		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
+	nlayers++;
+
+	/*
+	 * Consider all caches to be within a package/chip
+	 * and "in front" of all sub-components like
+	 * cores and hardware threads.
+	 */
+	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
+		if (caches[i].present) {
+			KASSERT(caches[i].id_shift <= pkg_id_shift,
+				("bug in APIC topology discovery"));
+			KASSERT(caches[i].id_shift >= core_id_shift,
+				("bug in APIC topology discovery"));
+
+			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
+			topo_layers[nlayers].subtype = i + 1;
+			topo_layers[nlayers].id_shift = caches[i].id_shift;
+			if (bootverbose)
+				printf("L%u cache ID shift: %u\n",
+				    topo_layers[nlayers].subtype,
+				    topo_layers[nlayers].id_shift);
+			nlayers++;
+		}
+	}
+
+	if (pkg_id_shift > core_id_shift) {
+		topo_layers[nlayers].type = TOPO_TYPE_CORE;
+		topo_layers[nlayers].id_shift = core_id_shift;
+		if (bootverbose)
+			printf("Core ID shift: %u\n",
+			    topo_layers[nlayers].id_shift);
+		nlayers++;
+	}
+
+	topo_layers[nlayers].type = TOPO_TYPE_PU;
+	topo_layers[nlayers].id_shift = 0;
+	nlayers++;
+
+	topo_init_root(&topo_root);
+	for (i = 0; i <= MAX_APIC_ID; ++i) {
+		if (!cpu_info[i].cpu_present)
+			continue;
+
+		parent = &topo_root;
+		for (layer = 0; layer < nlayers; ++layer) {
+			node_id = i >> topo_layers[layer].id_shift;
+			parent = topo_add_node_by_hwid(parent, node_id,
+			    topo_layers[layer].type,
+			    topo_layers[layer].subtype);
+		}
+	}
+
+	parent = &topo_root;
+	for (layer = 0; layer < nlayers; ++layer) {
+		node_id = boot_cpu_id >> topo_layers[layer].id_shift;
+		node = topo_find_node_by_hwid(parent, node_id,
+		    topo_layers[layer].type,
+		    topo_layers[layer].subtype);
+		topo_promote_child(node);
+		parent = node;
+	}
+
+	cpu_topo_probed = 1;
+}
+
+/*
+ * Assign logical CPU IDs to local APICs.
+ */
+void
+assign_cpu_ids(void)
+{
+	struct topo_node *node;
+	u_int smt_mask;
+
+	smt_mask = (1u << core_id_shift) - 1;
+
+	/*
+	 * Assign CPU IDs to local APIC IDs and disable any CPUs
+	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
+	 */
+	mp_ncpus = 0;
+	TOPO_FOREACH(node, &topo_root) {
+		if (node->type != TOPO_TYPE_PU)
+			continue;
+
+		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
+			cpu_info[node->hwid].cpu_hyperthread = 1;
+
+		if (resource_disabled("lapic", node->hwid)) {
+			if (node->hwid != boot_cpu_id)
+				cpu_info[node->hwid].cpu_disabled = 1;
+			else
+				printf("Cannot disable BSP, APIC ID = %d\n",
+				    node->hwid);
+		}
+
+		if (!hyperthreading_allowed &&
+		    cpu_info[node->hwid].cpu_hyperthread)
+			cpu_info[node->hwid].cpu_disabled = 1;
+
+		if (mp_ncpus >= MAXCPU)
+			cpu_info[node->hwid].cpu_disabled = 1;
+
+		if (cpu_info[node->hwid].cpu_disabled) {
+			disabled_cpus++;
+			continue;
+		}
+
+		cpu_apic_ids[mp_ncpus] = node->hwid;
+		apic_cpuids[node->hwid] = mp_ncpus;
+		topo_set_pu_id(node, mp_ncpus);
+		mp_ncpus++;
+	}
+
+	KASSERT(mp_maxid >= mp_ncpus - 1,
+	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
+	    mp_ncpus));
+}
+
+/*
+ * Print various information about the SMP system hardware and setup.
+ */
+void
+cpu_mp_announce(void)
+{
+	struct topo_node *node;
+	const char *hyperthread;
+	int pkg_count;
+	int cores_per_pkg;
+	int thrs_per_core;
+
+	printf("FreeBSD/SMP: ");
+	if (topo_analyze(&topo_root, 1, &pkg_count,
+	    &cores_per_pkg, &thrs_per_core)) {
+		printf("%d package(s)", pkg_count);
+		if (cores_per_pkg > 0)
+			printf(" x %d core(s)", cores_per_pkg);
+		if (thrs_per_core > 1)
+		    printf(" x %d hardware threads", thrs_per_core);
+	} else {
+		printf("Non-uniform topology");
+	}
+	printf("\n");
+
+	if (disabled_cpus) {
+		printf("FreeBSD/SMP Online: ");
+		if (topo_analyze(&topo_root, 0, &pkg_count,
+		    &cores_per_pkg, &thrs_per_core)) {
+			printf("%d package(s)", pkg_count);
+			if (cores_per_pkg > 0)
+				printf(" x %d core(s)", cores_per_pkg);
+			if (thrs_per_core > 1)
+			    printf(" x %d hardware threads", thrs_per_core);
+		} else {
+			printf("Non-uniform topology");
+		}
+		printf("\n");
+	}
+
+	if (!bootverbose)
+		return;
+
+	TOPO_FOREACH(node, &topo_root) {
+		switch (node->type) {
+		case TOPO_TYPE_PKG:
+			printf("Package HW ID = %u (%#x)\n",
+			    node->hwid, node->hwid);
+			break;
+		case TOPO_TYPE_CORE:
+			printf("\tCore HW ID = %u (%#x)\n",
+			    node->hwid, node->hwid);
+			break;
+		case TOPO_TYPE_PU:
+			if (cpu_info[node->hwid].cpu_hyperthread)
+				hyperthread = "/HT";
+			else
+				hyperthread = "";
+
+			if (node->subtype == 0)
+				printf("\t\tCPU (AP%s): APIC ID: %u (%#x)"
+				    "(disabled)\n", hyperthread, node->hwid,
+				    node->hwid);
+			else if (node->id == 0)
+				printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n",
+				    node->hwid, node->hwid);
+			else
+				printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n",
+				    node->id, hyperthread, node->hwid,
+				    node->hwid);
+			break;
+		default:
+			/* ignored */
+			break;
+		}
+	}
+}
+
+/*
+ * Add a scheduling group, a group of logical processors sharing
+ * a particular cache (and, thus having an affinity), to the scheduling
+ * topology.
+ * This function recursively works on lower level caches.
+ */
+static void
+x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
+{
+	struct topo_node *node;
+	int nchildren;
+	int ncores;
+	int i;
+
+	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE,
+	    ("x86topo_add_sched_group: bad type: %u", root->type));
+	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
+	cg_root->cg_count = root->cpu_count;
+	if (root->type == TOPO_TYPE_SYSTEM)
+		cg_root->cg_level = CG_SHARE_NONE;
+	else
+		cg_root->cg_level = root->subtype;
+
+	/*
+	 * Check how many core nodes we have under the given root node.
+	 * If we have multiple logical processors, but not multiple
+	 * cores, then those processors must be hardware threads.
+	 */
+	ncores = 0;
+	node = root;
+	while (node != NULL) {
+		if (node->type != TOPO_TYPE_CORE) {
+			node = topo_next_node(root, node);
+			continue;
+		}
+
+		ncores++;
+		node = topo_next_nonchild_node(root, node);
+	}
+
+	if (cg_root->cg_level != CG_SHARE_NONE &&
+	    root->cpu_count > 1 && ncores < 2)
+		cg_root->cg_flags = CG_FLAG_SMT;
+
+	/*
+	 * Find out how many cache nodes we have under the given root node.
+	 * We ignore cache nodes that cover all the same processors as the
+	 * root node.  Also, we do not descend below found cache nodes.
+	 * That is, we count top-level "non-redundant" caches under the root
+	 * node.
+	 */
+	nchildren = 0;
+	node = root;
+	while (node != NULL) {
+		if (node->type != TOPO_TYPE_CACHE ||
+		    (root->type != TOPO_TYPE_SYSTEM &&
+		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
+			node = topo_next_node(root, node);
+			continue;
+		}
+		nchildren++;
+		node = topo_next_nonchild_node(root, node);
+	}
+
+	cg_root->cg_child = smp_topo_alloc(nchildren);
+	cg_root->cg_children = nchildren;
+
+	/*
+	 * Now find again the same cache nodes as above and recursively
+	 * build scheduling topologies for them.
+	 */
+	node = root;
+	i = 0;
+	while (node != NULL) {
+		if (node->type != TOPO_TYPE_CACHE ||
+		    (root->type != TOPO_TYPE_SYSTEM &&
+		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
+			node = topo_next_node(root, node);
+			continue;
+		}
+		cg_root->cg_child[i].cg_parent = cg_root;
+		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
+		i++;
+		node = topo_next_nonchild_node(root, node);
+	}
+}
+
+/*
+ * Build the MI scheduling topology from the discovered hardware topology.
+ */
+struct cpu_group *
+cpu_topo(void)
+{
+	struct cpu_group *cg_root;
+
+	if (mp_ncpus <= 1)
+		return (smp_topo_none());
+
+	cg_root = smp_topo_alloc(1);
+	x86topo_add_sched_group(&topo_root, cg_root);
+	return (cg_root);
+}
+
+
+/*
+ * Add a logical CPU to the topology.
+ */
+void
+cpu_add(u_int apic_id, char boot_cpu)
+{
+
+	if (apic_id > MAX_APIC_ID) {
+		panic("SMP: APIC ID %d too high", apic_id);
+		return;
+	}
+	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
+	    apic_id));
+	cpu_info[apic_id].cpu_present = 1;
+	if (boot_cpu) {
+		KASSERT(boot_cpu_id == -1,
+		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
+		    boot_cpu_id));
+		boot_cpu_id = apic_id;
+		cpu_info[apic_id].cpu_bsp = 1;
+	}
+	if (mp_ncpus < MAXCPU) {
+		mp_ncpus++;
+		mp_maxid = mp_ncpus - 1;
+	}
+	if (bootverbose)
+		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
+		    "AP");
+}
+
+void
+cpu_mp_setmaxid(void)
+{
+
+	/*
+	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
+	 * If there were no calls to cpu_add() assume this is a UP system.
+	 */
+	if (mp_ncpus == 0)
+		mp_ncpus = 1;
+}
+
+int
+cpu_mp_probe(void)
+{
+
+	/*
+	 * Always record BSP in CPU map so that the mbuf init code works
+	 * correctly.
+	 */
+	CPU_SETOF(0, &all_cpus);
+	return (mp_ncpus > 1);
+}
+
+/*
+ * AP CPU's call this to initialize themselves.
+ */
+void
+init_secondary_tail(void)
+{
+	u_int cpuid;
+
+	pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
+
+	/*
+	 * On real hardware, switch to x2apic mode if possible.  Do it
+	 * after aps_ready was signalled, to avoid manipulating the
+	 * mode while BSP might still want to send some IPI to us
+	 * (second startup IPI is ignored on modern hardware etc).
+	 */
+	lapic_xapic_mode();
+
+	/* Initialize the PAT MSR. */
+	pmap_init_pat();
+
+	/* set up CPU registers and state */
+	cpu_setregs();
+
+	/* set up SSE/NX */
+	initializecpu();
+
+	/* set up FPU state on the AP */
+#ifdef __amd64__
+	fpuinit();
+#else
+	npxinit(false);
+#endif
+
+	if (cpu_ops.cpu_init)
+		cpu_ops.cpu_init();
+
+	/* A quick check from sanity claus */
+	cpuid = PCPU_GET(cpuid);
+	if (PCPU_GET(apic_id) != lapic_id()) {
+		printf("SMP: cpuid = %d\n", cpuid);
+		printf("SMP: actual apic_id = %d\n", lapic_id());
+		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
+		panic("cpuid mismatch! boom!!");
+	}
+
+	/* Initialize curthread. */
+	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
+	PCPU_SET(curthread, PCPU_GET(idlethread));
+
+	mtx_lock_spin(&ap_boot_mtx);
+
+	mca_init();
+
+	/* Init local apic for irq's */
+	lapic_setup(1);
+
+	/* Set memory range attributes for this CPU to match the BSP */
+	mem_range_AP_init();
+
+	smp_cpus++;
+
+	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
+	printf("SMP: AP CPU #%d Launched!\n", cpuid);
+
+	/* Determine if we are a logical CPU. */
+	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
+		CPU_SET(cpuid, &logical_cpus_mask);
+
+	if (bootverbose)
+		lapic_dump("AP");
+
+	if (smp_cpus == mp_ncpus) {
+		/* enable IPI's, tlb shootdown, freezes etc */
+		atomic_store_rel_int(&smp_started, 1);
+	}
+
+#ifdef __amd64__
+	/*
+	 * Enable global pages TLB extension
+	 * This also implicitly flushes the TLB 
+	 */
+	load_cr4(rcr4() | CR4_PGE);
+	if (pmap_pcid_enabled)
+		load_cr4(rcr4() | CR4_PCIDE);
+	load_ds(_udatasel);
+	load_es(_udatasel);
+	load_fs(_ufssel);
+#endif
+
+	mtx_unlock_spin(&ap_boot_mtx);
+
+	/* Wait until all the AP's are up. */
+	while (atomic_load_acq_int(&smp_started) == 0)
+		ia32_pause();
+
+#ifndef EARLY_AP_STARTUP
+	/* Start per-CPU event timers. */
+	cpu_initclocks_ap();
+#endif
+
+	sched_throw(NULL);
+
+	panic("scheduler returned us to %s", __func__);
+	/* NOTREACHED */
+}
+
+/*******************************************************************
+ * local functions and data
+ */
+
+/*
+ * We tell the I/O APIC code about all the CPUs we want to receive
+ * interrupts.  If we don't want certain CPUs to receive IRQs we
+ * can simply not tell the I/O APIC code about them in this function.
+ * We also do not tell it about the BSP since it tells itself about
+ * the BSP internally to work with UP kernels and on UP machines.
+ */
+void
+set_interrupt_apic_ids(void)
+{
+	u_int i, apic_id;
+
+	for (i = 0; i < MAXCPU; i++) {
+		apic_id = cpu_apic_ids[i];
+		if (apic_id == -1)
+			continue;
+		if (cpu_info[apic_id].cpu_bsp)
+			continue;
+		if (cpu_info[apic_id].cpu_disabled)
+			continue;
+
+		/* Don't let hyperthreads service interrupts. */
+		if (cpu_info[apic_id].cpu_hyperthread)
+			continue;
+
+		intr_add_cpu(i);
+	}
+}
+
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+    sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+    sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+    sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+    0, "");
+#endif /* COUNT_XINVLTLB_HITS */
+
+/*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
+	/*
+	 * This attempts to follow the algorithm described in the
+	 * Intel Multiprocessor Specification v1.4 in section B.4.
+	 * For each IPI, we allow the local APIC ~20us to deliver the
+	 * IPI.  If that times out, we panic.
+	 */
+
+	/*
+	 * first we do an INIT IPI: this INIT IPI might be run, resetting
+	 * and running the target CPU. OR this INIT IPI might be latched (P5
+	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
+	 * ignored.
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
+	lapic_ipi_wait(100);
+
+	/* Explicitly deassert the INIT IPI. */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
+	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
+	    apic_id);
+
+	DELAY(10000);		/* wait ~10mS */
+
+	/*
+	 * next we do a STARTUP IPI: the previous INIT IPI might still be
+	 * latched, (P5 bug) this 1st STARTUP would then terminate
+	 * immediately, and the previously started INIT IPI would continue. OR
+	 * the previous INIT IPI has already run. and this STARTUP IPI will
+	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
+	 * will run.
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+	    vector, apic_id);
+	if (!lapic_ipi_wait(100))
+		panic("Failed to deliver first STARTUP IPI to APIC %d",
+		    apic_id);
+	DELAY(200);		/* wait ~200uS */
+
+	/*
+	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
+	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
+	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
+	 * recognized after hardware RESET or INIT IPI.
+	 */
+	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
+	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
+	    vector, apic_id);
+	if (!lapic_ipi_wait(100))
+		panic("Failed to deliver second STARTUP IPI to APIC %d",
+		    apic_id);
+
+	DELAY(200);		/* wait ~200uS */
+}
+
+/*
+ * Send an IPI to specified CPU handling the bitmap logic.
+ */
+void
+ipi_send_cpu(int cpu, u_int ipi)
+{
+	u_int bitmap, old_pending, new_pending;
+
+	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
+
+	if (IPI_IS_BITMAPED(ipi)) {
+		bitmap = 1 << ipi;
+		ipi = IPI_BITMAP_VECTOR;
+		do {
+			old_pending = cpu_ipi_pending[cpu];
+			new_pending = old_pending | bitmap;
+		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
+		    old_pending, new_pending));	
+		if (old_pending)
+			return;
+	}
+	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
+}
+
+void
+ipi_bitmap_handler(struct trapframe frame)
+{
+	struct trapframe *oldframe;
+	struct thread *td;
+	int cpu = PCPU_GET(cpuid);
+	u_int ipi_bitmap;
+
+	critical_enter();
+	td = curthread;
+	td->td_intr_nesting_level++;
+	oldframe = td->td_intr_frame;
+	td->td_intr_frame = &frame;
+	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
+	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
+#ifdef COUNT_IPIS
+		(*ipi_preempt_counts[cpu])++;
+#endif
+		sched_preempt(td);
+	}
+	if (ipi_bitmap & (1 << IPI_AST)) {
+#ifdef COUNT_IPIS
+		(*ipi_ast_counts[cpu])++;
+#endif
+		/* Nothing to do for AST */
+	}
+	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
+#ifdef COUNT_IPIS
+		(*ipi_hardclock_counts[cpu])++;
+#endif
+		hardclockintr();
+	}
+	td->td_intr_frame = oldframe;
+	td->td_intr_nesting_level--;
+	critical_exit();
+}
+
+/*
+ * send an IPI to a set of cpus.
+ */
+void
+ipi_selected(cpuset_t cpus, u_int ipi)
+{
+	int cpu;
+
+	/*
+	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+	 * of help in order to understand what is the source.
+	 * Set the mask of receiving CPUs for this purpose.
+	 */
+	if (ipi == IPI_STOP_HARD)
+		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
+
+	while ((cpu = CPU_FFS(&cpus)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &cpus);
+		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
+		ipi_send_cpu(cpu, ipi);
+	}
+}
+
+/*
+ * send an IPI to a specific CPU.
+ */
+void
+ipi_cpu(int cpu, u_int ipi)
+{
+
+	/*
+	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+	 * of help in order to understand what is the source.
+	 * Set the mask of receiving CPUs for this purpose.
+	 */
+	if (ipi == IPI_STOP_HARD)
+		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
+
+	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
+	ipi_send_cpu(cpu, ipi);
+}
+
+/*
+ * send an IPI to all CPUs EXCEPT myself
+ */
+void
+ipi_all_but_self(u_int ipi)
+{
+	cpuset_t other_cpus;
+
+	other_cpus = all_cpus;
+	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	if (IPI_IS_BITMAPED(ipi)) {
+		ipi_selected(other_cpus, ipi);
+		return;
+	}
+
+	/*
+	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
+	 * of help in order to understand what is the source.
+	 * Set the mask of receiving CPUs for this purpose.
+	 */
+	if (ipi == IPI_STOP_HARD)
+		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
+
+	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
+	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
+}
+
+int
+ipi_nmi_handler(void)
+{
+	u_int cpuid;
+
+	/*
+	 * As long as there is not a simple way to know about a NMI's
+	 * source, if the bitmask for the current CPU is present in
+	 * the global pending bitword an IPI_STOP_HARD has been issued
+	 * and should be handled.
+	 */
+	cpuid = PCPU_GET(cpuid);
+	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
+		return (1);
+
+	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
+	cpustop_handler();
+	return (0);
+}
+
+int nmi_kdb_lock;
+
+void
+nmi_call_kdb_smp(u_int type, struct trapframe *frame)
+{
+	int cpu;
+	bool call_post;
+
+	cpu = PCPU_GET(cpuid);
+	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
+		nmi_call_kdb(cpu, type, frame);
+		call_post = false;
+	} else {
+		savectx(&stoppcbs[cpu]);
+		CPU_SET_ATOMIC(cpu, &stopped_cpus);
+		while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
+			ia32_pause();
+		call_post = true;
+	}
+	atomic_store_rel_int(&nmi_kdb_lock, 0);
+	if (call_post)
+		cpustop_handler_post(cpu);
+}
+
+/*
+ * Handle an IPI_STOP by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpustop_handler(void)
+{
+	u_int cpu;
+
+	cpu = PCPU_GET(cpuid);
+
+	savectx(&stoppcbs[cpu]);
+
+	/* Indicate that we are stopped */
+	CPU_SET_ATOMIC(cpu, &stopped_cpus);
+
+	/* Wait for restart */
+	while (!CPU_ISSET(cpu, &started_cpus))
+	    ia32_pause();
+
+	cpustop_handler_post(cpu);
+}
+
+static void
+cpustop_handler_post(u_int cpu)
+{
+
+	CPU_CLR_ATOMIC(cpu, &started_cpus);
+	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
+
+#if defined(__amd64__) && defined(DDB)
+	amd64_db_resume_dbreg();
+#endif
+
+	if (cpu == 0 && cpustop_restartfunc != NULL) {
+		cpustop_restartfunc();
+		cpustop_restartfunc = NULL;
+	}
+}
+
+/*
+ * Handle an IPI_SUSPEND by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpususpend_handler(void)
+{
+	u_int cpu;
+
+	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
+
+	cpu = PCPU_GET(cpuid);
+	if (savectx(&susppcbs[cpu]->sp_pcb)) {
+#ifdef __amd64__
+		fpususpend(susppcbs[cpu]->sp_fpususpend);
+#else
+		npxsuspend(susppcbs[cpu]->sp_fpususpend);
+#endif
+		/*
+		 * suspended_cpus is cleared shortly after each AP is restarted
+		 * by a Startup IPI, so that the BSP can proceed to restarting
+		 * the next AP.
+		 *
+		 * resuming_cpus gets cleared when the AP completes
+		 * initialization after having been released by the BSP.
+		 * resuming_cpus is probably not the best name for the
+		 * variable, because it is actually a set of processors that
+		 * haven't resumed yet and haven't necessarily started resuming.
+		 *
+		 * Note that suspended_cpus is meaningful only for ACPI suspend
+		 * as it's not really used for Xen suspend since the APs are
+		 * automatically restored to the running state and the correct
+		 * context.  For the same reason resumectx is never called in
+		 * that case.
+		 */
+		CPU_SET_ATOMIC(cpu, &suspended_cpus);
+		CPU_SET_ATOMIC(cpu, &resuming_cpus);
+
+		/*
+		 * Invalidate the cache after setting the global status bits.
+		 * The last AP to set its bit may end up being an Owner of the
+		 * corresponding cache line in MOESI protocol.  The AP may be
+		 * stopped before the cache line is written to the main memory.
+		 */
+		wbinvd();
+	} else {
+#ifdef __amd64__
+		fpuresume(susppcbs[cpu]->sp_fpususpend);
+#else
+		npxresume(susppcbs[cpu]->sp_fpususpend);
+#endif
+		pmap_init_pat();
+		initializecpu();
+		PCPU_SET(switchtime, 0);
+		PCPU_SET(switchticks, ticks);
+
+		/* Indicate that we have restarted and restored the context. */
+		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+	}
+
+	/* Wait for resume directive */
+	while (!CPU_ISSET(cpu, &toresume_cpus))
+		ia32_pause();
+
+	/* Re-apply microcode updates. */
+	ucode_reload();
+
+	if (cpu_ops.cpu_resume)
+		cpu_ops.cpu_resume();
+#ifdef __amd64__
+	if (vmm_resume_p)
+		vmm_resume_p();
+#endif
+
+	/* Resume MCA and local APIC */
+	lapic_xapic_mode();
+	mca_resume();
+	lapic_setup(0);
+
+	/* Indicate that we are resumed */
+	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
+	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
+}
+
+
+void
+invlcache_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_IPIS
+	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since wbinvd is a serializing instruction.  Without the
+	 * temporary, we'd wait for wbinvd to complete, then the read
+	 * would execute, then the dependent write, which must then
+	 * complete before return from interrupt.
+	 */
+	generation = smp_tlb_generation;
+	wbinvd();
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+/*
+ * This is called once the rest of the system is up and running and we're
+ * ready to let the AP's out of the pen.
+ */
+static void
+release_aps(void *dummy __unused)
+{
+
+	if (mp_ncpus == 1) 
+		return;
+	atomic_store_rel_int(&aps_ready, 1);
+	while (smp_started == 0)
+		ia32_pause();
+}
+SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
+
+#ifdef COUNT_IPIS
+/*
+ * Setup interrupt counters for IPI handlers.
+ */
+static void
+mp_ipi_intrcnt(void *dummy)
+{
+	char buf[64];
+	int i;
+
+	CPU_FOREACH(i) {
+		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
+		intrcnt_add(buf, &ipi_invltlb_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
+		intrcnt_add(buf, &ipi_invlrng_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
+		intrcnt_add(buf, &ipi_invlpg_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
+		intrcnt_add(buf, &ipi_invlcache_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
+		intrcnt_add(buf, &ipi_preempt_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
+		intrcnt_add(buf, &ipi_ast_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
+		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
+		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
+		intrcnt_add(buf, &ipi_hardclock_counts[i]);
+	}		
+}
+SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
+#endif
+
+/*
+ * Flush the TLB on other CPU's
+ */
+
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+pmap_t smp_tlb_pmap;
+volatile uint32_t smp_tlb_generation;
+
+#ifdef __amd64__
+#define	read_eflags() read_rflags()
+#endif
+
+static void
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
+    vm_offset_t addr1, vm_offset_t addr2)
+{
+	cpuset_t other_cpus;
+	volatile uint32_t *p_cpudone;
+	uint32_t generation;
+	int cpu;
+
+	/*
+	 * Check for other cpus.  Return if none.
+	 */
+	if (CPU_ISFULLSET(&mask)) {
+		if (mp_ncpus <= 1)
+			return;
+	} else {
+		CPU_CLR(PCPU_GET(cpuid), &mask);
+		if (CPU_EMPTY(&mask))
+			return;
+	}
+
+	if (!(read_eflags() & PSL_I))
+		panic("%s: interrupts disabled", __func__);
+	mtx_lock_spin(&smp_ipi_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_pmap = pmap;
+	generation = ++smp_tlb_generation;
+	if (CPU_ISFULLSET(&mask)) {
+		ipi_all_but_self(vector);
+		other_cpus = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	} else {
+		other_cpus = mask;
+		while ((cpu = CPU_FFS(&mask)) != 0) {
+			cpu--;
+			CPU_CLR(cpu, &mask);
+			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
+			    cpu, vector);
+			ipi_send_cpu(cpu, vector);
+		}
+	}
+	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &other_cpus);
+		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
+		while (*p_cpudone != generation)
+			ia32_pause();
+	}
+	mtx_unlock_spin(&smp_ipi_mtx);
+}
+
+void
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
+{
+
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_global++;
+#endif
+	}
+}
+
+void
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
+{
+
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_page++;
+#endif
+	}
+}
+
+void
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
+    pmap_t pmap)
+{
+
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
+		    addr1, addr2);
+#ifdef COUNT_XINVLTLB_HITS
+		ipi_range++;
+		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+	}
+}
+
+void
+smp_cache_flush(void)
+{
+
+	if (smp_started) {
+		smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL,
+		    0, 0);
+	}
+}
+
+/*
+ * Handlers for TLB related IPIs
+ */
+void
+invltlb_handler(void)
+{
+	uint32_t generation;
+  
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since invalidating the TLB is a serializing operation.
+	 */
+	generation = smp_tlb_generation;
+	if (smp_tlb_pmap == kernel_pmap)
+		invltlb_glob();
+	else
+		invltlb();
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlpg_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	invlpg(smp_tlb_addr1);
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlrng_handler(void)
+{
+	vm_offset_t addr, addr2;
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	do {
+		invlpg(addr);
+		addr += PAGE_SIZE;
+	} while (addr < addr2);
+
+	PCPU_SET(smp_tlb_done, generation);
+}


Property changes on: trunk/sys/x86/x86/mp_x86.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/mptable.c
===================================================================
--- trunk/sys/x86/x86/mptable.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/mptable.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mptable.c 262141 2014-02-18 01:15:32Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mptable.c 261087 2014-01-23 20:10:22Z jhb $");
 
 #include "opt_mptable_force_htt.h"
 #include <sys/param.h>
@@ -51,7 +51,7 @@
 #include <x86/mptable.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
 #include <machine/md_var.h>
 #ifdef NEW_PCIB
 #include <machine/resource.h>
@@ -79,6 +79,13 @@
 typedef	void mptable_entry_handler(u_char *entry, void *arg);
 typedef	void mptable_extended_entry_handler(ext_entry_ptr entry, void *arg);
 
+/* descriptions of MP table entries */
+typedef struct BASETABLE_ENTRY {
+	uint8_t	type;
+	uint8_t	length;
+	uint8_t	name[16];
+}       basetable_entry;
+
 static basetable_entry basetable_entry_types[] =
 {
 	{0, 20, "Processor"},

Modified: trunk/sys/x86/x86/mptable_pci.c
===================================================================
--- trunk/sys/x86/x86/mptable_pci.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/mptable_pci.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/mptable_pci.c 280970 2015-04-01 21:48:54Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mptable_pci.c 294883 2016-01-27 02:23:54Z jhibbits $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -70,13 +70,13 @@
 #ifdef NEW_PCIB
 	mptable_pci_host_res_init(dev);
 #endif
-	device_add_child(dev, "pci", pcib_get_bus(dev));
+	device_add_child(dev, "pci", -1);
 	return (bus_generic_attach(dev));
 }
 
 #ifdef NEW_PCIB
 static int
-mptable_is_isa_range(u_long start, u_long end)
+mptable_is_isa_range(rman_res_t start, rman_res_t end)
 {
 
 	if (end >= 0x10000)
@@ -89,7 +89,7 @@
 }
 
 static int
-mptable_is_vga_range(u_long start, u_long end)
+mptable_is_vga_range(rman_res_t start, rman_res_t end)
 {
 	if (end >= 0x10000)
 		return (0);
@@ -102,7 +102,7 @@
 
 static struct resource *
 mptable_hostb_alloc_resource(device_t dev, device_t child, int type, int *rid,
-    u_long start, u_long end, u_long count, u_int flags)
+    rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct mptable_hostb_softc *sc;
 
@@ -143,7 +143,7 @@
 
 static int
 mptable_hostb_adjust_resource(device_t dev, device_t child, int type,
-    struct resource *r, u_long start, u_long end)
+    struct resource *r, rman_res_t start, rman_res_t end)
 {
 	struct mptable_hostb_softc *sc;
 

Modified: trunk/sys/x86/x86/msi.c
===================================================================
--- trunk/sys/x86/x86/msi.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/msi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -36,11 +36,14 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/msi.c 333126 2018-04-30 20:29:28Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/msi.c 344912 2019-03-08 01:04:19Z jhb $");
 
+#include "opt_acpi.h"
+
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
@@ -52,7 +55,8 @@
 #include <machine/md_var.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
+#include <x86/iommu/iommu_intrmap.h>
 #include <machine/specialreg.h>
 #include <dev/pci/pcivar.h>
 
@@ -113,10 +117,11 @@
 	u_int msi_irq;			/* IRQ cookie. */
 	u_int msi_msix;			/* MSI-X message. */
 	u_int msi_vector:8;		/* IDT vector. */
-	u_int msi_cpu:8;		/* Local APIC ID. (g) */
+	u_int msi_cpu;			/* Local APIC ID. (g) */
 	u_int msi_count:8;		/* Messages in this group. (g) */
 	u_int msi_maxcount:8;		/* Alignment for this group. (g) */
-	int *msi_irqs;			/* Group's IRQ list. (g) */
+	u_int *msi_irqs;		/* Group's IRQ list. (g) */
+	u_int msi_remap_cookie;
 };
 
 static void	msi_create_source(void);
@@ -131,11 +136,27 @@
 		    enum intr_polarity pol);
 static int	msi_assign_cpu(struct intsrc *isrc, u_int apic_id);
 
-struct pic msi_pic = { msi_enable_source, msi_disable_source, msi_eoi_source,
-		       msi_enable_intr, msi_disable_intr, msi_vector,
-		       msi_source_pending, NULL, NULL, msi_config_intr,
-		       msi_assign_cpu };
+struct pic msi_pic = {
+	.pic_enable_source = msi_enable_source,
+	.pic_disable_source = msi_disable_source,
+	.pic_eoi_source = msi_eoi_source,
+	.pic_enable_intr = msi_enable_intr,
+	.pic_disable_intr = msi_disable_intr,
+	.pic_vector = msi_vector,
+	.pic_source_pending = msi_source_pending,
+	.pic_suspend = NULL,
+	.pic_resume = NULL,
+	.pic_config_intr = msi_config_intr,
+	.pic_assign_cpu = msi_assign_cpu,
+	.pic_reprogram_pin = NULL,
+};
 
+u_int first_msi_irq;
+
+u_int num_msi_irqs = 512;
+SYSCTL_UINT(_machdep, OID_AUTO, num_msi_irqs, CTLFLAG_RDTUN, &num_msi_irqs, 0,
+    "Number of IRQs reserved for MSI and MSI-X interrupts");
+
 #ifdef SMP
 /**
  * Xen hypervisors prior to 4.6.0 do not properly handle updates to
@@ -153,7 +174,7 @@
 #endif
 
 static int msi_enabled;
-static int msi_last_irq;
+static u_int msi_last_irq;
 static struct mtx msi_lock;
 
 static void
@@ -314,6 +335,14 @@
 	}
 #endif
 
+	if (num_msi_irqs == 0)
+		return;
+
+	first_msi_irq = max(MINIMUM_MSI_INT, num_io_irqs);
+	if (num_msi_irqs > UINT_MAX - first_msi_irq)
+		panic("num_msi_irqs too high");
+	num_io_irqs = first_msi_irq + num_msi_irqs;
+
 	msi_enabled = 1;
 	intr_register_pic(&msi_pic);
 	mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
@@ -326,11 +355,11 @@
 	u_int irq;
 
 	mtx_lock(&msi_lock);
-	if (msi_last_irq >= NUM_MSI_INTS) {
+	if (msi_last_irq >= num_msi_irqs) {
 		mtx_unlock(&msi_lock);
 		return;
 	}
-	irq = msi_last_irq + FIRST_MSI_INT;
+	irq = msi_last_irq + first_msi_irq;
 	msi_last_irq++;
 	mtx_unlock(&msi_lock);
 
@@ -348,8 +377,12 @@
 msi_alloc(device_t dev, int count, int maxcount, int *irqs)
 {
 	struct msi_intsrc *msi, *fsrc;
-	u_int cpu;
-	int cnt, i, *mirqs, vector;
+	u_int cpu, *mirqs;
+	int cnt, i, vector;
+#ifdef ACPI_DMAR
+	u_int cookies[count];
+	int error;
+#endif
 
 	if (!msi_enabled)
 		return (ENXIO);
@@ -363,7 +396,7 @@
 
 	/* Try to find 'count' free IRQs. */
 	cnt = 0;
-	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
+	for (i = first_msi_irq; i < first_msi_irq + num_msi_irqs; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
@@ -382,7 +415,7 @@
 	/* Do we need to create some new sources? */
 	if (cnt < count) {
 		/* If we would exceed the max, give up. */
-		if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) {
+		if (i + (count - cnt) > first_msi_irq + num_msi_irqs) {
 			mtx_unlock(&msi_lock);
 			free(mirqs, M_MSI);
 			return (ENXIO);
@@ -409,6 +442,24 @@
 		return (ENOSPC);
 	}
 
+#ifdef ACPI_DMAR
+	mtx_unlock(&msi_lock);
+	error = iommu_alloc_msi_intr(dev, cookies, count);
+	mtx_lock(&msi_lock);
+	if (error == EOPNOTSUPP)
+		error = 0;
+	if (error != 0) {
+		for (i = 0; i < count; i++)
+			apic_free_vector(cpu, vector + i, irqs[i]);
+		free(mirqs, M_MSI);
+		return (error);
+	}
+	for (i = 0; i < count; i++) {
+		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
+		msi->msi_remap_cookie = cookies[i];
+	}
+#endif
+
 	/* Assign IDT vectors and make these messages owned by 'dev'. */
 	fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
 	for (i = 0; i < count; i++) {
@@ -430,7 +481,6 @@
 		bcopy(irqs, mirqs, count * sizeof(*mirqs));
 	fsrc->msi_irqs = mirqs;
 	mtx_unlock(&msi_lock);
-
 	return (0);
 }
 
@@ -474,6 +524,9 @@
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		KASSERT(msi->msi_first == first, ("message not in group"));
 		KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch"));
+#ifdef ACPI_DMAR
+		iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie);
+#endif
 		msi->msi_first = NULL;
 		msi->msi_dev = NULL;
 		apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
@@ -481,6 +534,11 @@
 	}
 
 	/* Clear out the first message. */
+#ifdef ACPI_DMAR
+	mtx_unlock(&msi_lock);
+	iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie);
+	mtx_lock(&msi_lock);
+#endif
 	first->msi_first = NULL;
 	first->msi_dev = NULL;
 	apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq);
@@ -498,6 +556,11 @@
 msi_map(int irq, uint64_t *addr, uint32_t *data)
 {
 	struct msi_intsrc *msi;
+	int error;
+#ifdef ACPI_DMAR
+	struct msi_intsrc *msi1;
+	int i, k;
+#endif
 
 	mtx_lock(&msi_lock);
 	msi = (struct msi_intsrc *)intr_lookup_source(irq);
@@ -525,10 +588,36 @@
 		msi = msi->msi_first;
 	}
 
-	*addr = INTEL_ADDR(msi);
-	*data = INTEL_DATA(msi);
+#ifdef ACPI_DMAR
+	if (!msi->msi_msix) {
+		for (k = msi->msi_count - 1, i = first_msi_irq; k > 0 &&
+		    i < first_msi_irq + num_msi_irqs; i++) {
+			if (i == msi->msi_irq)
+				continue;
+			msi1 = (struct msi_intsrc *)intr_lookup_source(i);
+			if (!msi1->msi_msix && msi1->msi_first == msi) {
+				mtx_unlock(&msi_lock);
+				iommu_map_msi_intr(msi1->msi_dev,
+				    msi1->msi_cpu, msi1->msi_vector,
+				    msi1->msi_remap_cookie, NULL, NULL);
+				k--;
+				mtx_lock(&msi_lock);
+			}
+		}
+	}
 	mtx_unlock(&msi_lock);
-	return (0);
+	error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu,
+	    msi->msi_vector, msi->msi_remap_cookie, addr, data);
+#else
+	mtx_unlock(&msi_lock);
+	error = EOPNOTSUPP;
+#endif
+	if (error == EOPNOTSUPP) {
+		*addr = INTEL_ADDR(msi);
+		*data = INTEL_DATA(msi);
+		error = 0;
+	}
+	return (error);
 }
 
 int
@@ -537,6 +626,10 @@
 	struct msi_intsrc *msi;
 	u_int cpu;
 	int i, vector;
+#ifdef ACPI_DMAR
+	u_int cookie;
+	int error;
+#endif
 
 	if (!msi_enabled)
 		return (ENXIO);
@@ -545,7 +638,7 @@
 	mtx_lock(&msi_lock);
 
 	/* Find a free IRQ. */
-	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
+	for (i = first_msi_irq; i < first_msi_irq + num_msi_irqs; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
@@ -558,7 +651,7 @@
 	}
 
 	/* Are all IRQs in use? */
-	if (i == FIRST_MSI_INT + NUM_MSI_INTS) {
+	if (i == first_msi_irq + num_msi_irqs) {
 		mtx_unlock(&msi_lock);
 		return (ENXIO);
 	}
@@ -579,6 +672,22 @@
 		mtx_unlock(&msi_lock);
 		return (ENOSPC);
 	}
+
+	msi->msi_dev = dev;
+#ifdef ACPI_DMAR
+	mtx_unlock(&msi_lock);
+	error = iommu_alloc_msi_intr(dev, &cookie, 1);
+	mtx_lock(&msi_lock);
+	if (error == EOPNOTSUPP)
+		error = 0;
+	if (error != 0) {
+		msi->msi_dev = NULL;
+		apic_free_vector(cpu, vector, i);
+		return (error);
+	}
+	msi->msi_remap_cookie = cookie;
+#endif
+
 	if (bootverbose)
 		printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n",
 		    msi->msi_irq, cpu, vector);
@@ -585,7 +694,6 @@
 
 	/* Setup source. */
 	msi->msi_cpu = cpu;
-	msi->msi_dev = dev;
 	msi->msi_first = msi;
 	msi->msi_vector = vector;
 	msi->msi_msix = 1;
@@ -621,6 +729,11 @@
 	KASSERT(msi->msi_dev != NULL, ("unowned message"));
 
 	/* Clear out the message. */
+#ifdef ACPI_DMAR
+	mtx_unlock(&msi_lock);
+	iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie);
+	mtx_lock(&msi_lock);
+#endif
 	msi->msi_first = NULL;
 	msi->msi_dev = NULL;
 	apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);

Modified: trunk/sys/x86/x86/nexus.c
===================================================================
--- trunk/sys/x86/x86/nexus.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/nexus.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/nexus.c 221324 2011-05-02 14:13:12Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/nexus.c 340016 2018-11-01 18:34:26Z jhb $");
 
 /*
  * This code implements a `root nexus' for Intel Architecture
@@ -64,7 +64,6 @@
 #include <machine/vmparam.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
-#include <machine/pmap.h>
 
 #include <machine/metadata.h>
 #include <machine/nexusvar.h>
@@ -80,7 +79,7 @@
 #ifdef PC98
 #include <pc98/cbus/cbus.h>
 #else
-#include <x86/isa/isa.h>
+#include <isa/isareg.h>
 #endif
 #endif
 #include <sys/rtprio.h>
@@ -100,9 +99,10 @@
 static device_t nexus_add_child(device_t bus, u_int order, const char *name,
 				int unit);
 static	struct resource *nexus_alloc_resource(device_t, device_t, int, int *,
-					      u_long, u_long, u_long, u_int);
+					      rman_res_t, rman_res_t, rman_res_t,
+					      u_int);
 static	int nexus_adjust_resource(device_t, device_t, int, struct resource *,
-				  u_long, u_long);
+				  rman_res_t, rman_res_t);
 #ifdef SMP
 static	int nexus_bind_intr(device_t, device_t, struct resource *, int);
 #endif
@@ -115,6 +115,12 @@
 				    struct resource *);
 static	int nexus_deactivate_resource(device_t, device_t, int, int,
 				      struct resource *);
+static	int nexus_map_resource(device_t bus, device_t child, int type,
+    			       struct resource *r,
+			       struct resource_map_request *argsp,
+			       struct resource_map *map);
+static	int nexus_unmap_resource(device_t bus, device_t child, int type,
+				 struct resource *r, struct resource_map *map);
 static	int nexus_release_resource(device_t, device_t, int, int,
 				   struct resource *);
 static	int nexus_setup_intr(device_t, device_t, struct resource *, int flags,
@@ -123,9 +129,13 @@
 static	int nexus_teardown_intr(device_t, device_t, struct resource *,
 				void *);
 static struct resource_list *nexus_get_reslist(device_t dev, device_t child);
-static	int nexus_set_resource(device_t, device_t, int, int, u_long, u_long);
-static	int nexus_get_resource(device_t, device_t, int, int, u_long *, u_long *);
+static	int nexus_set_resource(device_t, device_t, int, int,
+			       rman_res_t, rman_res_t);
+static	int nexus_get_resource(device_t, device_t, int, int,
+			       rman_res_t *, rman_res_t *);
 static void nexus_delete_resource(device_t, device_t, int, int);
+static	int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t,
+			   cpuset_t *);
 #ifdef DEV_APIC
 static	int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs);
 static	int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs);
@@ -151,6 +161,8 @@
 	DEVMETHOD(bus_release_resource,	nexus_release_resource),
 	DEVMETHOD(bus_activate_resource, nexus_activate_resource),
 	DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource),
+	DEVMETHOD(bus_map_resource,	nexus_map_resource),
+	DEVMETHOD(bus_unmap_resource,	nexus_unmap_resource),
 	DEVMETHOD(bus_setup_intr,	nexus_setup_intr),
 	DEVMETHOD(bus_teardown_intr,	nexus_teardown_intr),
 #ifdef SMP
@@ -162,6 +174,7 @@
 	DEVMETHOD(bus_set_resource,	nexus_set_resource),
 	DEVMETHOD(bus_get_resource,	nexus_get_resource),
 	DEVMETHOD(bus_delete_resource,	nexus_delete_resource),
+	DEVMETHOD(bus_get_cpus,		nexus_get_cpus),
 
 	/* pcib interface */
 #ifdef DEV_APIC
@@ -214,7 +227,7 @@
 	irq_rman.rm_start = 0;
 	irq_rman.rm_type = RMAN_ARRAY;
 	irq_rman.rm_descr = "Interrupt request lines";
-	irq_rman.rm_end = NUM_IO_INTS - 1;
+	irq_rman.rm_end = num_io_irqs - 1;
 	if (rman_init(&irq_rman))
 		panic("nexus_init_resources irq_rman");
 
@@ -222,7 +235,7 @@
 	 * We search for regions of existing IRQs and add those to the IRQ
 	 * resource manager.
 	 */
-	for (irq = 0; irq < NUM_IO_INTS; irq++)
+	for (irq = 0; irq < num_io_irqs; irq++)
 		if (intr_lookup_source(irq) != NULL)
 			if (rman_manage_region(&irq_rman, irq, irq) != 0)
 				panic("nexus_init_resources irq_rman add");
@@ -260,11 +273,15 @@
 		panic("nexus_init_resources port_rman");
 
 	mem_rman.rm_start = 0;
-	mem_rman.rm_end = ~0ul;
+#ifndef PAE
+	mem_rman.rm_end = BUS_SPACE_MAXADDR;
+#else
+	mem_rman.rm_end = ((1ULL << cpu_maxphyaddr) - 1);
+#endif
 	mem_rman.rm_type = RMAN_ARRAY;
 	mem_rman.rm_descr = "I/O memory addresses";
 	if (rman_init(&mem_rman)
-	    || rman_manage_region(&mem_rman, 0, ~0))
+	    || rman_manage_region(&mem_rman, 0, mem_rman.rm_end))
 		panic("nexus_init_resources mem_rman");
 }
 
@@ -296,9 +313,9 @@
 	if (STAILQ_FIRST(rl))
 		retval += printf(" at");
 
-	retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#lx");
-	retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#lx");
-	retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%ld");
+	retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx");
+	retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx");
+	retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
 
 	return retval;
 }
@@ -360,7 +377,8 @@
  */
 static struct resource *
 nexus_alloc_resource(device_t bus, device_t child, int type, int *rid,
-		     u_long start, u_long end, u_long count, u_int flags)
+		     rman_res_t start, rman_res_t end, rman_res_t count,
+		     u_int flags)
 {
 	struct nexus_device *ndev = DEVTONX(child);
 	struct	resource *rv;
@@ -369,12 +387,13 @@
 	int needactivate = flags & RF_ACTIVE;
 
 	/*
-	 * If this is an allocation of the "default" range for a given RID, and
-	 * we know what the resources for this device are (ie. they aren't maintained
-	 * by a child bus), then work out the start/end values.
+	 * If this is an allocation of the "default" range for a given
+	 * RID, and we know what the resources for this device are
+	 * (ie. they aren't maintained by a child bus), then work out
+	 * the start/end values.
 	 */
-	if ((start == 0UL) && (end == ~0UL) && (count == 1)) {
-		if (ndev == NULL)
+	if (RMAN_IS_DEFAULT_RANGE(start, end) && (count == 1)) {
+		if (device_get_parent(child) != bus || ndev == NULL)
 			return(NULL);
 		rle = resource_list_find(&ndev->nx_resources, type, *rid);
 		if (rle == NULL)
@@ -390,7 +409,7 @@
 		return (NULL);
 
 	rv = rman_reserve_resource(rm, start, end, count, flags, child);
-	if (rv == 0)
+	if (rv == NULL)
 		return 0;
 	rman_set_rid(rv, *rid);
 
@@ -406,7 +425,7 @@
 
 static int
 nexus_adjust_resource(device_t bus, device_t child, int type,
-    struct resource *r, u_long start, u_long end)
+    struct resource *r, rman_res_t start, rman_res_t end)
 {
 	struct rman *rm;
 
@@ -422,12 +441,82 @@
 nexus_activate_resource(device_t bus, device_t child, int type, int rid,
 			struct resource *r)
 {
+	struct resource_map map;
+	int error;
+
+	error = rman_activate_resource(r);
+	if (error != 0)
+		return (error);
+
+	if (!(rman_get_flags(r) & RF_UNMAPPED) &&
+	    (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
+		error = nexus_map_resource(bus, child, type, r, NULL, &map);
+		if (error) {
+			rman_deactivate_resource(r);
+			return (error);
+		}
+
+		rman_set_mapping(r,&map);
+	}
+	return (0);
+}
+
+static int
+nexus_deactivate_resource(device_t bus, device_t child, int type, int rid,
+			  struct resource *r)
+{
+	struct resource_map map;
+	int error;
+
+	error = rman_deactivate_resource(r);
+	if (error)
+		return (error);
+
+	if (!(rman_get_flags(r) & RF_UNMAPPED) &&
+	    (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
+		rman_get_mapping(r, &map);
+		nexus_unmap_resource(bus, child, type, r, &map);
+	}
+	return (0);
+}
+
+static int
+nexus_map_resource(device_t bus, device_t child, int type, struct resource *r,
+    struct resource_map_request *argsp, struct resource_map *map)
+{
+	struct resource_map_request args;
+	rman_res_t end, length, start;
 #ifdef PC98
-	bus_space_handle_t bh;
 	int error;
 #endif
-	void *vaddr;
 
+	/* Resources must be active to be mapped. */
+	if (!(rman_get_flags(r) & RF_ACTIVE))
+		return (ENXIO);
+
+	/* Mappings are only supported on I/O and memory resources. */
+	switch (type) {
+	case SYS_RES_IOPORT:
+	case SYS_RES_MEMORY:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	resource_init_map_request(&args);
+	if (argsp != NULL)
+		bcopy(argsp, &args, imin(argsp->size, args.size));
+	start = rman_get_start(r) + args.offset;
+	if (args.length == 0)
+		length = rman_get_size(r);
+	else
+		length = args.length;
+	end = start + length - 1;
+	if (start > rman_get_end(r) || start < rman_get_start(r))
+		return (EINVAL);
+	if (end > rman_get_end(r) || end < start)
+		return (EINVAL);
+
 	/*
 	 * If this is a memory resource, map it into the kernel.
 	 */
@@ -435,58 +524,64 @@
 	case SYS_RES_IOPORT:
 #ifdef PC98
 		error = i386_bus_space_handle_alloc(X86_BUS_SPACE_IO,
-		    rman_get_start(r), rman_get_size(r), &bh);
+		    start, length, &map->r_bushandle);
 		if (error)
 			return (error);
-		rman_set_bushandle(r, bh);
 #else
-		rman_set_bushandle(r, rman_get_start(r));
+		map->r_bushandle = start;
 #endif
-		rman_set_bustag(r, X86_BUS_SPACE_IO);
+		map->r_bustag = X86_BUS_SPACE_IO;
+		map->r_size = length;
+		map->r_vaddr = NULL;
 		break;
 	case SYS_RES_MEMORY:
 #ifdef PC98
 		error = i386_bus_space_handle_alloc(X86_BUS_SPACE_MEM,
-		    rman_get_start(r), rman_get_size(r), &bh);
+		    start, length, &map->r_bushandle);
 		if (error)
 			return (error);
 #endif
-		vaddr = pmap_mapdev(rman_get_start(r), rman_get_size(r));
-		rman_set_virtual(r, vaddr);
-		rman_set_bustag(r, X86_BUS_SPACE_MEM);
+		map->r_vaddr = pmap_mapdev_attr(start, length, args.memattr);
+		map->r_bustag = X86_BUS_SPACE_MEM;
+		map->r_size = length;
+
+		/*
+		 * PC-98 stores the virtual address as a member of the
+		 * structure in the handle.  On plain x86, the handle is
+		 * the virtual address.
+		 */
 #ifdef PC98
-		/* PC-98: the type of bus_space_handle_t is the structure. */
-		bh->bsh_base = (bus_addr_t) vaddr;
-		rman_set_bushandle(r, bh);
+		map->r_bushandle->bsh_base = (bus_addr_t)map->r_vaddr;
 #else
-		/* IBM-PC: the type of bus_space_handle_t is u_int */
-		rman_set_bushandle(r, (bus_space_handle_t) vaddr);
+		map->r_bushandle = (bus_space_handle_t)map->r_vaddr;
 #endif
+		break;
 	}
-	return (rman_activate_resource(r));
+	return (0);
 }
 
 static int
-nexus_deactivate_resource(device_t bus, device_t child, int type, int rid,
-			  struct resource *r)
+nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r,
+    struct resource_map *map)
 {
-
+	
 	/*
 	 * If this is a memory resource, unmap it.
 	 */
-	if (type == SYS_RES_MEMORY) {
-		pmap_unmapdev((vm_offset_t)rman_get_virtual(r),
-		    rman_get_size(r));
-	}
+	switch (type) {
+	case SYS_RES_MEMORY:
+		pmap_unmapdev((vm_offset_t)map->r_vaddr, map->r_size);
+		/* FALLTHROUGH */
+	case SYS_RES_IOPORT:
 #ifdef PC98
-	if (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT) {
-		bus_space_handle_t bh;
-
-		bh = rman_get_bushandle(r);
-		i386_bus_space_handle_free(rman_get_bustag(r), bh, bh->bsh_sz);
+		i386_bus_space_handle_free(map->r_bustag, map->r_bushandle,
+		    map->r_bushandle->bsh_sz);
+#endif
+		break;
+	default:
+		return (EINVAL);
 	}
-#endif
-	return (rman_deactivate_resource(r));
+	return (0);
 }
 
 static int
@@ -493,6 +588,7 @@
 nexus_release_resource(device_t bus, device_t child, int type, int rid,
 		       struct resource *r)
 {
+
 	if (rman_get_flags(r) & RF_ACTIVE) {
 		int error = bus_deactivate_resource(child, type, rid, r);
 		if (error)
@@ -518,7 +614,7 @@
 	if (irq == NULL)
 		panic("nexus_setup_intr: NULL irq resource!");
 
-	*cookiep = 0;
+	*cookiep = NULL;
 	if ((rman_get_flags(irq) & RF_SHAREABLE) == 0)
 		flags |= INTR_EXCL;
 
@@ -573,7 +669,8 @@
 }
 
 static int
-nexus_set_resource(device_t dev, device_t child, int type, int rid, u_long start, u_long count)
+nexus_set_resource(device_t dev, device_t child, int type, int rid,
+    rman_res_t start, rman_res_t count)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
@@ -584,7 +681,8 @@
 }
 
 static int
-nexus_get_resource(device_t dev, device_t child, int type, int rid, u_long *startp, u_long *countp)
+nexus_get_resource(device_t dev, device_t child, int type, int rid,
+    rman_res_t *startp, rman_res_t *countp)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
@@ -609,6 +707,24 @@
 	resource_list_delete(rl, type, rid);
 }
 
+static int
+nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
+    cpuset_t *cpuset)
+{
+
+	switch (op) {
+#ifdef SMP
+	case INTR_CPUS:
+		if (setsize != sizeof(cpuset_t))
+			return (EINVAL);
+		*cpuset = intr_cpus;
+		return (0);
+#endif
+	default:
+		return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
+	}
+}
+
 /* Called from the MSI code to add new IRQs to the IRQ rman. */
 void
 nexus_add_irq(u_long irq)
@@ -689,11 +805,8 @@
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type(ELF_KERN_STR);  
-	if (kmdp != NULL)
-		smapbase = (struct bios_smap *)preload_search_info(kmdp,
-		    MODINFO_METADATA | MODINFOMD_SMAP);
-	else
-		smapbase = NULL;
+	smapbase = (struct bios_smap *)preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase != NULL) {
 		smapsize = *((u_int32_t *)smapbase - 1);
 		smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);

Added: trunk/sys/x86/x86/pvclock.c
===================================================================
--- trunk/sys/x86/x86/pvclock.c	                        (rev 0)
+++ trunk/sys/x86/x86/pvclock.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,204 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2009 Adrian Chadd
+ * Copyright (c) 2012 Spectra Logic Corporation
+ * Copyright (c) 2014 Bryan Venteicher
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/pvclock.c 278184 2015-02-04 08:33:04Z bryanv $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+
+#include <machine/cpufunc.h>
+#include <machine/cpu.h>
+#include <machine/atomic.h>
+#include <machine/pvclock.h>
+
+/*
+ * Last time; this guarantees a monotonically increasing clock for when
+ * a stable TSC is not provided.
+ */
+static volatile uint64_t pvclock_last_cycles;
+
+void
+pvclock_resume(void)
+{
+
+	atomic_store_rel_64(&pvclock_last_cycles, 0);
+}
+
+uint64_t
+pvclock_get_last_cycles(void)
+{
+
+	return (atomic_load_acq_64(&pvclock_last_cycles));
+}
+
+uint64_t
+pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti)
+{
+	uint64_t freq;
+
+	freq = (1000000000ULL << 32) / ti->tsc_to_system_mul;
+
+	if (ti->tsc_shift < 0)
+		freq <<= -ti->tsc_shift;
+	else
+		freq >>= ti->tsc_shift;
+
+	return (freq);
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline uint64_t
+pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
+{
+	uint64_t product;
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#if defined(__i386__)
+	{
+		uint32_t tmp1, tmp2;
+
+		/**
+		 * For i386, the formula looks like:
+		 *
+		 *   lower = (mul_frac * (delta & UINT_MAX)) >> 32
+		 *   upper = mul_frac * (delta >> 32)
+		 *   product = lower + upper
+		 */
+		__asm__ (
+			"mul  %5       ; "
+			"mov  %4,%%eax ; "
+			"mov  %%edx,%4 ; "
+			"mul  %5       ; "
+			"xor  %5,%5    ; "
+			"add  %4,%%eax ; "
+			"adc  %5,%%edx ; "
+			: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+			: "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
+			  "2" (mul_frac) );
+	}
+#elif defined(__amd64__)
+	{
+		unsigned long tmp;
+
+		__asm__ (
+			"mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+			: [lo]"=a" (product), [hi]"=d" (tmp)
+			: "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
+	}
+#else
+#error "pvclock: unsupported x86 architecture?"
+#endif
+
+	return (product);
+}
+
+static uint64_t
+pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti)
+{
+	uint64_t delta;
+
+	delta = rdtsc() - ti->tsc_timestamp;
+
+	return (pvclock_scale_delta(delta, ti->tsc_to_system_mul,
+	    ti->tsc_shift));
+}
+
+static void
+pvclock_read_time_info(struct pvclock_vcpu_time_info *ti,
+    uint64_t *cycles, uint8_t *flags)
+{
+	uint32_t version;
+
+	do {
+		version = ti->version;
+		rmb();
+		*cycles = ti->system_time + pvclock_get_nsec_offset(ti);
+		*flags = ti->flags;
+		rmb();
+	} while ((ti->version & 1) != 0 || ti->version != version);
+}
+
+static void
+pvclock_read_wall_clock(struct pvclock_wall_clock *wc, uint32_t *sec,
+    uint32_t *nsec)
+{
+	uint32_t version;
+
+	do {
+		version = wc->version;
+		rmb();
+		*sec = wc->sec;
+		*nsec = wc->nsec;
+		rmb();
+	} while ((wc->version & 1) != 0 || wc->version != version);
+}
+
+uint64_t
+pvclock_get_timecount(struct pvclock_vcpu_time_info *ti)
+{
+	uint64_t now, last;
+	uint8_t flags;
+
+	pvclock_read_time_info(ti, &now, &flags);
+
+	if (flags & PVCLOCK_FLAG_TSC_STABLE)
+		return (now);
+
+	/*
+	 * Enforce a monotonically increasing clock time across all VCPUs.
+	 * If our time is too old, use the last time and return. Otherwise,
+	 * try to update the last time.
+	 */
+	do {
+		last = atomic_load_acq_64(&pvclock_last_cycles);
+		if (last > now)
+			return (last);
+	} while (!atomic_cmpset_64(&pvclock_last_cycles, last, now));
+
+	return (now);
+}
+
+void
+pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts)
+{
+	uint32_t sec, nsec;
+
+	pvclock_read_wall_clock(wc, &sec, &nsec);
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}


Property changes on: trunk/sys/x86/x86/pvclock.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/stack_machdep.c
===================================================================
--- trunk/sys/x86/x86/stack_machdep.c	                        (rev 0)
+++ trunk/sys/x86/x86/stack_machdep.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,182 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 EMC Corporation
+ * Copyright (c) 2005 Antoine Brodin
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/stack_machdep.c 337976 2018-08-17 16:04:59Z markj $");
+
+#include "opt_stack.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/stack.h>
+
+#include <machine/pcb.h>
+#include <machine/smp.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <x86/stack.h>
+
+#ifdef __i386__
+#define	PCB_FP(pcb)	((pcb)->pcb_ebp)
+#define	TF_FLAGS(tf)	((tf)->tf_eflags)
+#define	TF_FP(tf)	((tf)->tf_ebp)
+#define	TF_PC(tf)	((tf)->tf_eip)
+
+typedef struct i386_frame *x86_frame_t;
+#else
+#define	PCB_FP(pcb)	((pcb)->pcb_rbp)
+#define	TF_FLAGS(tf)	((tf)->tf_rflags)
+#define	TF_FP(tf)	((tf)->tf_rbp)
+#define	TF_PC(tf)	((tf)->tf_rip)
+
+typedef struct amd64_frame *x86_frame_t;
+#endif
+
+#ifdef STACK
+static struct stack *nmi_stack;
+static volatile struct thread *nmi_pending;
+
+#ifdef SMP
+static struct mtx nmi_lock;
+MTX_SYSINIT(nmi_lock, &nmi_lock, "stack_nmi", MTX_SPIN);
+#endif
+#endif
+
+static void
+stack_capture(struct thread *td, struct stack *st, register_t fp)
+{
+	x86_frame_t frame;
+	vm_offset_t callpc;
+
+	stack_zero(st);
+	frame = (x86_frame_t)fp;
+	while (1) {
+		if ((vm_offset_t)frame < td->td_kstack ||
+		    (vm_offset_t)frame >= td->td_kstack +
+		    td->td_kstack_pages * PAGE_SIZE)
+			break;
+		callpc = frame->f_retaddr;
+		if (!INKERNEL(callpc))
+			break;
+		if (stack_put(st, callpc) == -1)
+			break;
+		if (frame->f_frame <= frame)
+			break;
+		frame = frame->f_frame;
+	}
+}
+
+int
+stack_nmi_handler(struct trapframe *tf)
+{
+
+#ifdef STACK
+	/* Don't consume an NMI that wasn't meant for us. */
+	if (nmi_stack == NULL || curthread != nmi_pending)
+		return (0);
+
+	if (!TRAPF_USERMODE(tf) && (TF_FLAGS(tf) & PSL_I) != 0)
+		stack_capture(curthread, nmi_stack, TF_FP(tf));
+	else
+		/* We were running in usermode or had interrupts disabled. */
+		nmi_stack->depth = 0;
+
+	atomic_store_rel_ptr((long *)&nmi_pending, (long)NULL);
+	return (1);
+#else
+	return (0);
+#endif
+}
+
+void
+stack_save_td(struct stack *st, struct thread *td)
+{
+
+	if (TD_IS_SWAPPED(td))
+		panic("stack_save_td: swapped");
+	if (TD_IS_RUNNING(td))
+		panic("stack_save_td: running");
+
+	stack_capture(td, st, PCB_FP(td->td_pcb));
+}
+
+int
+stack_save_td_running(struct stack *st, struct thread *td)
+{
+
+#ifdef STACK
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	MPASS(TD_IS_RUNNING(td));
+
+	if (td == curthread) {
+		stack_save(st);
+		return (0);
+	}
+
+#ifdef SMP
+	mtx_lock_spin(&nmi_lock);
+
+	nmi_stack = st;
+	nmi_pending = td;
+	ipi_cpu(td->td_oncpu, IPI_TRACE);
+	while ((void *)atomic_load_acq_ptr((long *)&nmi_pending) != NULL)
+		cpu_spinwait();
+	nmi_stack = NULL;
+
+	mtx_unlock_spin(&nmi_lock);
+
+	if (st->depth == 0)
+		return (EAGAIN);
+#else /* !SMP */
+	KASSERT(0, ("curthread isn't running"));
+#endif /* SMP */
+	return (0);
+#else /* !STACK */
+	return (EOPNOTSUPP);
+#endif /* STACK */
+}
+
+void
+stack_save(struct stack *st)
+{
+	register_t fp;
+
+#ifdef __i386__
+	__asm __volatile("movl %%ebp,%0" : "=g" (fp));
+#else
+	__asm __volatile("movq %%rbp,%0" : "=g" (fp));
+#endif
+	stack_capture(curthread, st, fp);
+}


Property changes on: trunk/sys/x86/x86/stack_machdep.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/x86/tsc.c
===================================================================
--- trunk/sys/x86/x86/tsc.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/x86/tsc.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/x86/tsc.c 280973 2015-04-02 01:02:42Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/tsc.c 353007 2019-10-02 13:46:40Z kib $");
 
 #include "opt_compat.h"
 #include "opt_clock.h"
@@ -49,6 +49,7 @@
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <x86/vmware.h>
+#include <dev/acpica/acpi_hpet.h>
 
 #include "cpufreq_if.h"
 
@@ -60,34 +61,28 @@
 
 SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN,
     &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant");
-TUNABLE_INT("kern.timecounter.invariant_tsc", &tsc_is_invariant);
 
 #ifdef SMP
 int	smp_tsc;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0,
     "Indicates whether the TSC is safe to use in SMP mode");
-TUNABLE_INT("kern.timecounter.smp_tsc", &smp_tsc);
 
 int	smp_tsc_adjust = 0;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN,
     &smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP");
-TUNABLE_INT("kern.timecounter.smp_tsc_adjust", &smp_tsc_adjust);
 #endif
 
 static int	tsc_shift = 1;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN,
     &tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency");
-TUNABLE_INT("kern.timecounter.tsc_shift", &tsc_shift);
 
 static int	tsc_disabled;
 SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0,
     "Disable x86 Time Stamp Counter");
-TUNABLE_INT("machdep.disable_tsc", &tsc_disabled);
 
 static int	tsc_skip_calibration;
 SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN,
     &tsc_skip_calibration, 0, "Disable TSC frequency calibration");
-TUNABLE_INT("machdep.disable_tsc_calibration", &tsc_skip_calibration);
 
 static void tsc_freq_changed(void *arg, const struct cf_level *level,
     int status);
@@ -100,14 +95,22 @@
 static unsigned tsc_get_timecount_mfence(struct timecounter *tc);
 static unsigned tsc_get_timecount_low_mfence(struct timecounter *tc);
 static void tsc_levels_changed(void *arg, int unit);
+static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
+    struct timecounter *tc);
+#ifdef COMPAT_FREEBSD32
+static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
+    struct timecounter *tc);
+#endif
 
 static struct timecounter tsc_timecounter = {
-	tsc_get_timecount,	/* get_timecount */
-	0,			/* no poll_pps */
-	~0u,			/* counter_mask */
-	0,			/* frequency */
-	"TSC",			/* name */
-	800,			/* quality (adjusted in code) */
+	.tc_get_timecount =		tsc_get_timecount,
+	.tc_counter_mask =		~0u,
+	.tc_name =			"TSC",
+	.tc_quality =			800,	/* adjusted in code */
+	.tc_fill_vdso_timehands = 	x86_tsc_vdso_timehands,
+#ifdef COMPAT_FREEBSD32
+	.tc_fill_vdso_timehands32 = 	x86_tsc_vdso_timehands32,
+#endif
 };
 
 static void
@@ -126,6 +129,40 @@
 	tsc_is_invariant = 1;
 }
 
+/*
+ * Calculate TSC frequency using information from the CPUID leaf 0x15
+ * 'Time Stamp Counter and Nominal Core Crystal Clock'.  If leaf 0x15
+ * is not functional, as it is on Skylake/Kabylake, try 0x16 'Processor
+ * Frequency Information'.  Leaf 0x16 is described in the SDM as
+ * informational only, but if 0x15 did not work, and TSC calibration
+ * is disabled, it is the best we can get at all.  It should still be
+ * an improvement over the parsing of the CPU model name in
+ * tsc_freq_intel(), when available.
+ */
+static bool
+tsc_freq_cpuid(void)
+{
+	u_int regs[4];
+
+	if (cpu_high < 0x15)
+		return (false);
+	do_cpuid(0x15, regs);
+	if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) {
+		tsc_freq = (uint64_t)regs[2] * regs[1] / regs[0];
+		return (true);
+	}
+
+	if (cpu_high < 0x16)
+		return (false);
+	do_cpuid(0x16, regs);
+	if (regs[0] != 0) {
+		tsc_freq = (uint64_t)regs[0] * 1000000;
+		return (true);
+	}
+
+	return (false);
+}
+
 static void
 tsc_freq_intel(void)
 {
@@ -250,18 +287,19 @@
 	}
 
 	if (tsc_skip_calibration) {
-		if (cpu_vendor_id == CPU_VENDOR_INTEL)
+		if (tsc_freq_cpuid())
+			;
+		else if (cpu_vendor_id == CPU_VENDOR_INTEL)
 			tsc_freq_intel();
-		return;
+	} else {
+		if (bootverbose)
+			printf("Calibrating TSC clock ... ");
+		tsc1 = rdtsc();
+		DELAY(1000000);
+		tsc2 = rdtsc();
+		tsc_freq = tsc2 - tsc1;
 	}
-
 	if (bootverbose)
-	        printf("Calibrating TSC clock ... ");
-	tsc1 = rdtsc();
-	DELAY(1000000);
-	tsc2 = rdtsc();
-	tsc_freq = tsc2 - tsc1;
-	if (bootverbose)
 		printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq);
 }
 
@@ -427,7 +465,7 @@
 }
 
 static int
-test_tsc(void)
+test_tsc(int adj_max_count)
 {
 	uint64_t *data, *tsc;
 	u_int i, size, adj;
@@ -441,12 +479,12 @@
 	for (i = 0, tsc = data; i < N; i++, tsc += size)
 		smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc);
 	smp_tsc = 1;	/* XXX */
-	smp_rendezvous(smp_no_rendevous_barrier, comp_smp_tsc,
-	    smp_no_rendevous_barrier, data);
-	if (!smp_tsc && adj < smp_tsc_adjust) {
+	smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc,
+	    smp_no_rendezvous_barrier, data);
+	if (!smp_tsc && adj < adj_max_count) {
 		adj++;
-		smp_rendezvous(smp_no_rendevous_barrier, adj_smp_tsc,
-		    smp_no_rendevous_barrier, data);
+		smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc,
+		    smp_no_rendezvous_barrier, data);
 		goto retry;
 	}
 	free(data, M_TEMP);
@@ -481,19 +519,6 @@
 
 #undef N
 
-#else
-
-/*
- * The function is not called, it is provided to avoid linking failure
- * on uniprocessor kernel.
- */
-static int
-test_tsc(void)
-{
-
-	return (0);
-}
-
 #endif /* SMP */
 
 static void
@@ -529,17 +554,22 @@
 	}
 
 	/*
-	 * We cannot use the TSC if it stops incrementing while idle.
 	 * Intel CPUs without a C-state invariant TSC can stop the TSC
-	 * in either C2 or C3.
+	 * in either C2 or C3.  Disable use of C2 and C3 while using
+	 * the TSC as the timecounter.  The timecounter can be changed
+	 * to enable C2 and C3.
+	 *
+	 * Note that the TSC is used as the cputicker for computing
+	 * thread runtime regardless of the timecounter setting, so
+	 * using an alternate timecounter and enabling C2 or C3 can
+	 * result incorrect runtimes for kernel idle threads (but not
+	 * for any non-idle threads).
 	 */
-	if (cpu_deepest_sleep >= 2 && cpu_vendor_id == CPU_VENDOR_INTEL &&
+	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) {
-		tsc_timecounter.tc_quality = -1000;
 		tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP;
 		if (bootverbose)
-			printf("TSC timecounter disabled: C2/C3 may halt it.\n");
-		goto init;
+			printf("TSC timecounter disables C2 and C3.\n");
 	}
 
 	/*
@@ -549,9 +579,12 @@
 	 * non-zero value.  The TSC seems unreliable in virtualized SMP
 	 * environments, so it is set to a negative quality in those cases.
 	 */
+#ifdef SMP
 	if (mp_ncpus > 1)
-		tsc_timecounter.tc_quality = test_tsc();
-	else if (tsc_is_invariant)
+		tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust);
+	else
+#endif /* SMP */
+	if (tsc_is_invariant)
 		tsc_timecounter.tc_quality = 1000;
 	max_freq >>= tsc_shift;
 
@@ -586,6 +619,32 @@
 }
 SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
 
+void
+resume_TSC(void)
+{
+#ifdef SMP
+	int quality;
+
+	/* If TSC was not good on boot, it is unlikely to become good now. */
+	if (tsc_timecounter.tc_quality < 0)
+		return;
+	/* Nothing to do with UP. */
+	if (mp_ncpus < 2)
+		return;
+
+	/*
+	 * If TSC was good, a single synchronization should be enough,
+	 * but honour smp_tsc_adjust if it's set.
+	 */
+	quality = test_tsc(MAX(smp_tsc_adjust, 1));
+	if (quality != tsc_timecounter.tc_quality) {
+		printf("TSC timecounter quality changed: %d -> %d\n",
+		    tsc_timecounter.tc_quality, quality);
+		tsc_timecounter.tc_quality = quality;
+	}
+#endif /* SMP */
+}
+
 /*
  * When cpufreq levels change, find out about the (new) max frequency.  We
  * use this to update CPU accounting in case it got a lower estimate at boot.
@@ -726,22 +785,27 @@
 	return (tsc_get_timecount_low(tc));
 }
 
-uint32_t
-cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+static uint32_t
+x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
 {
 
-	vdso_th->th_x86_shift = (int)(intptr_t)timecounter->tc_priv;
+	vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC;
+	vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv;
+	vdso_th->th_x86_hpet_idx = 0xffffffff;
 	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
-	return (timecounter == &tsc_timecounter);
+	return (1);
 }
 
 #ifdef COMPAT_FREEBSD32
-uint32_t
-cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+static uint32_t
+x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
+    struct timecounter *tc)
 {
 
-	vdso_th32->th_x86_shift = (int)(intptr_t)timecounter->tc_priv;
+	vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC;
+	vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv;
+	vdso_th32->th_x86_hpet_idx = 0xffffffff;
 	bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res));
-	return (timecounter == &tsc_timecounter);
+	return (1);
 }
 #endif

Added: trunk/sys/x86/x86/ucode.c
===================================================================
--- trunk/sys/x86/x86/ucode.c	                        (rev 0)
+++ trunk/sys/x86/x86/ucode.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,402 @@
+/* $MidnightBSD$ */
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/ucode.c 347700 2019-05-16 14:42:16Z markj $");
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <x86/specialreg.h>
+#include <machine/stdarg.h>
+#include <x86/ucode.h>
+#include <x86/x86_smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+
+static void	*ucode_intel_match(uint8_t *data, size_t *len);
+static int	ucode_intel_verify(struct ucode_intel_header *hdr,
+		    size_t resid);
+
+static struct ucode_ops {
+	const char *vendor;
+	int (*load)(void *, bool, uint64_t *, uint64_t *);
+	void *(*match)(uint8_t *, size_t *);
+} loaders[] = {
+	{
+		.vendor = INTEL_VENDOR_ID,
+		.load = ucode_intel_load,
+		.match = ucode_intel_match,
+	},
+};
+
+/* Selected microcode update data. */
+static void *early_ucode_data;
+static void *ucode_data;
+static struct ucode_ops *ucode_loader;
+
+/* Variables used for reporting success or failure. */
+enum {
+	NO_ERROR,
+	NO_MATCH,
+	VERIFICATION_FAILED,
+} ucode_error = NO_ERROR;
+static uint64_t ucode_nrev, ucode_orev;
+
+static void
+log_msg(void *arg __unused)
+{
+
+	if (ucode_nrev != 0) {
+		printf("CPU microcode: updated from %#jx to %#jx\n",
+		    (uintmax_t)ucode_orev, (uintmax_t)ucode_nrev);
+		return;
+	}
+
+	switch (ucode_error) {
+	case NO_MATCH:
+		printf("CPU microcode: no matching update found\n");
+		break;
+	case VERIFICATION_FAILED:
+		printf("CPU microcode: microcode verification failed\n");
+		break;
+	default:
+		break;
+	}
+}
+SYSINIT(ucode_log, SI_SUB_CPU, SI_ORDER_FIRST, log_msg, NULL);
+
+int
+ucode_intel_load(void *data, bool unsafe, uint64_t *nrevp, uint64_t *orevp)
+{
+	uint64_t nrev, orev;
+	uint32_t cpuid[4];
+
+	orev = rdmsr(MSR_BIOS_SIGN) >> 32;
+
+	/*
+	 * Perform update.  Flush caches first to work around seemingly
+	 * undocumented errata applying to some Broadwell CPUs.
+	 */
+	wbinvd();
+	if (unsafe)
+		wrmsr_safe(MSR_BIOS_UPDT_TRIG, (uint64_t)(uintptr_t)data);
+	else
+		wrmsr(MSR_BIOS_UPDT_TRIG, (uint64_t)(uintptr_t)data);
+	wrmsr(MSR_BIOS_SIGN, 0);
+
+	/*
+	 * Serialize instruction flow.
+	 */
+	do_cpuid(0, cpuid);
+
+	/*
+	 * Verify that the microcode revision changed.
+	 */
+	nrev = rdmsr(MSR_BIOS_SIGN) >> 32;
+	if (nrevp != NULL)
+		*nrevp = nrev;
+	if (orevp != NULL)
+		*orevp = orev;
+	if (nrev <= orev)
+		return (EEXIST);
+	return (0);
+}
+
+static int
+ucode_intel_verify(struct ucode_intel_header *hdr, size_t resid)
+{
+	uint32_t cksum, *data, size;
+	int i;
+
+	if (resid < sizeof(struct ucode_intel_header))
+		return (1);
+	size = hdr->total_size;
+	if (size == 0)
+		size = UCODE_INTEL_DEFAULT_DATA_SIZE +
+		    sizeof(struct ucode_intel_header);
+
+	if (hdr->header_version != 1)
+		return (1);
+	if (size % 16 != 0)
+		return (1);
+	if (resid < size)
+		return (1);
+
+	cksum = 0;
+	data = (uint32_t *)hdr;
+	for (i = 0; i < size / sizeof(uint32_t); i++)
+		cksum += data[i];
+	if (cksum != 0)
+		return (1);
+	return (0);
+}
+
+static void *
+ucode_intel_match(uint8_t *data, size_t *len)
+{
+	struct ucode_intel_header *hdr;
+	struct ucode_intel_extsig_table *table;
+	struct ucode_intel_extsig *entry;
+	uint64_t platformid;
+	size_t resid;
+	uint32_t data_size, flags, regs[4], sig, total_size;
+	int i;
+
+	do_cpuid(1, regs);
+	sig = regs[0];
+
+	platformid = rdmsr(MSR_IA32_PLATFORM_ID);
+	flags = 1 << ((platformid >> 50) & 0x7);
+
+	for (resid = *len; resid > 0; data += total_size, resid -= total_size) {
+		hdr = (struct ucode_intel_header *)data;
+		if (ucode_intel_verify(hdr, resid) != 0) {
+			ucode_error = VERIFICATION_FAILED;
+			break;
+		}
+
+		data_size = hdr->data_size;
+		total_size = hdr->total_size;
+		if (data_size == 0)
+			data_size = UCODE_INTEL_DEFAULT_DATA_SIZE;
+		if (total_size == 0)
+			total_size = UCODE_INTEL_DEFAULT_DATA_SIZE +
+			    sizeof(struct ucode_intel_header);
+		if (data_size > total_size + sizeof(struct ucode_intel_header))
+			table = (struct ucode_intel_extsig_table *)
+			    ((uint8_t *)(hdr + 1) + data_size);
+		else
+			table = NULL;
+
+		if (hdr->processor_signature == sig) {
+			if ((hdr->processor_flags & flags) != 0) {
+				*len = data_size;
+				return (hdr + 1);
+			}
+		} else if (table != NULL) {
+			for (i = 0; i < table->signature_count; i++) {
+				entry = &table->entries[i];
+				if (entry->processor_signature == sig &&
+				    (entry->processor_flags & flags) != 0) {
+					*len = data_size;
+					return (hdr + 1);
+				}
+			}
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Release any memory backing unused microcode blobs back to the system.
+ * We copy the selected update and free the entire microcode file.
+ */
+static void
+ucode_release(void *arg __unused)
+{
+	char *name, *type;
+	caddr_t file;
+	int release;
+
+	if (early_ucode_data == NULL)
+		return;
+	release = 1;
+	TUNABLE_INT_FETCH("debug.ucode.release", &release);
+	if (!release)
+		return;
+
+restart:
+	file = 0;
+	for (;;) {
+		file = preload_search_next_name(file);
+		if (file == 0)
+			break;
+		type = (char *)preload_search_info(file, MODINFO_TYPE);
+		if (type == NULL || strcmp(type, "cpu_microcode") != 0)
+			continue;
+
+		name = preload_search_info(file, MODINFO_NAME);
+		preload_delete_name(name);
+		goto restart;
+	}
+}
+SYSINIT(ucode_release, SI_SUB_KMEM + 1, SI_ORDER_ANY, ucode_release, NULL);
+
+void
+ucode_load_ap(int cpu)
+{
+#ifdef SMP
+	KASSERT(cpu_info[cpu_apic_ids[cpu]].cpu_present,
+	    ("cpu %d not present", cpu));
+
+	if (cpu_info[cpu_apic_ids[cpu]].cpu_hyperthread)
+		return;
+#endif
+
+	if (ucode_data != NULL)
+		(void)ucode_loader->load(ucode_data, false, NULL, NULL);
+}
+
+static void *
+map_ucode(uintptr_t free, size_t len)
+{
+#ifdef __i386__
+	uintptr_t va;
+
+	for (va = free; va < free + len; va += PAGE_SIZE)
+		pmap_kenter(va, (vm_paddr_t)va);
+#else
+	(void)len;
+#endif
+	return ((void *)free);
+}
+
+static void
+unmap_ucode(uintptr_t free, size_t len)
+{
+#ifdef __i386__
+	uintptr_t va;
+
+	for (va = free; va < free + len; va += PAGE_SIZE)
+		pmap_kremove(va);
+#else
+	(void)free;
+	(void)len;
+#endif
+}
+
+/*
+ * Search for an applicable microcode update, and load it.  APs will load the
+ * selected update once they come online.
+ *
+ * "free" is the address of the next free physical page.  If a microcode update
+ * is selected, it will be copied to this region prior to loading in order to
+ * satisfy alignment requirements.
+ */
+size_t
+ucode_load_bsp(uintptr_t free)
+{
+	union {
+		uint32_t regs[4];
+		char vendor[13];
+	} cpuid;
+	uint8_t *addr, *fileaddr, *match;
+	char *type;
+	uint64_t nrev, orev;
+	caddr_t file;
+	size_t i, len;
+	int error;
+
+	KASSERT(free % PAGE_SIZE == 0, ("unaligned boundary %p", (void *)free));
+
+	do_cpuid(0, cpuid.regs);
+	cpuid.regs[0] = cpuid.regs[1];
+	cpuid.regs[1] = cpuid.regs[3];
+	cpuid.vendor[12] = '\0';
+	for (i = 0; i < nitems(loaders); i++)
+		if (strcmp(cpuid.vendor, loaders[i].vendor) == 0) {
+			ucode_loader = &loaders[i];
+			break;
+		}
+	if (ucode_loader == NULL)
+		return (0);
+
+	file = 0;
+	fileaddr = match = NULL;
+	for (;;) {
+		file = preload_search_next_name(file);
+		if (file == 0)
+			break;
+		type = (char *)preload_search_info(file, MODINFO_TYPE);
+		if (type == NULL || strcmp(type, "cpu_microcode") != 0)
+			continue;
+
+		fileaddr = preload_fetch_addr(file);
+		len = preload_fetch_size(file);
+		match = ucode_loader->match(fileaddr, &len);
+		if (match != NULL) {
+			addr = map_ucode(free, len);
+			/* We can't use memcpy() before ifunc resolution. */
+			for (i = 0; i < len; i++)
+				addr[i] = ((volatile uint8_t *)match)[i];
+			match = addr;
+
+			error = ucode_loader->load(match, false, &nrev, &orev);
+			if (error == 0) {
+				ucode_data = early_ucode_data = match;
+				ucode_nrev = nrev;
+				ucode_orev = orev;
+				return (len);
+			}
+			unmap_ucode(free, len);
+		}
+	}
+	if (fileaddr != NULL && ucode_error == NO_ERROR)
+		ucode_error = NO_MATCH;
+	return (0);
+}
+
+/*
+ * Reload microcode following an ACPI resume.
+ */
+void
+ucode_reload(void)
+{
+
+	ucode_load_ap(PCPU_GET(cpuid));
+}
+
+/*
+ * Replace an existing microcode update.
+ */
+void *
+ucode_update(void *newdata)
+{
+
+	newdata = (void *)atomic_swap_ptr((void *)&ucode_data,
+	    (uintptr_t)newdata);
+	if (newdata == early_ucode_data)
+		newdata = NULL;
+	return (newdata);
+}


Property changes on: trunk/sys/x86/x86/ucode.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/x86/x86_mem.c
===================================================================
--- trunk/sys/x86/x86/x86_mem.c	                        (rev 0)
+++ trunk/sys/x86/x86/x86_mem.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,729 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1999 Michael Smith <msmith at freebsd.org>
+ * Copyright (c) 2017 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/x86/x86_mem.c 314591 2017-03-03 10:30:30Z kib $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+/*
+ * Pentium Pro+ memory range operations
+ *
+ * This code will probably be impenetrable without reference to the
+ * Intel Pentium Pro documentation or x86-64 programmers manual vol 2.
+ */
+
+static char *mem_owner_bios = "BIOS";
+
+#define	MR686_FIXMTRR	(1<<0)
+
+#define	mrwithin(mr, a)							\
+	(((a) >= (mr)->mr_base) && ((a) < ((mr)->mr_base + (mr)->mr_len)))
+#define	mroverlap(mra, mrb)						\
+	(mrwithin(mra, mrb->mr_base) || mrwithin(mrb, mra->mr_base))
+
+#define	mrvalid(base, len) 						\
+	((!(base & ((1 << 12) - 1))) &&	/* base is multiple of 4k */	\
+	    ((len) >= (1 << 12)) &&	/* length is >= 4k */		\
+	    powerof2((len)) &&		/* ... and power of two */	\
+	    !((base) & ((len) - 1)))	/* range is not discontiuous */
+
+#define	mrcopyflags(curr, new)						\
+	(((curr) & ~MDF_ATTRMASK) | ((new) & MDF_ATTRMASK))
+
+static int mtrrs_disabled;
+SYSCTL_INT(_machdep, OID_AUTO, disable_mtrrs, CTLFLAG_RDTUN,
+    &mtrrs_disabled, 0,
+    "Disable MTRRs.");
+
+static void	x86_mrinit(struct mem_range_softc *sc);
+static int	x86_mrset(struct mem_range_softc *sc,
+		    struct mem_range_desc *mrd, int *arg);
+static void	x86_mrAPinit(struct mem_range_softc *sc);
+static void	x86_mrreinit(struct mem_range_softc *sc);
+
+static struct mem_range_ops x86_mrops = {
+	x86_mrinit,
+	x86_mrset,
+	x86_mrAPinit,
+	x86_mrreinit
+};
+
+/* XXX for AP startup hook */
+static u_int64_t mtrrcap, mtrrdef;
+
+/* The bitmask for the PhysBase and PhysMask fields of the variable MTRRs. */
+static u_int64_t mtrr_physmask;
+
+static struct mem_range_desc *mem_range_match(struct mem_range_softc *sc,
+		    struct mem_range_desc *mrd);
+static void	x86_mrfetch(struct mem_range_softc *sc);
+static int	x86_mtrrtype(int flags);
+static int	x86_mrt2mtrr(int flags, int oldval);
+static int	x86_mtrrconflict(int flag1, int flag2);
+static void	x86_mrstore(struct mem_range_softc *sc);
+static void	x86_mrstoreone(void *arg);
+static struct mem_range_desc *x86_mtrrfixsearch(struct mem_range_softc *sc,
+		    u_int64_t addr);
+static int	x86_mrsetlow(struct mem_range_softc *sc,
+		    struct mem_range_desc *mrd, int *arg);
+static int	x86_mrsetvariable(struct mem_range_softc *sc,
+		    struct mem_range_desc *mrd, int *arg);
+
+/* ia32 MTRR type to memory range type conversion */
+static int x86_mtrrtomrt[] = {
+	MDF_UNCACHEABLE,
+	MDF_WRITECOMBINE,
+	MDF_UNKNOWN,
+	MDF_UNKNOWN,
+	MDF_WRITETHROUGH,
+	MDF_WRITEPROTECT,
+	MDF_WRITEBACK
+};
+
+#define	MTRRTOMRTLEN nitems(x86_mtrrtomrt)
+
+static int
+x86_mtrr2mrt(int val)
+{
+
+	if (val < 0 || val >= MTRRTOMRTLEN)
+		return (MDF_UNKNOWN);
+	return (x86_mtrrtomrt[val]);
+}
+
+/*
+ * x86 MTRR conflicts. Writeback and uncachable may overlap.
+ */
+static int
+x86_mtrrconflict(int flag1, int flag2)
+{
+
+	flag1 &= MDF_ATTRMASK;
+	flag2 &= MDF_ATTRMASK;
+	if ((flag1 & MDF_UNKNOWN) || (flag2 & MDF_UNKNOWN))
+		return (1);
+	if (flag1 == flag2 ||
+	    (flag1 == MDF_WRITEBACK && flag2 == MDF_UNCACHEABLE) ||
+	    (flag2 == MDF_WRITEBACK && flag1 == MDF_UNCACHEABLE))
+		return (0);
+	return (1);
+}
+
+/*
+ * Look for an exactly-matching range.
+ */
+static struct mem_range_desc *
+mem_range_match(struct mem_range_softc *sc, struct mem_range_desc *mrd)
+{
+	struct mem_range_desc *cand;
+	int i;
+
+	for (i = 0, cand = sc->mr_desc; i < sc->mr_ndesc; i++, cand++)
+		if ((cand->mr_base == mrd->mr_base) &&
+		    (cand->mr_len == mrd->mr_len))
+			return (cand);
+	return (NULL);
+}
+
+/*
+ * Ensure that the direct map region does not contain any mappings
+ * that span MTRRs of different types.  However, the fixed MTRRs can
+ * be ignored, because a large page mapping the first 1 MB of physical
+ * memory is a special case that the processor handles.  Invalidate
+ * any old TLB entries that might hold inconsistent memory type
+ * information. 
+ */
+static void
+x86_mr_split_dmap(struct mem_range_softc *sc __unused)
+{
+#ifdef __amd64__
+	struct mem_range_desc *mrd;
+	int i;
+
+	i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0;
+	mrd = sc->mr_desc + i;
+	for (; i < sc->mr_ndesc; i++, mrd++) {
+		if ((mrd->mr_flags & (MDF_ACTIVE | MDF_BOGUS)) == MDF_ACTIVE)
+			pmap_demote_DMAP(mrd->mr_base, mrd->mr_len, TRUE);
+	}
+#endif
+}
+
+/*
+ * Fetch the current mtrr settings from the current CPU (assumed to
+ * all be in sync in the SMP case).  Note that if we are here, we
+ * assume that MTRRs are enabled, and we may or may not have fixed
+ * MTRRs.
+ */
+static void
+x86_mrfetch(struct mem_range_softc *sc)
+{
+	struct mem_range_desc *mrd;
+	u_int64_t msrv;
+	int i, j, msr;
+
+	mrd = sc->mr_desc;
+
+	/* Get fixed-range MTRRs. */
+	if (sc->mr_cap & MR686_FIXMTRR) {
+		msr = MSR_MTRR64kBase;
+		for (i = 0; i < (MTRR_N64K / 8); i++, msr++) {
+			msrv = rdmsr(msr);
+			for (j = 0; j < 8; j++, mrd++) {
+				mrd->mr_flags =
+				    (mrd->mr_flags & ~MDF_ATTRMASK) |
+				    x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+				if (mrd->mr_owner[0] == 0)
+					strcpy(mrd->mr_owner, mem_owner_bios);
+				msrv = msrv >> 8;
+			}
+		}
+		msr = MSR_MTRR16kBase;
+		for (i = 0; i < MTRR_N16K / 8; i++, msr++) {
+			msrv = rdmsr(msr);
+			for (j = 0; j < 8; j++, mrd++) {
+				mrd->mr_flags =
+				    (mrd->mr_flags & ~MDF_ATTRMASK) |
+				    x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+				if (mrd->mr_owner[0] == 0)
+					strcpy(mrd->mr_owner, mem_owner_bios);
+				msrv = msrv >> 8;
+			}
+		}
+		msr = MSR_MTRR4kBase;
+		for (i = 0; i < MTRR_N4K / 8; i++, msr++) {
+			msrv = rdmsr(msr);
+			for (j = 0; j < 8; j++, mrd++) {
+				mrd->mr_flags =
+				    (mrd->mr_flags & ~MDF_ATTRMASK) |
+				    x86_mtrr2mrt(msrv & 0xff) | MDF_ACTIVE;
+				if (mrd->mr_owner[0] == 0)
+					strcpy(mrd->mr_owner, mem_owner_bios);
+				msrv = msrv >> 8;
+			}
+		}
+	}
+
+	/* Get remainder which must be variable MTRRs. */
+	msr = MSR_MTRRVarBase;
+	for (; mrd - sc->mr_desc < sc->mr_ndesc; msr += 2, mrd++) {
+		msrv = rdmsr(msr);
+		mrd->mr_flags = (mrd->mr_flags & ~MDF_ATTRMASK) |
+		    x86_mtrr2mrt(msrv & MTRR_PHYSBASE_TYPE);
+		mrd->mr_base = msrv & mtrr_physmask;
+		msrv = rdmsr(msr + 1);
+		mrd->mr_flags = (msrv & MTRR_PHYSMASK_VALID) ?
+		    (mrd->mr_flags | MDF_ACTIVE) :
+		    (mrd->mr_flags & ~MDF_ACTIVE);
+
+		/* Compute the range from the mask. Ick. */
+		mrd->mr_len = (~(msrv & mtrr_physmask) &
+		    (mtrr_physmask | 0xfff)) + 1;
+		if (!mrvalid(mrd->mr_base, mrd->mr_len))
+			mrd->mr_flags |= MDF_BOGUS;
+
+		/* If unclaimed and active, must be the BIOS. */
+		if ((mrd->mr_flags & MDF_ACTIVE) && (mrd->mr_owner[0] == 0))
+			strcpy(mrd->mr_owner, mem_owner_bios);
+	}
+}
+
+/*
+ * Return the MTRR memory type matching a region's flags
+ */
+static int
+x86_mtrrtype(int flags)
+{
+	int i;
+
+	flags &= MDF_ATTRMASK;
+
+	for (i = 0; i < MTRRTOMRTLEN; i++) {
+		if (x86_mtrrtomrt[i] == MDF_UNKNOWN)
+			continue;
+		if (flags == x86_mtrrtomrt[i])
+			return (i);
+	}
+	return (-1);
+}
+
+static int
+x86_mrt2mtrr(int flags, int oldval)
+{
+	int val;
+
+	if ((val = x86_mtrrtype(flags)) == -1)
+		return (oldval & 0xff);
+	return (val & 0xff);
+}
+
+/*
+ * Update running CPU(s) MTRRs to match the ranges in the descriptor
+ * list.
+ *
+ * Must be called with interrupts enabled.
+ */
+static void
+x86_mrstore(struct mem_range_softc *sc)
+{
+
+	smp_rendezvous(NULL, x86_mrstoreone, NULL, sc);
+}
+
+/*
+ * Update the current CPU's MTRRs with those represented in the
+ * descriptor list.  Note that we do this wholesale rather than just
+ * stuffing one entry; this is simpler (but slower, of course).
+ */
+static void
+x86_mrstoreone(void *arg)
+{
+	struct mem_range_softc *sc = arg;
+	struct mem_range_desc *mrd;
+	u_int64_t omsrv, msrv;
+	int i, j, msr;
+	u_long cr0, cr4;
+
+	mrd = sc->mr_desc;
+
+	critical_enter();
+
+	/* Disable PGE. */
+	cr4 = rcr4();
+	load_cr4(cr4 & ~CR4_PGE);
+
+	/* Disable caches (CD = 1, NW = 0). */
+	cr0 = rcr0();
+	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
+
+	/* Flushes caches and TLBs. */
+	wbinvd();
+	invltlb();
+
+	/* Disable MTRRs (E = 0). */
+	wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) & ~MTRR_DEF_ENABLE);
+
+	/* Set fixed-range MTRRs. */
+	if (sc->mr_cap & MR686_FIXMTRR) {
+		msr = MSR_MTRR64kBase;
+		for (i = 0; i < MTRR_N64K / 8; i++, msr++) {
+			msrv = 0;
+			omsrv = rdmsr(msr);
+			for (j = 7; j >= 0; j--) {
+				msrv = msrv << 8;
+				msrv |= x86_mrt2mtrr((mrd + j)->mr_flags,
+				    omsrv >> (j * 8));
+			}
+			wrmsr(msr, msrv);
+			mrd += 8;
+		}
+		msr = MSR_MTRR16kBase;
+		for (i = 0; i < MTRR_N16K / 8; i++, msr++) {
+			msrv = 0;
+			omsrv = rdmsr(msr);
+			for (j = 7; j >= 0; j--) {
+				msrv = msrv << 8;
+				msrv |= x86_mrt2mtrr((mrd + j)->mr_flags,
+				    omsrv >> (j * 8));
+			}
+			wrmsr(msr, msrv);
+			mrd += 8;
+		}
+		msr = MSR_MTRR4kBase;
+		for (i = 0; i < MTRR_N4K / 8; i++, msr++) {
+			msrv = 0;
+			omsrv = rdmsr(msr);
+			for (j = 7; j >= 0; j--) {
+				msrv = msrv << 8;
+				msrv |= x86_mrt2mtrr((mrd + j)->mr_flags,
+				    omsrv >> (j * 8));
+			}
+			wrmsr(msr, msrv);
+			mrd += 8;
+		}
+	}
+
+	/* Set remainder which must be variable MTRRs. */
+	msr = MSR_MTRRVarBase;
+	for (; mrd - sc->mr_desc < sc->mr_ndesc; msr += 2, mrd++) {
+		/* base/type register */
+		omsrv = rdmsr(msr);
+		if (mrd->mr_flags & MDF_ACTIVE) {
+			msrv = mrd->mr_base & mtrr_physmask;
+			msrv |= x86_mrt2mtrr(mrd->mr_flags, omsrv);
+		} else {
+			msrv = 0;
+		}
+		wrmsr(msr, msrv);
+
+		/* mask/active register */
+		if (mrd->mr_flags & MDF_ACTIVE) {
+			msrv = MTRR_PHYSMASK_VALID |
+			    rounddown2(mtrr_physmask, mrd->mr_len);
+		} else {
+			msrv = 0;
+		}
+		wrmsr(msr + 1, msrv);
+	}
+
+	/* Flush caches and TLBs. */
+	wbinvd();
+	invltlb();
+
+	/* Enable MTRRs. */
+	wrmsr(MSR_MTRRdefType, rdmsr(MSR_MTRRdefType) | MTRR_DEF_ENABLE);
+
+	/* Restore caches and PGE. */
+	load_cr0(cr0);
+	load_cr4(cr4);
+
+	critical_exit();
+}
+
+/*
+ * Hunt for the fixed MTRR referencing (addr)
+ */
+static struct mem_range_desc *
+x86_mtrrfixsearch(struct mem_range_softc *sc, u_int64_t addr)
+{
+	struct mem_range_desc *mrd;
+	int i;
+
+	for (i = 0, mrd = sc->mr_desc; i < MTRR_N64K + MTRR_N16K + MTRR_N4K;
+	     i++, mrd++)
+		if (addr >= mrd->mr_base &&
+		    addr < mrd->mr_base + mrd->mr_len)
+			return (mrd);
+	return (NULL);
+}
+
+/*
+ * Try to satisfy the given range request by manipulating the fixed
+ * MTRRs that cover low memory.
+ *
+ * Note that we try to be generous here; we'll bloat the range out to
+ * the next higher/lower boundary to avoid the consumer having to know
+ * too much about the mechanisms here.
+ *
+ * XXX note that this will have to be updated when we start supporting
+ * "busy" ranges.
+ */
+static int
+x86_mrsetlow(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg)
+{
+	struct mem_range_desc *first_md, *last_md, *curr_md;
+
+	/* Range check. */
+	if ((first_md = x86_mtrrfixsearch(sc, mrd->mr_base)) == NULL ||
+	    (last_md = x86_mtrrfixsearch(sc, mrd->mr_base + mrd->mr_len - 1))
+	    == NULL)
+		return (EINVAL);
+
+	/* Check that we aren't doing something risky. */
+	if ((mrd->mr_flags & MDF_FORCE) == 0) {
+		for (curr_md = first_md; curr_md <= last_md; curr_md++) {
+			if ((curr_md->mr_flags & MDF_ATTRMASK) == MDF_UNKNOWN)
+				return (EACCES);
+		}
+	}
+
+	/* Set flags, clear set-by-firmware flag. */
+	for (curr_md = first_md; curr_md <= last_md; curr_md++) {
+		curr_md->mr_flags = mrcopyflags(curr_md->mr_flags &
+		    ~MDF_FIRMWARE, mrd->mr_flags);
+		bcopy(mrd->mr_owner, curr_md->mr_owner, sizeof(mrd->mr_owner));
+	}
+
+	return (0);
+}
+
+/*
+ * Modify/add a variable MTRR to satisfy the request.
+ *
+ * XXX needs to be updated to properly support "busy" ranges.
+ */
+static int
+x86_mrsetvariable(struct mem_range_softc *sc, struct mem_range_desc *mrd,
+    int *arg)
+{
+	struct mem_range_desc *curr_md, *free_md;
+	int i;
+
+	/*
+	 * Scan the currently active variable descriptors, look for
+	 * one we exactly match (straight takeover) and for possible
+	 * accidental overlaps.
+	 *
+	 * Keep track of the first empty variable descriptor in case
+	 * we can't perform a takeover.
+	 */
+	i = (sc->mr_cap & MR686_FIXMTRR) ? MTRR_N64K + MTRR_N16K + MTRR_N4K : 0;
+	curr_md = sc->mr_desc + i;
+	free_md = NULL;
+	for (; i < sc->mr_ndesc; i++, curr_md++) {
+		if (curr_md->mr_flags & MDF_ACTIVE) {
+			/* Exact match? */
+			if (curr_md->mr_base == mrd->mr_base &&
+			    curr_md->mr_len == mrd->mr_len) {
+
+				/* Whoops, owned by someone. */
+				if (curr_md->mr_flags & MDF_BUSY)
+					return (EBUSY);
+
+				/* Check that we aren't doing something risky */
+				if (!(mrd->mr_flags & MDF_FORCE) &&
+				    (curr_md->mr_flags & MDF_ATTRMASK) ==
+				    MDF_UNKNOWN)
+					return (EACCES);
+
+				/* Ok, just hijack this entry. */
+				free_md = curr_md;
+				break;
+			}
+
+			/* Non-exact overlap? */
+			if (mroverlap(curr_md, mrd)) {
+				/* Between conflicting region types? */
+				if (x86_mtrrconflict(curr_md->mr_flags,
+				    mrd->mr_flags))
+					return (EINVAL);
+			}
+		} else if (free_md == NULL) {
+			free_md = curr_md;
+		}
+	}
+
+	/* Got somewhere to put it? */
+	if (free_md == NULL)
+		return (ENOSPC);
+
+	/* Set up new descriptor. */
+	free_md->mr_base = mrd->mr_base;
+	free_md->mr_len = mrd->mr_len;
+	free_md->mr_flags = mrcopyflags(MDF_ACTIVE, mrd->mr_flags);
+	bcopy(mrd->mr_owner, free_md->mr_owner, sizeof(mrd->mr_owner));
+	return (0);
+}
+
+/*
+ * Handle requests to set memory range attributes by manipulating MTRRs.
+ */
+static int
+x86_mrset(struct mem_range_softc *sc, struct mem_range_desc *mrd, int *arg)
+{
+	struct mem_range_desc *targ;
+	int error;
+
+	switch (*arg) {
+	case MEMRANGE_SET_UPDATE:
+		/*
+		 * Make sure that what's being asked for is even
+		 * possible at all.
+		 */
+		if (!mrvalid(mrd->mr_base, mrd->mr_len) ||
+		    x86_mtrrtype(mrd->mr_flags) == -1)
+			return (EINVAL);
+
+#define	FIXTOP	\
+    ((MTRR_N64K * 0x10000) + (MTRR_N16K * 0x4000) + (MTRR_N4K * 0x1000))
+
+		/* Are the "low memory" conditions applicable? */
+		if ((sc->mr_cap & MR686_FIXMTRR) != 0 &&
+		    mrd->mr_base + mrd->mr_len <= FIXTOP) {
+			if ((error = x86_mrsetlow(sc, mrd, arg)) != 0)
+				return (error);
+		} else {
+			/* It's time to play with variable MTRRs. */
+			if ((error = x86_mrsetvariable(sc, mrd, arg)) != 0)
+				return (error);
+		}
+		break;
+
+	case MEMRANGE_SET_REMOVE:
+		if ((targ = mem_range_match(sc, mrd)) == NULL)
+			return (ENOENT);
+		if (targ->mr_flags & MDF_FIXACTIVE)
+			return (EPERM);
+		if (targ->mr_flags & MDF_BUSY)
+			return (EBUSY);
+		targ->mr_flags &= ~MDF_ACTIVE;
+		targ->mr_owner[0] = 0;
+		break;
+
+	default:
+		return (EOPNOTSUPP);
+	}
+
+	x86_mr_split_dmap(sc);
+
+	/* Update the hardware. */
+	x86_mrstore(sc);
+
+	/* Refetch to see where we're at. */
+	x86_mrfetch(sc);
+	return (0);
+}
+
+/*
+ * Work out how many ranges we support, initialise storage for them,
+ * and fetch the initial settings.
+ */
+static void
+x86_mrinit(struct mem_range_softc *sc)
+{
+	struct mem_range_desc *mrd;
+	int i, nmdesc;
+
+	if (sc->mr_desc != NULL)
+		/* Already initialized. */
+		return;
+
+	nmdesc = 0;
+	mtrrcap = rdmsr(MSR_MTRRcap);
+	mtrrdef = rdmsr(MSR_MTRRdefType);
+
+	/* For now, bail out if MTRRs are not enabled. */
+	if (!(mtrrdef & MTRR_DEF_ENABLE)) {
+		if (bootverbose)
+			printf("CPU supports MTRRs but not enabled\n");
+		return;
+	}
+	nmdesc = mtrrcap & MTRR_CAP_VCNT;
+	if (bootverbose)
+		printf("Pentium Pro MTRR support enabled\n");
+
+	/*
+	 * Determine the size of the PhysMask and PhysBase fields in
+	 * the variable range MTRRs.
+	 */
+	mtrr_physmask = (((uint64_t)1 << cpu_maxphyaddr) - 1) &
+	    ~(uint64_t)0xfff;
+
+	/* If fixed MTRRs supported and enabled. */
+	if ((mtrrcap & MTRR_CAP_FIXED) && (mtrrdef & MTRR_DEF_FIXED_ENABLE)) {
+		sc->mr_cap = MR686_FIXMTRR;
+		nmdesc += MTRR_N64K + MTRR_N16K + MTRR_N4K;
+	}
+
+	sc->mr_desc = malloc(nmdesc * sizeof(struct mem_range_desc), M_MEMDESC,
+	    M_WAITOK | M_ZERO);
+	sc->mr_ndesc = nmdesc;
+
+	mrd = sc->mr_desc;
+
+	/* Populate the fixed MTRR entries' base/length. */
+	if (sc->mr_cap & MR686_FIXMTRR) {
+		for (i = 0; i < MTRR_N64K; i++, mrd++) {
+			mrd->mr_base = i * 0x10000;
+			mrd->mr_len = 0x10000;
+			mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+			    MDF_FIXACTIVE;
+		}
+		for (i = 0; i < MTRR_N16K; i++, mrd++) {
+			mrd->mr_base = i * 0x4000 + 0x80000;
+			mrd->mr_len = 0x4000;
+			mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+			    MDF_FIXACTIVE;
+		}
+		for (i = 0; i < MTRR_N4K; i++, mrd++) {
+			mrd->mr_base = i * 0x1000 + 0xc0000;
+			mrd->mr_len = 0x1000;
+			mrd->mr_flags = MDF_FIXBASE | MDF_FIXLEN |
+			    MDF_FIXACTIVE;
+		}
+	}
+
+	/*
+	 * Get current settings, anything set now is considered to
+	 * have been set by the firmware. (XXX has something already
+	 * played here?)
+	 */
+	x86_mrfetch(sc);
+	mrd = sc->mr_desc;
+	for (i = 0; i < sc->mr_ndesc; i++, mrd++) {
+		if (mrd->mr_flags & MDF_ACTIVE)
+			mrd->mr_flags |= MDF_FIRMWARE;
+	}
+
+	x86_mr_split_dmap(sc);
+}
+
+/*
+ * Initialise MTRRs on an AP after the BSP has run the init code.
+ */
+static void
+x86_mrAPinit(struct mem_range_softc *sc)
+{
+
+	x86_mrstoreone(sc);
+	wrmsr(MSR_MTRRdefType, mtrrdef);
+}
+
+/*
+ * Re-initialise running CPU(s) MTRRs to match the ranges in the descriptor
+ * list.
+ *
+ * Must be called with interrupts enabled.
+ */
+static void
+x86_mrreinit(struct mem_range_softc *sc)
+{
+
+	smp_rendezvous(NULL, (void (*)(void *))x86_mrAPinit, NULL, sc);
+}
+
+static void
+x86_mem_drvinit(void *unused)
+{
+
+	if (mtrrs_disabled)
+		return;
+	if (!(cpu_feature & CPUID_MTRR))
+		return;
+	mem_range_softc.mr_op = &x86_mrops;
+	x86_mrinit(&mem_range_softc);
+}
+SYSINIT(x86memdev, SI_SUB_CPU, SI_ORDER_ANY, x86_mem_drvinit, NULL);


Property changes on: trunk/sys/x86/x86/x86_mem.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/xen/hvm.c
===================================================================
--- trunk/sys/x86/xen/hvm.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/xen/hvm.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/xen/hvm.c 305672 2016-09-09 19:57:32Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/hvm.c 305672 2016-09-09 19:57:32Z jhb $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -59,34 +59,8 @@
 #include <xen/interface/vcpu.h>
 
 /*--------------------------- Forward Declarations ---------------------------*/
-#ifdef SMP
-static driver_filter_t xen_smp_rendezvous_action;
-static driver_filter_t xen_invltlb;
-static driver_filter_t xen_invlpg;
-static driver_filter_t xen_invlrng;
-static driver_filter_t xen_invlcache;
-#ifdef __i386__
-static driver_filter_t xen_lazypmap;
-#endif
-static driver_filter_t xen_ipi_bitmap_handler;
-static driver_filter_t xen_cpustop_handler;
-static driver_filter_t xen_cpususpend_handler;
-static driver_filter_t xen_cpustophard_handler;
-static void xen_ipi_vectored(u_int vector, int dest);
-#endif
 static void xen_hvm_cpu_init(void);
 
-/*---------------------------- Extern Declarations ---------------------------*/
-#ifdef __i386__
-extern void pmap_lazyfix_action(void);
-#endif
-#ifdef __amd64__
-extern int pmap_pcid_enabled;
-#endif
-
-/*---------------------------------- Macros ----------------------------------*/
-#define	IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS)
-
 /*-------------------------------- Local Types -------------------------------*/
 enum xen_hvm_init_type {
 	XEN_HVM_INIT_COLD,
@@ -94,18 +68,11 @@
 	XEN_HVM_INIT_RESUME
 };
 
-struct xen_ipi_handler
-{
-	driver_filter_t	*filter;
-	const char	*description;
-};
-
 /*-------------------------------- Global Data -------------------------------*/
 enum xen_domain_type xen_domain_type = XEN_NATIVE;
 
 #ifdef SMP
 struct cpu_ops xen_hvm_cpu_ops = {
-	.ipi_vectored	= lapic_ipi_vectored,
 	.cpu_init	= xen_hvm_cpu_init,
 	.cpu_resume	= xen_hvm_cpu_init
 };
@@ -113,24 +80,6 @@
 
 static MALLOC_DEFINE(M_XENHVM, "xen_hvm", "Xen HVM PV Support");
 
-#ifdef SMP
-static struct xen_ipi_handler xen_ipis[] = 
-{
-	[IPI_TO_IDX(IPI_RENDEZVOUS)]	= { xen_smp_rendezvous_action,	"r"   },
-	[IPI_TO_IDX(IPI_INVLTLB)]	= { xen_invltlb,		"itlb"},
-	[IPI_TO_IDX(IPI_INVLPG)]	= { xen_invlpg,			"ipg" },
-	[IPI_TO_IDX(IPI_INVLRNG)]	= { xen_invlrng,		"irg" },
-	[IPI_TO_IDX(IPI_INVLCACHE)]	= { xen_invlcache,		"ic"  },
-#ifdef __i386__
-	[IPI_TO_IDX(IPI_LAZYPMAP)]	= { xen_lazypmap,		"lp"  },
-#endif
-	[IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler,	"b"   },
-	[IPI_TO_IDX(IPI_STOP)]		= { xen_cpustop_handler,	"st"  },
-	[IPI_TO_IDX(IPI_SUSPEND)]	= { xen_cpususpend_handler,	"sp"  },
-	[IPI_TO_IDX(IPI_STOP_HARD)]	= { xen_cpustophard_handler,	"sth" },
-};
-#endif
-
 /**
  * If non-zero, the hypervisor has been configured to use a direct
  * IDT event callback for interrupt injection.
@@ -140,14 +89,10 @@
 /*------------------------------- Per-CPU Data -------------------------------*/
 DPCPU_DEFINE(struct vcpu_info, vcpu_local_info);
 DPCPU_DEFINE(struct vcpu_info *, vcpu_info);
-#ifdef SMP
-DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]);
-#endif
 
 /*------------------ Hypervisor Access Shared Memory Regions -----------------*/
-/** Hypercall table accessed via HYPERVISOR_*_op() methods. */
-char *hypercall_stubs;
 shared_info_t *HYPERVISOR_shared_info;
+start_info_t *HYPERVISOR_start_info;
 
 
 /*------------------------------ Sysctl tunables -----------------------------*/
@@ -156,207 +101,6 @@
 TUNABLE_INT("hw.xen.disable_pv_disks", &xen_disable_pv_disks);
 TUNABLE_INT("hw.xen.disable_pv_nics", &xen_disable_pv_nics);
 
-#ifdef SMP
-/*---------------------------- XEN PV IPI Handlers ---------------------------*/
-/*
- * This are C clones of the ASM functions found in apic_vector.s
- */
-static int
-xen_ipi_bitmap_handler(void *arg)
-{
-	struct trapframe *frame;
-
-	frame = arg;
-	ipi_bitmap_handler(*frame);
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_smp_rendezvous_action(void *arg)
-{
-#ifdef COUNT_IPIS
-	(*ipi_rendezvous_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	smp_rendezvous_action();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invltlb(void *arg)
-{
-
-	invltlb_handler();
-	return (FILTER_HANDLED);
-}
-
-#ifdef __amd64__
-static int
-xen_invltlb_pcid(void *arg)
-{
-
-	invltlb_pcid_handler();
-	return (FILTER_HANDLED);
-}
-#endif
-
-static int
-xen_invlpg(void *arg)
-{
-
-	invlpg_handler();
-	return (FILTER_HANDLED);
-}
-
-#ifdef __amd64__
-static int
-xen_invlpg_pcid(void *arg)
-{
-
-	invlpg_pcid_handler();
-	return (FILTER_HANDLED);
-}
-#endif
-
-static int
-xen_invlrng(void *arg)
-{
-
-	invlrng_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invlcache(void *arg)
-{
-
-	invlcache_handler();
-	return (FILTER_HANDLED);
-}
-
-#ifdef __i386__
-static int
-xen_lazypmap(void *arg)
-{
-
-	pmap_lazyfix_action();
-	return (FILTER_HANDLED);
-}
-#endif
-
-static int
-xen_cpustop_handler(void *arg)
-{
-
-	cpustop_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_cpususpend_handler(void *arg)
-{
-
-	cpususpend_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_cpustophard_handler(void *arg)
-{
-
-	ipi_nmi_handler();
-	return (FILTER_HANDLED);
-}
-
-/* Xen PV IPI sender */
-static void
-xen_ipi_vectored(u_int vector, int dest)
-{
-	xen_intr_handle_t *ipi_handle;
-	int ipi_idx, to_cpu, self;
-
-	ipi_idx = IPI_TO_IDX(vector);
-	if (ipi_idx > nitems(xen_ipis))
-		panic("IPI out of range");
-
-	switch(dest) {
-	case APIC_IPI_DEST_SELF:
-		ipi_handle = DPCPU_GET(ipi_handle);
-		xen_intr_signal(ipi_handle[ipi_idx]);
-		break;
-	case APIC_IPI_DEST_ALL:
-		CPU_FOREACH(to_cpu) {
-			ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
-			xen_intr_signal(ipi_handle[ipi_idx]);
-		}
-		break;
-	case APIC_IPI_DEST_OTHERS:
-		self = PCPU_GET(cpuid);
-		CPU_FOREACH(to_cpu) {
-			if (to_cpu != self) {
-				ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
-				xen_intr_signal(ipi_handle[ipi_idx]);
-			}
-		}
-		break;
-	default:
-		to_cpu = apic_cpuid(dest);
-		ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
-		xen_intr_signal(ipi_handle[ipi_idx]);
-		break;
-	}
-}
-
-/*---------------------- XEN diverged cpu operations -------------------------*/
-static void
-xen_cpu_ipi_init(int cpu)
-{
-	xen_intr_handle_t *ipi_handle;
-	const struct xen_ipi_handler *ipi;
-	device_t dev;
-	int idx, rc;
-
-	ipi_handle = DPCPU_ID_GET(cpu, ipi_handle);
-	dev = pcpu_find(cpu)->pc_device;
-	KASSERT((dev != NULL), ("NULL pcpu device_t"));
-
-	for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) {
-
-		if (ipi->filter == NULL) {
-			ipi_handle[idx] = NULL;
-			continue;
-		}
-
-		rc = xen_intr_alloc_and_bind_ipi(dev, cpu, ipi->filter,
-		    INTR_TYPE_TTY, &ipi_handle[idx]);
-		if (rc != 0)
-			panic("Unable to allocate a XEN IPI port");
-		xen_intr_describe(ipi_handle[idx], "%s", ipi->description);
-	}
-}
-
-static void
-xen_setup_cpus(void)
-{
-	int i;
-
-	if (!xen_hvm_domain() || !xen_vector_callback_enabled)
-		return;
-
-#ifdef __amd64__
-	if (pmap_pcid_enabled) {
-		xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = xen_invltlb_pcid;
-		xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = xen_invlpg_pcid;
-	}
-#endif
-	CPU_FOREACH(i)
-		xen_cpu_ipi_init(i);
-
-	/* Set the xen pv ipi ops to replace the native ones */
-	cpu_ops.ipi_vectored = xen_ipi_vectored;
-}
-#endif
-
 /*---------------------- XEN Hypervisor Probe and Setup ----------------------*/
 static uint32_t
 xen_hvm_cpuid_base(void)
@@ -376,16 +120,21 @@
  * Allocate and fill in the hypcall page.
  */
 static int
-xen_hvm_init_hypercall_stubs(void)
+xen_hvm_init_hypercall_stubs(enum xen_hvm_init_type init_type)
 {
 	uint32_t base, regs[4];
 	int i;
 
+	if (xen_pv_domain()) {
+		/* hypercall page is already set in the PV case */
+		return (0);
+	}
+
 	base = xen_hvm_cpuid_base();
 	if (base == 0)
 		return (ENXIO);
 
-	if (hypercall_stubs == NULL) {
+	if (init_type == XEN_HVM_INIT_COLD) {
 		int major, minor;
 
 		do_cpuid(base + 1, regs);
@@ -417,18 +166,9 @@
 	 * Find the hypercall pages.
 	 */
 	do_cpuid(base + 2, regs);
-	
-	if (hypercall_stubs == NULL) {
-		size_t call_region_size;
 
-		call_region_size = regs[0] * PAGE_SIZE;
-		hypercall_stubs = malloc(call_region_size, M_XENHVM, M_NOWAIT);
-		if (hypercall_stubs == NULL)
-			panic("Unable to allocate Xen hypercall region");
-	}
-
 	for (i = 0; i < regs[0]; i++)
-		wrmsr(regs[1], vtophys(hypercall_stubs + i * PAGE_SIZE) + i);
+		wrmsr(regs[1], vtophys(&hypercall_page + i * PAGE_SIZE) + i);
 
 	return (0);
 }
@@ -438,6 +178,14 @@
 {
 	struct xen_add_to_physmap xatp;
 
+	if (xen_pv_domain()) {
+		/*
+		 * Already setup in the PV case, shared_info is passed inside
+		 * of the start_info struct at start of day.
+		 */
+		return;
+	}
+
 	if (HYPERVISOR_shared_info == NULL) {
 		HYPERVISOR_shared_info = malloc(PAGE_SIZE, M_XENHVM, M_NOWAIT);
 		if (HYPERVISOR_shared_info == NULL)
@@ -516,6 +264,16 @@
 {
 	u_short disable_devs = 0;
 
+	if (xen_pv_domain()) {
+		/*
+		 * No emulated devices in the PV case, so no need to unplug
+		 * anything.
+		 */
+		if (xen_disable_pv_disks != 0 || xen_disable_pv_nics != 0)
+			printf("PV devices cannot be disabled in PV guests\n");
+		return;
+	}
+
 	if (inw(XEN_MAGIC_IOPORT) != XMI_MAGIC)
 		return;
 
@@ -543,7 +301,7 @@
 	if (init_type == XEN_HVM_INIT_CANCELLED_SUSPEND)
 		return;
 
-	error = xen_hvm_init_hypercall_stubs();
+	error = xen_hvm_init_hypercall_stubs(init_type);
 
 	switch (init_type) {
 	case XEN_HVM_INIT_COLD:
@@ -550,11 +308,21 @@
 		if (error != 0)
 			return;
 
+		/*
+		 * If xen_domain_type is not set at this point
+		 * it means we are inside a (PV)HVM guest, because
+		 * for PVH the guest type is set much earlier
+		 * (see hammer_time_xen).
+		 */
+		if (!xen_domain()) {
+			xen_domain_type = XEN_HVM_DOMAIN;
+			vm_guest = VM_GUEST_XEN;
+		}
+
 		setup_xen_features();
 #ifdef SMP
 		cpu_ops = xen_hvm_cpu_ops;
 #endif
- 		vm_guest = VM_GUEST_XEN;
 		break;
 	case XEN_HVM_INIT_RESUME:
 		if (error != 0)
@@ -569,9 +337,15 @@
 	}
 
 	xen_vector_callback_enabled = 0;
-	xen_domain_type = XEN_HVM_DOMAIN;
+	xen_hvm_set_callback(NULL);
+
+	/*
+	 * On (PV)HVM domains we need to request the hypervisor to
+	 * fill the shared info page, for PVH guest the shared_info page
+	 * is passed inside the start_info struct and is already set, so this
+	 * functions are no-ops.
+	 */
 	xen_hvm_init_shared_info_page();
-	xen_hvm_set_callback(NULL);
 	xen_hvm_disable_emulated_devices();
 } 
 
@@ -603,6 +377,9 @@
 	struct pcpu *pc;
 	int i;
 
+	if (!xen_hvm_domain())
+		return;
+
 	/* Set vcpu_id to acpi_id */
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
@@ -645,8 +422,5 @@
 }
 
 SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_sysinit, NULL);
-#ifdef SMP
-SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_FIRST, xen_setup_cpus, NULL);
-#endif
 SYSINIT(xen_hvm_cpu_init, SI_SUB_INTR, SI_ORDER_FIRST, xen_hvm_cpu_init, NULL);
 SYSINIT(xen_set_vcpu_id, SI_SUB_CPU, SI_ORDER_ANY, xen_set_vcpu_id, NULL);

Added: trunk/sys/x86/xen/pv.c
===================================================================
--- trunk/sys/x86/xen/pv.c	                        (rev 0)
+++ trunk/sys/x86/xen/pv.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,428 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2004 Christian Limpach.
+ * Copyright (c) 2004-2006,2008 Kip Macy
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2013 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/pv.c 344378 2019-02-20 19:19:24Z kevans $");
+
+#include "opt_ddb.h"
+#include "opt_kstack_pages.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/reboot.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/boot.h>
+#include <sys/ctype.h>
+#include <sys/mutex.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_param.h>
+
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+#include <x86/init.h>
+#include <machine/pc/bios.h>
+#include <machine/smp.h>
+#include <machine/intr_machdep.h>
+#include <machine/metadata.h>
+
+#include <xen/xen-os.h>
+#include <xen/hypervisor.h>
+#include <xen/xenstore/xenstorevar.h>
+#include <xen/xen_pv.h>
+#include <xen/xen_msi.h>
+
+#include <xen/interface/vcpu.h>
+
+#include <dev/xen/timer/timer.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/* Native initial function */
+extern u_int64_t hammer_time(u_int64_t, u_int64_t);
+/* Xen initial function */
+uint64_t hammer_time_xen(start_info_t *, uint64_t);
+
+#define MAX_E820_ENTRIES	128
+
+/*--------------------------- Forward Declarations ---------------------------*/
+static caddr_t xen_pv_parse_preload_data(u_int64_t);
+static void xen_pv_parse_memmap(caddr_t, vm_paddr_t *, int *);
+
+#ifdef SMP
+static int xen_pv_start_all_aps(void);
+#endif
+
+/*---------------------------- Extern Declarations ---------------------------*/
+#ifdef SMP
+/* Variables used by amd64 mp_machdep to start APs */
+extern char *doublefault_stack;
+extern char *mce_stack;
+extern char *nmi_stack;
+#endif
+
+/*
+ * Placed by the linker at the end of the bss section, which is the last
+ * section loaded by Xen before loading the symtab and strtab.
+ */
+extern uint32_t end;
+
+/*-------------------------------- Global Data -------------------------------*/
+/* Xen init_ops implementation. */
+struct init_ops xen_init_ops = {
+	.parse_preload_data		= xen_pv_parse_preload_data,
+	.early_clock_source_init	= xen_clock_init,
+	.early_delay			= xen_delay,
+	.parse_memmap			= xen_pv_parse_memmap,
+#ifdef SMP
+	.start_all_aps			= xen_pv_start_all_aps,
+#endif
+	.msi_init =			xen_msi_init,
+};
+
+static struct bios_smap xen_smap[MAX_E820_ENTRIES];
+
+/*-------------------------------- Xen PV init -------------------------------*/
+/*
+ * First function called by the Xen PVH boot sequence.
+ *
+ * Set some Xen global variables and prepare the environment so it is
+ * as similar as possible to what native FreeBSD init function expects.
+ */
+uint64_t
+hammer_time_xen(start_info_t *si, uint64_t xenstack)
+{
+	uint64_t physfree;
+	uint64_t *PT4 = (u_int64_t *)xenstack;
+	uint64_t *PT3 = (u_int64_t *)(xenstack + PAGE_SIZE);
+	uint64_t *PT2 = (u_int64_t *)(xenstack + 2 * PAGE_SIZE);
+	int i;
+
+	xen_domain_type = XEN_PV_DOMAIN;
+	vm_guest = VM_GUEST_XEN;
+
+	if ((si == NULL) || (xenstack == 0)) {
+		xc_printf("ERROR: invalid start_info or xen stack, halting\n");
+		HYPERVISOR_shutdown(SHUTDOWN_crash);
+	}
+
+	xc_printf("FreeBSD PVH running on %s\n", si->magic);
+
+	/* We use 3 pages of xen stack for the boot pagetables */
+	physfree = xenstack + 3 * PAGE_SIZE - KERNBASE;
+
+	/* Setup Xen global variables */
+	HYPERVISOR_start_info = si;
+	HYPERVISOR_shared_info =
+	    (shared_info_t *)(si->shared_info + KERNBASE);
+
+	/*
+	 * Setup some misc global variables for Xen devices
+	 *
+	 * XXX: Devices that need these specific variables should
+	 *      be rewritten to fetch this info by themselves from the
+	 *      start_info page.
+	 */
+	xen_store = (struct xenstore_domain_interface *)
+	    (ptoa(si->store_mfn) + KERNBASE);
+	console_page = (char *)(ptoa(si->console.domU.mfn) + KERNBASE);
+
+	/*
+	 * Use the stack Xen gives us to build the page tables
+	 * as native FreeBSD expects to find them (created
+	 * by the boot trampoline).
+	 */
+	for (i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); i++) {
+		/*
+		 * Each slot of the level 4 pages points
+		 * to the same level 3 page
+		 */
+		PT4[i] = ((uint64_t)&PT3[0]) - KERNBASE;
+		PT4[i] |= PG_V | PG_RW | PG_U;
+
+		/*
+		 * Each slot of the level 3 pages points
+		 * to the same level 2 page
+		 */
+		PT3[i] = ((uint64_t)&PT2[0]) - KERNBASE;
+		PT3[i] |= PG_V | PG_RW | PG_U;
+
+		/*
+		 * The level 2 page slots are mapped with
+		 * 2MB pages for 1GB.
+		 */
+		PT2[i] = i * (2 * 1024 * 1024);
+		PT2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+	}
+	load_cr3(((uint64_t)&PT4[0]) - KERNBASE);
+
+	/* Set the hooks for early functions that diverge from bare metal */
+	init_ops = xen_init_ops;
+	apic_ops = xen_apic_ops;
+
+	/* Now we can jump into the native init function */
+	return (hammer_time(0, physfree));
+}
+
+/*-------------------------------- PV specific -------------------------------*/
+#ifdef SMP
+static bool
+start_xen_ap(int cpu)
+{
+	struct vcpu_guest_context *ctxt;
+	int ms, cpus = mp_naps;
+	const size_t stacksize = kstack_pages * PAGE_SIZE;
+
+	/* allocate and set up an idle stack data page */
+	bootstacks[cpu] =
+	    (void *)kmem_malloc(kernel_arena, stacksize, M_WAITOK | M_ZERO);
+	doublefault_stack =
+	    (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+	mce_stack =
+	    (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+	nmi_stack =
+	    (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+	dpcpu =
+	    (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, M_WAITOK | M_ZERO);
+
+	bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8;
+	bootAP = cpu;
+
+	ctxt = malloc(sizeof(*ctxt), M_TEMP, M_WAITOK | M_ZERO);
+
+	ctxt->flags = VGCF_IN_KERNEL;
+	ctxt->user_regs.rip = (unsigned long) init_secondary;
+	ctxt->user_regs.rsp = (unsigned long) bootSTK;
+
+	/* Set the AP to use the same page tables */
+	ctxt->ctrlreg[3] = KPML4phys;
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+		panic("unable to initialize AP#%d", cpu);
+
+	free(ctxt, M_TEMP);
+
+	/* Launch the vCPU */
+	if (HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
+		panic("unable to start AP#%d", cpu);
+
+	/* Wait up to 5 seconds for it to start. */
+	for (ms = 0; ms < 5000; ms++) {
+		if (mp_naps > cpus)
+			return (true);
+		DELAY(1000);
+	}
+
+	return (false);
+}
+
+static int
+xen_pv_start_all_aps(void)
+{
+	int cpu;
+
+	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
+
+	for (cpu = 1; cpu < mp_ncpus; cpu++) {
+
+		/* attempt to start the Application Processor */
+		if (!start_xen_ap(cpu))
+			panic("AP #%d failed to start!", cpu);
+
+		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
+	}
+
+	return (mp_naps);
+}
+#endif /* SMP */
+
+/*
+ * Functions to convert the "extra" parameters passed by Xen
+ * into FreeBSD boot options.
+ */
+static void
+xen_pv_set_env(void)
+{
+	char *cmd_line_next, *cmd_line;
+	size_t env_size;
+
+	cmd_line = HYPERVISOR_start_info->cmd_line;
+	env_size = sizeof(HYPERVISOR_start_info->cmd_line);
+
+	/* Skip leading spaces */
+	for (; isspace(*cmd_line) && (env_size != 0); cmd_line++)
+		env_size--;
+
+	/* Replace ',' with '\0' */
+	for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;)
+		;
+
+	init_static_kenv(cmd_line, 0);
+}
+
+#ifdef DDB
+/*
+ * The way Xen loads the symtab is different from the native boot loader,
+ * because it's tailored for NetBSD. So we have to adapt and use the same
+ * method as NetBSD. Portions of the code below have been picked from NetBSD:
+ * sys/kern/kern_ksyms.c CVS Revision 1.71.
+ */
+static void
+xen_pv_parse_symtab(void)
+{
+	Elf_Ehdr *ehdr;
+	Elf_Shdr *shdr;
+	vm_offset_t sym_end;
+	uint32_t size;
+	int i, j;
+
+	size = end;
+	sym_end = HYPERVISOR_start_info->mod_start != 0 ?
+	    HYPERVISOR_start_info->mod_start :
+	    HYPERVISOR_start_info->mfn_list;
+
+	/*
+	 * Make sure the size is right headed, sym_end is just a
+	 * high boundary, but at least allows us to fail earlier.
+	 */
+	if ((vm_offset_t)&end + size > sym_end) {
+		xc_printf("Unable to load ELF symtab: size mismatch\n");
+		return;
+	}
+
+	ehdr = (Elf_Ehdr *)(&end + 1);
+	if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) ||
+	    ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+	    ehdr->e_version > 1) {
+		xc_printf("Unable to load ELF symtab: invalid symbol table\n");
+		return;
+	}
+
+	shdr = (Elf_Shdr *)((uint8_t *)ehdr + ehdr->e_shoff);
+	/* Find the symbol table and the corresponding string table. */
+	for (i = 1; i < ehdr->e_shnum; i++) {
+		if (shdr[i].sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr[i].sh_offset == 0)
+			continue;
+		ksymtab = (uintptr_t)((uint8_t *)ehdr + shdr[i].sh_offset);
+		ksymtab_size = shdr[i].sh_size;
+		j = shdr[i].sh_link;
+		if (shdr[j].sh_offset == 0)
+			continue; /* Can this happen? */
+		kstrtab = (uintptr_t)((uint8_t *)ehdr + shdr[j].sh_offset);
+		break;
+	}
+
+	if (ksymtab == 0 || kstrtab == 0) {
+		xc_printf(
+    "Unable to load ELF symtab: could not find symtab or strtab\n");
+		return;
+	}
+}
+#endif
+
+static caddr_t
+xen_pv_parse_preload_data(u_int64_t modulep)
+{
+	caddr_t		 kmdp;
+	vm_ooffset_t	 off;
+	vm_paddr_t	 metadata;
+	char             *envp;
+
+	if (HYPERVISOR_start_info->mod_start != 0) {
+		preload_metadata = (caddr_t)(HYPERVISOR_start_info->mod_start);
+
+		kmdp = preload_search_by_type("elf kernel");
+		if (kmdp == NULL)
+			kmdp = preload_search_by_type("elf64 kernel");
+		KASSERT(kmdp != NULL, ("unable to find kernel"));
+
+		/*
+		 * Xen has relocated the metadata and the modules,
+		 * so we need to recalculate it's position. This is
+		 * done by saving the original modulep address and
+		 * then calculating the offset with mod_start,
+		 * which contains the relocated modulep address.
+		 */
+		metadata = MD_FETCH(kmdp, MODINFOMD_MODULEP, vm_paddr_t);
+		off = HYPERVISOR_start_info->mod_start - metadata;
+
+		preload_bootstrap_relocate(off);
+
+		boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
+		envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
+		if (envp != NULL)
+			envp += off;
+		init_static_kenv(envp, 0);
+	} else {
+		/* Parse the extra boot information given by Xen */
+		xen_pv_set_env();
+		boothowto |= boot_env_to_howto();
+		kmdp = NULL;
+	}
+
+#ifdef DDB
+	xen_pv_parse_symtab();
+#endif
+	return (kmdp);
+}
+
+static void
+xen_pv_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
+{
+	struct xen_memory_map memmap;
+	u_int32_t size;
+	int rc;
+
+	/* Fetch the E820 map from Xen */
+	memmap.nr_entries = MAX_E820_ENTRIES;
+	set_xen_guest_handle(memmap.buffer, xen_smap);
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc)
+		panic("unable to fetch Xen E820 memory map");
+	size = memmap.nr_entries * sizeof(xen_smap[0]);
+
+	bios_add_smap_entries(xen_smap, size, physmap, physmap_idx);
+}


Property changes on: trunk/sys/x86/xen/pv.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/pvcpu_enum.c
===================================================================
--- trunk/sys/x86/xen/pvcpu_enum.c	                        (rev 0)
+++ trunk/sys/x86/xen/pvcpu_enum.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,267 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2003 John Baldwin <jhb at FreeBSD.org>
+ * Copyright (c) 2013 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/pvcpu_enum.c 340016 2018-11-01 18:34:26Z jhb $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/smp.h>
+#include <sys/pcpu.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#include <xen/xen-os.h>
+#include <xen/xen_intr.h>
+#include <xen/hypervisor.h>
+
+#include <xen/interface/vcpu.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+#include <contrib/dev/acpica/include/aclocal.h>
+#include <contrib/dev/acpica/include/actables.h>
+
+#include <dev/acpica/acpivar.h>
+
+static int xenpv_probe(void);
+static int xenpv_probe_cpus(void);
+static int xenpv_setup_local(void);
+static int xenpv_setup_io(void);
+
+static ACPI_TABLE_MADT *madt;
+static vm_paddr_t madt_physaddr;
+static vm_offset_t madt_length;
+
+static struct apic_enumerator xenpv_enumerator = {
+	"Xen PV",
+	xenpv_probe,
+	xenpv_probe_cpus,
+	xenpv_setup_local,
+	xenpv_setup_io
+};
+
+/*--------------------- Helper functions to parse MADT -----------------------*/
+
+/*
+ * Parse an interrupt source override for an ISA interrupt.
+ */
+static void
+madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr)
+{
+	enum intr_trigger trig;
+	enum intr_polarity pol;
+	int ret;
+
+	if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 &&
+	    intr->GlobalIrq == 2) {
+		if (bootverbose)
+			printf("MADT: Skipping timer override\n");
+		return;
+	}
+
+	madt_parse_interrupt_values(intr, &trig, &pol);
+
+	/* Remap the IRQ if it is mapped to a different interrupt vector. */
+	if (intr->SourceIrq != intr->GlobalIrq && intr->GlobalIrq > 15 &&
+	    intr->SourceIrq == AcpiGbl_FADT.SciInterrupt)
+		/*
+		 * If the SCI is remapped to a non-ISA global interrupt,
+		 * then override the vector we use to setup.
+		 */
+		acpi_OverrideInterruptLevel(intr->GlobalIrq);
+
+	/* Register the IRQ with the polarity and trigger mode found. */
+	ret = xen_register_pirq(intr->GlobalIrq, trig, pol);
+	if (ret != 0)
+		panic("Unable to register interrupt override");
+}
+
+/*
+ * Call the handler routine for each entry in the MADT table.
+ */
+static void
+madt_walk_table(acpi_subtable_handler *handler, void *arg)
+{
+
+	acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length,
+	    handler, arg);
+}
+
+/*
+ * Parse interrupt entries.
+ */
+static void
+madt_parse_ints(ACPI_SUBTABLE_HEADER *entry, void *arg __unused)
+{
+
+	if (entry->Type == ACPI_MADT_TYPE_INTERRUPT_OVERRIDE)
+		madt_parse_interrupt_override(
+		    (ACPI_MADT_INTERRUPT_OVERRIDE *)entry);
+}
+
+/*---------------------------- Xen PV enumerator -----------------------------*/
+
+/*
+ * This enumerator will only be registered on PVH
+ */
+static int
+xenpv_probe(void)
+{
+	return (0);
+}
+
+/*
+ * Test each possible vCPU in order to find the number of vCPUs
+ */
+static int
+xenpv_probe_cpus(void)
+{
+#ifdef SMP
+	int i, ret;
+
+	for (i = 0; i < MAXCPU; i++) {
+		ret = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+		if (ret >= 0)
+			lapic_create((i * 2), (i == 0));
+	}
+#endif
+	return (0);
+}
+
+/*
+ * Initialize the vCPU id of the BSP
+ */
+static int
+xenpv_setup_local(void)
+{
+	PCPU_SET(vcpu_id, 0);
+	lapic_init(0);
+	return (0);
+}
+
+/*
+ * On PVH guests there's no IO APIC
+ */
+static int
+xenpv_setup_io(void)
+{
+
+	if (xen_initial_domain()) {
+		/*
+		 * NB: we could iterate over the MADT IOAPIC entries in order
+		 * to figure out the exact number of IOAPIC interrupts, but
+		 * this is legacy code so just keep using the previous
+		 * behaviour and assume a maximum of 256 interrupts.
+		 */
+		num_io_irqs = max(MINIMUM_MSI_INT - 1, num_io_irqs);
+
+		acpi_SetDefaultIntrModel(ACPI_INTR_APIC);
+	}
+	return (0);
+}
+
+void
+xenpv_register_pirqs(struct pic *pic __unused)
+{
+	unsigned int i;
+	int ret;
+
+	/* Map MADT */
+	madt_physaddr = acpi_find_table(ACPI_SIG_MADT);
+	madt = acpi_map_table(madt_physaddr, ACPI_SIG_MADT);
+	madt_length = madt->Header.Length;
+
+	/* Try to initialize ACPI so that we can access the FADT. */
+	ret = acpi_Startup();
+	if (ACPI_FAILURE(ret)) {
+		printf("MADT: ACPI Startup failed with %s\n",
+		    AcpiFormatException(ret));
+		printf("Try disabling either ACPI or apic support.\n");
+		panic("Using MADT but ACPI doesn't work");
+	}
+
+	/* Run through the table to see if there are any overrides. */
+	madt_walk_table(madt_parse_ints, NULL);
+
+	/*
+	 * If there was not an explicit override entry for the SCI,
+	 * force it to use level trigger and active-low polarity.
+	 */
+	if (!madt_found_sci_override) {
+		printf(
+"MADT: Forcing active-low polarity and level trigger for SCI\n");
+		ret = xen_register_pirq(AcpiGbl_FADT.SciInterrupt,
+		    INTR_TRIGGER_LEVEL, INTR_POLARITY_LOW);
+		if (ret != 0)
+			panic("Unable to register SCI IRQ");
+	}
+
+	/* Register legacy ISA IRQs */
+	for (i = 1; i < 16; i++) {
+		if (intr_lookup_source(i) != NULL)
+			continue;
+		ret = xen_register_pirq(i, INTR_TRIGGER_EDGE,
+		    INTR_POLARITY_LOW);
+		if (ret != 0 && bootverbose)
+			printf("Unable to register legacy IRQ#%u: %d\n", i,
+			    ret);
+	}
+}
+
+static void
+xenpv_register(void *dummy __unused)
+{
+	if (xen_pv_domain()) {
+		apic_register_enumerator(&xenpv_enumerator);
+	}
+}
+SYSINIT(xenpv_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, xenpv_register, NULL);
+
+/*
+ * Setup per-CPU vCPU IDs
+ */
+static void
+xenpv_set_ids(void *dummy)
+{
+	struct pcpu *pc;
+	int i;
+
+	CPU_FOREACH(i) {
+		pc = pcpu_find(i);
+		pc->pc_vcpu_id = i;
+	}
+}
+SYSINIT(xenpv_set_ids, SI_SUB_CPU, SI_ORDER_MIDDLE, xenpv_set_ids, NULL);


Property changes on: trunk/sys/x86/xen/pvcpu_enum.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xen_apic.c
===================================================================
--- trunk/sys/x86/xen/xen_apic.c	                        (rev 0)
+++ trunk/sys/x86/xen/xen_apic.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,598 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_apic.c 334047 2018-05-22 14:36:46Z kib $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/cpufunc.h>
+#include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <machine/md_var.h>
+#include <machine/smp.h>
+
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
+
+#include <xen/xen-os.h>
+#include <xen/features.h>
+#include <xen/gnttab.h>
+#include <xen/hypervisor.h>
+#include <xen/hvm.h>
+#include <xen/xen_intr.h>
+
+#include <xen/interface/vcpu.h>
+
+/*--------------------------------- Macros -----------------------------------*/
+
+#define XEN_APIC_UNSUPPORTED \
+	panic("%s: not available in Xen PV port.", __func__)
+
+
+/*--------------------------- Forward Declarations ---------------------------*/
+#ifdef SMP
+static driver_filter_t xen_smp_rendezvous_action;
+static driver_filter_t xen_invltlb;
+static driver_filter_t xen_invlpg;
+static driver_filter_t xen_invlrng;
+static driver_filter_t xen_invlcache;
+static driver_filter_t xen_ipi_bitmap_handler;
+static driver_filter_t xen_cpustop_handler;
+static driver_filter_t xen_cpususpend_handler;
+static driver_filter_t xen_cpustophard_handler;
+#endif
+
+/*---------------------------------- Macros ----------------------------------*/
+#define	IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS)
+
+/*--------------------------------- Xen IPIs ---------------------------------*/
+#ifdef SMP
+struct xen_ipi_handler
+{
+	driver_filter_t	*filter;
+	const char	*description;
+};
+
+static struct xen_ipi_handler xen_ipis[] = 
+{
+	[IPI_TO_IDX(IPI_RENDEZVOUS)]	= { xen_smp_rendezvous_action,	"r"   },
+	[IPI_TO_IDX(IPI_INVLTLB)]	= { xen_invltlb,		"itlb"},
+	[IPI_TO_IDX(IPI_INVLPG)]	= { xen_invlpg,			"ipg" },
+	[IPI_TO_IDX(IPI_INVLRNG)]	= { xen_invlrng,		"irg" },
+	[IPI_TO_IDX(IPI_INVLCACHE)]	= { xen_invlcache,		"ic"  },
+	[IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler,	"b"   },
+	[IPI_TO_IDX(IPI_STOP)]		= { xen_cpustop_handler,	"st"  },
+	[IPI_TO_IDX(IPI_SUSPEND)]	= { xen_cpususpend_handler,	"sp"  },
+	[IPI_TO_IDX(IPI_STOP_HARD)]	= { xen_cpustophard_handler,	"sth" },
+};
+#endif
+
+/*------------------------------- Per-CPU Data -------------------------------*/
+#ifdef SMP
+DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]);
+#endif
+
+/*------------------------------- Xen PV APIC --------------------------------*/
+
+static void
+xen_pv_lapic_create(u_int apic_id, int boot_cpu)
+{
+#ifdef SMP
+	cpu_add(apic_id, boot_cpu);
+#endif
+}
+
+static void
+xen_pv_lapic_init(vm_paddr_t addr)
+{
+
+}
+
+static void
+xen_pv_lapic_setup(int boot)
+{
+
+}
+
+static void
+xen_pv_lapic_dump(const char *str)
+{
+
+	printf("cpu%d %s XEN PV LAPIC\n", PCPU_GET(cpuid), str);
+}
+
+static void
+xen_pv_lapic_disable(void)
+{
+
+}
+
+static bool
+xen_pv_lapic_is_x2apic(void)
+{
+
+	return (false);
+}
+
+static void
+xen_pv_lapic_eoi(void)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static int
+xen_pv_lapic_id(void)
+{
+
+	return (PCPU_GET(apic_id));
+}
+
+static int
+xen_pv_lapic_intr_pending(u_int vector)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static u_int
+xen_pv_apic_cpuid(u_int apic_id)
+{
+#ifdef SMP
+	return (apic_cpuids[apic_id]);
+#else
+	return (0);
+#endif
+}
+
+static u_int
+xen_pv_apic_alloc_vector(u_int apic_id, u_int irq)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static u_int
+xen_pv_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static void
+xen_pv_apic_disable_vector(u_int apic_id, u_int vector)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_apic_enable_vector(u_int apic_id, u_int vector)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static int
+xen_pv_lapic_enable_pmc(void)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static void
+xen_pv_lapic_disable_pmc(void)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_reenable_pmc(void)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_enable_cmc(void)
+{
+
+}
+
+#ifdef SMP
+static void
+xen_pv_lapic_ipi_raw(register_t icrlo, u_int dest)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static void
+xen_pv_lapic_ipi_vectored(u_int vector, int dest)
+{
+	xen_intr_handle_t *ipi_handle;
+	int ipi_idx, to_cpu, self;
+
+	ipi_idx = IPI_TO_IDX(vector);
+	if (ipi_idx >= nitems(xen_ipis))
+		panic("IPI out of range");
+
+	switch(dest) {
+	case APIC_IPI_DEST_SELF:
+		ipi_handle = DPCPU_GET(ipi_handle);
+		xen_intr_signal(ipi_handle[ipi_idx]);
+		break;
+	case APIC_IPI_DEST_ALL:
+		CPU_FOREACH(to_cpu) {
+			ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
+			xen_intr_signal(ipi_handle[ipi_idx]);
+		}
+		break;
+	case APIC_IPI_DEST_OTHERS:
+		self = PCPU_GET(cpuid);
+		CPU_FOREACH(to_cpu) {
+			if (to_cpu != self) {
+				ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
+				xen_intr_signal(ipi_handle[ipi_idx]);
+			}
+		}
+		break;
+	default:
+		to_cpu = apic_cpuid(dest);
+		ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
+		xen_intr_signal(ipi_handle[ipi_idx]);
+		break;
+	}
+}
+
+static int
+xen_pv_lapic_ipi_wait(int delay)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+#endif	/* SMP */
+
+static int
+xen_pv_lapic_ipi_alloc(inthand_t *ipifunc)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (-1);
+}
+
+static void
+xen_pv_lapic_ipi_free(int vector)
+{
+
+	XEN_APIC_UNSUPPORTED;
+}
+
+static int
+xen_pv_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static int
+xen_pv_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static int
+xen_pv_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+static int
+xen_pv_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
+    enum intr_trigger trigger)
+{
+
+	XEN_APIC_UNSUPPORTED;
+	return (0);
+}
+
+/* Xen apic_ops implementation */
+struct apic_ops xen_apic_ops = {
+	.create			= xen_pv_lapic_create,
+	.init			= xen_pv_lapic_init,
+	.xapic_mode		= xen_pv_lapic_disable,
+	.is_x2apic		= xen_pv_lapic_is_x2apic,
+	.setup			= xen_pv_lapic_setup,
+	.dump			= xen_pv_lapic_dump,
+	.disable		= xen_pv_lapic_disable,
+	.eoi			= xen_pv_lapic_eoi,
+	.id			= xen_pv_lapic_id,
+	.intr_pending		= xen_pv_lapic_intr_pending,
+	.set_logical_id		= xen_pv_lapic_set_logical_id,
+	.cpuid			= xen_pv_apic_cpuid,
+	.alloc_vector		= xen_pv_apic_alloc_vector,
+	.alloc_vectors		= xen_pv_apic_alloc_vectors,
+	.enable_vector		= xen_pv_apic_enable_vector,
+	.disable_vector		= xen_pv_apic_disable_vector,
+	.free_vector		= xen_pv_apic_free_vector,
+	.enable_pmc		= xen_pv_lapic_enable_pmc,
+	.disable_pmc		= xen_pv_lapic_disable_pmc,
+	.reenable_pmc		= xen_pv_lapic_reenable_pmc,
+	.enable_cmc		= xen_pv_lapic_enable_cmc,
+#ifdef SMP
+	.ipi_raw		= xen_pv_lapic_ipi_raw,
+	.ipi_vectored		= xen_pv_lapic_ipi_vectored,
+	.ipi_wait		= xen_pv_lapic_ipi_wait,
+#endif
+	.ipi_alloc		= xen_pv_lapic_ipi_alloc,
+	.ipi_free		= xen_pv_lapic_ipi_free,
+	.set_lvt_mask		= xen_pv_lapic_set_lvt_mask,
+	.set_lvt_mode		= xen_pv_lapic_set_lvt_mode,
+	.set_lvt_polarity	= xen_pv_lapic_set_lvt_polarity,
+	.set_lvt_triggermode	= xen_pv_lapic_set_lvt_triggermode,
+};
+
+#ifdef SMP
+/*---------------------------- XEN PV IPI Handlers ---------------------------*/
+/*
+ * These are C clones of the ASM functions found in apic_vector.
+ */
+static int
+xen_ipi_bitmap_handler(void *arg)
+{
+	struct trapframe *frame;
+
+	frame = arg;
+	ipi_bitmap_handler(*frame);
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_smp_rendezvous_action(void *arg)
+{
+#ifdef COUNT_IPIS
+	(*ipi_rendezvous_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	smp_rendezvous_action();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invltlb(void *arg)
+{
+
+	invltlb_handler();
+	return (FILTER_HANDLED);
+}
+
+#ifdef __amd64__
+static int
+xen_invltlb_invpcid(void *arg)
+{
+
+	invltlb_invpcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invltlb_pcid(void *arg)
+{
+
+	invltlb_pcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invltlb_invpcid_pti(void *arg)
+{
+
+	invltlb_invpcid_pti_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlpg_invpcid_handler(void *arg)
+{
+
+	invlpg_invpcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlpg_pcid_handler(void *arg)
+{
+
+	invlpg_pcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlrng_invpcid_handler(void *arg)
+{
+
+	invlrng_invpcid_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlrng_pcid_handler(void *arg)
+{
+
+	invlrng_pcid_handler();
+	return (FILTER_HANDLED);
+}
+#endif
+
+static int
+xen_invlpg(void *arg)
+{
+
+	invlpg_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlrng(void *arg)
+{
+
+	invlrng_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_invlcache(void *arg)
+{
+
+	invlcache_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_cpustop_handler(void *arg)
+{
+
+	cpustop_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_cpususpend_handler(void *arg)
+{
+
+	cpususpend_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_cpustophard_handler(void *arg)
+{
+
+	ipi_nmi_handler();
+	return (FILTER_HANDLED);
+}
+
+/*----------------------------- XEN PV IPI setup -----------------------------*/
+/*
+ * Those functions are provided outside of the Xen PV APIC implementation
+ * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC,
+ * because on PVHVM there's an emulated LAPIC provided by Xen.
+ */
+static void
+xen_cpu_ipi_init(int cpu)
+{
+	xen_intr_handle_t *ipi_handle;
+	const struct xen_ipi_handler *ipi;
+	int idx, rc;
+
+	ipi_handle = DPCPU_ID_GET(cpu, ipi_handle);
+
+	for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) {
+
+		if (ipi->filter == NULL) {
+			ipi_handle[idx] = NULL;
+			continue;
+		}
+
+		rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter,
+		    INTR_TYPE_TTY, &ipi_handle[idx]);
+		if (rc != 0)
+			panic("Unable to allocate a XEN IPI port");
+		xen_intr_describe(ipi_handle[idx], "%s", ipi->description);
+	}
+}
+
+static void
+xen_setup_cpus(void)
+{
+	int i;
+
+	if (!xen_vector_callback_enabled)
+		return;
+
+#ifdef __amd64__
+	if (pmap_pcid_enabled) {
+		if (pti)
+			xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
+			    invpcid_works ? xen_invltlb_invpcid_pti :
+			    xen_invltlb_pcid;
+		else
+			xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
+			    invpcid_works ? xen_invltlb_invpcid :
+			    xen_invltlb_pcid;
+		xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = invpcid_works ?
+		    xen_invlpg_invpcid_handler : xen_invlpg_pcid_handler;
+		xen_ipis[IPI_TO_IDX(IPI_INVLRNG)].filter = invpcid_works ?
+		    xen_invlrng_invpcid_handler : xen_invlrng_pcid_handler;
+	}
+#endif
+	CPU_FOREACH(i)
+		xen_cpu_ipi_init(i);
+
+	/* Set the xen pv ipi ops to replace the native ones */
+	if (xen_hvm_domain())
+		apic_ops.ipi_vectored = xen_pv_lapic_ipi_vectored;
+}
+
+/* We need to setup IPIs before APs are started */
+SYSINIT(xen_setup_cpus, SI_SUB_SMP-1, SI_ORDER_FIRST, xen_setup_cpus, NULL);
+#endif /* SMP */


Property changes on: trunk/sys/x86/xen/xen_apic.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/xen/xen_intr.c
===================================================================
--- trunk/sys/x86/xen/xen_intr.c	2020-02-08 19:29:01 UTC (rev 12309)
+++ trunk/sys/x86/xen/xen_intr.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -2,7 +2,7 @@
 /******************************************************************************
  * xen_intr.c
  *
- * Xen event and interrupt services for x86 PV and HVM guests.
+ * Xen event and interrupt services for x86 HVM guests.
  *
  * Copyright (c) 2002-2005, K A Fraser
  * Copyright (c) 2005, Intel Corporation <xiaofeng.ling at intel.com>
@@ -31,8 +31,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/xen/xen_intr.c 291647 2015-12-02 12:58:20Z royger $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_intr.c 342656 2018-12-31 22:09:08Z jhb $");
 
+#include "opt_ddb.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
@@ -49,22 +51,30 @@
 #include <vm/pmap.h>
 
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
+#include <x86/apicreg.h>
 #include <machine/smp.h>
 #include <machine/stdarg.h>
 
 #include <machine/xen/synch_bitops.h>
 #include <machine/xen/xen-os.h>
-#include <machine/xen/xenvar.h>
 
+#include <xen/xen-os.h>
 #include <xen/hypervisor.h>
 #include <xen/xen_intr.h>
 #include <xen/evtchn/evtchnvar.h>
 
 #include <dev/xen/xenpci/xenpcivar.h>
+#include <dev/pci/pcivar.h>
 
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
 static MALLOC_DEFINE(M_XENINTR, "xen_intr", "Xen Interrupt Services");
 
+static u_int first_evtchn_irq;
+
 /**
  * Per-cpu event channel processing state.
  */
@@ -96,7 +106,7 @@
  * Start the scan at port 0 by initializing the last scanned
  * location as the highest numbered event channel port.
  */
-DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = {
+static DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = {
 	.last_processed_l1i = LONG_BIT - 1,
 	.last_processed_l2i = LONG_BIT - 1
 };
@@ -103,8 +113,12 @@
 
 DPCPU_DECLARE(struct vcpu_info *, vcpu_info);
 
-#define is_valid_evtchn(x)	((x) != 0)
+#define	XEN_EEXIST		17 /* Xen "already exists" error */
+#define	XEN_ALLOCATE_VECTOR	0 /* Allocate a vector for this event channel */
+#define	XEN_INVALID_EVTCHN	0 /* Invalid event channel */
 
+#define	is_valid_evtchn(x)	((x) != XEN_INVALID_EVTCHN)
+
 struct xenisrc {
 	struct intsrc	xi_intsrc;
 	enum evtchn_type xi_type;
@@ -113,13 +127,13 @@
 	evtchn_port_t	xi_port;
 	int		xi_pirq;
 	int		xi_virq;
+	void		*xi_cookie;
 	u_int		xi_close:1;	/* close on unbind? */
-	u_int		xi_needs_eoi:1;
-	u_int		xi_shared:1;	/* Shared with other domains. */
+	u_int		xi_activehi:1;
+	u_int		xi_edgetrigger:1;
+	u_int		xi_masked:1;
 };
 
-#define ARRAY_SIZE(a)	(sizeof(a) / sizeof(a[0]))
-
 static void	xen_intr_suspend(struct pic *);
 static void	xen_intr_resume(struct pic *, bool suspend_cancelled);
 static void	xen_intr_enable_source(struct intsrc *isrc);
@@ -137,6 +151,9 @@
 static void	xen_intr_pirq_disable_source(struct intsrc *isrc, int eoi);
 static void	xen_intr_pirq_eoi_source(struct intsrc *isrc);
 static void	xen_intr_pirq_enable_intr(struct intsrc *isrc);
+static void	xen_intr_pirq_disable_intr(struct intsrc *isrc);
+static int	xen_intr_pirq_config_intr(struct intsrc *isrc,
+		     enum intr_trigger trig, enum intr_polarity pol);
 
 /**
  * PIC interface for all event channel port types except physical IRQs.
@@ -160,22 +177,25 @@
  * physical interrupt sources.
  */
 struct pic xen_intr_pirq_pic = {
+#ifdef __amd64__
+	.pic_register_sources = xenpv_register_pirqs,
+#endif
 	.pic_enable_source  = xen_intr_pirq_enable_source,
 	.pic_disable_source = xen_intr_pirq_disable_source,
 	.pic_eoi_source     = xen_intr_pirq_eoi_source,
 	.pic_enable_intr    = xen_intr_pirq_enable_intr,
-	.pic_disable_intr   = xen_intr_disable_intr,
+	.pic_disable_intr   = xen_intr_pirq_disable_intr,
 	.pic_vector         = xen_intr_vector,
 	.pic_source_pending = xen_intr_source_pending,
-	.pic_suspend        = xen_intr_suspend,
-	.pic_resume         = xen_intr_resume,
-	.pic_config_intr    = xen_intr_config_intr,
+	.pic_config_intr    = xen_intr_pirq_config_intr,
 	.pic_assign_cpu     = xen_intr_assign_cpu
 };
 
-static struct mtx	xen_intr_isrc_lock;
-static int		xen_intr_isrc_count;
-static struct xenisrc  *xen_intr_port_to_isrc[NR_EVENT_CHANNELS];
+static struct mtx	 xen_intr_isrc_lock;
+static u_int		 xen_intr_auto_vector_count;
+static struct xenisrc	*xen_intr_port_to_isrc[NR_EVENT_CHANNELS];
+static u_long		*xen_intr_pirq_eoi_map;
+static boolean_t	 xen_intr_pirq_eoi_map_enabled;
 
 /*------------------------- Private Functions --------------------------------*/
 /**
@@ -197,7 +217,7 @@
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
-	clear_bit(port, pcpu->evtchn_enabled);
+	xen_clear_bit(port, pcpu->evtchn_enabled);
 }
 
 /**
@@ -219,7 +239,7 @@
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
-	set_bit(port, pcpu->evtchn_enabled);
+	xen_set_bit(port, pcpu->evtchn_enabled);
 }
 
 /**
@@ -257,11 +277,11 @@
 
 	KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn isrc lock not held"));
 
-	for (isrc_idx = 0; isrc_idx < xen_intr_isrc_count; isrc_idx ++) {
+	for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx ++) {
 		struct xenisrc *isrc;
 		u_int vector;
 
-		vector = FIRST_EVTCHN_INT + isrc_idx;
+		vector = first_evtchn_irq + isrc_idx;
 		isrc = (struct xenisrc *)intr_lookup_source(vector);
 		if (isrc != NULL
 		 && isrc->xi_type == EVTCHN_TYPE_UNBOUND) {
@@ -283,15 +303,14 @@
  *          object or NULL.
  */
 static struct xenisrc *
-xen_intr_alloc_isrc(enum evtchn_type type)
+xen_intr_alloc_isrc(enum evtchn_type type, int vector)
 {
 	static int warned;
 	struct xenisrc *isrc;
-	int vector;
 
 	KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn alloc lock not held"));
 
-	if (xen_intr_isrc_count > NR_EVENT_CHANNELS) {
+	if (xen_intr_auto_vector_count > NR_EVENT_CHANNELS) {
 		if (!warned) {
 			warned = 1;
 			printf("xen_intr_alloc: Event channels exhausted.\n");
@@ -298,12 +317,19 @@
 		}
 		return (NULL);
 	}
-	vector = FIRST_EVTCHN_INT + xen_intr_isrc_count;
-	xen_intr_isrc_count++;
 
+	if (type != EVTCHN_TYPE_PIRQ) {
+		vector = first_evtchn_irq + xen_intr_auto_vector_count;
+		xen_intr_auto_vector_count++;
+	}
+
+	KASSERT((intr_lookup_source(vector) == NULL),
+	    ("Trying to use an already allocated vector"));
+
 	mtx_unlock(&xen_intr_isrc_lock);
 	isrc = malloc(sizeof(*isrc), M_XENINTR, M_WAITOK | M_ZERO);
-	isrc->xi_intsrc.is_pic = &xen_intr_pic;
+	isrc->xi_intsrc.is_pic =
+	    (type == EVTCHN_TYPE_PIRQ) ? &xen_intr_pirq_pic : &xen_intr_pic;
 	isrc->xi_vector = vector;
 	isrc->xi_type = type;
 	intr_register_source(&isrc->xi_intsrc);
@@ -345,6 +371,7 @@
 	isrc->xi_cpu = 0;
 	isrc->xi_type = EVTCHN_TYPE_UNBOUND;
 	isrc->xi_port = 0;
+	isrc->xi_cookie = NULL;
 	mtx_unlock(&xen_intr_isrc_lock);
 	return (0);
 }
@@ -372,7 +399,7 @@
  */
 static int
 xen_intr_bind_isrc(struct xenisrc **isrcp, evtchn_port_t local_port,
-    enum evtchn_type type, device_t intr_owner, driver_filter_t filter,
+    enum evtchn_type type, const char *intr_owner, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags,
     xen_intr_handle_t *port_handlep)
 {
@@ -381,8 +408,8 @@
 
 	*isrcp = NULL;
 	if (port_handlep == NULL) {
-		device_printf(intr_owner,
-			      "xen_intr_bind_isrc: Bad event handle\n");
+		printf("%s: xen_intr_bind_isrc: Bad event handle\n",
+		    intr_owner);
 		return (EINVAL);
 	}
 
@@ -389,7 +416,7 @@
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = xen_intr_find_unused_isrc(type);
 	if (isrc == NULL) {
-		isrc = xen_intr_alloc_isrc(type);
+		isrc = xen_intr_alloc_isrc(type, XEN_ALLOCATE_VECTOR);
 		if (isrc == NULL) {
 			mtx_unlock(&xen_intr_isrc_lock);
 			return (ENOSPC);
@@ -399,17 +426,37 @@
 	xen_intr_port_to_isrc[local_port] = isrc;
 	mtx_unlock(&xen_intr_isrc_lock);
 
-	error = intr_add_handler(device_get_nameunit(intr_owner),
-				 isrc->xi_vector, filter, handler, arg,
-				 flags|INTR_EXCL, port_handlep);
+	/* Assign the opaque handler (the event channel port) */
+	*port_handlep = &isrc->xi_vector;
+
+#ifdef SMP
+	if (type == EVTCHN_TYPE_PORT) {
+		/*
+		 * By default all interrupts are assigned to vCPU#0
+		 * unless specified otherwise, so shuffle them to balance
+		 * the interrupt load.
+		 */
+		xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu());
+	}
+#endif
+
+	if (filter == NULL && handler == NULL) {
+		/*
+		 * No filter/handler provided, leave the event channel
+		 * masked and without a valid handler, the caller is
+		 * in charge of setting that up.
+		 */
+		*isrcp = isrc;
+		return (0);
+	}
+
+	error = xen_intr_add_handler(intr_owner, filter, handler, arg, flags,
+	    *port_handlep);
 	if (error != 0) {
-		device_printf(intr_owner,
-			      "xen_intr_bind_irq: intr_add_handler failed\n");
 		xen_intr_release_isrc(isrc);
 		return (error);
 	}
 	*isrcp = isrc;
-	evtchn_unmask_port(local_port);
 	return (0);
 }
 
@@ -426,13 +473,17 @@
 static struct xenisrc *
 xen_intr_isrc(xen_intr_handle_t handle)
 {
-	struct intr_handler *ih;
+	int vector;
 
-	ih = handle;
-	if (ih == NULL || ih->ih_event == NULL)
+	if (handle == NULL)
 		return (NULL);
 
-	return (ih->ih_event->ie_source);
+	vector = *(int *)handle;
+	KASSERT(vector >= first_evtchn_irq &&
+	    vector < (first_evtchn_irq + xen_intr_auto_vector_count),
+	    ("Xen interrupt vector is out of range"));
+
+	return ((struct xenisrc *)intr_lookup_source(vector));
 }
 
 /**
@@ -451,6 +502,11 @@
 xen_intr_active_ports(struct xen_intr_pcpu_data *pcpu, shared_info_t *sh,
     u_int idx)
 {
+
+	CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(sh->evtchn_pending[0]));
+	CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(pcpu->evtchn_enabled[0]));
+	CTASSERT(sizeof(sh->evtchn_mask) == sizeof(sh->evtchn_pending));
+	CTASSERT(sizeof(sh->evtchn_mask) == sizeof(pcpu->evtchn_enabled));
 	return (sh->evtchn_pending[idx]
 	      & ~sh->evtchn_mask[idx]
 	      & pcpu->evtchn_enabled[idx]);
@@ -570,8 +626,10 @@
 static int
 xen_intr_init(void *dummy __unused)
 {
+	shared_info_t *s = HYPERVISOR_shared_info;
 	struct xen_intr_pcpu_data *pcpu;
-	int i;
+	struct physdev_pirq_eoi_gmfn eoi_gmfn;
+	int i, rc;
 
 	if (!xen_domain())
 		return (0);
@@ -579,25 +637,65 @@
 	mtx_init(&xen_intr_isrc_lock, "xen-irq-lock", NULL, MTX_DEF);
 
 	/*
-	 * Register interrupt count manually as we aren't
-	 * guaranteed to see a call to xen_intr_assign_cpu()
-	 * before our first interrupt. Also set the per-cpu
-	 * mask of CPU#0 to enable all, since by default
-	 * all event channels are bound to CPU#0.
+	 * Set the per-cpu mask of CPU#0 to enable all, since by default all
+	 * event channels are bound to CPU#0.
 	 */
 	CPU_FOREACH(i) {
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
 		memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
-		       sizeof(pcpu->evtchn_enabled));
-		xen_intr_intrcnt_add(i);
+		    sizeof(pcpu->evtchn_enabled));
 	}
 
+	for (i = 0; i < nitems(s->evtchn_mask); i++)
+		atomic_store_rel_long(&s->evtchn_mask[i], ~0);
+
+	/* Try to register PIRQ EOI map */
+	xen_intr_pirq_eoi_map = malloc(PAGE_SIZE, M_XENINTR, M_WAITOK | M_ZERO);
+	eoi_gmfn.gmfn = atop(vtophys(xen_intr_pirq_eoi_map));
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
+	if (rc != 0 && bootverbose)
+		printf("Xen interrupts: unable to register PIRQ EOI map\n");
+	else
+		xen_intr_pirq_eoi_map_enabled = true;
+
 	intr_register_pic(&xen_intr_pic);
+	if (xen_pv_domain() && xen_initial_domain())
+		intr_register_pic(&xen_intr_pirq_pic);
 
+	if (bootverbose)
+		printf("Xen interrupt system initialized\n");
+
 	return (0);
 }
-SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_MIDDLE, xen_intr_init, NULL);
+SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL);
 
+static void
+xen_intrcnt_init(void *dummy __unused)
+{
+	unsigned int i;
+
+	if (!xen_domain())
+		return;
+
+	/*
+	 * Register interrupt count manually as we aren't guaranteed to see a
+	 * call to xen_intr_assign_cpu() before our first interrupt.
+	 */
+	CPU_FOREACH(i)
+		xen_intr_intrcnt_add(i);
+}
+SYSINIT(xen_intrcnt_init, SI_SUB_INTR, SI_ORDER_MIDDLE, xen_intrcnt_init, NULL);
+
+void
+xen_intr_alloc_irqs(void)
+{
+
+	if (num_io_irqs > UINT_MAX - NR_EVENT_CHANNELS)
+		panic("IRQ allocation overflow (num_msi_irqs too high?)");
+	first_evtchn_irq = num_io_irqs;
+	num_io_irqs += NR_EVENT_CHANNELS;
+}
+
 /*--------------------------- Common PIC Functions ---------------------------*/
 /**
  * Prepare this PIC for system suspension.
@@ -685,8 +783,8 @@
 		struct xen_intr_pcpu_data *pcpu;
 
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
-		memset(pcpu->evtchn_enabled,
-		       i == 0 ? ~0 : 0, sizeof(pcpu->evtchn_enabled));
+		memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
+		    sizeof(pcpu->evtchn_enabled));
 	}
 
 	/* Mask all event channels. */
@@ -697,10 +795,10 @@
 	memset(xen_intr_port_to_isrc, 0, sizeof(xen_intr_port_to_isrc));
 
 	/* Free unused isrcs and rebind VIRQs and IPIs */
-	for (isrc_idx = 0; isrc_idx < xen_intr_isrc_count; isrc_idx++) {
+	for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx++) {
 		u_int vector;
 
-		vector = FIRST_EVTCHN_INT + isrc_idx;
+		vector = first_evtchn_irq + isrc_idx;
 		isrc = (struct xenisrc *)intr_lookup_source(vector);
 		if (isrc != NULL) {
 			isrc->xi_port = 0;
@@ -712,7 +810,6 @@
 				xen_rebind_virq(isrc);
 				break;
 			default:
-				isrc->xi_cpu = 0;
 				break;
 			}
 		}
@@ -798,16 +895,13 @@
 	struct evtchn_bind_vcpu bind_vcpu;
 	struct xenisrc *isrc;
 	u_int to_cpu, vcpu_id;
-	int error;
+	int error, masked;
 
-#ifdef XENHVM
 	if (xen_vector_callback_enabled == 0)
 		return (EOPNOTSUPP);
-#endif
 
 	to_cpu = apic_cpuid(apic_id);
 	vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id;
-	xen_intr_intrcnt_add(to_cpu);
 
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = (struct xenisrc *)base_isrc;
@@ -816,6 +910,11 @@
 		return (EINVAL);
 	}
 
+	/*
+	 * Mask the event channel while binding it to prevent interrupt
+	 * delivery with an inconsistent state in isrc->xi_cpu.
+	 */
+	masked = evtchn_test_and_set_mask(isrc->xi_port);
 	if ((isrc->xi_type == EVTCHN_TYPE_VIRQ) ||
 		(isrc->xi_type == EVTCHN_TYPE_IPI)) {
 		/*
@@ -826,18 +925,12 @@
 		evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 		isrc->xi_cpu = to_cpu;
 		evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
-		mtx_unlock(&xen_intr_isrc_lock);
-		return (0);
+		goto out;
 	}
 
 	bind_vcpu.port = isrc->xi_port;
 	bind_vcpu.vcpu = vcpu_id;
 
-	/*
-	 * Allow interrupts to be fielded on the new VCPU before
-	 * we ask the hypervisor to deliver them there.
-	 */
-	evtchn_cpu_unmask_port(to_cpu, isrc->xi_port);
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu);
 	if (isrc->xi_cpu != to_cpu) {
 		if (error == 0) {
@@ -844,11 +937,13 @@
 			/* Commit to new binding by removing the old one. */
 			evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 			isrc->xi_cpu = to_cpu;
-		} else {
-			/* Roll-back to previous binding. */
-			evtchn_cpu_mask_port(to_cpu, isrc->xi_port);
+			evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
 		}
 	}
+
+out:
+	if (masked == 0)
+		evtchn_unmask_port(isrc->xi_port);
 	mtx_unlock(&xen_intr_isrc_lock);
 	return (0);
 #else
@@ -865,8 +960,21 @@
  *              acknowledgements.
  */
 static void
-xen_intr_disable_source(struct intsrc *isrc, int eoi)
+xen_intr_disable_source(struct intsrc *base_isrc, int eoi)
 {
+	struct xenisrc *isrc;
+
+	isrc = (struct xenisrc *)base_isrc;
+
+	/*
+	 * NB: checking if the event channel is already masked is
+	 * needed because the event channel user-space device
+	 * masks event channels on it's filter as part of it's
+	 * normal operation, and those shouldn't be automatically
+	 * unmasked by the generic interrupt code. The event channel
+	 * device will unmask them when needed.
+	 */
+	isrc->xi_masked = !!evtchn_test_and_set_mask(isrc->xi_port);
 }
 
 /*
@@ -875,8 +983,14 @@
  * \param isrc  The interrupt source to unmask (if necessary).
  */
 static void
-xen_intr_enable_source(struct intsrc *isrc)
+xen_intr_enable_source(struct intsrc *base_isrc)
 {
+	struct xenisrc *isrc;
+
+	isrc = (struct xenisrc *)base_isrc;
+
+	if (isrc->xi_masked == 0)
+		evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
@@ -885,7 +999,7 @@
  * \param isrc  The interrupt source to EOI.
  */
 static void
-xen_intr_eoi_source(struct intsrc *isrc)
+xen_intr_eoi_source(struct intsrc *base_isrc)
 {
 }
 
@@ -916,7 +1030,11 @@
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
-	evtchn_mask_port(isrc->xi_port);
+
+	if (isrc->xi_edgetrigger == 0)
+		evtchn_mask_port(isrc->xi_port);
+	if (eoi == PIC_EOI)
+		xen_intr_pirq_eoi_source(base_isrc);
 }
 
 /*
@@ -930,7 +1048,9 @@
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
-	evtchn_unmask_port(isrc->xi_port);
+
+	if (isrc->xi_edgetrigger == 0)
+		evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
@@ -942,13 +1062,17 @@
 xen_intr_pirq_eoi_source(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
+	int error;
 
-	/* XXX Use shared page of flags for this. */
 	isrc = (struct xenisrc *)base_isrc;
-	if (isrc->xi_needs_eoi != 0) {
+
+	if (xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)) {
 		struct physdev_eoi eoi = { .irq = isrc->xi_pirq };
 
-		(void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+		error = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+		if (error != 0)
+			panic("Unable to EOI PIRQ#%d: %d\n",
+			    isrc->xi_pirq, error);
 	}
 }
 
@@ -958,10 +1082,118 @@
  * \param isrc  The interrupt source to enable.
  */
 static void
-xen_intr_pirq_enable_intr(struct intsrc *isrc)
+xen_intr_pirq_enable_intr(struct intsrc *base_isrc)
 {
+	struct xenisrc *isrc;
+	struct evtchn_bind_pirq bind_pirq;
+	struct physdev_irq_status_query irq_status;
+	int error;
+
+	isrc = (struct xenisrc *)base_isrc;
+
+	if (!xen_intr_pirq_eoi_map_enabled) {
+		irq_status.irq = isrc->xi_pirq;
+		error = HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query,
+		    &irq_status);
+		if (error)
+			panic("unable to get status of IRQ#%d", isrc->xi_pirq);
+
+		if (irq_status.flags & XENIRQSTAT_needs_eoi) {
+			/*
+			 * Since the dynamic PIRQ EOI map is not available
+			 * mark the PIRQ as needing EOI unconditionally.
+			 */
+			xen_set_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map);
+		}
+	}
+
+	bind_pirq.pirq = isrc->xi_pirq;
+	bind_pirq.flags = isrc->xi_edgetrigger ? 0 : BIND_PIRQ__WILL_SHARE;
+	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
+	if (error)
+		panic("unable to bind IRQ#%d", isrc->xi_pirq);
+
+	isrc->xi_port = bind_pirq.port;
+
+	mtx_lock(&xen_intr_isrc_lock);
+	KASSERT((xen_intr_port_to_isrc[bind_pirq.port] == NULL),
+	    ("trying to override an already setup event channel port"));
+	xen_intr_port_to_isrc[bind_pirq.port] = isrc;
+	mtx_unlock(&xen_intr_isrc_lock);
+
+	evtchn_unmask_port(isrc->xi_port);
 }
 
+/*
+ * Disable an interrupt source.
+ *
+ * \param isrc  The interrupt source to disable.
+ */
+static void
+xen_intr_pirq_disable_intr(struct intsrc *base_isrc)
+{
+	struct xenisrc *isrc;
+	struct evtchn_close close;
+	int error;
+
+	isrc = (struct xenisrc *)base_isrc;
+
+	evtchn_mask_port(isrc->xi_port);
+
+	close.port = isrc->xi_port;
+	error = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+	if (error)
+		panic("unable to close event channel %d IRQ#%d",
+		    isrc->xi_port, isrc->xi_pirq);
+
+	mtx_lock(&xen_intr_isrc_lock);
+	xen_intr_port_to_isrc[isrc->xi_port] = NULL;
+	mtx_unlock(&xen_intr_isrc_lock);
+
+	isrc->xi_port = 0;
+}
+
+/**
+ * Perform configuration of an interrupt source.
+ *
+ * \param isrc  The interrupt source to configure.
+ * \param trig  Edge or level.
+ * \param pol   Active high or low.
+ *
+ * \returns  0 if no events are pending, otherwise non-zero.
+ */
+static int
+xen_intr_pirq_config_intr(struct intsrc *base_isrc, enum intr_trigger trig,
+    enum intr_polarity pol)
+{
+	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
+	struct physdev_setup_gsi setup_gsi;
+	int error;
+
+	KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM),
+	    ("%s: Conforming trigger or polarity\n", __func__));
+
+	setup_gsi.gsi = isrc->xi_pirq;
+	setup_gsi.triggering = trig == INTR_TRIGGER_EDGE ? 0 : 1;
+	setup_gsi.polarity = pol == INTR_POLARITY_HIGH ? 0 : 1;
+
+	error = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+	if (error == -XEN_EEXIST) {
+		if ((isrc->xi_edgetrigger && (trig != INTR_TRIGGER_EDGE)) ||
+		    (isrc->xi_activehi && (pol != INTR_POLARITY_HIGH)))
+			panic("unable to reconfigure interrupt IRQ#%d",
+			    isrc->xi_pirq);
+		error = 0;
+	}
+	if (error)
+		panic("unable to configure IRQ#%d\n", isrc->xi_pirq);
+
+	isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
+	isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
+
+	return (0);
+}
+
 /*--------------------------- Public Functions -------------------------------*/
 /*------- API comments for these methods can be found in xen/xenintr.h -------*/
 int
@@ -972,8 +1204,9 @@
 	struct xenisrc *isrc;
 	int error;
 
-	error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT, dev,
-		    filter, handler, arg, flags, port_handlep);
+	error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT,
+	    device_get_nameunit(dev), filter, handler, arg, flags,
+	    port_handlep);
 	if (error != 0)
 		return (error);
 
@@ -1007,8 +1240,8 @@
 	}
 
 	error = xen_intr_bind_isrc(&isrc, alloc_unbound.port, EVTCHN_TYPE_PORT,
-				 dev, filter, handler, arg, flags,
-				 port_handlep);
+	    device_get_nameunit(dev), filter, handler, arg, flags,
+	    port_handlep);
 	if (error != 0) {
 		evtchn_close_t close = { .port = alloc_unbound.port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
@@ -1042,8 +1275,8 @@
 	}
 
 	error = xen_intr_bind_isrc(&isrc, bind_interdomain.local_port,
-				 EVTCHN_TYPE_PORT, dev, filter, handler,
-				 arg, flags, port_handlep);
+	    EVTCHN_TYPE_PORT, device_get_nameunit(dev), filter, handler, arg,
+	    flags, port_handlep);
 	if (error) {
 		evtchn_close_t close = { .port = bind_interdomain.local_port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
@@ -1069,9 +1302,6 @@
 	struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id };
 	int error;
 
-	/* Ensure the target CPU is ready to handle evtchn interrupts. */
-	xen_intr_intrcnt_add(cpu);
-
 	isrc = NULL;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq);
 	if (error != 0) {
@@ -1082,8 +1312,9 @@
 		return (-error);
 	}
 
-	error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ, dev,
-				 filter, handler, arg, flags, port_handlep);
+	error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ,
+	    device_get_nameunit(dev), filter, handler, arg, flags,
+	    port_handlep);
 
 #ifdef SMP
 	if (error == 0)
@@ -1122,19 +1353,17 @@
 }
 
 int
-xen_intr_alloc_and_bind_ipi(device_t dev, u_int cpu,
-    driver_filter_t filter, enum intr_type flags,
-    xen_intr_handle_t *port_handlep)
+xen_intr_alloc_and_bind_ipi(u_int cpu, driver_filter_t filter,
+    enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 #ifdef SMP
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	struct xenisrc *isrc;
 	struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
+	/* Same size as the one used by intr_handler->ih_name. */
+	char name[MAXCOMLEN + 1];
 	int error;
 
-	/* Ensure the target CPU is ready to handle evtchn interrupts. */
-	xen_intr_intrcnt_add(cpu);
-
 	isrc = NULL;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
 	if (error != 0) {
@@ -1145,12 +1374,10 @@
 		return (-error);
 	}
 
+	snprintf(name, sizeof(name), "cpu%u", cpu);
+
 	error = xen_intr_bind_isrc(&isrc, bind_ipi.port, EVTCHN_TYPE_IPI,
-	                           dev, filter, NULL, NULL, flags,
-	                           port_handlep);
-	if (error == 0)
-		error = intr_event_bind(isrc->xi_intsrc.is_event, cpu);
-
+	    name, filter, NULL, NULL, flags, port_handlep);
 	if (error != 0) {
 		evtchn_close_t close = { .port = bind_ipi.port };
 
@@ -1182,6 +1409,101 @@
 }
 
 int
+xen_register_pirq(int vector, enum intr_trigger trig, enum intr_polarity pol)
+{
+	struct physdev_map_pirq map_pirq;
+	struct xenisrc *isrc;
+	int error;
+
+	if (vector == 0)
+		return (EINVAL);
+
+	if (bootverbose)
+		printf("xen: register IRQ#%d\n", vector);
+
+	map_pirq.domid = DOMID_SELF;
+	map_pirq.type = MAP_PIRQ_TYPE_GSI;
+	map_pirq.index = vector;
+	map_pirq.pirq = vector;
+
+	error = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_pirq);
+	if (error) {
+		printf("xen: unable to map IRQ#%d\n", vector);
+		return (error);
+	}
+
+	mtx_lock(&xen_intr_isrc_lock);
+	isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector);
+	mtx_unlock(&xen_intr_isrc_lock);
+	KASSERT((isrc != NULL), ("xen: unable to allocate isrc for interrupt"));
+	isrc->xi_pirq = vector;
+	isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
+	isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
+
+	return (0);
+}
+
+int
+xen_register_msi(device_t dev, int vector, int count)
+{
+	struct physdev_map_pirq msi_irq;
+	struct xenisrc *isrc;
+	int ret;
+
+	memset(&msi_irq, 0, sizeof(msi_irq));
+	msi_irq.domid = DOMID_SELF;
+	msi_irq.type = count == 1 ?
+	    MAP_PIRQ_TYPE_MSI_SEG : MAP_PIRQ_TYPE_MULTI_MSI;
+	msi_irq.index = -1;
+	msi_irq.pirq = -1;
+	msi_irq.bus = pci_get_bus(dev) | (pci_get_domain(dev) << 16);
+	msi_irq.devfn = (pci_get_slot(dev) << 3) | pci_get_function(dev);
+	msi_irq.entry_nr = count;
+
+	ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &msi_irq);
+	if (ret != 0)
+		return (ret);
+	if (count != msi_irq.entry_nr) {
+		panic("unable to setup all requested MSI vectors "
+		    "(expected %d got %d)", count, msi_irq.entry_nr);
+	}
+
+	mtx_lock(&xen_intr_isrc_lock);
+	for (int i = 0; i < count; i++) {
+		isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector + i);
+		KASSERT(isrc != NULL,
+		    ("xen: unable to allocate isrc for interrupt"));
+		isrc->xi_pirq = msi_irq.pirq + i;
+		/* MSI interrupts are always edge triggered */
+		isrc->xi_edgetrigger = 1;
+	}
+	mtx_unlock(&xen_intr_isrc_lock);
+
+	return (0);
+}
+
+int
+xen_release_msi(int vector)
+{
+	struct physdev_unmap_pirq unmap;
+	struct xenisrc *isrc;
+	int ret;
+
+	isrc = (struct xenisrc *)intr_lookup_source(vector);
+	if (isrc == NULL)
+		return (ENXIO);
+
+	unmap.pirq = isrc->xi_pirq;
+	ret = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap);
+	if (ret != 0)
+		return (ret);
+
+	xen_intr_release_isrc(isrc);
+
+	return (0);
+}
+
+int
 xen_intr_describe(xen_intr_handle_t port_handle, const char *fmt, ...)
 {
 	char descr[MAXCOMLEN + 1];
@@ -1195,22 +1517,24 @@
 	va_start(ap, fmt);
 	vsnprintf(descr, sizeof(descr), fmt, ap);
 	va_end(ap);
-	return (intr_describe(isrc->xi_vector, port_handle, descr));
+	return (intr_describe(isrc->xi_vector, isrc->xi_cookie, descr));
 }
 
 void
 xen_intr_unbind(xen_intr_handle_t *port_handlep)
 {
-	struct intr_handler *handler;
 	struct xenisrc *isrc;
 
-	handler = *port_handlep;
+	KASSERT(port_handlep != NULL,
+	    ("NULL xen_intr_handle_t passed to xen_intr_unbind"));
+
+	isrc = xen_intr_isrc(*port_handlep);
 	*port_handlep = NULL;
-	isrc = xen_intr_isrc(handler);
 	if (isrc == NULL)
 		return;
 
-	intr_remove_handler(handler);
+	if (isrc->xi_cookie != NULL)
+		intr_remove_handler(isrc->xi_cookie);
 	xen_intr_release_isrc(isrc);
 }
 
@@ -1240,3 +1564,96 @@
 	
 	return (isrc->xi_port);
 }
+
+int
+xen_intr_add_handler(const char *name, driver_filter_t filter,
+    driver_intr_t handler, void *arg, enum intr_type flags,
+    xen_intr_handle_t handle)
+{
+	struct xenisrc *isrc;
+	int error;
+
+	isrc = xen_intr_isrc(handle);
+	if (isrc == NULL || isrc->xi_cookie != NULL)
+		return (EINVAL);
+
+	error = intr_add_handler(name, isrc->xi_vector,filter, handler, arg,
+	    flags|INTR_EXCL, &isrc->xi_cookie);
+	if (error != 0) {
+		printf(
+		    "%s: xen_intr_add_handler: intr_add_handler failed: %d\n",
+		    name, error);
+	}
+
+	return (error);
+}
+
+#ifdef DDB
+static const char *
+xen_intr_print_type(enum evtchn_type type)
+{
+	static const char *evtchn_type_to_string[EVTCHN_TYPE_COUNT] = {
+		[EVTCHN_TYPE_UNBOUND]	= "UNBOUND",
+		[EVTCHN_TYPE_PIRQ]	= "PIRQ",
+		[EVTCHN_TYPE_VIRQ]	= "VIRQ",
+		[EVTCHN_TYPE_IPI]	= "IPI",
+		[EVTCHN_TYPE_PORT]	= "PORT",
+	};
+
+	if (type >= EVTCHN_TYPE_COUNT)
+		return ("UNKNOWN");
+
+	return (evtchn_type_to_string[type]);
+}
+
+static void
+xen_intr_dump_port(struct xenisrc *isrc)
+{
+	struct xen_intr_pcpu_data *pcpu;
+	shared_info_t *s = HYPERVISOR_shared_info;
+	int i;
+
+	db_printf("Port %d Type: %s\n",
+	    isrc->xi_port, xen_intr_print_type(isrc->xi_type));
+	if (isrc->xi_type == EVTCHN_TYPE_PIRQ) {
+		db_printf("\tPirq: %d ActiveHi: %d EdgeTrigger: %d "
+		    "NeedsEOI: %d\n",
+		    isrc->xi_pirq, isrc->xi_activehi, isrc->xi_edgetrigger,
+		    !!xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map));
+	}
+	if (isrc->xi_type == EVTCHN_TYPE_VIRQ)
+		db_printf("\tVirq: %d\n", isrc->xi_virq);
+
+	db_printf("\tMasked: %d Pending: %d\n",
+	    !!xen_test_bit(isrc->xi_port, &s->evtchn_mask[0]),
+	    !!xen_test_bit(isrc->xi_port, &s->evtchn_pending[0]));
+
+	db_printf("\tPer-CPU Masks: ");
+	CPU_FOREACH(i) {
+		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
+		db_printf("cpu#%d: %d ", i,
+		    !!xen_test_bit(isrc->xi_port, pcpu->evtchn_enabled));
+	}
+	db_printf("\n");
+}
+
+DB_SHOW_COMMAND(xen_evtchn, db_show_xen_evtchn)
+{
+	int i;
+
+	if (!xen_domain()) {
+		db_printf("Only available on Xen guests\n");
+		return;
+	}
+
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+		struct xenisrc *isrc;
+
+		isrc = xen_intr_port_to_isrc[i];
+		if (isrc == NULL)
+			continue;
+
+		xen_intr_dump_port(isrc);
+	}
+}
+#endif /* DDB */

Added: trunk/sys/x86/xen/xen_msi.c
===================================================================
--- trunk/sys/x86/xen/xen_msi.c	                        (rev 0)
+++ trunk/sys/x86/xen/xen_msi.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,134 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_msi.c 344912 2019-03-08 01:04:19Z jhb $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/systm.h>
+#include <x86/apicreg.h>
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/frame.h>
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+#include <machine/specialreg.h>
+#include <dev/pci/pcivar.h>
+
+#include <xen/xen-os.h>
+#include <xen/xen_intr.h>
+#include <xen/xen_msi.h>
+
+static struct mtx msi_lock;
+static u_int msi_last_irq;
+
+void
+xen_msi_init(void)
+{
+
+	MPASS(num_io_irqs > 0);
+	first_msi_irq = min(MINIMUM_MSI_INT, num_io_irqs);
+	if (num_msi_irqs > UINT_MAX - first_msi_irq)
+		panic("num_msi_irqs too high");
+	num_io_irqs = first_msi_irq + num_msi_irqs;
+
+	mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
+}
+
+/*
+ * Try to allocate 'count' interrupt sources with contiguous IDT values.
+ */
+int
+xen_msi_alloc(device_t dev, int count, int maxcount, int *irqs)
+{
+	int i, ret = 0;
+
+	mtx_lock(&msi_lock);
+
+	/* If we would exceed the max, give up. */
+	if (msi_last_irq + count > num_msi_irqs) {
+		mtx_unlock(&msi_lock);
+		return (ENXIO);
+	}
+
+	/* Allocate MSI vectors */
+	for (i = 0; i < count; i++)
+		irqs[i] = first_msi_irq + msi_last_irq++;
+
+	mtx_unlock(&msi_lock);
+
+	ret = xen_register_msi(dev, irqs[0], count);
+	if (ret != 0)
+		return (ret);
+
+	for (i = 0; i < count; i++)
+		nexus_add_irq(irqs[i]);
+
+	return (0);
+}
+
+int
+xen_msi_release(int *irqs, int count)
+{
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		ret = xen_release_msi(irqs[i]);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+int
+xen_msi_map(int irq, uint64_t *addr, uint32_t *data)
+{
+
+	return (0);
+}
+
+int
+xen_msix_alloc(device_t dev, int *irq)
+{
+
+	return (ENXIO);
+}
+
+int
+xen_msix_release(int irq)
+{
+
+	return (ENOENT);
+}


Property changes on: trunk/sys/x86/xen/xen_msi.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xen_nexus.c
===================================================================
--- trunk/sys/x86/xen/xen_nexus.c	                        (rev 0)
+++ trunk/sys/x86/xen/xen_nexus.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,168 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2013 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_nexus.c 340016 2018-11-01 18:34:26Z jhb $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+
+#include <dev/acpica/acpivar.h>
+
+#include <x86/init.h>
+#include <machine/nexusvar.h>
+#include <machine/intr_machdep.h>
+
+#include <xen/xen-os.h>
+#include <xen/xen_intr.h>
+#include <xen/xen_msi.h>
+
+#include "pcib_if.h"
+
+/*
+ * Xen nexus(4) driver.
+ */
+static int
+nexus_xen_probe(device_t dev)
+{
+
+	if (!xen_pv_domain())
+		return (ENXIO);
+
+	return (BUS_PROBE_SPECIFIC);
+}
+
+static int
+nexus_xen_attach(device_t dev)
+{
+	int error;
+	device_t acpi_dev = NULL;
+
+	nexus_init_resources();
+	bus_generic_probe(dev);
+
+	if (xen_initial_domain()) {
+		/* Disable some ACPI devices that are not usable by Dom0 */
+		acpi_cpu_disabled = true;
+		acpi_hpet_disabled = true;
+		acpi_timer_disabled = true;
+
+		acpi_dev = BUS_ADD_CHILD(dev, 10, "acpi", 0);
+		if (acpi_dev == NULL)
+			panic("Unable to add ACPI bus to Xen Dom0");
+	}
+
+	error = bus_generic_attach(dev);
+	if (xen_initial_domain() && (error == 0))
+		acpi_install_wakeup_handler(device_get_softc(acpi_dev));
+
+	return (error);
+}
+
+static int
+nexus_xen_config_intr(device_t dev, int irq, enum intr_trigger trig,
+    enum intr_polarity pol)
+{
+	int ret;
+
+	/*
+	 * ISA and PCI intline IRQs are not preregistered on Xen, so
+	 * intercept calls to configure those and register them on the fly.
+	 */
+	if ((irq < first_msi_irq) && (intr_lookup_source(irq) == NULL)) {
+		ret = xen_register_pirq(irq, trig, pol);
+		if (ret != 0)
+			return (ret);
+		nexus_add_irq(irq);
+	}
+	return (intr_config_intr(irq, trig, pol));
+}
+
+static int
+nexus_xen_alloc_msix(device_t pcib, device_t dev, int *irq)
+{
+
+	return (xen_msix_alloc(dev, irq));
+}
+
+static int
+nexus_xen_release_msix(device_t pcib, device_t dev, int irq)
+{
+
+	return (xen_msix_release(irq));
+}
+
+static int
+nexus_xen_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs)
+{
+
+	return (xen_msi_alloc(dev, count, maxcount, irqs));
+}
+
+static int
+nexus_xen_release_msi(device_t pcib, device_t dev, int count, int *irqs)
+{
+
+	return (xen_msi_release(irqs, count));
+}
+
+static int
+nexus_xen_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data)
+{
+
+	return (xen_msi_map(irq, addr, data));
+}
+
+static device_method_t nexus_xen_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		nexus_xen_probe),
+	DEVMETHOD(device_attach,	nexus_xen_attach),
+
+	/* INTR */
+	DEVMETHOD(bus_config_intr,	nexus_xen_config_intr),
+
+	/* MSI */
+	DEVMETHOD(pcib_alloc_msi,	nexus_xen_alloc_msi),
+	DEVMETHOD(pcib_release_msi,	nexus_xen_release_msi),
+	DEVMETHOD(pcib_alloc_msix,	nexus_xen_alloc_msix),
+	DEVMETHOD(pcib_release_msix,	nexus_xen_release_msix),
+	DEVMETHOD(pcib_map_msi,		nexus_xen_map_msi),
+
+	{ 0, 0 }
+};
+
+DEFINE_CLASS_1(nexus, nexus_xen_driver, nexus_xen_methods, 1, nexus_driver);
+static devclass_t nexus_devclass;
+
+DRIVER_MODULE(nexus_xen, root, nexus_xen_driver, nexus_devclass, 0, 0);


Property changes on: trunk/sys/x86/xen/xen_nexus.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xen_pci_bus.c
===================================================================
--- trunk/sys/x86/xen/xen_pci_bus.c	                        (rev 0)
+++ trunk/sys/x86/xen/xen_pci_bus.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,91 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xen_pci_bus.c 275649 2014-12-09 18:03:25Z royger $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+
+#include <sys/pciio.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pci_private.h>
+
+#include <xen/xen-os.h>
+#include <xen/hypervisor.h>
+#include <xen/xen_pci.h>
+
+#include "pcib_if.h"
+#include "pci_if.h"
+
+void
+xen_pci_enable_msi_method(device_t dev, device_t child, uint64_t address,
+     uint16_t data)
+{
+	struct pci_devinfo *dinfo = device_get_ivars(child);
+	struct pcicfg_msi *msi = &dinfo->cfg.msi;
+
+	/* Enable MSI in the control register. */
+	msi->msi_ctrl |= PCIM_MSICTRL_MSI_ENABLE;
+	pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
+	    msi->msi_ctrl, 2);
+}
+
+void
+xen_pci_disable_msi_method(device_t dev, device_t child)
+{
+	struct pci_devinfo *dinfo = device_get_ivars(child);
+	struct pcicfg_msi *msi = &dinfo->cfg.msi;
+
+	msi->msi_ctrl &= ~PCIM_MSICTRL_MSI_ENABLE;
+	pci_write_config(child, msi->msi_location + PCIR_MSI_CTRL,
+	    msi->msi_ctrl, 2);
+}
+
+void
+xen_pci_child_added_method(device_t dev, device_t child)
+{
+	struct pci_devinfo *dinfo;
+	struct physdev_pci_device_add add_pci;
+	int error;
+
+	dinfo = device_get_ivars(child);
+	KASSERT((dinfo != NULL),
+	    ("xen_pci_add_child_method called with NULL dinfo"));
+
+	bzero(&add_pci, sizeof(add_pci));
+	add_pci.seg = dinfo->cfg.domain;
+	add_pci.bus = dinfo->cfg.bus;
+	add_pci.devfn = (dinfo->cfg.slot << 3) | dinfo->cfg.func;
+	error = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add_pci);
+	if (error)
+		panic("unable to add device bus %u devfn %u error: %d\n",
+		    add_pci.bus, add_pci.devfn, error);
+}


Property changes on: trunk/sys/x86/xen/xen_pci_bus.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/xen/xenpv.c
===================================================================
--- trunk/sys/x86/xen/xenpv.c	                        (rev 0)
+++ trunk/sys/x86/xen/xenpv.c	2020-02-08 19:32:41 UTC (rev 12310)
@@ -0,0 +1,203 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2014 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/x86/xen/xenpv.c 331017 2018-03-15 19:08:33Z kevans $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/pcpu.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+#include <sys/limits.h>
+#include <sys/vmmeter.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_param.h>
+#include <vm/vm_phys.h>
+
+#include <xen/xen-os.h>
+#include <xen/gnttab.h>
+
+#include "xenmem_if.h"
+
+/*
+ * Allocate unused physical memory above 4GB in order to map memory
+ * from foreign domains. We use memory starting at 4GB in order to
+ * prevent clashes with MMIO/ACPI regions.
+ *
+ * Since this is not possible on i386 just use any available memory
+ * chunk and hope we don't clash with anything else.
+ */
+#ifdef __amd64__
+#define LOW_MEM_LIMIT	0x100000000ul
+#else
+#define LOW_MEM_LIMIT	0
+#endif
+
+static devclass_t xenpv_devclass;
+
+static void
+xenpv_identify(driver_t *driver, device_t parent)
+{
+	if (!xen_domain())
+		return;
+
+	/* Make sure there's only one xenpv device. */
+	if (devclass_get_device(xenpv_devclass, 0))
+		return;
+
+	/*
+	 * The xenpv bus should be the last to attach in order
+	 * to properly detect if an ISA bus has already been added.
+	 */
+	if (BUS_ADD_CHILD(parent, UINT_MAX, "xenpv", 0) == NULL)
+		panic("Unable to attach xenpv bus.");
+}
+
+static int
+xenpv_probe(device_t dev)
+{
+
+	device_set_desc(dev, "Xen PV bus");
+	return (BUS_PROBE_NOWILDCARD);
+}
+
+static int
+xenpv_attach(device_t dev)
+{
+	device_t child;
+
+	/*
+	 * Let our child drivers identify any child devices that they
+	 * can find.  Once that is done attach any devices that we
+	 * found.
+	 */
+	bus_generic_probe(dev);
+	bus_generic_attach(dev);
+
+	if (!devclass_get_device(devclass_find("isa"), 0)) {
+		child = BUS_ADD_CHILD(dev, 0, "isa", 0);
+		if (child == NULL)
+			panic("Failed to attach ISA bus.");
+		device_probe_and_attach(child);
+	}
+
+	return (0);
+}
+
+static struct resource *
+xenpv_alloc_physmem(device_t dev, device_t child, int *res_id, size_t size)
+{
+	struct resource *res;
+	vm_paddr_t phys_addr;
+	int error;
+
+	res = bus_alloc_resource(child, SYS_RES_MEMORY, res_id, LOW_MEM_LIMIT,
+	    ~0, size, RF_ACTIVE);
+	if (res == NULL)
+		return (NULL);
+
+	phys_addr = rman_get_start(res);
+	error = vm_phys_fictitious_reg_range(phys_addr, phys_addr + size,
+	    VM_MEMATTR_DEFAULT);
+	if (error) {
+		bus_release_resource(child, SYS_RES_MEMORY, *res_id, res);
+		return (NULL);
+	}
+
+	return (res);
+}
+
+static int
+xenpv_free_physmem(device_t dev, device_t child, int res_id, struct resource *res)
+{
+	vm_paddr_t phys_addr;
+	size_t size;
+
+	phys_addr = rman_get_start(res);
+	size = rman_get_size(res);
+
+	vm_phys_fictitious_unreg_range(phys_addr, phys_addr + size);
+	return (bus_release_resource(child, SYS_RES_MEMORY, res_id, res));
+}
+
+static device_method_t xenpv_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_identify,		xenpv_identify),
+	DEVMETHOD(device_probe,			xenpv_probe),
+	DEVMETHOD(device_attach,		xenpv_attach),
+	DEVMETHOD(device_suspend,		bus_generic_suspend),
+	DEVMETHOD(device_resume,		bus_generic_resume),
+
+	/* Bus interface */
+	DEVMETHOD(bus_add_child,		bus_generic_add_child),
+	DEVMETHOD(bus_alloc_resource,		bus_generic_alloc_resource),
+	DEVMETHOD(bus_release_resource,		bus_generic_release_resource),
+	DEVMETHOD(bus_activate_resource,	bus_generic_activate_resource),
+	DEVMETHOD(bus_deactivate_resource,	bus_generic_deactivate_resource),
+
+	/* Interface to allocate memory for foreign mappings */
+	DEVMETHOD(xenmem_alloc,			xenpv_alloc_physmem),
+	DEVMETHOD(xenmem_free,			xenpv_free_physmem),
+
+	DEVMETHOD_END
+};
+
+static driver_t xenpv_driver = {
+	"xenpv",
+	xenpv_methods,
+	0,
+};
+
+DRIVER_MODULE(xenpv, nexus, xenpv_driver, xenpv_devclass, 0, 0);
+
+struct resource *
+xenmem_alloc(device_t dev, int *res_id, size_t size)
+{
+	device_t parent;
+
+	parent = device_get_parent(dev);
+	if (parent == NULL)
+		return (NULL);
+	return (XENMEM_ALLOC(parent, dev, res_id, size));
+}
+
+int
+xenmem_free(device_t dev, int res_id, struct resource *res)
+{
+	device_t parent;
+
+	parent = device_get_parent(dev);
+	if (parent == NULL)
+		return (ENXIO);
+	return (XENMEM_FREE(parent, dev, res_id, res));
+}


Property changes on: trunk/sys/x86/xen/xenpv.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property

From laffer1 at midnightbsd.org  Sat Feb  8 14:33:28 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:33:28 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12311] trunk/sys/x86/include: sync with
 FreeBSD 11-stable
Message-ID: <202002081933.018JXShK061961@stargazer.midnightbsd.org>

Revision: 12311
          http://svnweb.midnightbsd.org/src/?rev=12311
Author:   laffer1
Date:     2020-02-08 14:33:27 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/x86/include/_align.h
    trunk/sys/x86/include/_inttypes.h
    trunk/sys/x86/include/_limits.h
    trunk/sys/x86/include/_stdint.h
    trunk/sys/x86/include/_types.h
    trunk/sys/x86/include/acpica_machdep.h
    trunk/sys/x86/include/apicreg.h
    trunk/sys/x86/include/apm_bios.h
    trunk/sys/x86/include/bus.h
    trunk/sys/x86/include/busdma_impl.h
    trunk/sys/x86/include/elf.h
    trunk/sys/x86/include/endian.h
    trunk/sys/x86/include/fdt.h
    trunk/sys/x86/include/float.h
    trunk/sys/x86/include/fpu.h
    trunk/sys/x86/include/frame.h
    trunk/sys/x86/include/legacyvar.h
    trunk/sys/x86/include/mca.h
    trunk/sys/x86/include/metadata.h
    trunk/sys/x86/include/mptable.h
    trunk/sys/x86/include/ofw_machdep.h
    trunk/sys/x86/include/pci_cfgreg.h
    trunk/sys/x86/include/psl.h
    trunk/sys/x86/include/ptrace.h
    trunk/sys/x86/include/reg.h
    trunk/sys/x86/include/segments.h
    trunk/sys/x86/include/setjmp.h
    trunk/sys/x86/include/sigframe.h
    trunk/sys/x86/include/signal.h
    trunk/sys/x86/include/specialreg.h
    trunk/sys/x86/include/stdarg.h
    trunk/sys/x86/include/sysarch.h
    trunk/sys/x86/include/trap.h
    trunk/sys/x86/include/ucontext.h
    trunk/sys/x86/include/vdso.h
    trunk/sys/x86/include/vmware.h

Added Paths:
-----------
    trunk/sys/x86/include/apicvar.h
    trunk/sys/x86/include/cputypes.h
    trunk/sys/x86/include/dump.h
    trunk/sys/x86/include/ifunc.h
    trunk/sys/x86/include/init.h
    trunk/sys/x86/include/intr_machdep.h
    trunk/sys/x86/include/pvclock.h
    trunk/sys/x86/include/stack.h
    trunk/sys/x86/include/ucode.h
    trunk/sys/x86/include/x86_smp.h
    trunk/sys/x86/include/x86_var.h
    trunk/sys/x86/include/xen/
    trunk/sys/x86/include/xen/xen-os.h

Modified: trunk/sys/x86/include/_align.h
===================================================================
--- trunk/sys/x86/include/_align.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/_align.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -36,7 +36,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)param.h	5.8 (Berkeley) 6/28/91
- * $FreeBSD: stable/10/sys/x86/include/_align.h 215856 2010-11-26 10:59:20Z tijl $
+ * $FreeBSD: stable/11/sys/x86/include/_align.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _X86_INCLUDE__ALIGN_H_
@@ -47,7 +47,7 @@
  * for all data types (int, long, ...).   The result is unsigned int
  * and must be cast to any desired pointer type.
  */
-#define	_ALIGNBYTES	(sizeof(register_t) - 1)
-#define	_ALIGN(p)	(((uintptr_t)(p) + _ALIGNBYTES) & ~_ALIGNBYTES)
+#define	_ALIGNBYTES	(sizeof(__register_t) - 1)
+#define	_ALIGN(p)	(((__uintptr_t)(p) + _ALIGNBYTES) & ~_ALIGNBYTES)
 
 #endif /* !_X86_INCLUDE__ALIGN_H_ */

Modified: trunk/sys/x86/include/_inttypes.h
===================================================================
--- trunk/sys/x86/include/_inttypes.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/_inttypes.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -28,7 +28,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  *
  *	From: $NetBSD: int_fmtio.h,v 1.2 2001/04/26 16:25:21 kleink Exp $
- * $FreeBSD: stable/10/sys/x86/include/_inttypes.h 217157 2011-01-08 18:09:48Z tijl $
+ * $FreeBSD: stable/11/sys/x86/include/_inttypes.h 217157 2011-01-08 18:09:48Z tijl $
  */
 
 #ifndef _MACHINE_INTTYPES_H_

Modified: trunk/sys/x86/include/_limits.h
===================================================================
--- trunk/sys/x86/include/_limits.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/_limits.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)limits.h	8.3 (Berkeley) 1/4/94
- * $FreeBSD: stable/10/sys/x86/include/_limits.h 235939 2012-05-24 21:44:46Z obrien $
+ * $FreeBSD: stable/11/sys/x86/include/_limits.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef	_MACHINE__LIMITS_H_

Modified: trunk/sys/x86/include/_stdint.h
===================================================================
--- trunk/sys/x86/include/_stdint.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/_stdint.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -35,12 +35,14 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/_stdint.h 237517 2012-06-24 04:15:58Z andrew $
+ * $FreeBSD: stable/11/sys/x86/include/_stdint.h 301030 2016-05-31 08:38:24Z ed $
  */
 
 #ifndef _MACHINE__STDINT_H_
 #define	_MACHINE__STDINT_H_
 
+#include <machine/_limits.h>
+
 #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS)
 
 #define	INT8_C(c)		(c)
@@ -168,8 +170,8 @@
 #define	PTRDIFF_MAX	INT64_MAX
 
 /* Limits of sig_atomic_t. */
-#define	SIG_ATOMIC_MIN	LONG_MIN
-#define	SIG_ATOMIC_MAX	LONG_MAX
+#define	SIG_ATOMIC_MIN	__LONG_MIN
+#define	SIG_ATOMIC_MAX	__LONG_MAX
 
 /* Limit of size_t. */
 #define	SIZE_MAX	UINT64_MAX

Modified: trunk/sys/x86/include/_types.h
===================================================================
--- trunk/sys/x86/include/_types.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/_types.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -34,7 +34,7 @@
  *
  *	From: @(#)ansi.h	8.2 (Berkeley) 1/4/94
  *	From: @(#)types.h	8.3 (Berkeley) 1/5/94
- * $FreeBSD: stable/10/sys/x86/include/_types.h 287139 2015-08-25 19:18:38Z jkim $
+ * $FreeBSD: stable/11/sys/x86/include/_types.h 332135 2018-04-06 19:17:59Z kevans $
  */
 
 #ifndef _MACHINE__TYPES_H_
@@ -44,6 +44,8 @@
 #error this file needs sys/cdefs.h as a prerequisite
 #endif
 
+#include <machine/_limits.h>
+
 #define __NO_STRICT_ALIGNMENT
 
 /*
@@ -77,15 +79,19 @@
 #ifdef	__LP64__
 typedef	__int32_t	__clock_t;		/* clock()... */
 typedef	__int64_t	__critical_t;
+#ifndef _STANDALONE
 typedef	double		__double_t;
 typedef	float		__float_t;
+#endif
 typedef	__int64_t	__intfptr_t;
 typedef	__int64_t	__intptr_t;
 #else
 typedef	unsigned long	__clock_t;
 typedef	__int32_t	__critical_t;
+#ifndef _STANDALONE
 typedef	long double	__double_t;
 typedef	long double	__float_t;
+#endif
 typedef	__int32_t	__intfptr_t;
 typedef	__int32_t	__intptr_t;
 #endif
@@ -141,8 +147,6 @@
 #endif
 typedef	__uint32_t	__vm_size_t;
 #endif
-typedef	__int64_t	__vm_ooffset_t;
-typedef	__uint64_t	__vm_pindex_t;
 typedef	int		___wchar_t;
 
 #define	__WCHAR_MIN	__INT_MIN	/* min value for a wchar_t */

Modified: trunk/sys/x86/include/acpica_machdep.h
===================================================================
--- trunk/sys/x86/include/acpica_machdep.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/acpica_machdep.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/acpica_machdep.h 259073 2013-12-07 18:23:29Z peter $
+ * $FreeBSD: stable/11/sys/x86/include/acpica_machdep.h 298094 2016-04-16 03:44:50Z gjb $
  */
 
 /******************************************************************************
@@ -70,12 +70,20 @@
 	(Acq) = acpi_release_global_lock(&((GLptr)->GlobalLock));	\
 } while (0)
  
+enum intr_trigger;
+enum intr_polarity;
+
 void	acpi_SetDefaultIntrModel(int model);
 void	acpi_cpu_c1(void);
+void	acpi_cpu_idle_mwait(uint32_t mwait_hint);
 void	*acpi_map_table(vm_paddr_t pa, const char *sig);
 void	acpi_unmap_table(void *table);
 vm_paddr_t acpi_find_table(const char *sig);
+void	madt_parse_interrupt_values(void *entry,
+	    enum intr_trigger *trig, enum intr_polarity *pol);
 
+extern int madt_found_sci_override;
+
 #endif /* _KERNEL */
 
 #endif /* __ACPICA_MACHDEP_H__ */

Modified: trunk/sys/x86/include/apicreg.h
===================================================================
--- trunk/sys/x86/include/apicreg.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/apicreg.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/apicreg.h 262141 2014-02-18 01:15:32Z jhb $
+ * $FreeBSD: stable/11/sys/x86/include/apicreg.h 323608 2017-09-15 09:00:45Z kib $
  */
 
 #ifndef _X86_APICREG_H_
@@ -194,6 +194,81 @@
 
 typedef struct LAPIC lapic_t;
 
+enum LAPIC_REGISTERS {
+	LAPIC_ID	= 0x2,
+	LAPIC_VERSION	= 0x3,
+	LAPIC_TPR	= 0x8,
+	LAPIC_APR	= 0x9,
+	LAPIC_PPR	= 0xa,
+	LAPIC_EOI	= 0xb,
+	LAPIC_LDR	= 0xd,
+	LAPIC_DFR	= 0xe, /* Not in x2APIC */
+	LAPIC_SVR	= 0xf,
+	LAPIC_ISR0	= 0x10,
+	LAPIC_ISR1	= 0x11,
+	LAPIC_ISR2	= 0x12,
+	LAPIC_ISR3	= 0x13,
+	LAPIC_ISR4	= 0x14,
+	LAPIC_ISR5	= 0x15,
+	LAPIC_ISR6	= 0x16,
+	LAPIC_ISR7	= 0x17,
+	LAPIC_TMR0	= 0x18,
+	LAPIC_TMR1	= 0x19,
+	LAPIC_TMR2	= 0x1a,
+	LAPIC_TMR3	= 0x1b,
+	LAPIC_TMR4	= 0x1c,
+	LAPIC_TMR5	= 0x1d,
+	LAPIC_TMR6	= 0x1e,
+	LAPIC_TMR7	= 0x1f,
+	LAPIC_IRR0	= 0x20,
+	LAPIC_IRR1	= 0x21,
+	LAPIC_IRR2	= 0x22,
+	LAPIC_IRR3	= 0x23,
+	LAPIC_IRR4	= 0x24,
+	LAPIC_IRR5	= 0x25,
+	LAPIC_IRR6	= 0x26,
+	LAPIC_IRR7	= 0x27,
+	LAPIC_ESR	= 0x28,
+	LAPIC_LVT_CMCI	= 0x2f,
+	LAPIC_ICR_LO	= 0x30,
+	LAPIC_ICR_HI	= 0x31, /* Not in x2APIC */
+	LAPIC_LVT_TIMER	= 0x32,
+	LAPIC_LVT_THERMAL = 0x33,
+	LAPIC_LVT_PCINT	= 0x34,
+	LAPIC_LVT_LINT0	= 0x35,
+	LAPIC_LVT_LINT1	= 0x36,
+	LAPIC_LVT_ERROR	= 0x37,
+	LAPIC_ICR_TIMER	= 0x38,
+	LAPIC_CCR_TIMER	= 0x39,
+	LAPIC_DCR_TIMER	= 0x3e,
+	LAPIC_SELF_IPI	= 0x3f, /* Only in x2APIC */
+	LAPIC_EXT_FEATURES = 0x40, /* AMD */
+	LAPIC_EXT_CTRL	= 0x41, /* AMD */
+	LAPIC_EXT_SEOI	= 0x42, /* AMD */
+	LAPIC_EXT_IER0	= 0x48, /* AMD */
+	LAPIC_EXT_IER1	= 0x49, /* AMD */
+	LAPIC_EXT_IER2	= 0x4a, /* AMD */
+	LAPIC_EXT_IER3	= 0x4b, /* AMD */
+	LAPIC_EXT_IER4	= 0x4c, /* AMD */
+	LAPIC_EXT_IER5	= 0x4d, /* AMD */
+	LAPIC_EXT_IER6	= 0x4e, /* AMD */
+	LAPIC_EXT_IER7	= 0x4f, /* AMD */
+	LAPIC_EXT_LVT0	= 0x50, /* AMD */
+	LAPIC_EXT_LVT1	= 0x51, /* AMD */
+	LAPIC_EXT_LVT2	= 0x52, /* AMD */
+	LAPIC_EXT_LVT3	= 0x53, /* AMD */
+};
+
+#define	LAPIC_MEM_MUL	0x10
+
+/*
+ * Although some registers are available on AMD processors only,
+ * it's not a big waste to reserve them on all platforms.
+ * However, we need to watch out for this space being assigned for
+ * non-APIC purposes in the future processor models.
+ */
+#define	LAPIC_MEM_REGION ((LAPIC_EXT_LVT3 + 1) * LAPIC_MEM_MUL)
+
 /******************************************************************************
  * I/O APIC structure
  */
@@ -236,6 +311,7 @@
 #define APIC_VER_MAXLVT		0x00ff0000
 #define MAXLVTSHIFT		16
 #define APIC_VER_EOI_SUPPRESSION 0x01000000
+#define APIC_VER_AMD_EXT_SPACE	0x80000000
 
 /* fields in LDR */
 #define	APIC_LDR_RESERVED	0x00ffffff
@@ -340,11 +416,12 @@
 #define APIC_LVTT_VECTOR	0x000000ff
 #define APIC_LVTT_DS		0x00001000
 #define APIC_LVTT_M		0x00010000
-#define APIC_LVTT_TM		0x00020000
+#define APIC_LVTT_TM		0x00060000
 # define APIC_LVTT_TM_ONE_SHOT	0x00000000
 # define APIC_LVTT_TM_PERIODIC	0x00020000
+# define APIC_LVTT_TM_TSCDLT	0x00040000
+# define APIC_LVTT_TM_RSRV	0x00060000
 
-
 /* APIC timer current count */
 #define	APIC_TIMER_MAX_COUNT	0xffffffff
 
@@ -358,6 +435,13 @@
 #define APIC_TDCR_128		0x0a
 #define APIC_TDCR_1		0x0b
 
+/* Constants related to AMD Extended APIC Features Register */
+#define	APIC_EXTF_ELVT_MASK	0x00ff0000
+#define	APIC_EXTF_ELVT_SHIFT	16
+#define	APIC_EXTF_EXTID_CAP	0x00000004
+#define	APIC_EXTF_SEIO_CAP	0x00000002
+#define	APIC_EXTF_IER_CAP	0x00000001
+
 /* LVT table indices */
 #define	APIC_LVT_LINT0		0
 #define	APIC_LVT_LINT1		1
@@ -368,6 +452,13 @@
 #define	APIC_LVT_CMCI		6
 #define	APIC_LVT_MAX		APIC_LVT_CMCI
 
+/* AMD extended LVT constants, seem to be assigned by fiat */
+#define	APIC_ELVT_IBS		0 /* Instruction based sampling */
+#define	APIC_ELVT_MCA		1 /* MCE thresholding */
+#define	APIC_ELVT_DEI		2 /* Deferred error interrupt */
+#define	APIC_ELVT_SBI		3 /* Sideband interface */
+#define	APIC_ELVT_MAX		APIC_ELVT_SBI
+
 /******************************************************************************
  * I/O APIC defines
  */
@@ -379,6 +470,8 @@
 #define IOAPIC_WINDOW		0x10
 #define IOAPIC_EOIR		0x40
 
+#define	IOAPIC_WND_SIZE		0x50
+
 /* indexes into IO APIC */
 #define IOAPIC_ID		0x00
 #define IOAPIC_VER		0x01

Added: trunk/sys/x86/include/apicvar.h
===================================================================
--- trunk/sys/x86/include/apicvar.h	                        (rev 0)
+++ trunk/sys/x86/include/apicvar.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,487 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2003 John Baldwin <jhb at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/apicvar.h 346817 2019-04-28 13:21:01Z dchagin $
+ */
+
+#ifndef _X86_APICVAR_H_
+#define _X86_APICVAR_H_
+
+/*
+ * Local && I/O APIC variable definitions.
+ */
+
+/*
+ * Layout of local APIC interrupt vectors:
+ *
+ *	0xff (255)  +-------------+
+ *                  |             | 15 (Spurious / IPIs / Local Interrupts)
+ *	0xf0 (240)  +-------------+
+ *                  |             | 14 (I/O Interrupts / Timer)
+ *	0xe0 (224)  +-------------+
+ *                  |             | 13 (I/O Interrupts)
+ *	0xd0 (208)  +-------------+
+ *                  |             | 12 (I/O Interrupts)
+ *	0xc0 (192)  +-------------+
+ *                  |             | 11 (I/O Interrupts)
+ *	0xb0 (176)  +-------------+
+ *                  |             | 10 (I/O Interrupts)
+ *	0xa0 (160)  +-------------+
+ *                  |             | 9 (I/O Interrupts)
+ *	0x90 (144)  +-------------+
+ *                  |             | 8 (I/O Interrupts / System Calls)
+ *	0x80 (128)  +-------------+
+ *                  |             | 7 (I/O Interrupts)
+ *	0x70 (112)  +-------------+
+ *                  |             | 6 (I/O Interrupts)
+ *	0x60 (96)   +-------------+
+ *                  |             | 5 (I/O Interrupts)
+ *	0x50 (80)   +-------------+
+ *                  |             | 4 (I/O Interrupts)
+ *	0x40 (64)   +-------------+
+ *                  |             | 3 (I/O Interrupts)
+ *	0x30 (48)   +-------------+
+ *                  |             | 2 (ATPIC Interrupts)
+ *	0x20 (32)   +-------------+
+ *                  |             | 1 (Exceptions, traps, faults, etc.)
+ *	0x10 (16)   +-------------+
+ *                  |             | 0 (Exceptions, traps, faults, etc.)
+ *	0x00 (0)    +-------------+
+ *
+ * Note: 0x80 needs to be handled specially and not allocated to an
+ * I/O device!
+ */
+
+#define	MAX_APIC_ID	0xfe
+#define	APIC_ID_ALL	0xff
+
+/* I/O Interrupts are used for external devices such as ISA, PCI, etc. */
+#define	APIC_IO_INTS	(IDT_IO_INTS + 16)
+#define	APIC_NUM_IOINTS	191
+
+/* The timer interrupt is used for clock handling and drives hardclock, etc. */
+#define	APIC_TIMER_INT	(APIC_IO_INTS + APIC_NUM_IOINTS)
+
+/*  
+ ********************* !!! WARNING !!! ******************************
+ * Each local apic has an interrupt receive fifo that is two entries deep
+ * for each interrupt priority class (higher 4 bits of interrupt vector).
+ * Once the fifo is full the APIC can no longer receive interrupts for this
+ * class and sending IPIs from other CPUs will be blocked.
+ * To avoid deadlocks there should be no more than two IPI interrupts
+ * pending at the same time.
+ * Currently this is guaranteed by dividing the IPIs in two groups that have 
+ * each at most one IPI interrupt pending. The first group is protected by the
+ * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user 
+ * at a time) The second group uses a single interrupt and a bitmap to avoid
+ * redundant IPI interrupts.
+ */ 
+
+/* Interrupts for local APIC LVT entries other than the timer. */
+#define	APIC_LOCAL_INTS	240
+#define	APIC_ERROR_INT	APIC_LOCAL_INTS
+#define	APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
+#define	APIC_CMC_INT	(APIC_LOCAL_INTS + 2)
+#define	APIC_IPI_INTS	(APIC_LOCAL_INTS + 3)
+
+#define	IPI_RENDEZVOUS	(APIC_IPI_INTS)		/* Inter-CPU rendezvous. */
+#define	IPI_INVLTLB	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs */
+#define	IPI_INVLPG	(APIC_IPI_INTS + 2)
+#define	IPI_INVLRNG	(APIC_IPI_INTS + 3)
+#define	IPI_INVLCACHE	(APIC_IPI_INTS + 4)
+/* Vector to handle bitmap based IPIs */
+#define	IPI_BITMAP_VECTOR	(APIC_IPI_INTS + 5) 
+
+/* IPIs handled by IPI_BITMAP_VECTOR */
+#define	IPI_AST		0 	/* Generate software trap. */
+#define IPI_PREEMPT     1
+#define IPI_HARDCLOCK   2
+#define IPI_BITMAP_LAST IPI_HARDCLOCK
+#define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST)
+
+#define	IPI_STOP	(APIC_IPI_INTS + 6)	/* Stop CPU until restarted. */
+#define	IPI_SUSPEND	(APIC_IPI_INTS + 7)	/* Suspend CPU until restarted. */
+#define	IPI_DYN_FIRST	(APIC_IPI_INTS + 8)
+#define	IPI_DYN_LAST	(253)			/* IPIs allocated at runtime */
+
+/*
+ * IPI_STOP_HARD does not need to occupy a slot in the IPI vector space since
+ * it is delivered using an NMI anyways.
+ */
+#define	IPI_NMI_FIRST	254
+#define	IPI_TRACE	254			/* Interrupt for tracing. */
+#define	IPI_STOP_HARD	255			/* Stop CPU with a NMI. */
+
+/*
+ * The spurious interrupt can share the priority class with the IPIs since
+ * it is not a normal interrupt. (Does not use the APIC's interrupt fifo)
+ */
+#define	APIC_SPURIOUS_INT 255
+
+#ifndef LOCORE
+
+#define	APIC_IPI_DEST_SELF	-1
+#define	APIC_IPI_DEST_ALL	-2
+#define	APIC_IPI_DEST_OTHERS	-3
+
+#define	APIC_BUS_UNKNOWN	-1
+#define	APIC_BUS_ISA		0
+#define	APIC_BUS_EISA		1
+#define	APIC_BUS_PCI		2
+#define	APIC_BUS_MAX		APIC_BUS_PCI
+
+#define	IRQ_EXTINT		-1
+#define	IRQ_NMI			-2
+#define	IRQ_SMI			-3
+#define	IRQ_DISABLED		-4
+
+/*
+ * An APIC enumerator is a pseudo bus driver that enumerates APIC's including
+ * CPU's and I/O APIC's.
+ */
+struct apic_enumerator {
+	const char *apic_name;
+	int (*apic_probe)(void);
+	int (*apic_probe_cpus)(void);
+	int (*apic_setup_local)(void);
+	int (*apic_setup_io)(void);
+	SLIST_ENTRY(apic_enumerator) apic_next;
+};
+
+inthand_t
+	IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
+	IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
+	IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
+	IDTVEC(spuriousint), IDTVEC(timerint),
+	IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti),
+	IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti),
+	IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti),
+	IDTVEC(spuriousint_pti), IDTVEC(timerint_pti);
+
+extern vm_paddr_t lapic_paddr;
+extern int apic_cpuids[];
+
+void	apic_register_enumerator(struct apic_enumerator *enumerator);
+void	*ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase);
+int	ioapic_disable_pin(void *cookie, u_int pin);
+int	ioapic_get_vector(void *cookie, u_int pin);
+void	ioapic_register(void *cookie);
+int	ioapic_remap_vector(void *cookie, u_int pin, int vector);
+int	ioapic_set_bus(void *cookie, u_int pin, int bus_type);
+int	ioapic_set_extint(void *cookie, u_int pin);
+int	ioapic_set_nmi(void *cookie, u_int pin);
+int	ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol);
+int	ioapic_set_triggermode(void *cookie, u_int pin,
+	    enum intr_trigger trigger);
+int	ioapic_set_smi(void *cookie, u_int pin);
+
+/*
+ * Struct containing pointers to APIC functions whose
+ * implementation is run time selectable.
+ */
+struct apic_ops {
+	void	(*create)(u_int, int);
+	void	(*init)(vm_paddr_t);
+	void	(*xapic_mode)(void);
+	bool	(*is_x2apic)(void);
+	void	(*setup)(int);
+	void	(*dump)(const char *);
+	void	(*disable)(void);
+	void	(*eoi)(void);
+	int	(*id)(void);
+	int	(*intr_pending)(u_int);
+	void	(*set_logical_id)(u_int, u_int, u_int);
+	u_int	(*cpuid)(u_int);
+
+	/* Vectors */
+	u_int	(*alloc_vector)(u_int, u_int);
+	u_int	(*alloc_vectors)(u_int, u_int *, u_int, u_int);
+	void	(*enable_vector)(u_int, u_int);
+	void	(*disable_vector)(u_int, u_int);
+	void	(*free_vector)(u_int, u_int, u_int);
+
+
+	/* PMC */
+	int	(*enable_pmc)(void);
+	void	(*disable_pmc)(void);
+	void	(*reenable_pmc)(void);
+
+	/* CMC */
+	void	(*enable_cmc)(void);
+
+	/* AMD ELVT */
+	int	(*enable_mca_elvt)(void);
+
+	/* IPI */
+	void	(*ipi_raw)(register_t, u_int);
+	void	(*ipi_vectored)(u_int, int);
+	int	(*ipi_wait)(int);
+	int	(*ipi_alloc)(inthand_t *ipifunc);
+	void	(*ipi_free)(int vector);
+
+	/* LVT */
+	int	(*set_lvt_mask)(u_int, u_int, u_char);
+	int	(*set_lvt_mode)(u_int, u_int, u_int32_t);
+	int	(*set_lvt_polarity)(u_int, u_int, enum intr_polarity);
+	int	(*set_lvt_triggermode)(u_int, u_int, enum intr_trigger);
+};
+
+extern struct apic_ops apic_ops;
+
+static inline void
+lapic_create(u_int apic_id, int boot_cpu)
+{
+
+	apic_ops.create(apic_id, boot_cpu);
+}
+
+static inline void
+lapic_init(vm_paddr_t addr)
+{
+
+	apic_ops.init(addr);
+}
+
+static inline void
+lapic_xapic_mode(void)
+{
+
+	apic_ops.xapic_mode();
+}
+
+static inline bool
+lapic_is_x2apic(void)
+{
+
+	return (apic_ops.is_x2apic());
+}
+
+static inline void
+lapic_setup(int boot)
+{
+
+	apic_ops.setup(boot);
+}
+
+static inline void
+lapic_dump(const char *str)
+{
+
+	apic_ops.dump(str);
+}
+
+static inline void
+lapic_disable(void)
+{
+
+	apic_ops.disable();
+}
+
+static inline void
+lapic_eoi(void)
+{
+
+	apic_ops.eoi();
+}
+
+static inline int
+lapic_id(void)
+{
+
+	return (apic_ops.id());
+}
+
+static inline int
+lapic_intr_pending(u_int vector)
+{
+
+	return (apic_ops.intr_pending(vector));
+}
+
+/* XXX: UNUSED */
+static inline void
+lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
+{
+
+	apic_ops.set_logical_id(apic_id, cluster, cluster_id);
+}
+
+static inline u_int
+apic_cpuid(u_int apic_id)
+{
+
+	return (apic_ops.cpuid(apic_id));
+}
+
+static inline u_int
+apic_alloc_vector(u_int apic_id, u_int irq)
+{
+
+	return (apic_ops.alloc_vector(apic_id, irq));
+}
+
+static inline u_int
+apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
+{
+
+	return (apic_ops.alloc_vectors(apic_id, irqs, count, align));
+}
+
+static inline void
+apic_enable_vector(u_int apic_id, u_int vector)
+{
+
+	apic_ops.enable_vector(apic_id, vector);
+}
+
+static inline void
+apic_disable_vector(u_int apic_id, u_int vector)
+{
+
+	apic_ops.disable_vector(apic_id, vector);
+}
+
+static inline void
+apic_free_vector(u_int apic_id, u_int vector, u_int irq)
+{
+
+	apic_ops.free_vector(apic_id, vector, irq);
+}
+
+static inline int
+lapic_enable_pmc(void)
+{
+
+	return (apic_ops.enable_pmc());
+}
+
+static inline void
+lapic_disable_pmc(void)
+{
+
+	apic_ops.disable_pmc();
+}
+
+static inline void
+lapic_reenable_pmc(void)
+{
+
+	apic_ops.reenable_pmc();
+}
+
+static inline void
+lapic_enable_cmc(void)
+{
+
+	apic_ops.enable_cmc();
+}
+
+static inline int
+lapic_enable_mca_elvt(void)
+{
+
+	return (apic_ops.enable_mca_elvt());
+}
+
+static inline void
+lapic_ipi_raw(register_t icrlo, u_int dest)
+{
+
+	apic_ops.ipi_raw(icrlo, dest);
+}
+
+static inline void
+lapic_ipi_vectored(u_int vector, int dest)
+{
+
+	apic_ops.ipi_vectored(vector, dest);
+}
+
+static inline int
+lapic_ipi_wait(int delay)
+{
+
+	return (apic_ops.ipi_wait(delay));
+}
+
+static inline int
+lapic_ipi_alloc(inthand_t *ipifunc)
+{
+
+	return (apic_ops.ipi_alloc(ipifunc));
+}
+
+static inline void
+lapic_ipi_free(int vector)
+{
+
+	return (apic_ops.ipi_free(vector));
+}
+
+static inline int
+lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
+{
+
+	return (apic_ops.set_lvt_mask(apic_id, lvt, masked));
+}
+
+static inline int
+lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode)
+{
+
+	return (apic_ops.set_lvt_mode(apic_id, lvt, mode));
+}
+
+static inline int
+lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
+{
+
+	return (apic_ops.set_lvt_polarity(apic_id, lvt, pol));
+}
+
+static inline int
+lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger)
+{
+
+	return (apic_ops.set_lvt_triggermode(apic_id, lvt, trigger));
+}
+
+void	lapic_handle_cmc(void);
+void	lapic_handle_error(void);
+void	lapic_handle_intr(int vector, struct trapframe *frame);
+void	lapic_handle_timer(struct trapframe *frame);
+
+int	ioapic_get_rid(u_int apic_id, uint16_t *ridp);
+
+extern int x2apic_mode;
+extern int lapic_eoi_suppression;
+
+#ifdef _SYS_SYSCTL_H_
+SYSCTL_DECL(_hw_apic);
+#endif
+
+#endif /* !LOCORE */
+#endif /* _X86_APICVAR_H_ */


Property changes on: trunk/sys/x86/include/apicvar.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/include/apm_bios.h
===================================================================
--- trunk/sys/x86/include/apm_bios.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/apm_bios.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -13,7 +13,7 @@
  *
  * Aug, 1994	Implemented on FreeBSD 1.1.5.1R (Toshiba AVS001WD)
  *
- * $FreeBSD: stable/10/sys/x86/include/apm_bios.h 215140 2010-11-11 19:36:21Z jkim $
+ * $FreeBSD: stable/11/sys/x86/include/apm_bios.h 215140 2010-11-11 19:36:21Z jkim $
  */
 
 #ifndef _X86_APM_BIOS_H_

Modified: trunk/sys/x86/include/bus.h
===================================================================
--- trunk/sys/x86/include/bus.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/bus.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -29,7 +29,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/bus.h 287126 2015-08-25 14:39:40Z marcel $
+ * $FreeBSD: stable/11/sys/x86/include/bus.h 286667 2015-08-12 15:26:32Z marcel $
  */
 
 /*	$NetBSD: bus.h,v 1.12 1997/10/01 08:25:15 fvdl Exp $	*/

Modified: trunk/sys/x86/include/busdma_impl.h
===================================================================
--- trunk/sys/x86/include/busdma_impl.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/busdma_impl.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/busdma_impl.h 259512 2013-12-17 13:49:35Z kib $
+ * $FreeBSD: stable/11/sys/x86/include/busdma_impl.h 257308 2013-10-29 07:25:54Z kib $
  */
 
 #ifndef	__X86_BUSDMA_IMPL_H

Added: trunk/sys/x86/include/cputypes.h
===================================================================
--- trunk/sys/x86/include/cputypes.h	                        (rev 0)
+++ trunk/sys/x86/include/cputypes.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,50 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1993 Christopher G. Demetriou
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/cputypes.h 308433 2016-11-08 06:13:22Z jhb $
+ */
+
+#ifndef _X86_CPUTYPES_H_
+#define	_X86_CPUTYPES_H_
+
+/*
+ * Vendors of processor.
+ */
+#define	CPU_VENDOR_NSC		0x100b		/* NSC */
+#define	CPU_VENDOR_IBM		0x1014		/* IBM */
+#define	CPU_VENDOR_AMD		0x1022		/* AMD */
+#define	CPU_VENDOR_SIS		0x1039		/* SiS */
+#define	CPU_VENDOR_UMC		0x1060		/* UMC */
+#define	CPU_VENDOR_NEXGEN	0x1074		/* Nexgen */
+#define	CPU_VENDOR_CYRIX	0x1078		/* Cyrix */
+#define	CPU_VENDOR_IDT		0x111d		/* Centaur/IDT/VIA */
+#define	CPU_VENDOR_TRANSMETA	0x1279		/* Transmeta */
+#define	CPU_VENDOR_INTEL	0x8086		/* Intel */
+#define	CPU_VENDOR_RISE		0xdead2bad	/* Rise */
+#define	CPU_VENDOR_CENTAUR	CPU_VENDOR_IDT
+
+#endif /* !_X86_CPUTYPES_H_ */


Property changes on: trunk/sys/x86/include/cputypes.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/include/dump.h
===================================================================
--- trunk/sys/x86/include/dump.h	                        (rev 0)
+++ trunk/sys/x86/include/dump.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,88 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2014 EMC Corp.
+ * Author: Conrad Meyer <conrad.meyer at isilon.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/dump.h 276772 2015-01-07 01:01:39Z markj $
+ */
+
+#ifndef _MACHINE_DUMP_H_
+#define	_MACHINE_DUMP_H_
+
+#ifdef __amd64__
+#define	KERNELDUMP_ARCH_VERSION	KERNELDUMP_AMD64_VERSION
+#define	EM_VALUE		EM_X86_64
+#else
+#define	KERNELDUMP_ARCH_VERSION	KERNELDUMP_I386_VERSION
+#define	EM_VALUE		EM_386
+#endif
+
+/* 20 phys_avail entry pairs correspond to 10 pa's */
+#define	DUMPSYS_MD_PA_NPAIRS	10
+#define	DUMPSYS_NUM_AUX_HDRS	0
+
+static inline void
+dumpsys_pa_init(void)
+{
+
+	dumpsys_gen_pa_init();
+}
+
+static inline struct dump_pa *
+dumpsys_pa_next(struct dump_pa *p)
+{
+
+	return (dumpsys_gen_pa_next(p));
+}
+
+static inline void
+dumpsys_wbinv_all(void)
+{
+
+	dumpsys_gen_wbinv_all();
+}
+
+static inline void
+dumpsys_unmap_chunk(vm_paddr_t pa, size_t s, void *va)
+{
+
+	dumpsys_gen_unmap_chunk(pa, s, va);
+}
+
+static inline int
+dumpsys_write_aux_headers(struct dumperinfo *di)
+{
+
+	return (dumpsys_gen_write_aux_headers(di));
+}
+
+static inline int
+dumpsys(struct dumperinfo *di)
+{
+
+	return (dumpsys_generic(di));
+}
+
+#endif  /* !_MACHINE_DUMP_H_ */


Property changes on: trunk/sys/x86/include/dump.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/include/elf.h
===================================================================
--- trunk/sys/x86/include/elf.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/elf.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/elf.h 247047 2013-02-20 17:39:52Z kib $
+ * $FreeBSD: stable/11/sys/x86/include/elf.h 325810 2017-11-14 16:03:07Z jhb $
  */
 
 #ifndef _MACHINE_ELF_H_
@@ -101,8 +101,11 @@
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
 #define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #define	AT_STACKPROT	23	/* Initial stack protection. */
+#define	AT_EHDRFLAGS	24	/* e_flags field from elf hdr */
+#define	AT_HWCAP	25	/* CPU feature flags. */
+#define	AT_HWCAP2	26	/* CPU feature flags 2. */
 
-#define	AT_COUNT	24	/* Count of defined aux entry types. */
+#define	AT_COUNT	27	/* Count of defined aux entry types. */
 
 /*
  * Relocation types.
@@ -186,8 +189,11 @@
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
 #define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #define	AT_STACKPROT	23	/* Initial stack protection. */
+#define	AT_EHDRFLAGS	24	/* e_flags field from elf hdr */
+#define	AT_HWCAP	25	/* CPU feature flags. */
+#define	AT_HWCAP2	26	/* CPU feature flags 2. */
 
-#define	AT_COUNT	24	/* Count of defined aux entry types. */
+#define	AT_COUNT	27	/* Count of defined aux entry types. */
 
 /*
  * Relocation types.

Modified: trunk/sys/x86/include/endian.h
===================================================================
--- trunk/sys/x86/include/endian.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/endian.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)endian.h	7.8 (Berkeley) 4/3/91
- * $FreeBSD: stable/10/sys/x86/include/endian.h 233684 2012-03-29 23:31:48Z dim $
+ * $FreeBSD: stable/11/sys/x86/include/endian.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _MACHINE_ENDIAN_H_

Modified: trunk/sys/x86/include/fdt.h
===================================================================
--- trunk/sys/x86/include/fdt.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/fdt.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/fdt.h 266084 2014-05-14 19:18:58Z ian $
+ * $FreeBSD: stable/11/sys/x86/include/fdt.h 260327 2014-01-05 18:46:58Z nwhitehorn $
  */
 
 #ifndef _MACHINE_FDT_H_

Modified: trunk/sys/x86/include/float.h
===================================================================
--- trunk/sys/x86/include/float.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/float.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -11,7 +11,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)float.h	7.1 (Berkeley) 5/8/90
- * $FreeBSD: stable/10/sys/x86/include/float.h 235939 2012-05-24 21:44:46Z obrien $
+ * $FreeBSD: stable/11/sys/x86/include/float.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _MACHINE_FLOAT_H_

Modified: trunk/sys/x86/include/fpu.h
===================================================================
--- trunk/sys/x86/include/fpu.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/fpu.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)npx.h	5.3 (Berkeley) 1/18/91
- * $FreeBSD: stable/10/sys/x86/include/fpu.h 279211 2015-02-23 18:38:41Z jhb $
+ * $FreeBSD: stable/11/sys/x86/include/fpu.h 274817 2014-11-21 20:53:17Z jhb $
  */
 
 /*

Modified: trunk/sys/x86/include/frame.h
===================================================================
--- trunk/sys/x86/include/frame.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/frame.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -32,7 +32,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)frame.h	5.2 (Berkeley) 1/18/91
- * $FreeBSD: stable/10/sys/x86/include/frame.h 247047 2013-02-20 17:39:52Z kib $
+ * $FreeBSD: stable/11/sys/x86/include/frame.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _MACHINE_FRAME_H_
@@ -65,7 +65,7 @@
 	int	tf_eip;
 	int	tf_cs;
 	int	tf_eflags;
-	/* below only when crossing rings (e.g. user to kernel) */
+	/* below only when crossing rings (user to kernel) */
 	int	tf_esp;
 	int	tf_ss;
 };
@@ -90,15 +90,24 @@
 	int	tf_eip;
 	int	tf_cs;
 	int	tf_eflags;
-	/* below only when crossing rings (e.g. user to kernel) */
+	/* below only when crossing rings (user (including vm86) to kernel) */
 	int	tf_esp;
 	int	tf_ss;
-	/* below only when switching out of VM86 mode */
+	/* below only when crossing from vm86 mode to kernel */
 	int	tf_vm86_es;
 	int	tf_vm86_ds;
 	int	tf_vm86_fs;
 	int	tf_vm86_gs;
 };
+
+/*
+ * This alias for the MI TRAPF_USERMODE() should be used when we don't
+ * care about user mode itself, but need to know if a frame has stack
+ * registers.  The difference is only logical, but on i386 the logic
+ * for using TRAPF_USERMODE() is complicated by sometimes treating vm86
+ * bioscall mode (which is a special ring 3 user mode) as kernel mode.
+ */
+#define	TF_HAS_STACKREGS(tf)	TRAPF_USERMODE(tf)
 #endif /* __i386__ */
 
 #ifdef __amd64__
@@ -137,6 +146,7 @@
 	register_t	tf_rip;
 	register_t	tf_cs;
 	register_t	tf_rflags;
+	/* the amd64 frame always has the stack registers */
 	register_t	tf_rsp;
 	register_t	tf_ss;
 };

Added: trunk/sys/x86/include/ifunc.h
===================================================================
--- trunk/sys/x86/include/ifunc.h	                        (rev 0)
+++ trunk/sys/x86/include/ifunc.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,51 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015-2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib at FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/ifunc.h 339217 2018-10-07 00:40:56Z kib $
+ */
+
+#ifndef __X86_IFUNC_H
+#define	__X86_IFUNC_H
+
+#define	DEFINE_IFUNC(qual, ret_type, name, args, resolver_qual)		\
+    resolver_qual ret_type (*name##_resolver(void))args __used;		\
+    qual ret_type name args __attribute__((ifunc(#name "_resolver")));	\
+    resolver_qual ret_type (*name##_resolver(void))args
+
+#define	DEFINE_UIFUNC(qual, ret_type, name, args, resolver_qual)	\
+    resolver_qual ret_type (*name##_resolver(uint32_t, uint32_t,	\
+	uint32_t, uint32_t))args __used;				\
+    qual ret_type name args __attribute__((ifunc(#name "_resolver")));	\
+    resolver_qual ret_type (*name##_resolver(				\
+	uint32_t cpu_feature __unused,					\
+	uint32_t cpu_feature2 __unused,					\
+	uint32_t cpu_stdext_feature __unused,				\
+	uint32_t cpu_stdext_feature2 __unused))args
+
+#endif


Property changes on: trunk/sys/x86/include/ifunc.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/include/init.h
===================================================================
--- trunk/sys/x86/include/init.h	                        (rev 0)
+++ trunk/sys/x86/include/init.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,59 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2013 Roger Pau Monn? <roger.pau at citrix.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/init.h 272310 2014-09-30 16:46:45Z royger $
+ */
+
+#ifndef __X86_INIT_H__
+#define __X86_INIT_H__
+/*
+ * Struct containing pointers to init functions whose
+ * implementation is run time selectable.  Selection can be made,
+ * for example, based on detection of a BIOS variant or
+ * hypervisor environment.
+ */
+struct init_ops {
+	caddr_t	(*parse_preload_data)(u_int64_t);
+	void	(*early_clock_source_init)(void);
+	void	(*early_delay)(int);
+	void	(*parse_memmap)(caddr_t, vm_paddr_t *, int *);
+	u_int	(*mp_bootaddress)(u_int);
+	int	(*start_all_aps)(void);
+	void	(*msi_init)(void);
+};
+
+extern struct init_ops init_ops;
+
+/* Knob to disable acpi_cpu devices */
+extern bool acpi_cpu_disabled;
+
+/* Knob to disable acpi_hpet device */
+extern bool acpi_hpet_disabled;
+
+/* Knob to disable acpi_timer device */
+extern bool acpi_timer_disabled;
+
+#endif /* __X86_INIT_H__ */


Property changes on: trunk/sys/x86/include/init.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/include/intr_machdep.h
===================================================================
--- trunk/sys/x86/include/intr_machdep.h	                        (rev 0)
+++ trunk/sys/x86/include/intr_machdep.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,177 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2003 John Baldwin <jhb at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/intr_machdep.h 342656 2018-12-31 22:09:08Z jhb $
+ */
+
+#ifndef __X86_INTR_MACHDEP_H__
+#define	__X86_INTR_MACHDEP_H__
+
+#ifdef _KERNEL
+
+/*
+ * Values used in determining the allocation of IRQ values among
+ * different types of I/O interrupts.  These values are used as
+ * indices into a interrupt source array to map I/O interrupts to a
+ * device interrupt source whether it be a pin on an interrupt
+ * controller or an MSI interrupt.  The 16 ISA IRQs are assigned fixed
+ * IDT vectors, but all other device interrupts allocate IDT vectors
+ * on demand.  Currently we have 191 IDT vectors available for device
+ * interrupts on each CPU.  On many systems with I/O APICs, a lot of
+ * the IRQs are not used, so the total number of IRQ values reserved
+ * can exceed the number of available IDT slots.
+ *
+ * The first 16 IRQs (0 - 15) are reserved for ISA IRQs.  Interrupt
+ * pins on I/O APICs for non-ISA interrupts use IRQ values starting at
+ * IRQ 17.  This layout matches the GSI numbering used by ACPI so that
+ * IRQ values returned by ACPI methods such as _CRS can be used
+ * directly by the ACPI bus driver.
+ *
+ * MSI interrupts allocate a block of interrupts starting at either
+ * the end of the I/O APIC range or 256, whichever is higher.  When
+ * running under the Xen Hypervisor, an additional range of IRQ values
+ * are available for binding to event channel events.  We use 256 as
+ * the minimum IRQ value for MSI interrupts to attempt to leave 255
+ * unused since 255 is used in PCI to indicate an invalid INTx IRQ.
+ */
+#define	MINIMUM_MSI_INT	256
+
+extern u_int first_msi_irq;
+extern u_int num_io_irqs;
+extern u_int num_msi_irqs;
+
+/*
+ * Default base address for MSI messages on x86 platforms.
+ */
+#define	MSI_INTEL_ADDR_BASE		0xfee00000
+
+#ifndef LOCORE
+
+typedef void inthand_t(void);
+
+#define	IDTVEC(name)	__CONCAT(X,name)
+
+struct intsrc;
+
+/*
+ * Methods that a PIC provides to mask/unmask a given interrupt source,
+ * "turn on" the interrupt on the CPU side by setting up an IDT entry, and
+ * return the vector associated with this source.
+ */
+struct pic {
+	void (*pic_register_sources)(struct pic *);
+	void (*pic_enable_source)(struct intsrc *);
+	void (*pic_disable_source)(struct intsrc *, int);
+	void (*pic_eoi_source)(struct intsrc *);
+	void (*pic_enable_intr)(struct intsrc *);
+	void (*pic_disable_intr)(struct intsrc *);
+	int (*pic_vector)(struct intsrc *);
+	int (*pic_source_pending)(struct intsrc *);
+	void (*pic_suspend)(struct pic *);
+	void (*pic_resume)(struct pic *, bool suspend_cancelled);
+	int (*pic_config_intr)(struct intsrc *, enum intr_trigger,
+	    enum intr_polarity);
+	int (*pic_assign_cpu)(struct intsrc *, u_int apic_id);
+	void (*pic_reprogram_pin)(struct intsrc *);
+	TAILQ_ENTRY(pic) pics;
+};
+
+/* Flags for pic_disable_source() */
+enum {
+	PIC_EOI,
+	PIC_NO_EOI,
+};
+
+/*
+ * An interrupt source.  The upper-layer code uses the PIC methods to
+ * control a given source.  The lower-layer PIC drivers can store additional
+ * private data in a given interrupt source such as an interrupt pin number
+ * or an I/O APIC pointer.
+ */
+struct intsrc {
+	struct pic *is_pic;
+	struct intr_event *is_event;
+	u_long *is_count;
+	u_long *is_straycount;
+	u_int is_index;
+	u_int is_handlers;
+};
+
+struct trapframe;
+
+#ifdef SMP
+extern cpuset_t intr_cpus;
+#endif
+extern struct mtx icu_lock;
+extern int elcr_found;
+#ifdef SMP
+extern int msix_disable_migration;
+#endif
+
+#ifndef DEV_ATPIC
+void	atpic_reset(void);
+#endif
+/* XXX: The elcr_* prototypes probably belong somewhere else. */
+int	elcr_probe(void);
+enum intr_trigger elcr_read_trigger(u_int irq);
+void	elcr_resume(void);
+void	elcr_write_trigger(u_int irq, enum intr_trigger trigger);
+#ifdef SMP
+void	intr_add_cpu(u_int cpu);
+#endif
+int	intr_add_handler(const char *name, int vector, driver_filter_t filter, 
+			 driver_intr_t handler, void *arg, enum intr_type flags, 
+			 void **cookiep);    
+#ifdef SMP
+int	intr_bind(u_int vector, u_char cpu);
+#endif
+int	intr_config_intr(int vector, enum intr_trigger trig,
+    enum intr_polarity pol);
+int	intr_describe(u_int vector, void *ih, const char *descr);
+void	intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame);
+u_int	intr_next_cpu(void);
+struct intsrc *intr_lookup_source(int vector);
+int	intr_register_pic(struct pic *pic);
+int	intr_register_source(struct intsrc *isrc);
+int	intr_remove_handler(void *cookie);
+void	intr_resume(bool suspend_cancelled);
+void	intr_suspend(void);
+void	intr_reprogram(void);
+void	intrcnt_add(const char *name, u_long **countp);
+void	nexus_add_irq(u_long irq);
+int	msi_alloc(device_t dev, int count, int maxcount, int *irqs);
+void	msi_init(void);
+int	msi_map(int irq, uint64_t *addr, uint32_t *data);
+int	msi_release(int *irqs, int count);
+int	msix_alloc(device_t dev, int *irq);
+int	msix_release(int irq);
+#ifdef XENHVM
+void	xen_intr_alloc_irqs(void);
+#endif
+
+#endif	/* !LOCORE */
+#endif	/* _KERNEL */
+#endif	/* !__X86_INTR_MACHDEP_H__ */


Property changes on: trunk/sys/x86/include/intr_machdep.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/include/legacyvar.h
===================================================================
--- trunk/sys/x86/include/legacyvar.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/legacyvar.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/legacyvar.h 280970 2015-04-01 21:48:54Z jhb $
+ * $FreeBSD: stable/11/sys/x86/include/legacyvar.h 294883 2016-01-27 02:23:54Z jhibbits $
  */
 
 #ifndef _X86_LEGACYVAR_H_
@@ -57,9 +57,10 @@
 int	legacy_pcib_write_ivar(device_t dev, device_t child, int which,
     uintptr_t value);
 struct resource *legacy_pcib_alloc_resource(device_t dev, device_t child,
-    int type, int *rid, u_long start, u_long end, u_long count, u_int flags);
+    int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count,
+    u_int flags);
 int	legacy_pcib_adjust_resource(device_t dev, device_t child, int type,
-    struct resource *r, u_long start, u_long end);
+    struct resource *r, rman_res_t start, rman_res_t end);
 int	legacy_pcib_release_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r);
 int	legacy_pcib_alloc_msi(device_t pcib, device_t dev, int count,

Modified: trunk/sys/x86/include/mca.h
===================================================================
--- trunk/sys/x86/include/mca.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/mca.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/mca.h 283927 2015-06-02 19:20:39Z jhb $
+ * $FreeBSD: stable/11/sys/x86/include/mca.h 281887 2015-04-23 14:22:20Z jhb $
  */
 
 #ifndef __X86_MCA_H__

Modified: trunk/sys/x86/include/metadata.h
===================================================================
--- trunk/sys/x86/include/metadata.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/metadata.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/metadata.h 294274 2016-01-18 15:52:07Z emaste $
+ * $FreeBSD: stable/11/sys/x86/include/metadata.h 293343 2016-01-07 19:47:26Z emaste $
  */
 
 #ifndef _MACHINE_METADATA_H_

Modified: trunk/sys/x86/include/mptable.h
===================================================================
--- trunk/sys/x86/include/mptable.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/mptable.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/mptable.h 259837 2013-12-24 19:10:56Z jhb $
+ * $FreeBSD: stable/11/sys/x86/include/mptable.h 259228 2013-12-11 21:19:04Z jhb $
  */
 
 #ifndef __MACHINE_MPTABLE_H__
@@ -31,10 +31,13 @@
 
 enum busTypes {
     NOBUS = 0,
+    CBUS = 1,
+    CBUSII = 2,
     EISA = 3,
     ISA = 6,
     MCA = 9,
     PCI = 13,
+    XPRESS = 18,
     MAX_BUSTYPE = 18,
     UNKNOWN_BUSTYPE = 0xff
 };
@@ -41,17 +44,17 @@
 
 /* MP Floating Pointer Structure */
 typedef struct MPFPS {
-	char    signature[4];
-	u_int32_t pap;
-	u_char  length;
-	u_char  spec_rev;
-	u_char  checksum;
-	u_char  config_type;
-	u_char  mpfb2;
-	u_char  mpfb3;
-	u_char  mpfb4;
-	u_char  mpfb5;
-}      *mpfps_t;
+	uint8_t	signature[4];
+	uint32_t pap;
+	uint8_t	length;
+	uint8_t	spec_rev;
+	uint8_t	checksum;
+	uint8_t	config_type;
+	uint8_t	mpfb2;
+	uint8_t	mpfb3;
+	uint8_t	mpfb4;
+	uint8_t	mpfb5;
+} __packed *mpfps_t;
 
 #define	MPFB2_IMCR_PRESENT	0x80
 #define	MPFB2_MUL_CLK_SRCS	0x40
@@ -58,20 +61,20 @@
 
 /* MP Configuration Table Header */
 typedef struct MPCTH {
-	char    signature[4];
-	u_short base_table_length;
-	u_char  spec_rev;
-	u_char  checksum;
-	u_char  oem_id[8];
-	u_char  product_id[12];
-	u_int32_t oem_table_pointer;
-	u_short oem_table_size;
-	u_short entry_count;
-	u_int32_t apic_address;
-	u_short extended_table_length;
-	u_char  extended_table_checksum;
-	u_char  reserved;
-}      *mpcth_t;
+	uint8_t	signature[4];
+	uint16_t base_table_length;
+	uint8_t	spec_rev;
+	uint8_t	checksum;
+	uint8_t	oem_id[8];
+	uint8_t	product_id[12];
+	uint32_t oem_table_pointer;
+	uint16_t oem_table_size;
+	uint16_t entry_count;
+	uint32_t apic_address;
+	uint16_t extended_table_length;
+	uint8_t	extended_table_checksum;
+	uint8_t	reserved;
+} __packed *mpcth_t;
 
 /* Base table entries */
 
@@ -82,44 +85,44 @@
 #define	MPCT_ENTRY_LOCAL_INT	4
 
 typedef struct PROCENTRY {
-	u_char  type;
-	u_char  apic_id;
-	u_char  apic_version;
-	u_char  cpu_flags;
-	u_int32_t cpu_signature;
-	u_int32_t feature_flags;
-	u_int32_t reserved1;
-	u_int32_t reserved2;
-}      *proc_entry_ptr;
+	uint8_t	type;
+	uint8_t	apic_id;
+	uint8_t	apic_version;
+	uint8_t	cpu_flags;
+	uint32_t cpu_signature;
+	uint32_t feature_flags;
+	uint32_t reserved1;
+	uint32_t reserved2;
+} __packed *proc_entry_ptr;
 
 #define PROCENTRY_FLAG_EN	0x01
 #define PROCENTRY_FLAG_BP	0x02
 
 typedef struct BUSENTRY {
-	u_char  type;
-	u_char  bus_id;
-	char    bus_type[6];
-}      *bus_entry_ptr;
+	uint8_t	type;
+	uint8_t	bus_id;
+	uint8_t	bus_type[6];
+} __packed *bus_entry_ptr;
 
 typedef struct IOAPICENTRY {
-	u_char  type;
-	u_char  apic_id;
-	u_char  apic_version;
-	u_char  apic_flags;
-	u_int32_t apic_address;
-}      *io_apic_entry_ptr;
+	uint8_t	type;
+	uint8_t	apic_id;
+	uint8_t	apic_version;
+	uint8_t	apic_flags;
+	uint32_t apic_address;
+} __packed *io_apic_entry_ptr;
 
 #define IOAPICENTRY_FLAG_EN	0x01
 
 typedef struct INTENTRY {
-	u_char  type;
-	u_char  int_type;
-	u_short int_flags;
-	u_char  src_bus_id;
-	u_char  src_bus_irq;
-	u_char  dst_apic_id;
-	u_char  dst_apic_int;
-}      *int_entry_ptr;
+	uint8_t	type;
+	uint8_t	int_type;
+	uint16_t int_flags;
+	uint8_t	src_bus_id;
+	uint8_t	src_bus_irq;
+	uint8_t	dst_apic_id;
+	uint8_t	dst_apic_int;
+} __packed *int_entry_ptr;
 
 #define	INTENTRY_TYPE_INT  	0
 #define	INTENTRY_TYPE_NMI	1
@@ -138,9 +141,9 @@
 /* Extended table entries */
 
 typedef	struct EXTENTRY {
-	u_char	type;
-	u_char	length;
-}      *ext_entry_ptr;
+	uint8_t	type;
+	uint8_t	length;
+} __packed *ext_entry_ptr;
 
 #define	MPCT_EXTENTRY_SAS	0x80
 #define	MPCT_EXTENTRY_BHD	0x81
@@ -147,13 +150,13 @@
 #define	MPCT_EXTENTRY_CBASM	0x82
 
 typedef struct SASENTRY {
-	u_char	type;
-	u_char	length;
-	u_char	bus_id;
-	u_char	address_type;
+	uint8_t	type;
+	uint8_t	length;
+	uint8_t	bus_id;
+	uint8_t	address_type;
 	uint64_t address_base;
 	uint64_t address_length;
-} __attribute__((__packed__)) *sas_entry_ptr;
+} __packed *sas_entry_ptr;
 
 #define	SASENTRY_TYPE_IO	0
 #define	SASENTRY_TYPE_MEMORY	1
@@ -160,23 +163,23 @@
 #define	SASENTRY_TYPE_PREFETCH	2
 
 typedef struct BHDENTRY {
-	u_char	type;
-	u_char	length;
-	u_char	bus_id;
-	u_char	bus_info;
-	u_char	parent_bus;
-	u_char	reserved[3];
-}      *bhd_entry_ptr;
+	uint8_t	type;
+	uint8_t	length;
+	uint8_t	bus_id;
+	uint8_t	bus_info;
+	uint8_t	parent_bus;
+	uint8_t	reserved[3];
+} __packed *bhd_entry_ptr;
 
 #define	BHDENTRY_INFO_SUBTRACTIVE_DECODE	0x1
 
 typedef struct CBASMENTRY {
-	u_char	type;
-	u_char	length;
-	u_char	bus_id;
-	u_char	address_mod;
-	u_int	predefined_range;
-}      *cbasm_entry_ptr;
+	uint8_t	type;
+	uint8_t	length;
+	uint8_t	bus_id;
+	uint8_t	address_mod;
+	uint32_t predefined_range;
+} __packed *cbasm_entry_ptr;
 
 #define	CBASMENTRY_ADDRESS_MOD_ADD		0x0
 #define	CBASMENTRY_ADDRESS_MOD_SUBTRACT		0x1
@@ -184,13 +187,6 @@
 #define	CBASMENTRY_RANGE_ISA_IO		0
 #define	CBASMENTRY_RANGE_VGA_IO		1
 
-/* descriptions of MP table entries */
-typedef struct BASETABLE_ENTRY {
-	u_char  type;
-	u_char  length;
-	char    name[16];
-}       basetable_entry;
-
 #ifdef _KERNEL
 struct mptable_hostb_softc {
 #ifdef NEW_PCIB

Modified: trunk/sys/x86/include/ofw_machdep.h
===================================================================
--- trunk/sys/x86/include/ofw_machdep.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/ofw_machdep.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -24,13 +24,13 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/ofw_machdep.h 266084 2014-05-14 19:18:58Z ian $
+ * $FreeBSD: stable/11/sys/x86/include/ofw_machdep.h 287260 2015-08-28 15:41:09Z imp $
  */
 
 #ifndef _MACHINE_OFW_MACHDEP_H_
 #define _MACHINE_OFW_MACHDEP_H_
 
-#include <x86/bus.h>
+#include <machine/bus.h>
 #include <vm/vm.h>
 
 typedef	uint32_t	cell_t;

Modified: trunk/sys/x86/include/pci_cfgreg.h
===================================================================
--- trunk/sys/x86/include/pci_cfgreg.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/pci_cfgreg.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -24,7 +24,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/pci_cfgreg.h 223440 2011-06-22 21:04:13Z jhb $
+ * $FreeBSD: stable/11/sys/x86/include/pci_cfgreg.h 294883 2016-01-27 02:23:54Z jhibbits $
  *
  */
 
@@ -47,7 +47,7 @@
 #define CONF2_ENABLE_CHK   0x0e
 #define CONF2_ENABLE_RES   0x0e
 
-u_long		hostb_alloc_start(int type, u_long start, u_long end, u_long count);
+rman_res_t	hostb_alloc_start(int type, rman_res_t start, rman_res_t end, rman_res_t count);
 int		pcie_cfgregopen(uint64_t base, uint8_t minbus, uint8_t maxbus);
 int		pci_cfgregopen(void);
 u_int32_t	pci_cfgregread(int bus, int slot, int func, int reg, int bytes);

Modified: trunk/sys/x86/include/psl.h
===================================================================
--- trunk/sys/x86/include/psl.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/psl.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)psl.h	5.2 (Berkeley) 1/18/91
- * $FreeBSD: stable/10/sys/x86/include/psl.h 258559 2013-11-25 15:58:48Z emaste $
+ * $FreeBSD: stable/11/sys/x86/include/psl.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _MACHINE_PSL_H_

Modified: trunk/sys/x86/include/ptrace.h
===================================================================
--- trunk/sys/x86/include/ptrace.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/ptrace.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ptrace.h	8.1 (Berkeley) 6/11/93
- * $FreeBSD: stable/10/sys/x86/include/ptrace.h 286311 2015-08-05 08:17:10Z kib $
+ * $FreeBSD: stable/11/sys/x86/include/ptrace.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _MACHINE_PTRACE_H_

Added: trunk/sys/x86/include/pvclock.h
===================================================================
--- trunk/sys/x86/include/pvclock.h	                        (rev 0)
+++ trunk/sys/x86/include/pvclock.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,60 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2014, Bryan Venteicher <bryanv at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/pvclock.h 278184 2015-02-04 08:33:04Z bryanv $
+ */
+
+#ifndef X86_PVCLOCK
+#define X86_PVCLOCK
+
+struct pvclock_vcpu_time_info {
+	uint32_t	version;
+	uint32_t	pad0;
+	uint64_t	tsc_timestamp;
+	uint64_t	system_time;
+	uint32_t	tsc_to_system_mul;
+	int8_t		tsc_shift;
+	uint8_t		flags;
+	uint8_t		pad[2];
+};
+
+#define PVCLOCK_FLAG_TSC_STABLE		0x01
+#define PVCLOCK_FLAG_GUEST_PASUED	0x02
+
+struct pvclock_wall_clock {
+	uint32_t	version;
+	uint32_t	sec;
+	uint32_t	nsec;
+};
+
+void		pvclock_resume(void);
+uint64_t	pvclock_get_last_cycles(void);
+uint64_t	pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti);
+uint64_t	pvclock_get_timecount(struct pvclock_vcpu_time_info *ti);
+void		pvclock_get_wallclock(struct pvclock_wall_clock *wc,
+		    struct timespec *ts);
+
+#endif


Property changes on: trunk/sys/x86/include/pvclock.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/include/reg.h
===================================================================
--- trunk/sys/x86/include/reg.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/reg.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -32,7 +32,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)reg.h	5.5 (Berkeley) 1/18/91
- * $FreeBSD: stable/10/sys/x86/include/reg.h 283910 2015-06-02 14:54:53Z jhb $
+ * $FreeBSD: stable/11/sys/x86/include/reg.h 338691 2018-09-14 23:21:52Z jhb $
  */
 
 #ifndef _MACHINE_REG_H_
@@ -205,6 +205,14 @@
 				/* Index 8-15: reserved */
 };
 
+#define	DBREG_DR6_RESERVED1	0xffff0ff0
+#define	DBREG_DR6_BMASK		0x000f
+#define	DBREG_DR6_B(i)		(1 << (i))
+#define	DBREG_DR6_BD		0x2000
+#define	DBREG_DR6_BS		0x4000
+#define	DBREG_DR6_BT		0x8000
+
+#define	DBREG_DR7_RESERVED1	0x0400
 #define	DBREG_DR7_LOCAL_ENABLE	0x01
 #define	DBREG_DR7_GLOBAL_ENABLE	0x02
 #define	DBREG_DR7_LEN_1		0x00	/* 1 byte length          */
@@ -235,6 +243,8 @@
 #undef __dbreg64
 
 #ifdef _KERNEL
+struct thread;
+
 /*
  * XXX these interfaces are MI, so they should be declared in a MI place.
  */

Modified: trunk/sys/x86/include/segments.h
===================================================================
--- trunk/sys/x86/include/segments.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/segments.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -32,7 +32,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)segments.h	7.1 (Berkeley) 5/9/91
- * $FreeBSD: stable/10/sys/x86/include/segments.h 255040 2013-08-29 19:52:18Z gibbs $
+ * $FreeBSD: stable/11/sys/x86/include/segments.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _X86_SEGMENTS_H_
@@ -47,11 +47,7 @@
  */
 #define	SEL_RPL_MASK	3		/* requester priv level */
 #define	ISPL(s)		((s)&3)		/* priority level of a selector */
-#ifdef XEN
-#define	SEL_KPL		1		/* kernel priority level */
-#else
 #define	SEL_KPL		0		/* kernel priority level */
-#endif
 #define	SEL_UPL		3		/* user priority level */
 #define	ISLDT(s)	((s)&SEL_LDT)	/* is it local or global */
 #define	SEL_LDT		4		/* local descriptor table */
@@ -220,7 +216,7 @@
 #define	IDT_DTRACE_RET	0x92	/* DTrace pid provider Interrupt Vector */
 #define	IDT_EVTCHN	0x93	/* Xen HVM Event Channel Interrupt Vector */
 
-#if defined(__i386__) || defined(__ia64__)
+#if defined(__i386__)
 /*
  * Entries in the Global Descriptor Table (GDT)
  * Note that each 4 entries share a single 32 byte L1 cache line.
@@ -245,11 +241,7 @@
 #define	GBIOSUTIL_SEL	16	/* BIOS interface (Utility) */
 #define	GBIOSARGS_SEL	17	/* BIOS interface (Arguments) */
 #define	GNDIS_SEL	18	/* For the NDIS layer */
-#ifdef XEN
-#define	NGDT		9
-#else
 #define	NGDT		19
-#endif
 
 /*
  * Entries in the Local Descriptor Table (LDT)
@@ -265,7 +257,7 @@
 #define	LBSDICALLS_SEL	16	/* BSDI system call gate */
 #define	NLDT		(LBSDICALLS_SEL + 1)
 
-#else /* !__i386__ && !__ia64__ */
+#else /* !__i386__ */
 /*
  * Entries in the Global Descriptor Table (GDT)
  */
@@ -283,6 +275,6 @@
 #define	GUSERLDT_SEL	11	/* LDT */
 /* slot 12 is second half of GUSERLDT_SEL */
 #define	NGDT 		13
-#endif /* __i386__ || __ia64__ */
+#endif /* __i386__ */
 
 #endif /* !_X86_SEGMENTS_H_ */

Modified: trunk/sys/x86/include/setjmp.h
===================================================================
--- trunk/sys/x86/include/setjmp.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/setjmp.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/setjmp.h 232275 2012-02-28 22:17:52Z tijl $
+ * $FreeBSD: stable/11/sys/x86/include/setjmp.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _MACHINE_SETJMP_H_

Modified: trunk/sys/x86/include/sigframe.h
===================================================================
--- trunk/sys/x86/include/sigframe.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/sigframe.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -26,7 +26,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/sigframe.h 247047 2013-02-20 17:39:52Z kib $
+ * $FreeBSD: stable/11/sys/x86/include/sigframe.h 247047 2013-02-20 17:39:52Z kib $
  */
 
 #ifndef _X86_SIGFRAME_H_

Modified: trunk/sys/x86/include/signal.h
===================================================================
--- trunk/sys/x86/include/signal.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/signal.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -29,7 +29,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)signal.h	8.1 (Berkeley) 6/11/93
- * $FreeBSD: stable/10/sys/x86/include/signal.h 247047 2013-02-20 17:39:52Z kib $
+ * $FreeBSD: stable/11/sys/x86/include/signal.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _X86_SIGNAL_H

Modified: trunk/sys/x86/include/specialreg.h
===================================================================
--- trunk/sys/x86/include/specialreg.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/specialreg.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)specialreg.h	7.1 (Berkeley) 5/9/91
- * $FreeBSD: stable/10/sys/x86/include/specialreg.h 315928 2017-03-25 05:09:03Z grehan $
+ * $FreeBSD: stable/11/sys/x86/include/specialreg.h 354658 2019-11-12 19:35:46Z scottl $
  */
 
 #ifndef _MACHINE_SPECIALREG_H_
@@ -54,6 +54,7 @@
 #define	CR0_CD  0x40000000	/* Cache Disable */
 
 #define	CR3_PCID_SAVE 0x8000000000000000
+#define	CR3_PCID_MASK 0xfff
 
 /*
  * Bits in PPro special registers
@@ -74,6 +75,7 @@
 #define	CR4_PCIDE 0x00020000	/* Enable Context ID */
 #define	CR4_XSAVE 0x00040000	/* XSETBV/XGETBV */
 #define	CR4_SMEP 0x00100000	/* Supervisor-Mode Execution Prevention */
+#define	CR4_SMAP 0x00200000	/* Supervisor-Mode Access Prevention */
 
 /*
  * Bits in AMD64 special registers.  EFER is 64 bits wide.
@@ -322,6 +324,13 @@
 #define	AMDPM_CPB		0x00000200
 
 /*
+ * AMD extended function 8000_0008h ebx info (amd_extended_feature_extensions)
+ */
+#define	AMDFEID_CLZERO		0x00000001
+#define	AMDFEID_IRPERF		0x00000002
+#define	AMDFEID_XSAVEERPTR	0x00000004
+
+/*
  * AMD extended function 8000_0008h ecx info
  */
 #define	AMDID_CMP_CORES		0x000000ff
@@ -348,15 +357,21 @@
 #define	CPUID_STDEXT_MPX	0x00004000
 #define	CPUID_STDEXT_PQE	0x00008000
 #define	CPUID_STDEXT_AVX512F	0x00010000
+#define	CPUID_STDEXT_AVX512DQ	0x00020000
 #define	CPUID_STDEXT_RDSEED	0x00040000
 #define	CPUID_STDEXT_ADX	0x00080000
 #define	CPUID_STDEXT_SMAP	0x00100000
+#define	CPUID_STDEXT_AVX512IFMA	0x00200000
+#define	CPUID_STDEXT_PCOMMIT	0x00400000
 #define	CPUID_STDEXT_CLFLUSHOPT	0x00800000
+#define	CPUID_STDEXT_CLWB	0x01000000
 #define	CPUID_STDEXT_PROCTRACE	0x02000000
 #define	CPUID_STDEXT_AVX512PF	0x04000000
 #define	CPUID_STDEXT_AVX512ER	0x08000000
 #define	CPUID_STDEXT_AVX512CD	0x10000000
 #define	CPUID_STDEXT_SHA	0x20000000
+#define	CPUID_STDEXT_AVX512BW	0x40000000
+#define	CPUID_STDEXT_AVX512VL	0x80000000
 
 /*
  * CPUID instruction 7 Structured Extended Features, leaf 0 ecx info
@@ -365,10 +380,42 @@
 #define	CPUID_STDEXT2_UMIP	0x00000004
 #define	CPUID_STDEXT2_PKU	0x00000008
 #define	CPUID_STDEXT2_OSPKE	0x00000010
+#define	CPUID_STDEXT2_WAITPKG	0x00000020
+#define	CPUID_STDEXT2_GFNI	0x00000100
 #define	CPUID_STDEXT2_RDPID	0x00400000
+#define	CPUID_STDEXT2_CLDEMOTE	0x02000000
+#define	CPUID_STDEXT2_MOVDIRI	0x08000000
+#define	CPUID_STDEXT2_MOVDIRI64B	0x10000000
 #define	CPUID_STDEXT2_SGXLC	0x40000000
 
 /*
+ * CPUID instruction 7 Structured Extended Features, leaf 0 edx info
+ */
+#define	CPUID_STDEXT3_MD_CLEAR	0x00000400
+#define	CPUID_STDEXT3_TSXFA	0x00002000
+#define	CPUID_STDEXT3_IBPB	0x04000000
+#define	CPUID_STDEXT3_STIBP	0x08000000
+#define	CPUID_STDEXT3_L1D_FLUSH	0x10000000
+#define	CPUID_STDEXT3_ARCH_CAP	0x20000000
+#define	CPUID_STDEXT3_CORE_CAP	0x40000000
+#define	CPUID_STDEXT3_SSBD	0x80000000
+
+/* MSR IA32_ARCH_CAP(ABILITIES) bits */
+#define	IA32_ARCH_CAP_RDCL_NO	0x00000001
+#define	IA32_ARCH_CAP_IBRS_ALL	0x00000002
+#define	IA32_ARCH_CAP_RSBA	0x00000004
+#define	IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY	0x00000008
+#define	IA32_ARCH_CAP_SSB_NO	0x00000010
+#define	IA32_ARCH_CAP_MDS_NO	0x00000020
+#define	IA32_ARCH_CAP_IF_PSCHANGE_MC_NO	0x00000040
+#define	IA32_ARCH_CAP_TSX_CTRL	0x00000080
+#define	IA32_ARCH_CAP_TAA_NO	0x00000100
+
+/* MSR IA32_TSX_CTRL bits */
+#define	IA32_TSX_CTRL_RTM_DISABLE	0x00000001
+#define	IA32_TSX_CTRL_TSX_CPUID_CLEAR	0x00000002
+
+/*
  * CPUID manufacturers identifiers
  */
 #define	AMD_VENDOR_ID		"AuthenticAMD"
@@ -396,6 +443,8 @@
 #define	MSR_EBL_CR_POWERON	0x02a
 #define	MSR_TEST_CTL		0x033
 #define	MSR_IA32_FEATURE_CONTROL 0x03a
+#define	MSR_IA32_SPEC_CTRL	0x048
+#define	MSR_IA32_PRED_CMD	0x049
 #define	MSR_BIOS_UPDT_TRIG	0x079
 #define	MSR_BBL_CR_D0		0x088
 #define	MSR_BBL_CR_D1		0x089
@@ -408,6 +457,9 @@
 #define	MSR_APERF		0x0e8
 #define	MSR_IA32_EXT_CONFIG	0x0ee	/* Undocumented. Core Solo/Duo only */
 #define	MSR_MTRRcap		0x0fe
+#define	MSR_IA32_ARCH_CAP	0x10a
+#define	MSR_IA32_FLUSH_CMD	0x10b
+#define	MSR_TSX_FORCE_ABORT	0x10f
 #define	MSR_BBL_CR_ADDR		0x116
 #define	MSR_BBL_CR_DECC		0x118
 #define	MSR_BBL_CR_CTL		0x119
@@ -414,6 +466,7 @@
 #define	MSR_BBL_CR_TRIG		0x11a
 #define	MSR_BBL_CR_BUSY		0x11b
 #define	MSR_BBL_CR_CTL3		0x11e
+#define	MSR_IA32_TSX_CTRL	0x122
 #define	MSR_SYSENTER_CS_MSR	0x174
 #define	MSR_SYSENTER_ESP_MSR	0x175
 #define	MSR_SYSENTER_EIP_MSR	0x176
@@ -467,6 +520,7 @@
 #define	MSR_DRAM_ENERGY_STATUS	0x619
 #define	MSR_PP0_ENERGY_STATUS	0x639
 #define	MSR_PP1_ENERGY_STATUS	0x641
+#define	MSR_TSC_DEADLINE	0x6e0	/* Writes are not serializing */
 
 /*
  * VMX MSRs
@@ -488,8 +542,10 @@
 #define	MSR_VMX_TRUE_ENTRY_CTLS	0x490
 
 /*
- * X2APIC MSRs
+ * X2APIC MSRs.
+ * Writes are not serializing.
  */
+#define	MSR_APIC_000		0x800
 #define	MSR_APIC_ID		0x802
 #define	MSR_APIC_VERSION	0x803
 #define	MSR_APIC_TPR		0x808
@@ -548,6 +604,21 @@
 #define	IA32_MISC_EN_XDD	0x0000000400000000ULL
 
 /*
+ * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel'
+ * document 336996-001 Speculative Execution Side Channel Mitigations.
+ */
+/* MSR IA32_SPEC_CTRL */
+#define	IA32_SPEC_CTRL_IBRS	0x00000001
+#define	IA32_SPEC_CTRL_STIBP	0x00000002
+#define	IA32_SPEC_CTRL_SSBD	0x00000004
+
+/* MSR IA32_PRED_CMD */
+#define	IA32_PRED_CMD_IBPB_BARRIER	0x0000000000000001ULL
+
+/* MSR IA32_FLUSH_CMD */
+#define	IA32_FLUSH_CMD_L1D	0x00000001
+
+/*
  * PAT modes.
  */
 #define	PAT_UNCACHEABLE		0x00
@@ -697,6 +768,22 @@
 #define	MC_MISC_ADDRESS_MODE	0x00000000000001c0	/* If MCG_CAP_SER_P */
 #define	MC_CTL2_THRESHOLD	0x0000000000007fff
 #define	MC_CTL2_CMCI_EN		0x0000000040000000
+#define	MC_AMDNB_BANK		4
+#define	MC_MISC_AMDNB_VAL	0x8000000000000000	/* Counter presence valid */
+#define	MC_MISC_AMDNB_CNTP	0x4000000000000000	/* Counter present */
+#define	MC_MISC_AMDNB_LOCK	0x2000000000000000	/* Register locked */
+#define	MC_MISC_AMDNB_LVT_MASK	0x00f0000000000000	/* Extended LVT offset */
+#define	MC_MISC_AMDNB_LVT_SHIFT	52
+#define	MC_MISC_AMDNB_CNTEN	0x0008000000000000	/* Counter enabled */
+#define	MC_MISC_AMDNB_INT_MASK	0x0006000000000000	/* Interrupt type */
+#define	MC_MISC_AMDNB_INT_LVT	0x0002000000000000	/* Interrupt via Extended LVT */
+#define	MC_MISC_AMDNB_INT_SMI	0x0004000000000000	/* SMI */
+#define	MC_MISC_AMDNB_OVERFLOW	0x0001000000000000	/* Counter overflow */
+#define	MC_MISC_AMDNB_CNT_MASK	0x00000fff00000000	/* Counter value */
+#define	MC_MISC_AMDNB_CNT_SHIFT	32
+#define	MC_MISC_AMDNB_CNT_MAX	0xfff
+#define	MC_MISC_AMDNB_PTR_MASK	0x00000000ff000000	/* Pointer to additional registers */
+#define	MC_MISC_AMDNB_PTR_SHIFT	24
 
 /*
  * The following four 3-byte registers control the non-cacheable regions.
@@ -800,6 +887,7 @@
 #define	MSR_FSBASE	0xc0000100	/* base address of the %fs "segment" */
 #define	MSR_GSBASE	0xc0000101	/* base address of the %gs "segment" */
 #define	MSR_KGSBASE	0xc0000102	/* base address of the kernel %gs */
+#define	MSR_TSC_AUX	0xc0000103
 #define	MSR_PERFEVSEL0	0xc0010000
 #define	MSR_PERFEVSEL1	0xc0010001
 #define	MSR_PERFEVSEL2	0xc0010002
@@ -817,6 +905,8 @@
 #define	MSR_TOP_MEM	0xc001001a	/* boundary for ram below 4G */
 #define	MSR_TOP_MEM2	0xc001001d	/* boundary for ram above 4G */
 #define	MSR_NB_CFG1	0xc001001f	/* NB configuration 1 */
+#define	MSR_K8_UCODE_UPDATE 0xc0010020	/* update microcode */
+#define	MSR_MC0_CTL_MASK 0xc0010044
 #define	MSR_P_STATE_LIMIT 0xc0010061	/* P-state Current Limit Register */
 #define	MSR_P_STATE_CONTROL 0xc0010062	/* P-state Control Register */
 #define	MSR_P_STATE_STATUS 0xc0010063	/* P-state Status Register */
@@ -823,12 +913,12 @@
 #define	MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */
 #define	MSR_SMM_ADDR	0xc0010112	/* SMM TSEG base address */
 #define	MSR_SMM_MASK	0xc0010113	/* SMM TSEG address mask */
+#define	MSR_VM_CR	0xc0010114	/* SVM: feature control */
+#define	MSR_VM_HSAVE_PA 0xc0010117	/* SVM: host save area address */
+#define	MSR_AMD_CPUID07	0xc0011002	/* CPUID 07 %ebx override */
 #define	MSR_EXTFEATURES	0xc0011005	/* Extended CPUID Features override */
+#define	MSR_LS_CFG	0xc0011020
 #define	MSR_IC_CFG	0xc0011021	/* Instruction Cache Configuration */
-#define	MSR_K8_UCODE_UPDATE	0xc0010020	/* update microcode */
-#define	MSR_MC0_CTL_MASK	0xc0010044
-#define	MSR_VM_CR		0xc0010114 /* SVM: feature control */
-#define	MSR_VM_HSAVE_PA		0xc0010117 /* SVM: host save area address */
 
 /* MSR_VM_CR related */
 #define	VM_CR_SVMDIS		0x10	/* SVM: disabled by BIOS */

Added: trunk/sys/x86/include/stack.h
===================================================================
--- trunk/sys/x86/include/stack.h	                        (rev 0)
+++ trunk/sys/x86/include/stack.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,62 @@
+/* $MidnightBSD$ */
+/*-
+ * Mach Operating System
+ * Copyright (c) 1991,1990 Carnegie Mellon University
+ * All Rights Reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution at CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/stack.h 287645 2015-09-11 03:54:37Z markj $
+ */
+
+#ifndef _X86_STACK_H
+#define	_X86_STACK_H
+
+/*
+ * Stack trace.
+ */
+
+#ifdef __i386__
+struct i386_frame {
+	struct i386_frame	*f_frame;
+	u_int			f_retaddr;
+	u_int			f_arg0;
+};
+#endif
+
+#ifdef __amd64__
+struct amd64_frame {
+	struct amd64_frame	*f_frame;
+	u_long			f_retaddr;
+};
+
+struct i386_frame {
+	uint32_t		f_frame;
+	uint32_t		f_retaddr;
+	uint32_t		f_arg0;
+};
+#endif /* __amd64__ */
+
+#ifdef _KERNEL
+int	stack_nmi_handler(struct trapframe *);
+#endif
+
+#endif /* !_X86_STACK_H */


Property changes on: trunk/sys/x86/include/stack.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/include/stdarg.h
===================================================================
--- trunk/sys/x86/include/stdarg.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/stdarg.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/stdarg.h 256105 2013-10-07 10:01:23Z phk $
+ * $FreeBSD: stable/11/sys/x86/include/stdarg.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _MACHINE_STDARG_H_

Modified: trunk/sys/x86/include/sysarch.h
===================================================================
--- trunk/sys/x86/include/sysarch.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/sysarch.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/sysarch.h 233209 2012-03-19 21:57:31Z tijl $
+ * $FreeBSD: stable/11/sys/x86/include/sysarch.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*

Modified: trunk/sys/x86/include/trap.h
===================================================================
--- trunk/sys/x86/include/trap.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/trap.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.h	5.4 (Berkeley) 5/9/91
- * $FreeBSD: stable/10/sys/x86/include/trap.h 262042 2014-02-17 12:57:13Z avg $
+ * $FreeBSD: stable/11/sys/x86/include/trap.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _MACHINE_TRAP_H_

Added: trunk/sys/x86/include/ucode.h
===================================================================
--- trunk/sys/x86/include/ucode.h	                        (rev 0)
+++ trunk/sys/x86/include/ucode.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,69 @@
+/* $MidnightBSD$ */
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ *
+ * This software was developed by Mark Johnston under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/ucode.h 347700 2019-05-16 14:42:16Z markj $
+ */
+
+#ifndef _MACHINE_UCODE_H_
+#define	_MACHINE_UCODE_H_
+
+struct ucode_intel_header {
+	uint32_t	header_version;
+	int32_t		update_revision;
+	uint32_t	dat;
+	uint32_t	processor_signature;
+	uint32_t	checksum;
+	uint32_t	loader_revision;
+	uint32_t	processor_flags;
+#define	UCODE_INTEL_DEFAULT_DATA_SIZE		2000
+	uint32_t	data_size;
+	uint32_t	total_size;
+	uint32_t	reserved[3];
+};
+
+struct ucode_intel_extsig_table {
+	uint32_t	signature_count;
+	uint32_t	signature_table_checksum;
+	uint32_t	reserved[3];
+	struct ucode_intel_extsig {
+		uint32_t	processor_signature;
+		uint32_t	processor_flags;
+		uint32_t	checksum;
+	} entries[0];
+};
+
+int	ucode_intel_load(void *data, bool unsafe,
+	    uint64_t *nrevp, uint64_t *orevp);
+size_t	ucode_load_bsp(uintptr_t free);
+void	ucode_load_ap(int cpu);
+void	ucode_reload(void);
+void *	ucode_update(void *data);
+
+#endif /* _MACHINE_UCODE_H_ */


Property changes on: trunk/sys/x86/include/ucode.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/x86/include/ucontext.h
===================================================================
--- trunk/sys/x86/include/ucontext.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/ucontext.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -27,7 +27,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/ucontext.h 247047 2013-02-20 17:39:52Z kib $
+ * $FreeBSD: stable/11/sys/x86/include/ucontext.h 295561 2016-02-12 07:38:19Z kib $
  */
 
 #ifndef _X86_UCONTEXT_H_
@@ -163,4 +163,9 @@
 } mcontext_t;
 #endif /* __amd64__ */
 
+#ifdef __LINT__
+typedef struct __mcontext {
+} mcontext_t;
+#endif /* __LINT__ */
+
 #endif /* !_X86_UCONTEXT_H_ */

Modified: trunk/sys/x86/include/vdso.h
===================================================================
--- trunk/sys/x86/include/vdso.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/vdso.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -1,8 +1,12 @@
 /* $MidnightBSD$ */
 /*-
  * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>.
+ * Copyright 2016 The FreeBSD Foundation.
  * All rights reserved.
  *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -23,7 +27,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/vdso.h 237433 2012-06-22 07:06:40Z kib $
+ * $FreeBSD: stable/11/sys/x86/include/vdso.h 311376 2017-01-05 07:42:08Z sephe $
  */
 
 #ifndef _X86_VDSO_H
@@ -31,8 +35,13 @@
 
 #define	VDSO_TIMEHANDS_MD			\
 	uint32_t	th_x86_shift;		\
-	uint32_t	th_res[7];
+	uint32_t	th_x86_hpet_idx;	\
+	uint32_t	th_res[6];
 
+#define	VDSO_TH_ALGO_X86_TSC	VDSO_TH_ALGO_1
+#define	VDSO_TH_ALGO_X86_HPET	VDSO_TH_ALGO_2
+#define	VDSO_TH_ALGO_X86_HVTSC	VDSO_TH_ALGO_3	/* Hyper-V ref. TSC */
+
 #ifdef _KERNEL
 #ifdef COMPAT_FREEBSD32
 

Modified: trunk/sys/x86/include/vmware.h
===================================================================
--- trunk/sys/x86/include/vmware.h	2020-02-08 19:32:41 UTC (rev 12310)
+++ trunk/sys/x86/include/vmware.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/x86/include/vmware.h 278522 2015-02-10 16:34:42Z jhb $
+ * $FreeBSD: stable/11/sys/x86/include/vmware.h 278749 2015-02-14 09:00:12Z kib $
  */
 
 #ifndef _X86_VMWARE_H_
@@ -32,9 +32,14 @@
 
 #define	VMW_HVMAGIC		0x564d5868
 #define	VMW_HVPORT		0x5658
+
 #define	VMW_HVCMD_GETVERSION	10
 #define	VMW_HVCMD_GETHZ		45
+#define	VMW_HVCMD_GETVCPU_INFO	68
 
+#define	VMW_VCPUINFO_LEGACY_X2APIC	(1 << 3)
+#define	VMW_VCPUINFO_VCPU_RESERVED	(1 << 31)
+
 static __inline void
 vmware_hvcall(u_int cmd, u_int *p)
 {

Added: trunk/sys/x86/include/x86_smp.h
===================================================================
--- trunk/sys/x86/include/x86_smp.h	                        (rev 0)
+++ trunk/sys/x86/include/x86_smp.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,108 @@
+/* $MidnightBSD$ */
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk at FreeBSD.org> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD: stable/11/sys/x86/include/x86_smp.h 329462 2018-02-17 18:00:01Z kib $
+ *
+ */
+
+#ifndef _X86_X86_SMP_H_
+#define	_X86_X86_SMP_H_
+
+#include <sys/bus.h>
+#include <machine/frame.h>
+#include <machine/intr_machdep.h>
+#include <x86/apicvar.h>
+#include <machine/pcb.h>
+
+struct pmap;
+
+/* global data in mp_x86.c */
+extern int mp_naps;
+extern int boot_cpu_id;
+extern struct pcb stoppcbs[];
+extern int cpu_apic_ids[];
+extern int bootAP;
+extern void *dpcpu;
+extern char *bootSTK;
+extern void *bootstacks[];
+extern volatile u_int cpu_ipi_pending[];
+extern volatile int aps_ready;
+extern struct mtx ap_boot_mtx;
+extern int cpu_logical;
+extern int cpu_cores;
+extern volatile uint32_t smp_tlb_generation;
+extern struct pmap *smp_tlb_pmap;
+extern vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+extern u_int xhits_gbl[];
+extern u_int xhits_pg[];
+extern u_int xhits_rng[];
+extern u_int ipi_global;
+extern u_int ipi_page;
+extern u_int ipi_range;
+extern u_int ipi_range_size;
+
+extern int nmi_kdb_lock;
+extern int nmi_is_broadcast;
+
+struct cpu_info {
+	int	cpu_present:1;
+	int	cpu_bsp:1;
+	int	cpu_disabled:1;
+	int	cpu_hyperthread:1;
+};
+extern struct cpu_info cpu_info[];
+
+#ifdef COUNT_IPIS
+extern u_long *ipi_invltlb_counts[MAXCPU];
+extern u_long *ipi_invlrng_counts[MAXCPU];
+extern u_long *ipi_invlpg_counts[MAXCPU];
+extern u_long *ipi_invlcache_counts[MAXCPU];
+extern u_long *ipi_rendezvous_counts[MAXCPU];
+#endif
+
+/* IPI handlers */
+inthand_t
+	IDTVEC(invltlb),	/* TLB shootdowns - global */
+	IDTVEC(invlpg),		/* TLB shootdowns - 1 page */
+	IDTVEC(invlrng),	/* TLB shootdowns - page range */
+	IDTVEC(invlcache),	/* Write back and invalidate cache */
+	IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ 
+	IDTVEC(cpustop),	/* CPU stops & waits to be restarted */
+	IDTVEC(cpususpend),	/* CPU suspends & waits to be resumed */
+	IDTVEC(rendezvous);	/* handle CPU rendezvous */
+
+/* functions in x86_mp.c */
+void	assign_cpu_ids(void);
+void	cpu_add(u_int apic_id, char boot_cpu);
+void	cpustop_handler(void);
+void	cpususpend_handler(void);
+void	init_secondary_tail(void);
+void	invltlb_handler(void);
+void	invlpg_handler(void);
+void	invlrng_handler(void);
+void	invlcache_handler(void);
+void	init_secondary(void);
+void	ipi_startup(int apic_id, int vector);
+void	ipi_all_but_self(u_int ipi);
+void 	ipi_bitmap_handler(struct trapframe frame);
+void	ipi_cpu(int cpu, u_int ipi);
+int	ipi_nmi_handler(void);
+void	ipi_selected(cpuset_t cpus, u_int ipi);
+u_int	mp_bootaddress(u_int);
+void	set_interrupt_apic_ids(void);
+void	smp_cache_flush(void);
+void	smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, struct pmap *pmap);
+void	smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva,
+	    vm_offset_t endva, struct pmap *pmap);
+void	smp_masked_invltlb(cpuset_t mask, struct pmap *pmap);
+void	mem_range_AP_init(void);
+void	topo_probe(void);
+void	ipi_send_cpu(int cpu, u_int ipi);
+
+#endif


Property changes on: trunk/sys/x86/include/x86_smp.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/include/x86_var.h
===================================================================
--- trunk/sys/x86/include/x86_var.h	                        (rev 0)
+++ trunk/sys/x86/include/x86_var.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,161 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1995 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/x86_var.h 355094 2019-11-25 16:46:41Z kib $
+ */
+
+#ifndef _X86_X86_VAR_H_
+#define	_X86_X86_VAR_H_
+
+/*
+ * Miscellaneous machine-dependent declarations.
+ */
+
+extern	long	Maxmem;
+extern	u_int	basemem;
+extern	int	busdma_swi_pending;
+extern	u_int	cpu_exthigh;
+extern	u_int	cpu_feature;
+extern	u_int	cpu_feature2;
+extern	u_int	amd_feature;
+extern	u_int	amd_feature2;
+extern	u_int	amd_pminfo;
+extern	u_int	amd_extended_feature_extensions;
+extern	u_int	via_feature_rng;
+extern	u_int	via_feature_xcrypt;
+extern	u_int	cpu_clflush_line_size;
+extern	u_int	cpu_stdext_feature;
+extern	u_int	cpu_stdext_feature2;
+extern	u_int	cpu_stdext_feature3;
+extern	uint64_t cpu_ia32_arch_caps;
+extern	u_int	cpu_fxsr;
+extern	u_int	cpu_high;
+extern	u_int	cpu_id;
+extern	u_int	cpu_max_ext_state_size;
+extern	u_int	cpu_mxcsr_mask;
+extern	u_int	cpu_procinfo;
+extern	u_int	cpu_procinfo2;
+extern	char	cpu_vendor[];
+extern	u_int	cpu_vendor_id;
+extern	u_int	cpu_mon_mwait_flags;
+extern	u_int	cpu_mon_min_size;
+extern	u_int	cpu_mon_max_size;
+extern	u_int	cpu_maxphyaddr;
+extern	char	ctx_switch_xsave[];
+extern	u_int	hv_high;
+extern	char	hv_vendor[];
+extern	char	kstack[];
+extern	char	sigcode[];
+extern	int	szsigcode;
+extern	int	vm_page_dump_size;
+extern	int	workaround_erratum383;
+extern	int	_udatasel;
+extern	int	_ucodesel;
+extern	int	_ucode32sel;
+extern	int	_ufssel;
+extern	int	_ugssel;
+extern	int	use_xsave;
+extern	uint64_t xsave_mask;
+extern	int	pti;
+extern	int	hw_ibrs_active;
+extern	int	hw_mds_disable;
+extern	int	hw_ssb_active;
+extern	int	x86_taa_enable;
+
+struct	pcb;
+struct	thread;
+struct	reg;
+struct	fpreg;
+struct  dbreg;
+struct	dumperinfo;
+struct	trapframe;
+
+/*
+ * The interface type of the interrupt handler entry point cannot be
+ * expressed in C.  Use simplest non-variadic function type as an
+ * approximation.
+ */
+typedef void alias_for_inthand_t(void);
+
+/*
+ * Returns the maximum physical address that can be used with the
+ * current system.
+ */
+static __inline vm_paddr_t
+cpu_getmaxphyaddr(void)
+{
+#if defined(__i386__) && !defined(PAE)
+	return (0xffffffff);
+#else
+	return ((1ULL << cpu_maxphyaddr) - 1);
+#endif
+}
+
+void	*alloc_fpusave(int flags);
+void	busdma_swi(void);
+bool	cpu_mwait_usable(void);
+void	cpu_probe_amdc1e(void);
+void	cpu_setregs(void);
+void	dump_add_page(vm_paddr_t);
+void	dump_drop_page(vm_paddr_t);
+void	finishidentcpu(void);
+void	identify_cpu1(void);
+void	identify_cpu2(void);
+void	identify_hypervisor(void);
+void	initializecpu(void);
+void	initializecpucache(void);
+bool	fix_cpuid(void);
+void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
+int	is_physical_memory(vm_paddr_t addr);
+int	isa_nmi(int cd);
+void	handle_ibrs_entry(void);
+void	handle_ibrs_exit(void);
+void	hw_ibrs_recalculate(void);
+void	hw_mds_recalculate(void);
+void	hw_ssb_recalculate(bool all_cpus);
+void	x86_taa_recalculate(void);
+void	nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame);
+void	nmi_call_kdb_smp(u_int type, struct trapframe *frame);
+void	nmi_handle_intr(u_int type, struct trapframe *frame);
+void	pagecopy(void *from, void *to);
+void	printcpuinfo(void);
+int	pti_get_default(void);
+int	user_dbreg_trap(register_t dr6);
+int	minidumpsys(struct dumperinfo *);
+struct pcb *get_pcb_td(struct thread *td);
+
+#define	MSR_OP_ANDNOT		0x00000001
+#define	MSR_OP_OR		0x00000002
+#define	MSR_OP_WRITE		0x00000003
+#define	MSR_OP_LOCAL		0x10000000
+#define	MSR_OP_SCHED		0x20000000
+#define	MSR_OP_RENDEZVOUS	0x30000000
+void x86_msr_op(u_int msr, u_int op, uint64_t arg1);
+
+#endif


Property changes on: trunk/sys/x86/include/x86_var.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/x86/include/xen/xen-os.h
===================================================================
--- trunk/sys/x86/include/xen/xen-os.h	                        (rev 0)
+++ trunk/sys/x86/include/xen/xen-os.h	2020-02-08 19:33:27 UTC (rev 12311)
@@ -0,0 +1,39 @@
+/* $MidnightBSD$ */
+/*****************************************************************************
+ * x86/xen/xen-os.h
+ *
+ * Random collection of macros and definition
+ *
+ * Copyright (c) 2003, 2004 Keir Fraser (on behalf of the Xen team)
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * $FreeBSD: stable/11/sys/x86/include/xen/xen-os.h 289686 2015-10-21 10:44:07Z royger $
+ */
+
+#ifndef _MACHINE_X86_XEN_XEN_OS_H_
+#define _MACHINE_X86_XEN_XEN_OS_H_
+
+/* Everything below this point is not included by assembler (.S) files. */
+#ifndef __ASSEMBLY__
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _MACHINE_X86_XEN_XEN_OS_H_ */


Property changes on: trunk/sys/x86/include/xen/xen-os.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property

From laffer1 at midnightbsd.org  Sat Feb  8 14:34:35 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:34:35 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12312] trunk/sys/x86: sync with FreeBSD
 11-stable
Message-ID: <202002081934.018JYZuq062031@stargazer.midnightbsd.org>

Revision: 12312
          http://svnweb.midnightbsd.org/src/?rev=12312
Author:   laffer1
Date:     2020-02-08 14:34:34 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/x86/acpica/OsdEnvironment.c
    trunk/sys/x86/acpica/acpi_apm.c
    trunk/sys/x86/acpica/acpi_wakeup.c
    trunk/sys/x86/acpica/madt.c
    trunk/sys/x86/acpica/srat.c
    trunk/sys/x86/bios/smbios.c
    trunk/sys/x86/bios/vpd.c
    trunk/sys/x86/cpufreq/est.c
    trunk/sys/x86/cpufreq/hwpstate.c
    trunk/sys/x86/cpufreq/p4tcc.c
    trunk/sys/x86/cpufreq/powernow.c
    trunk/sys/x86/cpufreq/smist.c

Modified: trunk/sys/x86/acpica/OsdEnvironment.c
===================================================================
--- trunk/sys/x86/acpica/OsdEnvironment.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/acpica/OsdEnvironment.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -27,10 +27,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/OsdEnvironment.c 281687 2015-04-18 08:01:12Z jkim $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/OsdEnvironment.c 316303 2017-03-30 20:18:34Z jkim $");
 
 #include <sys/types.h>
 #include <sys/bus.h>
+#include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
@@ -61,6 +62,16 @@
 {
 	long acpi_root;
 
+	if (TUNABLE_ULONG_FETCH("acpi.rsdp", &acpi_root))
+		return (acpi_root);
+
+	/*
+	 * The hints mechanism is unreliable (it fails if anybody ever
+	 * compiled in hints to the kernel). It has been replaced
+	 * by the tunable method, but is used here as a fallback to
+	 * retain maximum compatibility between old loaders and new
+	 * kernels. It can be removed after 11.0R.
+	 */
 	if (resource_long_value("acpi", 0, "rsdp", &acpi_root) == 0)
 		return (acpi_root);
 

Modified: trunk/sys/x86/acpica/acpi_apm.c
===================================================================
--- trunk/sys/x86/acpica/acpi_apm.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/acpica/acpi_apm.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/acpi_apm.c 228283 2011-12-05 16:08:18Z ed $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/acpi_apm.c 228283 2011-12-05 16:08:18Z ed $");
 
 #include <sys/param.h>
 #include <sys/bus.h>

Modified: trunk/sys/x86/acpica/acpi_wakeup.c
===================================================================
--- trunk/sys/x86/acpica/acpi_wakeup.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/acpica/acpi_wakeup.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -29,10 +29,12 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/acpi_wakeup.c 331910 2018-04-03 07:52:06Z avg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/acpi_wakeup.c 347700 2019-05-16 14:42:16Z markj $");
 
-#ifdef __i386__
-#include "opt_npx.h"
+#if defined(__amd64__)
+#define DEV_APIC
+#else
+#include "opt_apic.h"
 #endif
 
 #include <sys/param.h>
@@ -43,6 +45,7 @@
 #include <sys/memrange.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
+#include <sys/cons.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
@@ -50,14 +53,17 @@
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
+#include <machine/md_var.h>
 #include <x86/mca.h>
 #include <machine/pcb.h>
-#include <machine/pmap.h>
 #include <machine/specialreg.h>
-#include <machine/md_var.h>
+#include <x86/ucode.h>
 
+#ifdef DEV_APIC
+#include <x86/apicreg.h>
+#include <x86/apicvar.h>
+#endif
 #ifdef SMP
-#include <x86/apicreg.h>
 #include <machine/smp.h>
 #include <machine/vmparam.h>
 #endif
@@ -74,6 +80,7 @@
 
 extern int		acpi_resume_beep;
 extern int		acpi_reset_video;
+extern int		acpi_susp_bounce;
 
 #ifdef SMP
 extern struct susppcb	**susppcbs;
@@ -82,7 +89,7 @@
 static struct susppcb	**susppcbs;
 #endif
 
-static void		*acpi_alloc_wakeup_handler(void);
+static void		*acpi_alloc_wakeup_handler(void **);
 static void		acpi_stop_beep(void *);
 
 #ifdef SMP
@@ -91,18 +98,14 @@
 #endif
 
 #ifdef __amd64__
-#define ACPI_PAGETABLES	3
+#define	ACPI_WAKEPAGES	4
 #else
-#define ACPI_PAGETABLES	0
+#define	ACPI_WAKEPAGES	1
 #endif
 
-#define	WAKECODE_VADDR(sc)				\
-    ((sc)->acpi_wakeaddr + (ACPI_PAGETABLES * PAGE_SIZE))
-#define	WAKECODE_PADDR(sc)				\
-    ((sc)->acpi_wakephys + (ACPI_PAGETABLES * PAGE_SIZE))
 #define	WAKECODE_FIXUP(offset, type, val)	do {	\
 	type	*addr;					\
-	addr = (type *)(WAKECODE_VADDR(sc) + offset);	\
+	addr = (type *)(sc->acpi_wakeaddr + (offset));	\
 	*addr = val;					\
 } while (0)
 
@@ -119,7 +122,7 @@
 acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
 {
 	struct pcb *pcb;
-	int		vector = (WAKECODE_PADDR(sc) >> 12) & 0xff;
+	int		vector = (sc->acpi_wakephys >> 12) & 0xff;
 	int		apic_id = cpu_apic_ids[cpu];
 	int		ms;
 
@@ -162,7 +165,7 @@
 
 	/* setup a vector to our boot code */
 	*((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET;
-	*((volatile u_short *)WARMBOOT_SEG) = WAKECODE_PADDR(sc) >> 4;
+	*((volatile u_short *)WARMBOOT_SEG) = sc->acpi_wakephys >> 4;
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 
@@ -191,6 +194,10 @@
 {
 	ACPI_STATUS	status;
 	struct pcb	*pcb;
+#ifdef __amd64__
+	struct pcpu *pc;
+	int i;
+#endif
 
 	if (sc->acpi_wakeaddr == 0ul)
 		return (-1);	/* couldn't alloc wake memory */
@@ -203,7 +210,7 @@
 	if (acpi_resume_beep != 0)
 		timer_spkr_acquire();
 
-	AcpiSetFirmwareWakingVector(WAKECODE_PADDR(sc), 0);
+	AcpiSetFirmwareWakingVector(sc->acpi_wakephys, 0);
 
 	intr_suspend();
 
@@ -211,7 +218,7 @@
 	if (savectx(pcb)) {
 #ifdef __amd64__
 		fpususpend(susppcbs[0]->sp_fpususpend);
-#elif defined(DEV_NPX)
+#else
 		npxsuspend(susppcbs[0]->sp_fpususpend);
 #endif
 #ifdef SMP
@@ -220,11 +227,23 @@
 			return (0);	/* couldn't sleep */
 		}
 #endif
+#ifdef __amd64__
+		hw_ibrs_active = 0;
+		hw_ssb_active = 0;
+		cpu_stdext_feature3 = 0;
+		CPU_FOREACH(i) {
+			pc = pcpu_find(i);
+			pc->pc_ibpb_set = 0;
+		}
+#endif
 
 		WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0));
 		WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0));
 
-#ifndef __amd64__
+#ifdef __amd64__
+		WAKECODE_FIXUP(wakeup_efer, uint64_t, rdmsr(MSR_EFER) &
+		    ~(EFER_LMA));
+#else
 		WAKECODE_FIXUP(wakeup_cr4, register_t, pcb->pcb_cr4);
 #endif
 		WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb);
@@ -243,12 +262,21 @@
 			return (0);	/* couldn't sleep */
 		}
 
+		if (acpi_susp_bounce)
+			resumectx(pcb);
+
 		for (;;)
 			ia32_pause();
 	} else {
+		/*
+		 * Re-initialize console hardware as soon as possibe.
+		 * No console output (e.g. printf) is allowed before
+		 * this point.
+		 */
+		cnresume();
 #ifdef __amd64__
 		fpuresume(susppcbs[0]->sp_fpususpend);
-#elif defined(DEV_NPX)
+#else
 		npxresume(susppcbs[0]->sp_fpususpend);
 #endif
 	}
@@ -267,10 +295,14 @@
 	if (!intr_enabled) {
 		/* Wakeup MD procedures in interrupt disabled context */
 		if (sleep_result == 1) {
+			ucode_reload();
 			pmap_init_pat();
 			initializecpu();
 			PCPU_SET(switchtime, 0);
 			PCPU_SET(switchticks, ticks);
+#ifdef DEV_APIC
+			lapic_xapic_mode();
+#endif
 #ifdef SMP
 			if (!CPU_EMPTY(&suspcpus))
 				acpi_wakeup_cpus(sc);
@@ -300,11 +332,12 @@
 }
 
 static void *
-acpi_alloc_wakeup_handler(void)
+acpi_alloc_wakeup_handler(void *wakepages[ACPI_WAKEPAGES])
 {
-	void		*wakeaddr;
 	int		i;
 
+	memset(wakepages, 0, ACPI_WAKEPAGES * sizeof(*wakepages));
+
 	/*
 	 * Specify the region for our wakeup code.  We want it in the low 1 MB
 	 * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA
@@ -312,18 +345,18 @@
 	 * and ROM area (0xa0000 and above).  The temporary page tables must be
 	 * page-aligned.
 	 */
-	wakeaddr = contigmalloc((ACPI_PAGETABLES + 1) * PAGE_SIZE, M_DEVBUF,
-	    M_WAITOK, 0x500, 0xa0000, PAGE_SIZE, 0ul);
-	if (wakeaddr == NULL) {
-		printf("%s: can't alloc wake memory\n", __func__);
-		return (NULL);
+	for (i = 0; i < ACPI_WAKEPAGES; i++) {
+		wakepages[i] = contigmalloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT,
+		    0x500, 0xa0000, PAGE_SIZE, 0ul);
+		if (wakepages[i] == NULL) {
+			printf("%s: can't alloc wake memory\n", __func__);
+			goto freepages;
+		}
 	}
 	if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL,
 	    EVENTHANDLER_PRI_LAST) == NULL) {
 		printf("%s: can't register event handler\n", __func__);
-		contigfree(wakeaddr, (ACPI_PAGETABLES + 1) * PAGE_SIZE,
-		    M_DEVBUF);
-		return (NULL);
+		goto freepages;
 	}
 	susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK);
 	for (i = 0; i < mp_ncpus; i++) {
@@ -331,15 +364,23 @@
 		susppcbs[i]->sp_fpususpend = alloc_fpusave(M_WAITOK);
 	}
 
-	return (wakeaddr);
+	return (wakepages);
+
+freepages:
+	for (i = 0; i < ACPI_WAKEPAGES; i++)
+		if (wakepages[i] != NULL)
+			contigfree(wakepages[i], PAGE_SIZE, M_DEVBUF);
+	return (NULL);
 }
 
 void
 acpi_install_wakeup_handler(struct acpi_softc *sc)
 {
-	static void	*wakeaddr = NULL;
+	static void	*wakeaddr;
+	void		*wakepages[ACPI_WAKEPAGES];
 #ifdef __amd64__
 	uint64_t	*pt4, *pt3, *pt2;
+	vm_paddr_t	pt4pa, pt3pa, pt2pa;
 	int		i;
 #endif
 
@@ -346,24 +387,33 @@
 	if (wakeaddr != NULL)
 		return;
 
-	wakeaddr = acpi_alloc_wakeup_handler();
-	if (wakeaddr == NULL)
+	if (acpi_alloc_wakeup_handler(wakepages) == NULL)
 		return;
 
+	wakeaddr = wakepages[0];
 	sc->acpi_wakeaddr = (vm_offset_t)wakeaddr;
 	sc->acpi_wakephys = vtophys(wakeaddr);
 
-	bcopy(wakecode, (void *)WAKECODE_VADDR(sc), sizeof(wakecode));
+#ifdef __amd64__
+	pt4 = wakepages[1];
+	pt3 = wakepages[2];
+	pt2 = wakepages[3];
+	pt4pa = vtophys(pt4);
+	pt3pa = vtophys(pt3);
+	pt2pa = vtophys(pt2);
+#endif
 
+	bcopy(wakecode, (void *)sc->acpi_wakeaddr, sizeof(wakecode));
+
 	/* Patch GDT base address, ljmp targets. */
 	WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t,
-	    WAKECODE_PADDR(sc) + bootgdt);
+	    sc->acpi_wakephys + bootgdt);
 	WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t,
-	    WAKECODE_PADDR(sc) + wakeup_32);
+	    sc->acpi_wakephys + wakeup_32);
 #ifdef __amd64__
 	WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t,
-	    WAKECODE_PADDR(sc) + wakeup_64);
-	WAKECODE_FIXUP(wakeup_pagetables, uint32_t, sc->acpi_wakephys);
+	    sc->acpi_wakephys + wakeup_64);
+	WAKECODE_FIXUP(wakeup_pagetables, uint32_t, pt4pa);
 #endif
 
 	/* Save pointers to some global data. */
@@ -375,12 +425,7 @@
 	WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdir));
 #endif
 
-#else
-	/* Build temporary page tables below realmode code. */
-	pt4 = wakeaddr;
-	pt3 = pt4 + (PAGE_SIZE) / sizeof(uint64_t);
-	pt2 = pt3 + (PAGE_SIZE) / sizeof(uint64_t);
-
+#else /* __amd64__ */
 	/* Create the initial 1GB replicated page tables */
 	for (i = 0; i < 512; i++) {
 		/*
@@ -387,7 +432,7 @@
 		 * Each slot of the level 4 pages points
 		 * to the same level 3 page
 		 */
-		pt4[i] = (uint64_t)(sc->acpi_wakephys + PAGE_SIZE);
+		pt4[i] = (uint64_t)pt3pa;
 		pt4[i] |= PG_V | PG_RW | PG_U;
 
 		/*
@@ -394,7 +439,7 @@
 		 * Each slot of the level 3 pages points
 		 * to the same level 2 page
 		 */
-		pt3[i] = (uint64_t)(sc->acpi_wakephys + (2 * PAGE_SIZE));
+		pt3[i] = (uint64_t)pt2pa;
 		pt3[i] |= PG_V | PG_RW | PG_U;
 
 		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
@@ -401,7 +446,7 @@
 		pt2[i] = i * (2 * 1024 * 1024);
 		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
 	}
-#endif
+#endif /* !__amd64__ */
 
 	if (bootverbose)
 		device_printf(sc->acpi_dev, "wakeup code va %#jx pa %#jx\n",

Modified: trunk/sys/x86/acpica/madt.c
===================================================================
--- trunk/sys/x86/acpica/madt.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/acpica/madt.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -26,12 +26,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/madt.c 288461 2015-10-01 20:54:19Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/madt.c 340016 2018-11-01 18:34:26Z jhb $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/smp.h>
 #include <vm/vm.h>
@@ -39,7 +40,9 @@
 
 #include <x86/apicreg.h>
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
+#include <machine/md_var.h>
+#include <x86/vmware.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/aclocal.h>
@@ -59,7 +62,7 @@
 	u_int la_acpi_id;
 } lapics[MAX_APIC_ID + 1];
 
-static int madt_found_sci_override;
+int madt_found_sci_override;
 static ACPI_TABLE_MADT *madt;
 static vm_paddr_t madt_physaddr;
 static vm_offset_t madt_length;
@@ -104,7 +107,7 @@
 	madt_physaddr = acpi_find_table(ACPI_SIG_MADT);
 	if (madt_physaddr == 0)
 		return (ENXIO);
-	return (0);
+	return (-50);
 }
 
 /*
@@ -129,8 +132,86 @@
 static int
 madt_setup_local(void)
 {
+	ACPI_TABLE_DMAR *dmartbl;
+	vm_paddr_t dmartbl_physaddr;
+	const char *reason;
+	char *hw_vendor;
+	u_int p[4];
+	int user_x2apic;
+	bool bios_x2apic;
 
 	madt = pmap_mapbios(madt_physaddr, madt_length);
+	if ((cpu_feature2 & CPUID2_X2APIC) != 0) {
+		reason = NULL;
+
+		/*
+		 * Automatically detect several configurations where
+		 * x2APIC mode is known to cause troubles.  User can
+		 * override the setting with hw.x2apic_enable tunable.
+		 */
+		dmartbl_physaddr = acpi_find_table(ACPI_SIG_DMAR);
+		if (dmartbl_physaddr != 0) {
+			dmartbl = acpi_map_table(dmartbl_physaddr,
+			    ACPI_SIG_DMAR);
+			if ((dmartbl->Flags & ACPI_DMAR_X2APIC_OPT_OUT) != 0)
+				reason = "by DMAR table";
+			acpi_unmap_table(dmartbl);
+		}
+		if (vm_guest == VM_GUEST_VMWARE) {
+			vmware_hvcall(VMW_HVCMD_GETVCPU_INFO, p);
+			if ((p[0] & VMW_VCPUINFO_VCPU_RESERVED) != 0 ||
+			    (p[0] & VMW_VCPUINFO_LEGACY_X2APIC) == 0)
+				reason =
+				    "inside VMWare without intr redirection";
+		} else if (vm_guest == VM_GUEST_XEN) {
+			reason = "due to running under XEN";
+		} else if (vm_guest == VM_GUEST_NO &&
+		    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
+		    CPUID_TO_MODEL(cpu_id) == 0x2a) {
+			hw_vendor = kern_getenv("smbios.planar.maker");
+			/*
+			 * It seems that some Lenovo and ASUS
+			 * SandyBridge-based notebook BIOSes have a
+			 * bug which prevents booting AP in x2APIC
+			 * mode.  Since the only way to detect mobile
+			 * CPU is to check northbridge pci id, which
+			 * cannot be done that early, disable x2APIC
+			 * for all Lenovo and ASUS SandyBridge
+			 * machines.
+			 */
+			if (hw_vendor != NULL) {
+				if (!strcmp(hw_vendor, "LENOVO") ||
+				    !strcmp(hw_vendor,
+				    "ASUSTeK Computer Inc.")) {
+					reason =
+				    "for a suspected SandyBridge BIOS bug";
+				}
+				freeenv(hw_vendor);
+			}
+		}
+		bios_x2apic = lapic_is_x2apic();
+		if (reason != NULL && bios_x2apic) {
+			if (bootverbose)
+				printf("x2APIC should be disabled %s but "
+				    "already enabled by BIOS; enabling.\n",
+				     reason);
+			reason = NULL;
+		}
+		if (reason == NULL)
+			x2apic_mode = 1;
+		else if (bootverbose)
+			printf("x2APIC available but disabled %s\n", reason);
+		user_x2apic = x2apic_mode;
+		TUNABLE_INT_FETCH("hw.x2apic_enable", &user_x2apic);
+		if (user_x2apic != x2apic_mode) {
+			if (bios_x2apic && !user_x2apic)
+				printf("x2APIC disabled by tunable and "
+				    "enabled by BIOS; ignoring tunable.");
+			else
+				x2apic_mode = user_x2apic;
+		}
+	}
+
 	lapic_init(madt->Address);
 	printf("ACPI APIC Table: <%.*s %.*s>\n",
 	    (int)sizeof(madt->Header.OemId), madt->Header.OemId,
@@ -290,10 +371,6 @@
 			    apic->Id);
 		if (ioapics[apic->Id].io_apic != NULL)
 			panic("%s: Double APIC ID %u", __func__, apic->Id);
-		if (apic->GlobalIrqBase >= FIRST_MSI_INT) {
-			printf("MADT: Ignoring bogus I/O APIC ID %u", apic->Id);
-			break;
-		}
 		ioapics[apic->Id].io_apic = ioapic_create(apic->Address,
 		    apic->Id, apic->GlobalIrqBase);
 		ioapics[apic->Id].io_vector = apic->GlobalIrqBase;
@@ -396,41 +473,27 @@
 	return (0);
 }
 
-/*
- * Parse an interrupt source override for an ISA interrupt.
- */
-static void
-madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr)
+void
+madt_parse_interrupt_values(void *entry,
+    enum intr_trigger *trig, enum intr_polarity *pol)
 {
-	void *new_ioapic, *old_ioapic;
-	u_int new_pin, old_pin;
-	enum intr_trigger trig;
-	enum intr_polarity pol;
+	ACPI_MADT_INTERRUPT_OVERRIDE *intr;
 	char buf[64];
 
-	if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 &&
-	    intr->GlobalIrq == 2) {
-		if (bootverbose)
-			printf("MADT: Skipping timer override\n");
-		return;
-	}
+	intr = entry;
+
 	if (bootverbose)
 		printf("MADT: Interrupt override: source %u, irq %u\n",
 		    intr->SourceIrq, intr->GlobalIrq);
 	KASSERT(intr->Bus == 0, ("bus for interrupt overrides must be zero"));
-	if (madt_find_interrupt(intr->GlobalIrq, &new_ioapic, &new_pin) != 0) {
-		printf("MADT: Could not find APIC for vector %u (IRQ %u)\n",
-		    intr->GlobalIrq, intr->SourceIrq);
-		return;
-	}
 
 	/*
 	 * Lookup the appropriate trigger and polarity modes for this
 	 * entry.
 	 */
-	trig = interrupt_trigger(intr->IntiFlags, intr->SourceIrq);
-	pol = interrupt_polarity(intr->IntiFlags, intr->SourceIrq);
-	
+	*trig = interrupt_trigger(intr->IntiFlags, intr->SourceIrq);
+	*pol = interrupt_polarity(intr->IntiFlags, intr->SourceIrq);
+
 	/*
 	 * If the SCI is identity mapped but has edge trigger and
 	 * active-hi polarity or the force_sci_lo tunable is set,
@@ -440,30 +503,57 @@
 		madt_found_sci_override = 1;
 		if (getenv_string("hw.acpi.sci.trigger", buf, sizeof(buf))) {
 			if (tolower(buf[0]) == 'e')
-				trig = INTR_TRIGGER_EDGE;
+				*trig = INTR_TRIGGER_EDGE;
 			else if (tolower(buf[0]) == 'l')
-				trig = INTR_TRIGGER_LEVEL;
+				*trig = INTR_TRIGGER_LEVEL;
 			else
 				panic(
 				"Invalid trigger %s: must be 'edge' or 'level'",
 				    buf);
 			printf("MADT: Forcing SCI to %s trigger\n",
-			    trig == INTR_TRIGGER_EDGE ? "edge" : "level");
+			    *trig == INTR_TRIGGER_EDGE ? "edge" : "level");
 		}
 		if (getenv_string("hw.acpi.sci.polarity", buf, sizeof(buf))) {
 			if (tolower(buf[0]) == 'h')
-				pol = INTR_POLARITY_HIGH;
+				*pol = INTR_POLARITY_HIGH;
 			else if (tolower(buf[0]) == 'l')
-				pol = INTR_POLARITY_LOW;
+				*pol = INTR_POLARITY_LOW;
 			else
 				panic(
 				"Invalid polarity %s: must be 'high' or 'low'",
 				    buf);
 			printf("MADT: Forcing SCI to active %s polarity\n",
-			    pol == INTR_POLARITY_HIGH ? "high" : "low");
+			    *pol == INTR_POLARITY_HIGH ? "high" : "low");
 		}
 	}
+}
 
+/*
+ * Parse an interrupt source override for an ISA interrupt.
+ */
+static void
+madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr)
+{
+	void *new_ioapic, *old_ioapic;
+	u_int new_pin, old_pin;
+	enum intr_trigger trig;
+	enum intr_polarity pol;
+
+	if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 &&
+	    intr->GlobalIrq == 2) {
+		if (bootverbose)
+			printf("MADT: Skipping timer override\n");
+		return;
+	}
+
+	if (madt_find_interrupt(intr->GlobalIrq, &new_ioapic, &new_pin) != 0) {
+		printf("MADT: Could not find APIC for vector %u (IRQ %u)\n",
+		    intr->GlobalIrq, intr->SourceIrq);
+		return;
+	}
+
+	madt_parse_interrupt_values(intr, &trig, &pol);
+
 	/* Remap the IRQ if it is mapped to a different interrupt vector. */
 	if (intr->SourceIrq != intr->GlobalIrq) {
 		/*
@@ -510,7 +600,7 @@
 	if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS))
 		ioapic_set_triggermode(ioapic, pin,
 		    interrupt_trigger(nmi->IntiFlags, 0));
-	if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS))
+	if (!(nmi->IntiFlags & ACPI_MADT_POLARITY_CONFORMS))
 		ioapic_set_polarity(ioapic, pin,
 		    interrupt_polarity(nmi->IntiFlags, 0));
 }

Modified: trunk/sys/x86/acpica/srat.c
===================================================================
--- trunk/sys/x86/acpica/srat.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/acpica/srat.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -27,8 +27,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/srat.c 299485 2016-05-11 22:06:28Z vangyzen $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/srat.c 322996 2017-08-29 07:01:15Z mav $");
 
+#include "opt_vm.h"
+
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
@@ -47,7 +49,7 @@
 #include <contrib/dev/acpica/include/actables.h>
 
 #include <machine/intr_machdep.h>
-#include <machine/apicvar.h>
+#include <x86/apicvar.h>
 
 #include <dev/acpica/acpivar.h>
 
@@ -64,11 +66,102 @@
 static ACPI_TABLE_SRAT *srat;
 static vm_paddr_t srat_physaddr;
 
-static int vm_domains[VM_PHYSSEG_MAX];
+static int domain_pxm[MAXMEMDOM];
+static int ndomain;
 
+static ACPI_TABLE_SLIT *slit;
+static vm_paddr_t slit_physaddr;
+static int vm_locality_table[MAXMEMDOM * MAXMEMDOM];
+
 static void	srat_walk_table(acpi_subtable_handler *handler, void *arg);
 
 /*
+ * SLIT parsing.
+ */
+
+static void
+slit_parse_table(ACPI_TABLE_SLIT *s)
+{
+	int i, j;
+	int i_domain, j_domain;
+	int offset = 0;
+	uint8_t e;
+
+	/*
+	 * This maps the SLIT data into the VM-domain centric view.
+	 * There may be sparse entries in the PXM namespace, so
+	 * remap them to a VM-domain ID and if it doesn't exist,
+	 * skip it.
+	 *
+	 * It should result in a packed 2d array of VM-domain
+	 * locality information entries.
+	 */
+
+	if (bootverbose)
+		printf("SLIT.Localities: %d\n", (int) s->LocalityCount);
+	for (i = 0; i < s->LocalityCount; i++) {
+		i_domain = acpi_map_pxm_to_vm_domainid(i);
+		if (i_domain < 0)
+			continue;
+
+		if (bootverbose)
+			printf("%d: ", i);
+		for (j = 0; j < s->LocalityCount; j++) {
+			j_domain = acpi_map_pxm_to_vm_domainid(j);
+			if (j_domain < 0)
+				continue;
+			e = s->Entry[i * s->LocalityCount + j];
+			if (bootverbose)
+				printf("%d ", (int) e);
+			/* 255 == "no locality information" */
+			if (e == 255)
+				vm_locality_table[offset] = -1;
+			else
+				vm_locality_table[offset] = e;
+			offset++;
+		}
+		if (bootverbose)
+			printf("\n");
+	}
+}
+
+/*
+ * Look for an ACPI System Locality Distance Information Table ("SLIT")
+ */
+static int
+parse_slit(void)
+{
+
+	if (resource_disabled("slit", 0)) {
+		return (-1);
+	}
+
+	slit_physaddr = acpi_find_table(ACPI_SIG_SLIT);
+	if (slit_physaddr == 0) {
+		return (-1);
+	}
+
+	/*
+	 * Make a pass over the table to populate the cpus[] and
+	 * mem_info[] tables.
+	 */
+	slit = acpi_map_table(slit_physaddr, ACPI_SIG_SLIT);
+	slit_parse_table(slit);
+	acpi_unmap_table(slit);
+	slit = NULL;
+
+#ifdef VM_NUMA_ALLOC
+	/* Tell the VM about it! */
+	mem_locality = vm_locality_table;
+#endif
+	return (0);
+}
+
+/*
+ * SRAT parsing.
+ */
+
+/*
  * Returns true if a memory range overlaps with at least one range in
  * phys_avail[].
  */
@@ -78,7 +171,7 @@
 	int i;
 
 	for (i = 0; phys_avail[i] != 0 && phys_avail[i + 1] != 0; i += 2) {
-		if (phys_avail[i + 1] < start)
+		if (phys_avail[i + 1] <= start)
 			continue;
 		if (phys_avail[i] < end)
 			return (1);
@@ -110,6 +203,12 @@
 			    "enabled" : "disabled");
 		if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED))
 			break;
+		if (cpu->ApicId > MAX_APIC_ID) {
+			printf("SRAT: Ignoring local APIC ID %u (too high)\n",
+			    cpu->ApicId);
+			break;
+		}
+
 		if (cpus[cpu->ApicId].enabled) {
 			printf("SRAT: Duplicate local APIC ID %u\n",
 			    cpu->ApicId);
@@ -128,6 +227,12 @@
 			    "enabled" : "disabled");
 		if (!(x2apic->Flags & ACPI_SRAT_CPU_ENABLED))
 			break;
+		if (x2apic->ApicId > MAX_APIC_ID) {
+			printf("SRAT: Ignoring local APIC ID %u (too high)\n",
+			    x2apic->ApicId);
+			break;
+		}
+
 		KASSERT(!cpus[x2apic->ApicId].enabled,
 		    ("Duplicate local APIC ID %u", x2apic->ApicId));
 		cpus[x2apic->ApicId].domain = x2apic->ProximityDomain;
@@ -137,7 +242,7 @@
 		mem = (ACPI_SRAT_MEM_AFFINITY *)entry;
 		if (bootverbose)
 			printf(
-		    "SRAT: Found memory domain %d addr %jx len %jx: %s\n",
+		    "SRAT: Found memory domain %d addr 0x%jx len 0x%jx: %s\n",
 			    mem->ProximityDomain, (uintmax_t)mem->BaseAddress,
 			    (uintmax_t)mem->Length,
 			    (mem->Flags & ACPI_SRAT_MEM_ENABLED) ?
@@ -146,7 +251,7 @@
 			break;
 		if (!overlaps_phys_avail(mem->BaseAddress,
 		    mem->BaseAddress + mem->Length)) {
-			printf("SRAT: Ignoring memory at addr %jx\n",
+			printf("SRAT: Ignoring memory at addr 0x%jx\n",
 			    (uintmax_t)mem->BaseAddress);
 			break;
 		}
@@ -243,7 +348,7 @@
 				address = mem_info[i].end + 1;
 		}
 	}
-	printf("SRAT: No memory region found for %jx - %jx\n",
+	printf("SRAT: No memory region found for 0x%jx - 0x%jx\n",
 	    (uintmax_t)phys_avail[j], (uintmax_t)phys_avail[j + 1]);
 	return (ENXIO);
 }
@@ -258,48 +363,47 @@
 	int i, j, slot;
 
 	/* Enumerate all the domains. */
-	vm_ndomains = 0;
+	ndomain = 0;
 	for (i = 0; i < num_mem; i++) {
 		/* See if this domain is already known. */
-		for (j = 0; j < vm_ndomains; j++) {
-			if (vm_domains[j] >= mem_info[i].domain)
+		for (j = 0; j < ndomain; j++) {
+			if (domain_pxm[j] >= mem_info[i].domain)
 				break;
 		}
-		if (j < vm_ndomains && vm_domains[j] == mem_info[i].domain)
+		if (j < ndomain && domain_pxm[j] == mem_info[i].domain)
 			continue;
 
-		/* Insert the new domain at slot 'j'. */
-		slot = j;
-		for (j = vm_ndomains; j > slot; j--)
-			vm_domains[j] = vm_domains[j - 1];
-		vm_domains[slot] = mem_info[i].domain;
-		vm_ndomains++;
-		if (vm_ndomains > MAXMEMDOM) {
-			vm_ndomains = 1;
+		if (ndomain >= MAXMEMDOM) {
+			ndomain = 1;
 			printf("SRAT: Too many memory domains\n");
 			return (EFBIG);
 		}
+
+		/* Insert the new domain at slot 'j'. */
+		slot = j;
+		for (j = ndomain; j > slot; j--)
+			domain_pxm[j] = domain_pxm[j - 1];
+		domain_pxm[slot] = mem_info[i].domain;
+		ndomain++;
 	}
 
-	/* Renumber each domain to its index in the sorted 'domains' list. */
-	for (i = 0; i < vm_ndomains; i++) {
+	/* Renumber each domain to its index in the sorted 'domain_pxm' list. */
+	for (i = 0; i < ndomain; i++) {
 		/*
 		 * If the domain is already the right value, no need
 		 * to renumber.
 		 */
-		if (vm_domains[i] == i)
+		if (domain_pxm[i] == i)
 			continue;
 
 		/* Walk the cpu[] and mem_info[] arrays to renumber. */
 		for (j = 0; j < num_mem; j++)
-			if (mem_info[j].domain == vm_domains[i])
+			if (mem_info[j].domain == domain_pxm[i])
 				mem_info[j].domain = i;
 		for (j = 0; j <= MAX_APIC_ID; j++)
-			if (cpus[j].enabled && cpus[j].domain == vm_domains[i])
+			if (cpus[j].enabled && cpus[j].domain == domain_pxm[i])
 				cpus[j].domain = i;
 	}
-	KASSERT(vm_ndomains > 0,
-	    ("renumber_domains: invalid final vm_ndomains setup"));
 
 	return (0);
 }
@@ -307,17 +411,17 @@
 /*
  * Look for an ACPI System Resource Affinity Table ("SRAT")
  */
-static void
-parse_srat(void *dummy)
+static int
+parse_srat(void)
 {
 	int error;
 
 	if (resource_disabled("srat", 0))
-		return;
+		return (-1);
 
 	srat_physaddr = acpi_find_table(ACPI_SIG_SRAT);
 	if (srat_physaddr == 0)
-		return;
+		return (-1);
 
 	/*
 	 * Make a pass over the table to populate the cpus[] and
@@ -331,15 +435,44 @@
 	if (error || check_domains() != 0 || check_phys_avail() != 0 ||
 	    renumber_domains() != 0) {
 		srat_physaddr = 0;
-		return;
+		return (-1);
 	}
 
+#ifdef VM_NUMA_ALLOC
 	/* Point vm_phys at our memory affinity table. */
+	vm_ndomains = ndomain;
 	mem_affinity = mem_info;
+#endif
+
+	return (0);
 }
-SYSINIT(parse_srat, SI_SUB_VM - 1, SI_ORDER_FIRST, parse_srat, NULL);
 
 static void
+init_mem_locality(void)
+{
+	int i;
+
+	/*
+	 * For now, assume -1 == "no locality information for
+	 * this pairing.
+	 */
+	for (i = 0; i < MAXMEMDOM * MAXMEMDOM; i++)
+		vm_locality_table[i] = -1;
+}
+
+static void
+parse_acpi_tables(void *dummy)
+{
+
+	if (parse_srat() < 0)
+		return;
+	init_mem_locality();
+	(void) parse_slit();
+}
+SYSINIT(parse_acpi_tables, SI_SUB_VM - 1, SI_ORDER_FIRST, parse_acpi_tables,
+    NULL);
+
+static void
 srat_walk_table(acpi_subtable_handler *handler, void *arg)
 {
 
@@ -348,7 +481,7 @@
 }
 
 /*
- * Setup per-CPU ACPI IDs.
+ * Setup per-CPU domain IDs.
  */
 static void
 srat_set_cpus(void *dummy)
@@ -369,6 +502,7 @@
 			panic("SRAT: CPU with APIC ID %u is not known",
 			    pc->pc_apic_id);
 		pc->pc_domain = cpu->domain;
+		CPU_SET(i, &cpuset_domain[cpu->domain]);
 		if (bootverbose)
 			printf("SRAT: CPU %u has memory domain %d\n", i,
 			    cpu->domain);
@@ -386,8 +520,8 @@
 {
 	int i;
 
-	for (i = 0; i < vm_ndomains; i++) {
-		if (vm_domains[i] == pxm)
+	for (i = 0; i < ndomain; i++) {
+		if (domain_pxm[i] == pxm)
 			return (i);
 	}
 
@@ -394,4 +528,13 @@
 	return (-1);
 }
 
+#else /* MAXMEMDOM == 1 */
+
+int
+acpi_map_pxm_to_vm_domainid(int pxm)
+{
+
+	return (-1);
+}
+
 #endif /* MAXMEMDOM > 1 */

Modified: trunk/sys/x86/bios/smbios.c
===================================================================
--- trunk/sys/x86/bios/smbios.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/bios/smbios.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/bios/smbios.c 241073 2012-09-30 15:42:20Z kevlo $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/bios/smbios.c 241073 2012-09-30 15:42:20Z kevlo $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/x86/bios/vpd.c
===================================================================
--- trunk/sys/x86/bios/vpd.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/bios/vpd.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/bios/vpd.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/bios/vpd.c 227309 2011-11-07 15:43:11Z ed $");
 
 /*
  * VPD decoder for IBM systems (Thinkpads)

Modified: trunk/sys/x86/cpufreq/est.c
===================================================================
--- trunk/sys/x86/cpufreq/est.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/cpufreq/est.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/est.c 260473 2014-01-09 10:44:27Z mav $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/est.c 259197 2013-12-10 20:25:43Z mav $");
 
 #include <sys/param.h>
 #include <sys/bus.h>

Modified: trunk/sys/x86/cpufreq/hwpstate.c
===================================================================
--- trunk/sys/x86/cpufreq/hwpstate.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/cpufreq/hwpstate.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -45,7 +45,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/hwpstate.c 326638 2017-12-06 21:40:24Z jkim $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/hwpstate.c 326637 2017-12-06 21:39:01Z jkim $");
 
 #include <sys/param.h>
 #include <sys/bus.h>

Modified: trunk/sys/x86/cpufreq/p4tcc.c
===================================================================
--- trunk/sys/x86/cpufreq/p4tcc.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/cpufreq/p4tcc.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/p4tcc.c 250487 2013-05-10 22:43:27Z hiren $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/p4tcc.c 250487 2013-05-10 22:43:27Z hiren $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/x86/cpufreq/powernow.c
===================================================================
--- trunk/sys/x86/cpufreq/powernow.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/cpufreq/powernow.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/powernow.c 305615 2016-09-08 15:06:28Z pfg $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/powernow.c 305614 2016-09-08 15:05:25Z pfg $");
 
 #include <sys/param.h>
 #include <sys/bus.h>

Modified: trunk/sys/x86/cpufreq/smist.c
===================================================================
--- trunk/sys/x86/cpufreq/smist.c	2020-02-08 19:33:27 UTC (rev 12311)
+++ trunk/sys/x86/cpufreq/smist.c	2020-02-08 19:34:34 UTC (rev 12312)
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/x86/cpufreq/smist.c 187597 2009-01-22 20:29:07Z jkim $");
+__FBSDID("$FreeBSD: stable/11/sys/x86/cpufreq/smist.c 297793 2016-04-10 23:07:00Z pfg $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -224,7 +224,7 @@
 		bus_dma_tag_destroy(tag);
 		device_printf(dev, "can't load mem\n");
 		return (ENXIO);
-	};
+	}
 	DPRINT(dev, "taking ownership over BIOS return %d\n", cb_data.result);
 	bus_dmamap_unload(tag, map);
 	bus_dmamem_free(tag, cb_data.buf, map);


From laffer1 at midnightbsd.org  Sat Feb  8 14:35:05 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:35:05 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12313] trunk/sys/xdr: sync with FreeBSD
 11-stable
Message-ID: <202002081935.018JZ5Fa062440@stargazer.midnightbsd.org>

Revision: 12313
          http://svnweb.midnightbsd.org/src/?rev=12313
Author:   laffer1
Date:     2020-02-08 14:35:04 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/xdr/xdr.c
    trunk/sys/xdr/xdr_array.c
    trunk/sys/xdr/xdr_mbuf.c
    trunk/sys/xdr/xdr_mem.c
    trunk/sys/xdr/xdr_reference.c
    trunk/sys/xdr/xdr_sizeof.c

Modified: trunk/sys/xdr/xdr.c
===================================================================
--- trunk/sys/xdr/xdr.c	2020-02-08 19:34:34 UTC (rev 12312)
+++ trunk/sys/xdr/xdr.c	2020-02-08 19:35:04 UTC (rev 12313)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*	$NetBSD: xdr.c,v 1.22 2000/07/06 03:10:35 christos Exp $	*/
 
 /*
@@ -34,7 +35,7 @@
 static char *sccsid = "@(#)xdr.c	2.1 88/07/29 4.0 RPCSRC";
 #endif
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr.c 319614 2017-06-06 07:21:33Z delphij $");
 
 /*
  * xdr.c, Generic XDR routines implementation.

Modified: trunk/sys/xdr/xdr_array.c
===================================================================
--- trunk/sys/xdr/xdr_array.c	2020-02-08 19:34:34 UTC (rev 12312)
+++ trunk/sys/xdr/xdr_array.c	2020-02-08 19:35:04 UTC (rev 12313)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*	$NetBSD: xdr_array.c,v 1.12 2000/01/22 22:19:18 mycroft Exp $	*/
 
 /*
@@ -34,7 +35,7 @@
 static char *sccsid = "@(#)xdr_array.c	2.1 88/07/29 4.0 RPCSRC";
 #endif
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_array.c 177633 2008-03-26 15:23:12Z dfr $");
 
 /*
  * xdr_array.c, Generic XDR routines impelmentation.

Modified: trunk/sys/xdr/xdr_mbuf.c
===================================================================
--- trunk/sys/xdr/xdr_mbuf.c	2020-02-08 19:34:34 UTC (rev 12312)
+++ trunk/sys/xdr/xdr_mbuf.c	2020-02-08 19:35:04 UTC (rev 12313)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
  * Authors: Doug Rabson <dfr at rabson.org>
@@ -26,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_mbuf.c 248318 2013-03-15 10:21:18Z glebius $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/xdr/xdr_mem.c
===================================================================
--- trunk/sys/xdr/xdr_mem.c	2020-02-08 19:34:34 UTC (rev 12312)
+++ trunk/sys/xdr/xdr_mem.c	2020-02-08 19:35:04 UTC (rev 12313)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*	$NetBSD: xdr_mem.c,v 1.15 2000/01/22 22:19:18 mycroft Exp $	*/
 
 /*
@@ -34,7 +35,7 @@
 static char *sccsid = "@(#)xdr_mem.c	2.1 88/07/29 4.0 RPCSRC";
 #endif
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_mem.c 297975 2016-04-14 17:06:37Z pfg $");
 
 /*
  * xdr_mem.h, XDR implementation using memory buffers.

Modified: trunk/sys/xdr/xdr_reference.c
===================================================================
--- trunk/sys/xdr/xdr_reference.c	2020-02-08 19:34:34 UTC (rev 12312)
+++ trunk/sys/xdr/xdr_reference.c	2020-02-08 19:35:04 UTC (rev 12313)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*	$NetBSD: xdr_reference.c,v 1.13 2000/01/22 22:19:18 mycroft Exp	$ */
 
 /*
@@ -34,7 +35,7 @@
 static char *sccsid = "@(#)xdr_reference.c	2.1 88/07/29 4.0 RPCSRC";
 #endif
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_reference.c 177633 2008-03-26 15:23:12Z dfr $");
 
 /*
  * xdr_reference.c, Generic XDR routines impelmentation.

Modified: trunk/sys/xdr/xdr_sizeof.c
===================================================================
--- trunk/sys/xdr/xdr_sizeof.c	2020-02-08 19:34:34 UTC (rev 12312)
+++ trunk/sys/xdr/xdr_sizeof.c	2020-02-08 19:35:04 UTC (rev 12313)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*
  * Sun RPC is a product of Sun Microsystems, Inc. and is provided for
  * unrestricted use provided that this legend is included on all tape
@@ -36,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/11/sys/xdr/xdr_sizeof.c 177633 2008-03-26 15:23:12Z dfr $");
 
 #include <sys/param.h>
 #include <sys/systm.h>


From laffer1 at midnightbsd.org  Sat Feb  8 14:35:49 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:35:49 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12314] trunk/sys/vm: sync with FreeBSD
 11-stable
Message-ID: <202002081935.018JZnSh062792@stargazer.midnightbsd.org>

Revision: 12314
          http://svnweb.midnightbsd.org/src/?rev=12314
Author:   laffer1
Date:     2020-02-08 14:35:48 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/vm/_vm_radix.h
    trunk/sys/vm/default_pager.c
    trunk/sys/vm/device_pager.c
    trunk/sys/vm/memguard.c
    trunk/sys/vm/memguard.h
    trunk/sys/vm/phys_pager.c
    trunk/sys/vm/pmap.h
    trunk/sys/vm/redzone.c
    trunk/sys/vm/redzone.h
    trunk/sys/vm/sg_pager.c
    trunk/sys/vm/swap_pager.c
    trunk/sys/vm/swap_pager.h
    trunk/sys/vm/uma.h
    trunk/sys/vm/uma_core.c
    trunk/sys/vm/uma_dbg.c
    trunk/sys/vm/uma_dbg.h
    trunk/sys/vm/uma_int.h
    trunk/sys/vm/vm.h
    trunk/sys/vm/vm_extern.h
    trunk/sys/vm/vm_fault.c
    trunk/sys/vm/vm_glue.c
    trunk/sys/vm/vm_init.c
    trunk/sys/vm/vm_kern.c
    trunk/sys/vm/vm_kern.h
    trunk/sys/vm/vm_map.c
    trunk/sys/vm/vm_map.h
    trunk/sys/vm/vm_meter.c
    trunk/sys/vm/vm_mmap.c
    trunk/sys/vm/vm_object.c
    trunk/sys/vm/vm_object.h
    trunk/sys/vm/vm_page.c
    trunk/sys/vm/vm_page.h
    trunk/sys/vm/vm_pageout.c
    trunk/sys/vm/vm_pageout.h
    trunk/sys/vm/vm_pager.c
    trunk/sys/vm/vm_pager.h
    trunk/sys/vm/vm_param.h
    trunk/sys/vm/vm_phys.c
    trunk/sys/vm/vm_phys.h
    trunk/sys/vm/vm_radix.c
    trunk/sys/vm/vm_radix.h
    trunk/sys/vm/vm_reserv.c
    trunk/sys/vm/vm_reserv.h
    trunk/sys/vm/vm_unix.c
    trunk/sys/vm/vm_zeroidle.c
    trunk/sys/vm/vnode_pager.c
    trunk/sys/vm/vnode_pager.h

Added Paths:
-----------
    trunk/sys/vm/vm_domain.c
    trunk/sys/vm/vm_domain.h
    trunk/sys/vm/vm_swapout.c
    trunk/sys/vm/vm_swapout_dummy.c

Modified: trunk/sys/vm/_vm_radix.h
===================================================================
--- trunk/sys/vm/_vm_radix.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/_vm_radix.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/_vm_radix.h 254141 2013-08-09 11:28:55Z attilio $
+ * $FreeBSD: stable/11/sys/vm/_vm_radix.h 321513 2017-07-26 06:52:45Z kib $
  */
 
 #ifndef __VM_RADIX_H_
@@ -37,20 +37,6 @@
  */
 struct vm_radix {
 	uintptr_t	rt_root;
-	uint8_t		rt_flags;
 };
 
-#define	RT_INSERT_INPROG	0x01
-#define	RT_TRIE_MODIFIED	0x02
-
-#ifdef _KERNEL
-
-static __inline boolean_t
-vm_radix_is_empty(struct vm_radix *rtree)
-{
-
-	return (rtree->rt_root == 0);
-}
-
-#endif /* _KERNEL */
 #endif /* !__VM_RADIX_H_ */

Modified: trunk/sys/vm/default_pager.c
===================================================================
--- trunk/sys/vm/default_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/default_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -28,18 +28,10 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * The default pager is responsible for supplying backing store to unbacked
- * storage.  The backing store is usually swap so we just fall through to
- * the swap routines.  However, since swap metadata has not been assigned,
- * the swap routines assign and manage the swap backing store through the
- * vm_page->swapblk field.  The object is only converted when the page is 
- * physically freed after having been cleaned and even then vm_page->swapblk
- * is maintained whenever a resident page also has swap backing store.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/default_pager.c 310363 2016-12-21 11:32:08Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/default_pager.c 315473 2017-03-18 05:38:10Z alc $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -54,14 +46,16 @@
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
-static vm_object_t default_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
-    vm_ooffset_t, struct ucred *);
-static void default_pager_dealloc(vm_object_t);
-static int default_pager_getpages(vm_object_t, vm_page_t *, int, int);
-static void default_pager_putpages(vm_object_t, vm_page_t *, int, 
-		boolean_t, int *);
-static boolean_t default_pager_haspage(vm_object_t, vm_pindex_t, int *, 
-		int *);
+static vm_object_t	default_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
+			    vm_ooffset_t, struct ucred *);
+static void		default_pager_dealloc(vm_object_t);
+static int		default_pager_getpages(vm_object_t, vm_page_t *, int,
+			    int *, int *);
+static void		default_pager_putpages(vm_object_t, vm_page_t *, int, 
+			    boolean_t, int *);
+static boolean_t	default_pager_haspage(vm_object_t, vm_pindex_t, int *, 
+			    int *);
+
 /*
  * pagerops for OBJT_DEFAULT - "default pager".
  *
@@ -84,7 +78,7 @@
 };
 
 /*
- * no_pager_alloc just returns an initialized object.
+ * Return an initialized object.
  */
 static vm_object_t
 default_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
@@ -102,51 +96,41 @@
 	object = vm_object_allocate(OBJT_DEFAULT,
 	    OFF_TO_IDX(round_page(offset + size)));
 	if (cred != NULL) {
-		VM_OBJECT_WLOCK(object);
 		object->cred = cred;
 		object->charge = size;
-		VM_OBJECT_WUNLOCK(object);
 	}
 	return (object);
 }
 
 /*
- * deallocate resources associated with default objects.   The default objects
- * have no special resources allocated to them, but the vm_page's being used
- * in this object might.  Still, we do not have to do anything - we will free
- * the swapblk in the underlying vm_page's when we free the vm_page or
- * garbage collect the vm_page cache list.
+ * Deallocate resources associated with the object.
  */
 static void
-default_pager_dealloc(object)
-	vm_object_t object;
+default_pager_dealloc(vm_object_t object)
 {
-	/*
-	 * OBJT_DEFAULT objects have no special resources allocated to them.
-	 */
+
+	/* Reserved swap is released by vm_object_destroy(). */
 	object->type = OBJT_DEAD;
 }
 
 /*
- * Load pages from backing store.  Since OBJT_DEFAULT is converted to
- * OBJT_SWAP at the time a swap-backed vm_page_t is freed, we will never
- * see a vm_page with assigned swap here.
+ * Load pages from backing store.
  */
 static int
-default_pager_getpages(object, m, count, reqpage)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	int reqpage;
+default_pager_getpages(vm_object_t object, vm_page_t *m, int count,
+    int *rbehind, int *rahead)
 {
-	return VM_PAGER_FAIL;
+
+	/*
+	 * Since an OBJT_DEFAULT object is converted to OBJT_SWAP by the first
+	 * call to the putpages method, this function will never be called on
+	 * a vm_page with assigned swap.
+	 */
+	return (VM_PAGER_FAIL);
 }
 
 /*
- * Store pages to backing store.  We should assign swap and initiate
- * I/O.  We do not actually convert the object to OBJT_SWAP here.  The
- * object will be converted when the written-out vm_page_t is moved from the
- * cache to the free list.
+ * Store pages to backing store.
  */
 static void
 default_pager_putpages(vm_object_t object, vm_page_t *m, int count,
@@ -153,28 +137,20 @@
     int flags, int *rtvals)
 {
 
+	/* The swap pager will convert the object to OBJT_SWAP. */
 	swappagerops.pgo_putpages(object, m, count, flags, rtvals);
 }
 
 /*
- * Tell us whether the backing store for the requested (object,index) is
- * synchronized.  i.e. tell us whether we can throw the page away and 
- * reload it later.  So, for example, if we are in the process of writing
- * the page to its backing store, or if no backing store has been assigned,
- * it is not yet synchronized.
- *
- * It is possible to have fully-synchronized swap assigned without the
- * object having been converted.  We just call swap_pager_haspage() to
- * deal with it since it must already deal with it plus deal with swap
- * meta-data structures.
+ * Tell us whether the requested (object,index) is available from the object's
+ * backing store.
  */
 static boolean_t
-default_pager_haspage(object, pindex, before, after)
-	vm_object_t object;
-	vm_pindex_t pindex;
-	int *before;
-	int *after;
+default_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+    int *after)
 {
-	return FALSE;
+
+	/* An OBJT_DEFAULT object has no backing store. */
+	return (FALSE);
 }
 

Modified: trunk/sys/vm/device_pager.c
===================================================================
--- trunk/sys/vm/device_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/device_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/device_pager.c 320439 2017-06-28 06:13:58Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/device_pager.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -47,6 +47,7 @@
 #include <sys/mman.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -60,10 +61,12 @@
 static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
 static void dev_pager_dealloc(vm_object_t);
-static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static void dev_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static void dev_pager_free_page(vm_object_t object, vm_page_t m);
+static int dev_pager_populate(vm_object_t object, vm_pindex_t pidx,
+    int fault_type, vm_prot_t, vm_pindex_t *first, vm_pindex_t *last);
 
 /* list of device pager objects */
 static struct pagerlst dev_pager_object_list;
@@ -85,6 +88,7 @@
 	.pgo_getpages =	dev_pager_getpages,
 	.pgo_putpages =	dev_pager_putpages,
 	.pgo_haspage =	dev_pager_haspage,
+	.pgo_populate =	dev_pager_populate,
 };
 
 static int old_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
@@ -128,6 +132,8 @@
 
 	if (tp != OBJT_DEVICE && tp != OBJT_MGTDEVICE)
 		return (NULL);
+	KASSERT(tp == OBJT_MGTDEVICE || ops->cdev_pg_populate == NULL,
+	    ("populate on unmanaged device pager"));
 
 	/*
 	 * Offset should be page aligned.
@@ -135,8 +141,18 @@
 	if (foff & PAGE_MASK)
 		return (NULL);
 
+	/*
+	 * Treat the mmap(2) file offset as an unsigned value for a
+	 * device mapping.  This, in effect, allows a user to pass all
+	 * possible off_t values as the mapping cookie to the driver.  At
+	 * this point, we know that both foff and size are a multiple
+	 * of the page size.  Do a check to avoid wrap.
+	 */
 	size = round_page(size);
-	pindex = OFF_TO_IDX(foff + size);
+	pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size);
+	if (pindex > OBJ_MAX_SIZE || pindex < UOFF_TO_IDX(foff) ||
+	    pindex < UOFF_TO_IDX(size))
+		return (NULL);
 
 	if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0)
 		return (NULL);
@@ -169,6 +185,11 @@
 			 */
 			if (pindex > object->size)
 				object->size = pindex;
+			KASSERT(object->type == tp,
+			    ("Inconsistent device pager type %p %d",
+			    object, tp));
+			KASSERT(object->un_pager.devp.ops == ops,
+			    ("Inconsistent devops %p %p", object, ops));
 		} else {
 			object = object1;
 			object1 = NULL;
@@ -175,12 +196,14 @@
 			object->handle = handle;
 			TAILQ_INSERT_TAIL(&dev_pager_object_list, object,
 			    pager_object_list);
-			KASSERT(object->type == tp,
-		("Inconsistent device pager type %p %d", object, tp));
+			if (ops->cdev_pg_populate != NULL)
+				vm_object_set_flag(object, OBJ_POPULATE);
 		}
 	} else {
 		if (pindex > object->size)
 			object->size = pindex;
+		KASSERT(object->type == tp,
+		    ("Inconsistent device pager type %p %d", object, tp));
 	}
 	mtx_unlock(&dev_pager_mtx);
 	if (object1 != NULL) {
@@ -256,34 +279,35 @@
 }
 
 static int
-dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int reqpage)
+dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
+    int *rahead)
 {
-	int error, i;
+	int error;
 
+	/* Since our haspage reports zero after/before, the count is 1. */
+	KASSERT(count == 1, ("%s: count %d", __func__, count));
 	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (object->un_pager.devp.ops->cdev_pg_fault == NULL)
+		return (VM_PAGER_FAIL);
 	error = object->un_pager.devp.ops->cdev_pg_fault(object,
-	    IDX_TO_OFF(ma[reqpage]->pindex), PROT_READ, &ma[reqpage]);
+	    IDX_TO_OFF(ma[0]->pindex), PROT_READ, &ma[0]);
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
-	for (i = 0; i < count; i++) {
-		if (i != reqpage) {
-			vm_page_lock(ma[i]);
-			vm_page_free(ma[i]);
-			vm_page_unlock(ma[i]);
-		}
-	}
-
 	if (error == VM_PAGER_OK) {
 		KASSERT((object->type == OBJT_DEVICE &&
-		     (ma[reqpage]->oflags & VPO_UNMANAGED) != 0) ||
+		     (ma[0]->oflags & VPO_UNMANAGED) != 0) ||
 		    (object->type == OBJT_MGTDEVICE &&
-		     (ma[reqpage]->oflags & VPO_UNMANAGED) == 0),
-		    ("Wrong page type %p %p", ma[reqpage], object));
+		     (ma[0]->oflags & VPO_UNMANAGED) == 0),
+		    ("Wrong page type %p %p", ma[0], object));
 		if (object->type == OBJT_DEVICE) {
 			TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist,
-			    ma[reqpage], plinks.q);
+			    ma[0], plinks.q);
 		}
+		if (rbehind)
+			*rbehind = 0;
+		if (rahead)
+			*rahead = 0;
 	}
 
 	return (error);
@@ -290,6 +314,18 @@
 }
 
 static int
+dev_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
+    vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (object->un_pager.devp.ops->cdev_pg_populate == NULL)
+		return (VM_PAGER_FAIL);
+	return (object->un_pager.devp.ops->cdev_pg_populate(object, pidx,
+	    fault_type, max_prot, first, last));
+}
+
+static int
 old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
@@ -355,8 +391,7 @@
 		 */
 		page = vm_page_getfake(paddr, memattr);
 		VM_OBJECT_WLOCK(object);
-		if (vm_page_replace(page, object, (*mres)->pindex) != *mres)
-			panic("old_dev_pager_fault: invalid page replacement");
+		vm_page_replace_checked(page, object, (*mres)->pindex, *mres);
 		vm_page_lock(*mres);
 		vm_page_free(*mres);
 		vm_page_unlock(*mres);

Modified: trunk/sys/vm/memguard.c
===================================================================
--- trunk/sys/vm/memguard.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/memguard.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/memguard.c 325037 2017-10-27 14:23:53Z markj $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/memguard.c 331017 2018-03-15 19:08:33Z kevans $");
 
 /*
  * MemGuard is a simple replacement allocator for debugging only
@@ -50,6 +50,7 @@
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
@@ -68,9 +69,9 @@
  * reserved for MemGuard.
  */
 static u_int vm_memguard_divisor;
-SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RDTUN,
+SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &vm_memguard_divisor,
-    0, "(kmem_size/memguard_divisor) == memguard submap size");     
+    0, "(kmem_size/memguard_divisor) == memguard submap size");
 
 /*
  * Short description (ks_shortdesc) of memory type to monitor.
@@ -131,8 +132,7 @@
 #define MG_GUARD_ALLLARGE	0x002
 #define MG_GUARD_NOFREE		0x004
 static int memguard_options = MG_GUARD_AROUND;
-TUNABLE_INT("vm.memguard.options", &memguard_options);
-SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RW,
+SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RWTUN,
     &memguard_options, 0,
     "MemGuard options:\n"
     "\t0x001 - add guard pages around each allocation\n"
@@ -148,8 +148,7 @@
 
 static u_int memguard_frequency;
 static u_long memguard_frequency_hits;
-TUNABLE_INT("vm.memguard.frequency", &memguard_frequency);
-SYSCTL_UINT(_vm_memguard, OID_AUTO, frequency, CTLFLAG_RW,
+SYSCTL_UINT(_vm_memguard, OID_AUTO, frequency, CTLFLAG_RWTUN,
     &memguard_frequency, 0, "Times in 100000 that MemGuard will randomly run");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, frequency_hits, CTLFLAG_RD,
     &memguard_frequency_hits, 0, "# times MemGuard randomly chose");
@@ -165,6 +164,7 @@
 	u_long mem_pgs, parent_size;
 
 	vm_memguard_divisor = 10;
+	/* CTFLAG_RDTUN doesn't work during the early boot process. */
 	TUNABLE_INT_FETCH("vm.memguard.divisor", &vm_memguard_divisor);
 
 	parent_size = vm_map_max(parent_map) - vm_map_min(parent_map) +
@@ -180,7 +180,7 @@
 	 * This prevents memguard's page promotions from completely
 	 * using up memory, since most malloc(9) calls are sub-page.
 	 */
-	mem_pgs = cnt.v_page_count;
+	mem_pgs = vm_cnt.v_page_count;
 	memguard_physlimit = (mem_pgs / vm_memguard_divisor) * PAGE_SIZE;
 	/*
 	 * We want as much KVA as we can take safely.  Use at most our

Modified: trunk/sys/vm/memguard.h
===================================================================
--- trunk/sys/vm/memguard.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/memguard.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -24,7 +24,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $
+ * $FreeBSD: stable/11/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $
  */
 
 #ifndef _VM_MEMGUARD_H_

Modified: trunk/sys/vm/phys_pager.c
===================================================================
--- trunk/sys/vm/phys_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/phys_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/phys_pager.c 310110 2016-12-15 10:47:35Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/phys_pager.c 327785 2018-01-10 20:39:26Z markj $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -42,6 +42,7 @@
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 
 /* list of phys pager objects */
@@ -99,6 +100,7 @@
 				object = object1;
 				object1 = NULL;
 				object->handle = handle;
+				vm_object_set_flag(object, OBJ_POPULATE);
 				TAILQ_INSERT_TAIL(&phys_pager_object_list,
 				    object, pager_object_list);
 			}
@@ -110,6 +112,7 @@
 		vm_object_deallocate(object1);
 	} else {
 		object = vm_object_allocate(OBJT_PHYS, pindex);
+		vm_object_set_flag(object, OBJ_POPULATE);
 	}
 
 	return (object);
@@ -134,7 +137,8 @@
  * Fill as many pages as vm_fault has allocated for us.
  */
 static int
-phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
+phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+    int *rahead)
 {
 	int i;
 
@@ -149,35 +153,98 @@
 		    ("phys_pager_getpages: partially valid page %p", m[i]));
 		KASSERT(m[i]->dirty == 0,
 		    ("phys_pager_getpages: dirty page %p", m[i]));
-		/* The requested page must remain busy, the others not. */
-		if (i == reqpage) {
-			vm_page_lock(m[i]);
-			vm_page_flash(m[i]);
-			vm_page_unlock(m[i]);
-		} else
-			vm_page_xunbusy(m[i]);
 	}
+	if (rbehind)
+		*rbehind = 0;
+	if (rahead)
+		*rahead = 0;
 	return (VM_PAGER_OK);
 }
 
-static void
-phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
-    int *rtvals)
-{
-
-	panic("phys_pager_putpage called");
-}
-
 /*
  * Implement a pretty aggressive clustered getpages strategy.  Hint that
  * everything in an entire 4MB window should be prefaulted at once.
  *
- * XXX 4MB (1024 slots per page table page) is convenient for x86,
+ * 4MB (1024 slots per page table page) is convenient for x86,
  * but may not be for other arches.
  */
 #ifndef PHYSCLUSTER
 #define PHYSCLUSTER 1024
 #endif
+static int phys_pager_cluster = PHYSCLUSTER;
+SYSCTL_INT(_vm, OID_AUTO, phys_pager_cluster, CTLFLAG_RWTUN,
+    &phys_pager_cluster, 0,
+    "prefault window size for phys pager");
+
+/*
+ * Max hint to vm_page_alloc() about the further allocation needs
+ * inside the phys_pager_populate() loop.  The number of bits used to
+ * implement VM_ALLOC_COUNT() determines the hard limit on this value.
+ * That limit is currently 65535.
+ */
+#define	PHYSALLOC	16
+
+static int
+phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
+    int fault_type __unused, vm_prot_t max_prot __unused, vm_pindex_t *first,
+    vm_pindex_t *last)
+{
+	vm_page_t m;
+	vm_pindex_t base, end, i;
+	int ahead;
+
+	base = rounddown(pidx, phys_pager_cluster);
+	end = base + phys_pager_cluster - 1;
+	if (end >= object->size)
+		end = object->size - 1;
+	if (*first > base)
+		base = *first;
+	if (end > *last)
+		end = *last;
+	*first = base;
+	*last = end;
+
+	for (i = base; i <= end; i++) {
+retry:
+		m = vm_page_lookup(object, i);
+		if (m == NULL) {
+			ahead = MIN(end - i, PHYSALLOC);
+			m = vm_page_alloc(object, i, VM_ALLOC_NORMAL |
+			    VM_ALLOC_ZERO | VM_ALLOC_WAITFAIL |
+			    VM_ALLOC_COUNT(ahead));
+			if (m == NULL)
+				goto retry;
+			if ((m->flags & PG_ZERO) == 0)
+				pmap_zero_page(m);
+			m->valid = VM_PAGE_BITS_ALL;
+		} else if (vm_page_xbusied(m)) {
+			vm_page_lock(m);
+			VM_OBJECT_WUNLOCK(object);
+			vm_page_busy_sleep(m, "physb", true);
+			VM_OBJECT_WLOCK(object);
+			goto retry;
+		} else {
+			vm_page_xbusy(m);
+			if (m->valid != VM_PAGE_BITS_ALL)
+				vm_page_zero_invalid(m, TRUE);
+		}
+
+		KASSERT(m->valid == VM_PAGE_BITS_ALL,
+		    ("phys_pager_populate: partially valid page %p", m));
+		KASSERT(m->dirty == 0,
+		    ("phys_pager_populate: dirty page %p", m));
+	}
+	return (VM_PAGER_OK);
+}
+
+static void
+phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
+    int *rtvals)
+{
+
+	panic("phys_pager_putpage called");
+}
+
 static boolean_t
 phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
@@ -184,8 +251,8 @@
 {
 	vm_pindex_t base, end;
 
-	base = pindex & (~(PHYSCLUSTER - 1));
-	end = base + (PHYSCLUSTER - 1);
+	base = rounddown(pindex, phys_pager_cluster);
+	end = base + phys_pager_cluster - 1;
 	if (before != NULL)
 		*before = pindex - base;
 	if (after != NULL)
@@ -200,4 +267,5 @@
 	.pgo_getpages =	phys_pager_getpages,
 	.pgo_putpages =	phys_pager_putpages,
 	.pgo_haspage =	phys_pager_haspage,
+	.pgo_populate =	phys_pager_populate,
 };

Modified: trunk/sys/vm/pmap.h
===================================================================
--- trunk/sys/vm/pmap.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/pmap.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/pmap.h 270920 2014-09-01 07:58:15Z kib $
+ * $FreeBSD: stable/11/sys/vm/pmap.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*
@@ -101,10 +101,22 @@
 /*
  * Flags for pmap_enter().  The bits in the low-order byte are reserved
  * for the protection code (vm_prot_t) that describes the fault type.
+ * Bits 24 through 31 are reserved for the pmap's internal use.
  */
-#define	PMAP_ENTER_NOSLEEP	0x0100
-#define	PMAP_ENTER_WIRED	0x0200
+#define	PMAP_ENTER_NOSLEEP	0x00000100
+#define	PMAP_ENTER_WIRED	0x00000200
+#define	PMAP_ENTER_RESERVED	0xFF000000
 
+/*
+ * Define the maximum number of machine-dependent reference bits that are
+ * cleared by a call to pmap_ts_referenced().  This limit serves two purposes.
+ * First, it bounds the cost of reference bit maintenance on widely shared
+ * pages.  Second, it prevents numeric overflow during maintenance of a
+ * widely shared page's "act_count" field.  An overflow could result in the
+ * premature deactivation of the page.
+ */
+#define	PMAP_TS_REFERENCED_MAX	5
+
 void		 pmap_activate(struct thread *td);
 void		 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 		    int advice);
@@ -142,6 +154,8 @@
 void		 pmap_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
 void		 pmap_qenter(vm_offset_t, vm_page_t *, int);
 void		 pmap_qremove(vm_offset_t, int);
+vm_offset_t	 pmap_quick_enter_page(vm_page_t);
+void		 pmap_quick_remove_page(vm_offset_t);
 void		 pmap_release(pmap_t);
 void		 pmap_remove(pmap_t, vm_offset_t, vm_offset_t);
 void		 pmap_remove_all(vm_page_t m);

Modified: trunk/sys/vm/redzone.c
===================================================================
--- trunk/sys/vm/redzone.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/redzone.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/redzone.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/redzone.c 267992 2014-06-28 03:56:17Z hselasky $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -42,8 +42,7 @@
 SYSCTL_ULONG(_vm_redzone, OID_AUTO, extra_mem, CTLFLAG_RD, &redzone_extra_mem,
     0, "Extra memory allocated by redzone");     
 static int redzone_panic = 0;
-TUNABLE_INT("vm.redzone.panic", &redzone_panic);
-SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RW, &redzone_panic, 0,
+SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RWTUN, &redzone_panic, 0,
     "Panic when buffer corruption is detected");     
 
 #define	REDZONE_CHSIZE	(16)

Modified: trunk/sys/vm/redzone.h
===================================================================
--- trunk/sys/vm/redzone.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/redzone.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $
+ * $FreeBSD: stable/11/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $
  */
 
 #ifndef	_VM_REDZONE_H_

Modified: trunk/sys/vm/sg_pager.c
===================================================================
--- trunk/sys/vm/sg_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/sg_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/sg_pager.c 284100 2015-06-06 20:37:40Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/sg_pager.c 331017 2018-03-15 19:08:33Z kevans $");
 
 /*
  * This pager manages OBJT_SG objects.  These objects are backed by
@@ -39,6 +39,8 @@
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/sglist.h>
+#include <sys/vmmeter.h>
+
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
@@ -50,7 +52,7 @@
 static vm_object_t sg_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
 static void sg_pager_dealloc(vm_object_t);
-static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static void sg_pager_putpages(vm_object_t, vm_page_t *, int, 
 		boolean_t, int *);
 static boolean_t sg_pager_haspage(vm_object_t, vm_pindex_t, int *,
@@ -97,8 +99,9 @@
 	 * to map beyond that.
 	 */
 	size = round_page(size);
-	pindex = OFF_TO_IDX(foff + size);
-	if (pindex > npages)
+	pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size);
+	if (pindex > npages || pindex < UOFF_TO_IDX(foff) ||
+	    pindex < UOFF_TO_IDX(size))
 		return (NULL);
 
 	/*
@@ -136,7 +139,8 @@
 }
 
 static int
-sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
+sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+    int *rahead)
 {
 	struct sglist *sg;
 	vm_page_t m_paddr, page;
@@ -146,11 +150,13 @@
 	size_t space;
 	int i;
 
+	/* Since our haspage reports zero after/before, the count is 1. */
+	KASSERT(count == 1, ("%s: count %d", __func__, count));
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	sg = object->handle;
 	memattr = object->memattr;
 	VM_OBJECT_WUNLOCK(object);
-	offset = m[reqpage]->pindex;
+	offset = m[0]->pindex;
 
 	/*
 	 * Lookup the physical address of the requested page.  An initial
@@ -179,7 +185,7 @@
 	}
 
 	/* Return a fake page for the requested page. */
-	KASSERT(!(m[reqpage]->flags & PG_FICTITIOUS),
+	KASSERT(!(m[0]->flags & PG_FICTITIOUS),
 	    ("backing page for SG is fake"));
 
 	/* Construct a new fake page. */
@@ -186,19 +192,18 @@
 	page = vm_page_getfake(paddr, memattr);
 	VM_OBJECT_WLOCK(object);
 	TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, plinks.q);
-
-	/* Free the original pages and insert this fake page into the object. */
-	for (i = 0; i < count; i++) {
-		if (i == reqpage &&
-		    vm_page_replace(page, object, offset) != m[i])
-			panic("sg_pager_getpages: invalid place replacement");
-		vm_page_lock(m[i]);
-		vm_page_free(m[i]);
-		vm_page_unlock(m[i]);
-	}
-	m[reqpage] = page;
+	vm_page_replace_checked(page, object, offset, m[0]);
+	vm_page_lock(m[0]);
+	vm_page_free(m[0]);
+	vm_page_unlock(m[0]);
+	m[0] = page;
 	page->valid = VM_PAGE_BITS_ALL;
 
+	if (rbehind)
+		*rbehind = 0;
+	if (rahead)
+		*rahead = 0;
+
 	return (VM_PAGER_OK);
 }
 

Modified: trunk/sys/vm/swap_pager.c
===================================================================
--- trunk/sys/vm/swap_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/swap_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -68,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/swap_pager.c 320557 2017-07-01 22:21:11Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/swap_pager.c 350355 2019-07-26 10:36:07Z kib $");
 
 #include "opt_swap.h"
 #include "opt_vm.h"
@@ -87,10 +87,12 @@
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
+#include <sys/pctrie.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
+#include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/blist.h>
@@ -120,7 +122,7 @@
  * The 64-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
-#define MAX_PAGEOUT_CLUSTER 16
+#define	MAX_PAGEOUT_CLUSTER	32
 #endif
 
 #if !defined(SWB_NPAGES)
@@ -127,22 +129,17 @@
 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 #endif
 
+#define	SWAP_META_PAGES		PCTRIE_COUNT
+
 /*
- * The swblock structure maps an object and a small, fixed-size range
- * of page indices to disk addresses within a swap area.
- * The collection of these mappings is implemented as a hash table.
- * Unused disk addresses within a swap area are allocated and managed
- * using a blist.
+ * A swblk structure maps each page index within a
+ * SWAP_META_PAGES-aligned and sized range to the address of an
+ * on-disk swap block (or SWAPBLK_NONE). The collection of these
+ * mappings for an entire vm object is implemented as a pc-trie.
  */
-#define SWAP_META_PAGES		(SWB_NPAGES * 2)
-#define SWAP_META_MASK		(SWAP_META_PAGES - 1)
-
-struct swblock {
-	struct swblock	*swb_hnext;
-	vm_object_t	swb_object;
-	vm_pindex_t	swb_index;
-	int		swb_count;
-	daddr_t		swb_pages[SWAP_META_PAGES];
+struct swblk {
+	vm_pindex_t	p;
+	daddr_t		d[SWAP_META_PAGES];
 };
 
 static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
@@ -151,7 +148,7 @@
 static struct swdevt *swdevhd;	/* Allocate from here next */
 static int nswapdev;		/* Number of swap devices */
 int swap_pager_avail;
-static int swdev_syscall_active = 0; /* serialize swap(on|off) */
+static struct sx swdev_syscall_lock;	/* serialize swap(on|off) */
 
 static vm_ooffset_t swap_total;
 SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
@@ -160,7 +157,7 @@
 SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
     "Amount of swap storage needed to back all allocated anonymous memory.");
 static int overcommit = 0;
-SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
+SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0,
     "Configure virtual memory overcommit behavior. See tuning(7) "
     "for details.");
 static unsigned long swzone;
@@ -210,7 +207,7 @@
 	mtx_lock(&sw_dev_mtx);
 	r = swap_reserved + incr;
 	if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
-		s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count;
+		s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_cnt.v_wire_count;
 		s *= PAGE_SIZE;
 	} else
 		s = 0;
@@ -223,16 +220,14 @@
 	mtx_unlock(&sw_dev_mtx);
 
 	if (res) {
-		PROC_LOCK(curproc);
 		UIDINFO_VMSIZE_LOCK(uip);
 		if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
-		    uip->ui_vmsize + incr > lim_cur(curproc, RLIMIT_SWAP) &&
+		    uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) &&
 		    priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
 			res = 0;
 		else
 			uip->ui_vmsize += incr;
 		UIDINFO_VMSIZE_UNLOCK(uip);
-		PROC_UNLOCK(curproc);
 		if (!res) {
 			mtx_lock(&sw_dev_mtx);
 			swap_reserved -= incr;
@@ -314,12 +309,10 @@
 	racct_sub_cred(cred, RACCT_SWAP, decr);
 }
 
-static void swapdev_strategy(struct buf *, struct swdevt *sw);
-
 #define SWM_FREE	0x02	/* free, period			*/
 #define SWM_POP		0x04	/* pop out			*/
 
-int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
+static int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
 static int nsw_rcount;		/* free read buffers			*/
 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
@@ -327,17 +320,17 @@
 static int nsw_wcount_async_max;/* assigned maximum			*/
 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
 
-static struct swblock **swhash;
-static int swhash_mask;
-static struct mtx swhash_mtx;
+static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I",
+    "Maximum running async swap ops");
+static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A",
+    "Swap Fragmentation Info");
 
-static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
 static struct sx sw_alloc_sx;
 
-
-SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
-	CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
-
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  * of searching a named list by hashing it just a little.
@@ -348,9 +341,9 @@
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
-static struct mtx sw_alloc_mtx;	/* protect list manipulation */
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
-static uma_zone_t	swap_zone;
+static uma_zone_t swblk_zone;
+static uma_zone_t swpctrie_zone;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
@@ -361,7 +354,10 @@
 		swap_pager_alloc(void *handle, vm_ooffset_t size,
 		    vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
 static void	swap_pager_dealloc(vm_object_t object);
-static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int *,
+    int *);
+static int	swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
+    int *, pgo_getpages_iodone_t, void *);
 static void	swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t
 		swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
@@ -374,6 +370,7 @@
 	.pgo_alloc =	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
 	.pgo_dealloc =	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
 	.pgo_getpages =	swap_pager_getpages,	/* pagein				*/
+	.pgo_getpages_async = swap_pager_getpages_async, /* pagein (async)		*/
 	.pgo_putpages =	swap_pager_putpages,	/* pageout				*/
 	.pgo_haspage =	swap_pager_haspage,	/* get backing store status for page	*/
 	.pgo_pageunswapped = swap_pager_unswapped,	/* remove swap related to page		*/
@@ -391,7 +388,7 @@
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
-static int	swapongeom(struct thread *, struct vnode *);
+static int	swapongeom(struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
 static int	swapoff_one(struct swdevt *sp, struct ucred *cred);
 
@@ -404,22 +401,28 @@
 /*
  * Metadata functions
  */
-static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
-static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
+static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
 static void swp_pager_meta_free_all(vm_object_t);
 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
 
+static void *
+swblk_trie_alloc(struct pctrie *ptree)
+{
+
+	return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ?
+	    M_USE_RESERVE : 0)));
+}
+
 static void
-swp_pager_free_nrpage(vm_page_t m)
+swblk_trie_free(struct pctrie *ptree, void *node)
 {
 
-	vm_page_lock(m);
-	if (m->wire_count == 0)
-		vm_page_free(m);
-	vm_page_unlock(m);
+	uma_zfree(swpctrie_zone, node);
 }
 
+PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);
+
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
  *
@@ -448,33 +451,6 @@
 }
 
 /*
- * SWP_PAGER_HASH() -	hash swap meta data
- *
- *	This is an helper function which hashes the swapblk given
- *	the object and page index.  It returns a pointer to a pointer
- *	to the object, or a pointer to a NULL pointer if it could not
- *	find a swapblk.
- */
-static struct swblock **
-swp_pager_hash(vm_object_t object, vm_pindex_t index)
-{
-	struct swblock **pswap;
-	struct swblock *swap;
-
-	index &= ~(vm_pindex_t)SWAP_META_MASK;
-	pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
-	while ((swap = *pswap) != NULL) {
-		if (swap->swb_object == object &&
-		    swap->swb_index == index
-		) {
-			break;
-		}
-		pswap = &swap->swb_hnext;
-	}
-	return (pswap);
-}
-
-/*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
  *	Expected to be started from system init.  NOTE:  This code is run
@@ -491,9 +467,9 @@
 
 	for (i = 0; i < NOBJLISTS; ++i)
 		TAILQ_INIT(&swap_pager_object_list[i]);
-	mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
 	sx_init(&sw_alloc_sx, "swspsx");
+	sx_init(&swdev_syscall_lock, "swsysc");
 }
 
 /*
@@ -539,21 +515,25 @@
 	mtx_unlock(&pbuf_mtx);
 
 	/*
-	 * Initialize our zone.  Right now I'm just guessing on the number
-	 * we need based on the number of pages in the system.  Each swblock
-	 * can hold 32 pages, so this is probably overkill.  This reservation
-	 * is typically limited to around 32MB by default.
+	 * Initialize our zone, taking the user's requested size or
+	 * estimating the number we need based on the number of pages
+	 * in the system.
 	 */
-	n = cnt.v_page_count / 2;
-	if (maxswzone && n > maxswzone / sizeof(struct swblock))
-		n = maxswzone / sizeof(struct swblock);
+	n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) :
+	    vm_cnt.v_page_count / 2;
+	swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL,
+	    pctrie_zone_init, NULL, UMA_ALIGN_PTR,
+	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
+	if (swpctrie_zone == NULL)
+		panic("failed to create swap pctrie zone.");
+	swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL,
+	    NULL, NULL, _Alignof(struct swblk) - 1,
+	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
+	if (swblk_zone == NULL)
+		panic("failed to create swap blk zone.");
 	n2 = n;
-	swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
-	if (swap_zone == NULL)
-		panic("failed to create swap_zone.");
 	do {
-		if (uma_zone_reserve_kva(swap_zone, n))
+		if (uma_zone_reserve_kva(swblk_zone, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
@@ -561,25 +541,50 @@
 		 */
 		n -= ((n + 2) / 3);
 	} while (n > 0);
-	if (n2 != n)
-		printf("Swap zone entries reduced from %lu to %lu.\n", n2, n);
+
+	/*
+	 * Often uma_zone_reserve_kva() cannot reserve exactly the
+	 * requested size.  Account for the difference when
+	 * calculating swap_maxpages.
+	 */
+	n = uma_zone_get_max(swblk_zone);
+
+	if (n < n2)
+		printf("Swap blk zone entries changed from %lu to %lu.\n",
+		    n2, n);
 	swap_maxpages = n * SWAP_META_PAGES;
-	swzone = n * sizeof(struct swblock);
-	n2 = n;
+	swzone = n * sizeof(struct swblk);
+	if (!uma_zone_reserve_kva(swpctrie_zone, n))
+		printf("Cannot reserve swap pctrie zone, "
+		    "reduce kern.maxswzone.\n");
+}
 
+static vm_object_t
+swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size,
+    vm_ooffset_t offset)
+{
+	vm_object_t object;
+
+	if (cred != NULL) {
+		if (!swap_reserve_by_cred(size, cred))
+			return (NULL);
+		crhold(cred);
+	}
+
 	/*
-	 * Initialize our meta-data hash table.  The swapper does not need to
-	 * be quite as efficient as the VM system, so we do not use an
-	 * oversized hash table.
-	 *
-	 * 	n: 		size of hash table, must be power of 2
-	 *	swhash_mask:	hash table index mask
+	 * The un_pager.swp.swp_blks trie is initialized by
+	 * vm_object_allocate() to ensure the correct order of
+	 * visibility to other threads.
 	 */
-	for (n = 1; n < n2 / 8; n *= 2)
-		;
-	swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
-	swhash_mask = n - 1;
-	mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
+	object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset +
+	    PAGE_MASK + size));
+
+	object->handle = handle;
+	if (cred != NULL) {
+		object->cred = cred;
+		object->charge = size;
+	}
+	return (object);
 }
 
 /*
@@ -587,13 +592,11 @@
  *			its metadata structures.
  *
  *	This routine is called from the mmap and fork code to create a new
- *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
- *	and then converting it with swp_pager_meta_build().
+ *	OBJT_SWAP object.
  *
- *	This routine may block in vm_object_allocate() and create a named
- *	object lookup race, so we must interlock.
- *
- * MPSAFE
+ *	This routine must ensure that no live duplicate is created for
+ *	the named object request, which is protected against by
+ *	holding the sw_alloc_sx lock in case handle != NULL.
  */
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
@@ -600,11 +603,8 @@
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
-	vm_pindex_t pindex;
 
-	pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
-	if (handle) {
-		mtx_lock(&Giant);
+	if (handle != NULL) {
 		/*
 		 * Reference existing named region or allocate new one.  There
 		 * should not be a race here against swp_pager_meta_build()
@@ -614,40 +614,16 @@
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 		if (object == NULL) {
-			if (cred != NULL) {
-				if (!swap_reserve_by_cred(size, cred)) {
-					sx_xunlock(&sw_alloc_sx);
-					mtx_unlock(&Giant);
-					return (NULL);
-				}
-				crhold(cred);
+			object = swap_pager_alloc_init(handle, cred, size,
+			    offset);
+			if (object != NULL) {
+				TAILQ_INSERT_TAIL(NOBJLIST(object->handle),
+				    object, pager_object_list);
 			}
-			object = vm_object_allocate(OBJT_DEFAULT, pindex);
-			VM_OBJECT_WLOCK(object);
-			object->handle = handle;
-			if (cred != NULL) {
-				object->cred = cred;
-				object->charge = size;
-			}
-			swp_pager_meta_build(object, 0, SWAPBLK_NONE);
-			VM_OBJECT_WUNLOCK(object);
 		}
 		sx_xunlock(&sw_alloc_sx);
-		mtx_unlock(&Giant);
 	} else {
-		if (cred != NULL) {
-			if (!swap_reserve_by_cred(size, cred))
-				return (NULL);
-			crhold(cred);
-		}
-		object = vm_object_allocate(OBJT_DEFAULT, pindex);
-		VM_OBJECT_WLOCK(object);
-		if (cred != NULL) {
-			object->cred = cred;
-			object->charge = size;
-		}
-		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
-		VM_OBJECT_WUNLOCK(object);
+		object = swap_pager_alloc_init(handle, cred, size, offset);
 	}
 	return (object);
 }
@@ -666,17 +642,22 @@
 swap_pager_dealloc(vm_object_t object)
 {
 
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj"));
+
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if (object->handle != NULL) {
-		mtx_lock(&sw_alloc_mtx);
-		TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
-		mtx_unlock(&sw_alloc_mtx);
+		VM_OBJECT_WUNLOCK(object);
+		sx_xlock(&sw_alloc_sx);
+		TAILQ_REMOVE(NOBJLIST(object->handle), object,
+		    pager_object_list);
+		sx_xunlock(&sw_alloc_sx);
+		VM_OBJECT_WLOCK(object);
 	}
 
-	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
@@ -763,11 +744,8 @@
 			mtx_unlock(&sw_dev_mtx);
 			if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
 			    unmapped_buf_allowed) {
-				bp->b_kvaalloc = bp->b_data;
 				bp->b_data = unmapped_buf;
-				bp->b_kvabase = unmapped_buf;
 				bp->b_offset = 0;
-				bp->b_flags |= B_UNMAPPED;
 			} else {
 				pmap_qenter((vm_offset_t)bp->b_data,
 				    &bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
@@ -815,6 +793,36 @@
 }
 
 /*
+ * SYSCTL_SWAP_FRAGMENTATION() -	produce raw swap space stats
+ */
+static int
+sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sbuf;
+	struct swdevt *sp;
+	const char *devname;
+	int error;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+	mtx_lock(&sw_dev_mtx);
+	TAILQ_FOREACH(sp, &swtailq, sw_list) {
+		if (vn_isdisk(sp->sw_vp, NULL))
+			devname = devtoname(sp->sw_vp->v_rdev);
+		else
+			devname = "[file]";
+		sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname);
+		blist_stats(sp->sw_blist, &sbuf);
+	}
+	mtx_unlock(&sw_dev_mtx);
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
+	return (error);
+}
+
+/*
  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
  *				range within an object.
  *
@@ -906,16 +914,19 @@
 	 * If destroysource is set, we remove the source object from the
 	 * swap_pager internal queue now.
 	 */
-	if (destroysource) {
-		if (srcobject->handle != NULL) {
-			mtx_lock(&sw_alloc_mtx);
-			TAILQ_REMOVE(
-			    NOBJLIST(srcobject->handle),
-			    srcobject,
-			    pager_object_list
-			);
-			mtx_unlock(&sw_alloc_mtx);
-		}
+	if (destroysource && srcobject->handle != NULL) {
+		vm_object_pip_add(srcobject, 1);
+		VM_OBJECT_WUNLOCK(srcobject);
+		vm_object_pip_add(dstobject, 1);
+		VM_OBJECT_WUNLOCK(dstobject);
+		sx_xlock(&sw_alloc_sx);
+		TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
+		    pager_object_list);
+		sx_xunlock(&sw_alloc_sx);
+		VM_OBJECT_WLOCK(dstobject);
+		vm_object_pip_wakeup(dstobject);
+		VM_OBJECT_WLOCK(srcobject);
+		vm_object_pip_wakeup(srcobject);
 	}
 
 	/*
@@ -970,7 +981,7 @@
 	/*
 	 * Free left over swap blocks in source.
 	 *
-	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
+	 * We have to revert the type to OBJT_DEFAULT so we do not accidentally
 	 * double-remove the object from the swap queues.
 	 */
 	if (destroysource) {
@@ -993,22 +1004,21 @@
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
- *	store exists before and after the requested page within a reasonable
- *	distance.  We do not try to restrict it to the swap device stripe
- *	(that is handled in getpages/putpages).  It probably isn't worth
- *	doing here.
+ *	store exists before and after the requested page.
  */
 static boolean_t
-swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
+swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+    int *after)
 {
-	daddr_t blk0;
+	daddr_t blk, blk0;
+	int i;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
+
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
-
 	if (blk0 == SWAPBLK_NONE) {
 		if (before)
 			*before = 0;
@@ -1021,11 +1031,7 @@
 	 * find backwards-looking contiguous good backing store
 	 */
 	if (before != NULL) {
-		int i;
-
-		for (i = 1; i < (SWB_NPAGES/2); ++i) {
-			daddr_t blk;
-
+		for (i = 1; i < SWB_NPAGES; i++) {
 			if (i > pindex)
 				break;
 			blk = swp_pager_meta_ctl(object, pindex - i, 0);
@@ -1032,7 +1038,7 @@
 			if (blk != blk0 - i)
 				break;
 		}
-		*before = (i - 1);
+		*before = i - 1;
 	}
 
 	/*
@@ -1039,16 +1045,12 @@
 	 * find forward-looking contiguous good backing store
 	 */
 	if (after != NULL) {
-		int i;
-
-		for (i = 1; i < (SWB_NPAGES/2); ++i) {
-			daddr_t blk;
-
+		for (i = 1; i < SWB_NPAGES; i++) {
 			blk = swp_pager_meta_ctl(object, pindex + i, 0);
 			if (blk != blk0 + i)
 				break;
 		}
-		*after = (i - 1);
+		*after = i - 1;
 	}
 	return (TRUE);
 }
@@ -1080,134 +1082,130 @@
 }
 
 /*
- * SWAP_PAGER_GETPAGES() - bring pages in from swap
+ * swap_pager_getpages() - bring pages in from swap
  *
- *	Attempt to retrieve (m, count) pages from backing store, but make
- *	sure we retrieve at least m[reqpage].  We try to load in as large
- *	a chunk surrounding m[reqpage] as is contiguous in swap and which
- *	belongs to the same object.
+ *	Attempt to page in the pages in array "ma" of length "count".  The
+ *	caller may optionally specify that additional pages preceding and
+ *	succeeding the specified range be paged in.  The number of such pages
+ *	is returned in the "rbehind" and "rahead" parameters, and they will
+ *	be in the inactive queue upon return.
  *
- *	The code is designed for asynchronous operation and
- *	immediate-notification of 'reqpage' but tends not to be
- *	used that way.  Please do not optimize-out this algorithmic
- *	feature, I intend to improve on it in the future.
- *
- *	The parent has a single vm_object_pip_add() reference prior to
- *	calling us and we should return with the same.
- *
- *	The parent has BUSY'd the pages.  We should return with 'm'
- *	left busy, but the others adjusted.
+ *	The pages in "ma" must be busied and will remain busied upon return.
  */
 static int
-swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
+swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
+    int *rahead)
 {
 	struct buf *bp;
-	vm_page_t mreq;
-	int i;
-	int j;
+	vm_page_t bm, mpred, msucc, p;
+	vm_pindex_t pindex;
 	daddr_t blk;
+	int i, maxahead, maxbehind, reqcount;
 
-	mreq = m[reqpage];
+	reqcount = count;
 
-	KASSERT(mreq->object == object,
-	    ("swap_pager_getpages: object mismatch %p/%p",
-	    object, mreq->object));
+	/*
+	 * Determine the final number of read-behind pages and
+	 * allocate them BEFORE releasing the object lock.  Otherwise,
+	 * there can be a problematic race with vm_object_split().
+	 * Specifically, vm_object_split() might first transfer pages
+	 * that precede ma[0] in the current object to a new object,
+	 * and then this function incorrectly recreates those pages as
+	 * read-behind pages in the current object.
+	 */
+	if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead))
+		return (VM_PAGER_FAIL);
 
 	/*
-	 * Calculate range to retrieve.  The pages have already been assigned
-	 * their swapblks.  We require a *contiguous* range but we know it to
-	 * not span devices.   If we do not supply it, bad things
-	 * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
-	 * loops are set up such that the case(s) are handled implicitly.
-	 *
-	 * The swp_*() calls must be made with the object locked.
+	 * Clip the readahead and readbehind ranges to exclude resident pages.
 	 */
-	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
-
-	for (i = reqpage - 1; i >= 0; --i) {
-		daddr_t iblk;
-
-		iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
-		if (blk != iblk + (reqpage - i))
-			break;
+	if (rahead != NULL) {
+		KASSERT(reqcount - 1 <= maxahead,
+		    ("page count %d extends beyond swap block", reqcount));
+		*rahead = imin(*rahead, maxahead - (reqcount - 1));
+		pindex = ma[reqcount - 1]->pindex;
+		msucc = TAILQ_NEXT(ma[reqcount - 1], listq);
+		if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead)
+			*rahead = msucc->pindex - pindex - 1;
 	}
-	++i;
+	if (rbehind != NULL) {
+		*rbehind = imin(*rbehind, maxbehind);
+		pindex = ma[0]->pindex;
+		mpred = TAILQ_PREV(ma[0], pglist, listq);
+		if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind)
+			*rbehind = pindex - mpred->pindex - 1;
+	}
 
-	for (j = reqpage + 1; j < count; ++j) {
-		daddr_t jblk;
+	bm = ma[0];
+	for (i = 0; i < count; i++)
+		ma[i]->oflags |= VPO_SWAPINPROG;
 
-		jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
-		if (blk != jblk - (j - reqpage))
-			break;
-	}
-
 	/*
-	 * free pages outside our collection range.   Note: we never free
-	 * mreq, it must remain busy throughout.
+	 * Allocate readahead and readbehind pages.
 	 */
-	if (0 < i || j < count) {
-		int k;
-
-		for (k = 0; k < i; ++k)
-			swp_pager_free_nrpage(m[k]);
-		for (k = j; k < count; ++k)
-			swp_pager_free_nrpage(m[k]);
+	if (rbehind != NULL) {
+		for (i = 1; i <= *rbehind; i++) {
+			p = vm_page_alloc(object, ma[0]->pindex - i,
+			    VM_ALLOC_NORMAL);
+			if (p == NULL)
+				break;
+			p->oflags |= VPO_SWAPINPROG;
+			bm = p;
+		}
+		*rbehind = i - 1;
 	}
+	if (rahead != NULL) {
+		for (i = 0; i < *rahead; i++) {
+			p = vm_page_alloc(object,
+			    ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL);
+			if (p == NULL)
+				break;
+			p->oflags |= VPO_SWAPINPROG;
+		}
+		*rahead = i;
+	}
+	if (rbehind != NULL)
+		count += *rbehind;
+	if (rahead != NULL)
+		count += *rahead;
 
-	/*
-	 * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq
-	 * still busy, but the others unbusied.
-	 */
-	if (blk == SWAPBLK_NONE)
-		return (VM_PAGER_FAIL);
+	vm_object_pip_add(object, count);
 
-	/*
-	 * Getpbuf() can sleep.
-	 */
+	pindex = bm->pindex;
+	blk = swp_pager_meta_ctl(object, pindex, 0);
+	KASSERT(blk != SWAPBLK_NONE,
+	    ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex));
+
 	VM_OBJECT_WUNLOCK(object);
-	/*
-	 * Get a swap buffer header to perform the IO
-	 */
 	bp = getpbuf(&nsw_rcount);
+	/* Pages cannot leave the object while busy. */
+	for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) {
+		MPASS(p->pindex == bm->pindex + i);
+		bp->b_pages[i] = p;
+	}
+
 	bp->b_flags |= B_PAGING;
-
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
 	bp->b_wcred = crhold(thread0.td_ucred);
-	bp->b_blkno = blk - (reqpage - i);
-	bp->b_bcount = PAGE_SIZE * (j - i);
-	bp->b_bufsize = PAGE_SIZE * (j - i);
-	bp->b_pager.pg_reqpage = reqpage - i;
+	bp->b_blkno = blk;
+	bp->b_bcount = PAGE_SIZE * count;
+	bp->b_bufsize = PAGE_SIZE * count;
+	bp->b_npages = count;
+	bp->b_pgbefore = rbehind != NULL ? *rbehind : 0;
+	bp->b_pgafter = rahead != NULL ? *rahead : 0;
 
-	VM_OBJECT_WLOCK(object);
-	{
-		int k;
-
-		for (k = i; k < j; ++k) {
-			bp->b_pages[k - i] = m[k];
-			m[k]->oflags |= VPO_SWAPINPROG;
-		}
-	}
-	bp->b_npages = j - i;
-
 	PCPU_INC(cnt.v_swapin);
-	PCPU_ADD(cnt.v_swappgsin, bp->b_npages);
+	PCPU_ADD(cnt.v_swappgsin, count);
 
 	/*
-	 * We still hold the lock on mreq, and our automatic completion routine
-	 * does not remove it.
-	 */
-	vm_object_pip_add(object, bp->b_npages);
-	VM_OBJECT_WUNLOCK(object);
-
-	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 	 * this point because we automatically release it on completion.
 	 * Instead, we look at the one page we are interested in which we
 	 * still hold a lock on even through the I/O completion.
 	 *
-	 * The other pages in our m[] array are also released on completion,
+	 * The other pages in our ma[] array are also released on completion,
 	 * so we cannot assume they are valid anymore either.
 	 *
 	 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
@@ -1216,13 +1214,13 @@
 	swp_pager_strategy(bp);
 
 	/*
-	 * wait for the page we want to complete.  VPO_SWAPINPROG is always
+	 * Wait for the pages we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
-	 * is set in the meta-data.
+	 * is set in the metadata for each page in the request.
 	 */
 	VM_OBJECT_WLOCK(object);
-	while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
-		mreq->oflags |= VPO_SWAPSLEEP;
+	while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) {
+		ma[0]->oflags |= VPO_SWAPSLEEP;
 		PCPU_INC(cnt.v_intrans);
 		if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP,
 		    "swread", hz * 20)) {
@@ -1233,16 +1231,14 @@
 	}
 
 	/*
-	 * mreq is left busied after completion, but all the other pages
-	 * are freed.  If we had an unrecoverable read error the page will
-	 * not be valid.
+	 * If we had an unrecoverable read error pages will not be valid.
 	 */
-	if (mreq->valid != VM_PAGE_BITS_ALL) {
-		return (VM_PAGER_ERROR);
-	} else {
-		return (VM_PAGER_OK);
-	}
+	for (i = 0; i < reqcount; i++)
+		if (ma[i]->valid != VM_PAGE_BITS_ALL)
+			return (VM_PAGER_ERROR);
 
+	return (VM_PAGER_OK);
+
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
@@ -1252,6 +1248,39 @@
 }
 
 /*
+ * 	swap_pager_getpages_async():
+ *
+ *	Right now this is emulation of asynchronous operation on top of
+ *	swap_pager_getpages().
+ */
+static int
+swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
+{
+	int r, error;
+
+	r = swap_pager_getpages(object, ma, count, rbehind, rahead);
+	VM_OBJECT_WUNLOCK(object);
+	switch (r) {
+	case VM_PAGER_OK:
+		error = 0;
+		break;
+	case VM_PAGER_ERROR:
+		error = EIO;
+		break;
+	case VM_PAGER_FAIL:
+		error = EINVAL;
+		break;
+	default:
+		panic("unhandled swap_pager_getpages() error %d", r);
+	}
+	(iodone)(arg, ma, count, error);
+	VM_OBJECT_WLOCK(object);
+
+	return (r);
+}
+
+/*
  *	swap_pager_putpages:
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
@@ -1273,17 +1302,17 @@
  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
  *	We need to unbusy the rest on I/O completion.
  */
-void
-swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
+static void
+swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
     int flags, int *rtvals)
 {
 	int i, n;
 	boolean_t sync;
 
-	if (count && m[0]->object != object) {
+	if (count && ma[0]->object != object) {
 		panic("swap_pager_putpages: object mismatch %p/%p",
 		    object,
-		    m[0]->object
+		    ma[0]->object
 		);
 	}
 
@@ -1307,39 +1336,6 @@
 	/*
 	 * Step 2
 	 *
-	 * Update nsw parameters from swap_async_max sysctl values.
-	 * Do not let the sysop crash the machine with bogus numbers.
-	 */
-	mtx_lock(&pbuf_mtx);
-	if (swap_async_max != nsw_wcount_async_max) {
-		int n;
-
-		/*
-		 * limit range
-		 */
-		if ((n = swap_async_max) > nswbuf / 2)
-			n = nswbuf / 2;
-		if (n < 1)
-			n = 1;
-		swap_async_max = n;
-
-		/*
-		 * Adjust difference ( if possible ).  If the current async
-		 * count is too low, we may not be able to make the adjustment
-		 * at this time.
-		 */
-		n -= nsw_wcount_async_max;
-		if (nsw_wcount_async + n >= 0) {
-			nsw_wcount_async += n;
-			nsw_wcount_async_max += n;
-			wakeup(&nsw_wcount_async);
-		}
-	}
-	mtx_unlock(&pbuf_mtx);
-
-	/*
-	 * Step 3
-	 *
 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 	 * The page is left dirty until the pageout operation completes
 	 * successfully.
@@ -1394,7 +1390,7 @@
 
 		VM_OBJECT_WLOCK(object);
 		for (j = 0; j < n; ++j) {
-			vm_page_t mreq = m[i+j];
+			vm_page_t mreq = ma[i+j];
 
 			swp_pager_meta_build(
 			    mreq->object,
@@ -1402,8 +1398,6 @@
 			    blk + j
 			);
 			MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
-			rtvals[i+j] = VM_PAGER_OK;
-
 			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
@@ -1419,6 +1413,16 @@
 		PCPU_ADD(cnt.v_swappgsout, bp->b_npages);
 
 		/*
+		 * We unconditionally set rtvals[] to VM_PAGER_PEND so that we
+		 * can call the async completion routine at the end of a
+		 * synchronous I/O operation.  Otherwise, our caller would
+		 * perform duplicate unbusy and wakeup operations on the page
+		 * and object, respectively.
+		 */
+		for (j = 0; j < n; j++)
+			rtvals[i + j] = VM_PAGER_PEND;
+
+		/*
 		 * asynchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
@@ -1427,10 +1431,6 @@
 			bp->b_iodone = swp_pager_async_iodone;
 			BUF_KERNPROC(bp);
 			swp_pager_strategy(bp);
-
-			for (j = 0; j < n; ++j)
-				rtvals[i+j] = VM_PAGER_PEND;
-			/* restart outter loop */
 			continue;
 		}
 
@@ -1443,14 +1443,10 @@
 		swp_pager_strategy(bp);
 
 		/*
-		 * Wait for the sync I/O to complete, then update rtvals.
-		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
-		 * our async completion routine at the end, thus avoiding a
-		 * double-free.
+		 * Wait for the sync I/O to complete.
 		 */
 		bwait(bp, PVM, "swwrt");
-		for (j = 0; j < n; ++j)
-			rtvals[i+j] = VM_PAGER_PEND;
+
 		/*
 		 * Now that we are through with the bp, we can call the
 		 * normal async completion, which frees everything up.
@@ -1491,12 +1487,10 @@
 	/*
 	 * remove the mapping for kernel virtual
 	 */
-	if ((bp->b_flags & B_UNMAPPED) != 0) {
-		bp->b_data = bp->b_kvaalloc;
-		bp->b_kvabase = bp->b_kvaalloc;
-		bp->b_flags &= ~B_UNMAPPED;
-	} else
+	if (buf_mapped(bp))
 		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
+	else
+		bp->b_data = bp->b_kvabase;
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
@@ -1529,33 +1523,11 @@
 			 */
 			if (bp->b_iocmd == BIO_READ) {
 				/*
-				 * When reading, reqpage needs to stay
-				 * locked for the parent, but all other
-				 * pages can be freed.  We still want to
-				 * wakeup the parent waiting on the page,
-				 * though.  ( also: pg_reqpage can be -1 and
-				 * not match anything ).
-				 *
-				 * We have to wake specifically requested pages
-				 * up too because we cleared VPO_SWAPINPROG and
-				 * someone may be waiting for that.
-				 *
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
 				 */
 				m->valid = 0;
-				if (i != bp->b_pager.pg_reqpage)
-					swp_pager_free_nrpage(m);
-				else {
-					vm_page_lock(m);
-					vm_page_flash(m);
-					vm_page_unlock(m);
-				}
-				/*
-				 * If i == bp->b_pager.pg_reqpage, do not wake
-				 * the page up.  The caller needs to.
-				 */
 			} else {
 				/*
 				 * If a write error occurs, reactivate page
@@ -1562,7 +1534,7 @@
 				 * so it doesn't clog the inactive list,
 				 * then finish the I/O.
 				 */
-				vm_page_dirty(m);
+				MPASS(m->dirty == VM_PAGE_BITS_ALL);
 				vm_page_lock(m);
 				vm_page_activate(m);
 				vm_page_unlock(m);
@@ -1577,54 +1549,33 @@
 			 * want to do that anyway, but it was an optimization
 			 * that existed in the old swapper for a time before
 			 * it got ripped out due to precisely this problem.
-			 *
-			 * If not the requested page then deactivate it.
-			 *
-			 * Note that the requested page, reqpage, is left
-			 * busied, but we still have to wake it up.  The
-			 * other pages are released (unbusied) by
-			 * vm_page_xunbusy().
 			 */
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("swp_pager_async_iodone: page %p is mapped", m));
-			m->valid = VM_PAGE_BITS_ALL;
 			KASSERT(m->dirty == 0,
 			    ("swp_pager_async_iodone: page %p is dirty", m));
 
-			/*
-			 * We have to wake specifically requested pages
-			 * up too because we cleared VPO_SWAPINPROG and
-			 * could be waiting for it in getpages.  However,
-			 * be sure to not unbusy getpages specifically
-			 * requested page - getpages expects it to be
-			 * left busy.
-			 */
-			if (i != bp->b_pager.pg_reqpage) {
-				vm_page_lock(m);
-				vm_page_deactivate(m);
-				vm_page_unlock(m);
-				vm_page_xunbusy(m);
-			} else {
-				vm_page_lock(m);
-				vm_page_flash(m);
-				vm_page_unlock(m);
-			}
+			m->valid = VM_PAGE_BITS_ALL;
+			if (i < bp->b_pgbefore ||
+			    i >= bp->b_npages - bp->b_pgafter)
+				vm_page_readahead_finish(m);
 		} else {
 			/*
 			 * For write success, clear the dirty
 			 * status, then finish the I/O ( which decrements the
 			 * busy count and possibly wakes waiter's up ).
+			 * A page is only written to swap after a period of
+			 * inactivity.  Therefore, we do not expect it to be
+			 * reused.
 			 */
 			KASSERT(!pmap_page_is_write_mapped(m),
 			    ("swp_pager_async_iodone: page %p is not write"
 			    " protected", m));
 			vm_page_undirty(m);
+			vm_page_lock(m);
+			vm_page_deactivate_noreuse(m);
+			vm_page_unlock(m);
 			vm_page_sunbusy(m);
-			if (vm_page_count_severe()) {
-				vm_page_lock(m);
-				vm_page_try_to_cache(m);
-				vm_page_unlock(m);
-			}
 		}
 	}
 
@@ -1661,51 +1612,17 @@
 }
 
 /*
- *	swap_pager_isswapped:
+ * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
  *
- *	Return 1 if at least one page in the given object is paged
- *	out to the given swap device.
+ *	This routine dissociates the page at the given index within an object
+ *	from its backing store, paging it in if it does not reside in memory.
+ *	If the page is paged in, it is marked dirty and placed in the laundry
+ *	queue.  The page is marked dirty because it no longer has backing
+ *	store.  It is placed in the laundry queue because it has not been
+ *	accessed recently.  Otherwise, it would already reside in memory.
  *
- *	This routine may not sleep.
- */
-int
-swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
-{
-	daddr_t index = 0;
-	int bcount;
-	int i;
-
-	VM_OBJECT_ASSERT_WLOCKED(object);
-	if (object->type != OBJT_SWAP)
-		return (0);
-
-	mtx_lock(&swhash_mtx);
-	for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
-		struct swblock *swap;
-
-		if ((swap = *swp_pager_hash(object, index)) != NULL) {
-			for (i = 0; i < SWAP_META_PAGES; ++i) {
-				if (swp_pager_isondev(swap->swb_pages[i], sp)) {
-					mtx_unlock(&swhash_mtx);
-					return (1);
-				}
-			}
-		}
-		index += SWAP_META_PAGES;
-	}
-	mtx_unlock(&swhash_mtx);
-	return (0);
-}
-
-/*
- * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
- *
- *	This routine dissociates the page at the given index within a
- *	swap block from its backing store, paging it in if necessary.
- *	If the page is paged in, it is placed in the inactive queue,
- *	since it had its backing store ripped out from under it.
- *	We also attempt to swap in all other pages in the swap block,
- *	we only guarantee that the one at the specified index is
+ *	We also attempt to swap in all other pages in the swap block.
+ *	However, we only guarantee that the one at the specified index is
  *	paged in.
  *
  *	XXX - The code to page the whole block in doesn't work, so we
@@ -1719,7 +1636,7 @@
 	vm_object_pip_add(object, 1);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 	if (m->valid == VM_PAGE_BITS_ALL) {
-		vm_object_pip_subtract(object, 1);
+		vm_object_pip_wakeup(object);
 		vm_page_dirty(m);
 		vm_page_lock(m);
 		vm_page_activate(m);
@@ -1729,12 +1646,12 @@
 		return;
 	}
 
-	if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
+	if (swap_pager_getpages(object, &m, 1, NULL, NULL) != VM_PAGER_OK)
 		panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
-	vm_object_pip_subtract(object, 1);
+	vm_object_pip_wakeup(object);
 	vm_page_dirty(m);
 	vm_page_lock(m);
-	vm_page_deactivate(m);
+	vm_page_launder(m);
 	vm_page_unlock(m);
 	vm_page_xunbusy(m);
 	vm_pager_page_unswapped(m);
@@ -1753,50 +1670,56 @@
 static void
 swap_pager_swapoff(struct swdevt *sp)
 {
-	struct swblock *swap;
-	vm_object_t locked_obj, object;
-	vm_pindex_t pindex;
-	int i, j, retries;
+	struct swblk *sb;
+	vm_object_t object;
+	vm_pindex_t pi;
+	int i, retries;
 
-	GIANT_REQUIRED;
+	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 
 	retries = 0;
-	locked_obj = NULL;
 full_rescan:
-	mtx_lock(&swhash_mtx);
-	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
-restart:
-		for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
-			object = swap->swb_object;
-			pindex = swap->swb_index;
-			for (j = 0; j < SWAP_META_PAGES; ++j) {
-				if (!swp_pager_isondev(swap->swb_pages[j], sp))
+	mtx_lock(&vm_object_list_mtx);
+	TAILQ_FOREACH(object, &vm_object_list, object_list) {
+		if (object->type != OBJT_SWAP)
+			continue;
+		mtx_unlock(&vm_object_list_mtx);
+		/* Depends on type-stability. */
+		VM_OBJECT_WLOCK(object);
+
+		/*
+		 * Dead objects are eventually terminated on their own.
+		 */
+		if ((object->flags & OBJ_DEAD) != 0)
+			goto next_obj;
+
+		/*
+		 * Sync with fences placed after pctrie
+		 * initialization.  We must not access pctrie below
+		 * unless we checked that our object is swap and not
+		 * dead.
+		 */
+		atomic_thread_fence_acq();
+		if (object->type != OBJT_SWAP)
+			goto next_obj;
+
+		for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
+		    &object->un_pager.swp.swp_blks, pi)) != NULL; ) {
+			pi = sb->p + SWAP_META_PAGES;
+			for (i = 0; i < SWAP_META_PAGES; i++) {
+				if (sb->d[i] == SWAPBLK_NONE)
 					continue;
-				if (locked_obj != object) {
-					if (locked_obj != NULL)
-						VM_OBJECT_WUNLOCK(locked_obj);
-					locked_obj = object;
-					if (!VM_OBJECT_TRYWLOCK(object)) {
-						mtx_unlock(&swhash_mtx);
-						/* Depends on type-stability. */
-						VM_OBJECT_WLOCK(object);
-						mtx_lock(&swhash_mtx);
-						goto restart;
-					}
-				}
-				MPASS(locked_obj == object);
-				mtx_unlock(&swhash_mtx);
-				swp_pager_force_pagein(object, pindex + j);
-				mtx_lock(&swhash_mtx);
-				goto restart;
+				if (swp_pager_isondev(sb->d[i], sp))
+					swp_pager_force_pagein(object,
+					    sb->p + i);
 			}
 		}
+next_obj:
+		VM_OBJECT_WUNLOCK(object);
+		mtx_lock(&vm_object_list_mtx);
 	}
-	mtx_unlock(&swhash_mtx);
-	if (locked_obj != NULL) {
-		VM_OBJECT_WUNLOCK(locked_obj);
-		locked_obj = NULL;
-	}
+	mtx_unlock(&vm_object_list_mtx);
+
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
@@ -1839,94 +1762,120 @@
 static void
 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 {
-	static volatile int exhausted;
-	struct swblock *swap;
-	struct swblock **pswap;
-	int idx;
+	static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
+	struct swblk *sb, *sb1;
+	vm_pindex_t modpi, rdpi;
+	int error, i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
+
 	/*
 	 * Convert default object to swap object if necessary
 	 */
 	if (object->type != OBJT_SWAP) {
+		pctrie_init(&object->un_pager.swp.swp_blks);
+
+		/*
+		 * Ensure that swap_pager_swapoff()'s iteration over
+		 * object_list does not see a garbage pctrie.
+		 */
+		atomic_thread_fence_rel();
+
 		object->type = OBJT_SWAP;
-		object->un_pager.swp.swp_bcount = 0;
-
-		if (object->handle != NULL) {
-			mtx_lock(&sw_alloc_mtx);
-			TAILQ_INSERT_TAIL(
-			    NOBJLIST(object->handle),
-			    object,
-			    pager_object_list
-			);
-			mtx_unlock(&sw_alloc_mtx);
-		}
+		KASSERT(object->handle == NULL, ("default pager with handle"));
 	}
 
-	/*
-	 * Locate hash entry.  If not found create, but if we aren't adding
-	 * anything just return.  If we run out of space in the map we wait
-	 * and, since the hash table may have changed, retry.
-	 */
-retry:
-	mtx_lock(&swhash_mtx);
-	pswap = swp_pager_hash(object, pindex);
-
-	if ((swap = *pswap) == NULL) {
-		int i;
-
+	rdpi = rounddown(pindex, SWAP_META_PAGES);
+	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi);
+	if (sb == NULL) {
 		if (swapblk == SWAPBLK_NONE)
-			goto done;
-
-		swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT |
-		    (curproc == pageproc ? M_USE_RESERVE : 0));
-		if (swap == NULL) {
-			mtx_unlock(&swhash_mtx);
+			return;
+		for (;;) {
+			sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc ==
+			    pageproc ? M_USE_RESERVE : 0));
+			if (sb != NULL) {
+				sb->p = rdpi;
+				for (i = 0; i < SWAP_META_PAGES; i++)
+					sb->d[i] = SWAPBLK_NONE;
+				if (atomic_cmpset_int(&swblk_zone_exhausted,
+				    1, 0))
+					printf("swblk zone ok\n");
+				break;
+			}
 			VM_OBJECT_WUNLOCK(object);
-			if (uma_zone_exhausted(swap_zone)) {
-				if (atomic_cmpset_int(&exhausted, 0, 1))
-					printf("swap zone exhausted, "
+			if (uma_zone_exhausted(swblk_zone)) {
+				if (atomic_cmpset_int(&swblk_zone_exhausted,
+				    0, 1))
+					printf("swap blk zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
-				pause("swzonex", 10);
+				pause("swzonxb", 10);
 			} else
-				VM_WAIT;
+				uma_zwait(swblk_zone);
 			VM_OBJECT_WLOCK(object);
-			goto retry;
+			sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+			    rdpi);
+			if (sb != NULL)
+				/*
+				 * Somebody swapped out a nearby page,
+				 * allocating swblk at the rdpi index,
+				 * while we dropped the object lock.
+				 */
+				goto allocated;
 		}
+		for (;;) {
+			error = SWAP_PCTRIE_INSERT(
+			    &object->un_pager.swp.swp_blks, sb);
+			if (error == 0) {
+				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
+				    1, 0))
+					printf("swpctrie zone ok\n");
+				break;
+			}
+			VM_OBJECT_WUNLOCK(object);
+			if (uma_zone_exhausted(swpctrie_zone)) {
+				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
+				    0, 1))
+					printf("swap pctrie zone exhausted, "
+					    "increase kern.maxswzone\n");
+				vm_pageout_oom(VM_OOM_SWAPZ);
+				pause("swzonxp", 10);
+			} else
+				uma_zwait(swpctrie_zone);
+			VM_OBJECT_WLOCK(object);
+			sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+			    rdpi);
+			if (sb1 != NULL) {
+				uma_zfree(swblk_zone, sb);
+				sb = sb1;
+				goto allocated;
+			}
+		}
+	}
+allocated:
+	MPASS(sb->p == rdpi);
 
-		if (atomic_cmpset_int(&exhausted, 1, 0))
-			printf("swap zone ok\n");
+	modpi = pindex % SWAP_META_PAGES;
+	/* Delete prior contents of metadata. */
+	if (sb->d[modpi] != SWAPBLK_NONE)
+		swp_pager_freeswapspace(sb->d[modpi], 1);
+	/* Enter block into metadata. */
+	sb->d[modpi] = swapblk;
 
-		swap->swb_hnext = NULL;
-		swap->swb_object = object;
-		swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
-		swap->swb_count = 0;
-
-		++object->un_pager.swp.swp_bcount;
-
-		for (i = 0; i < SWAP_META_PAGES; ++i)
-			swap->swb_pages[i] = SWAPBLK_NONE;
-	}
-
 	/*
-	 * Delete prior contents of metadata
+	 * Free the swblk if we end up with the empty page run.
 	 */
-	idx = pindex & SWAP_META_MASK;
-
-	if (swap->swb_pages[idx] != SWAPBLK_NONE) {
-		swp_pager_freeswapspace(swap->swb_pages[idx], 1);
-		--swap->swb_count;
+	if (swapblk == SWAPBLK_NONE) {
+		for (i = 0; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] != SWAPBLK_NONE)
+				break;
+		}
+		if (i == SWAP_META_PAGES) {
+			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+			    rdpi);
+			uma_zfree(swblk_zone, sb);
+		}
 	}
-
-	/*
-	 * Enter block into metadata
-	 */
-	swap->swb_pages[idx] = swapblk;
-	if (swapblk != SWAPBLK_NONE)
-		++swap->swb_count;
-done:
-	mtx_unlock(&swhash_mtx);
 }
 
 /*
@@ -1940,41 +1889,39 @@
  *	with resident pages.
  */
 static void
-swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
+swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count)
 {
+	struct swblk *sb;
+	vm_pindex_t last;
+	int i;
+	bool empty;
 
-	VM_OBJECT_ASSERT_LOCKED(object);
-	if (object->type != OBJT_SWAP)
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (object->type != OBJT_SWAP || count == 0)
 		return;
 
-	while (count > 0) {
-		struct swblock **pswap;
-		struct swblock *swap;
-
-		mtx_lock(&swhash_mtx);
-		pswap = swp_pager_hash(object, index);
-
-		if ((swap = *pswap) != NULL) {
-			daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
-
-			if (v != SWAPBLK_NONE) {
-				swp_pager_freeswapspace(v, 1);
-				swap->swb_pages[index & SWAP_META_MASK] =
-					SWAPBLK_NONE;
-				if (--swap->swb_count == 0) {
-					*pswap = swap->swb_hnext;
-					uma_zfree(swap_zone, swap);
-					--object->un_pager.swp.swp_bcount;
-				}
-			}
-			--count;
-			++index;
-		} else {
-			int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
-			count -= n;
-			index += n;
+	last = pindex + count - 1;
+	for (;;) {
+		sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+		    rounddown(pindex, SWAP_META_PAGES));
+		if (sb == NULL || sb->p > last)
+			break;
+		empty = true;
+		for (i = 0; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] == SWAPBLK_NONE)
+				continue;
+			if (pindex <= sb->p + i && sb->p + i <= last) {
+				swp_pager_freeswapspace(sb->d[i], 1);
+				sb->d[i] = SWAPBLK_NONE;
+			} else
+				empty = false;
 		}
-		mtx_unlock(&swhash_mtx);
+		pindex = sb->p + SWAP_META_PAGES;
+		if (empty) {
+			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+			    sb->p);
+			uma_zfree(swblk_zone, sb);
+		}
 	}
 }
 
@@ -1987,9 +1934,8 @@
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
-	struct swblock **pswap, *swap;
-	vm_pindex_t index;
-	daddr_t v;
+	struct swblk *sb;
+	vm_pindex_t pindex;
 	int i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
@@ -1996,27 +1942,15 @@
 	if (object->type != OBJT_SWAP)
 		return;
 
-	index = 0;
-	while (object->un_pager.swp.swp_bcount != 0) {
-		mtx_lock(&swhash_mtx);
-		pswap = swp_pager_hash(object, index);
-		if ((swap = *pswap) != NULL) {
-			for (i = 0; i < SWAP_META_PAGES; ++i) {
-				v = swap->swb_pages[i];
-				if (v != SWAPBLK_NONE) {
-					--swap->swb_count;
-					swp_pager_freeswapspace(v, 1);
-				}
-			}
-			if (swap->swb_count != 0)
-				panic(
-				    "swap_pager_meta_free_all: swb_count != 0");
-			*pswap = swap->swb_hnext;
-			uma_zfree(swap_zone, swap);
-			--object->un_pager.swp.swp_bcount;
+	for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
+	    &object->un_pager.swp.swp_blks, pindex)) != NULL;) {
+		pindex = sb->p + SWAP_META_PAGES;
+		for (i = 0; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] != SWAPBLK_NONE)
+				swp_pager_freeswapspace(sb->d[i], 1);
 		}
-		mtx_unlock(&swhash_mtx);
-		index += SWAP_META_PAGES;
+		SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
+		uma_zfree(swblk_zone, sb);
 	}
 }
 
@@ -2030,9 +1964,6 @@
  *	was invalid.  This routine will automatically free any invalid
  *	meta-data swapblks.
  *
- *	It is not possible to store invalid swapblks in the swap meta data
- *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
- *
  *	When acting on a busy resident page and paging is in progress, we
  *	have to wait until paging is complete but otherwise can act on the
  *	busy page.
@@ -2043,44 +1974,90 @@
 static daddr_t
 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 {
-	struct swblock **pswap;
-	struct swblock *swap;
+	struct swblk *sb;
 	daddr_t r1;
-	int idx;
+	int i;
 
-	VM_OBJECT_ASSERT_LOCKED(object);
+	if ((flags & (SWM_FREE | SWM_POP)) != 0)
+		VM_OBJECT_ASSERT_WLOCKED(object);
+	else
+		VM_OBJECT_ASSERT_LOCKED(object);
+
 	/*
-	 * The meta data only exists of the object is OBJT_SWAP
+	 * The meta data only exists if the object is OBJT_SWAP
 	 * and even then might not be allocated yet.
 	 */
 	if (object->type != OBJT_SWAP)
 		return (SWAPBLK_NONE);
 
-	r1 = SWAPBLK_NONE;
-	mtx_lock(&swhash_mtx);
-	pswap = swp_pager_hash(object, pindex);
+	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+	    rounddown(pindex, SWAP_META_PAGES));
+	if (sb == NULL)
+		return (SWAPBLK_NONE);
+	r1 = sb->d[pindex % SWAP_META_PAGES];
+	if (r1 == SWAPBLK_NONE)
+		return (SWAPBLK_NONE);
+	if ((flags & (SWM_FREE | SWM_POP)) != 0) {
+		sb->d[pindex % SWAP_META_PAGES] = SWAPBLK_NONE;
+		for (i = 0; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] != SWAPBLK_NONE)
+				break;
+		}
+		if (i == SWAP_META_PAGES) {
+			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+			    rounddown(pindex, SWAP_META_PAGES));
+			uma_zfree(swblk_zone, sb);
+		}
+	}
+	if ((flags & SWM_FREE) != 0) {
+		swp_pager_freeswapspace(r1, 1);
+		r1 = SWAPBLK_NONE;
+	}
+	return (r1);
+}
 
-	if ((swap = *pswap) != NULL) {
-		idx = pindex & SWAP_META_MASK;
-		r1 = swap->swb_pages[idx];
+/*
+ * Returns the least page index which is greater than or equal to the
+ * parameter pindex and for which there is a swap block allocated.
+ * Returns object's size if the object's type is not swap or if there
+ * are no allocated swap blocks for the object after the requested
+ * pindex.
+ */
+vm_pindex_t
+swap_pager_find_least(vm_object_t object, vm_pindex_t pindex)
+{
+	struct swblk *sb;
+	int i;
 
-		if (r1 != SWAPBLK_NONE) {
-			if (flags & SWM_FREE) {
-				swp_pager_freeswapspace(r1, 1);
-				r1 = SWAPBLK_NONE;
-			}
-			if (flags & (SWM_FREE|SWM_POP)) {
-				swap->swb_pages[idx] = SWAPBLK_NONE;
-				if (--swap->swb_count == 0) {
-					*pswap = swap->swb_hnext;
-					uma_zfree(swap_zone, swap);
-					--object->un_pager.swp.swp_bcount;
-				}
-			}
+	VM_OBJECT_ASSERT_LOCKED(object);
+	if (object->type != OBJT_SWAP)
+		return (object->size);
+
+	sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+	    rounddown(pindex, SWAP_META_PAGES));
+	if (sb == NULL)
+		return (object->size);
+	if (sb->p < pindex) {
+		for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] != SWAPBLK_NONE)
+				return (sb->p + i);
 		}
+		sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+		    roundup(pindex, SWAP_META_PAGES));
+		if (sb == NULL)
+			return (object->size);
 	}
-	mtx_unlock(&swhash_mtx);
-	return (r1);
+	for (i = 0; i < SWAP_META_PAGES; i++) {
+		if (sb->d[i] != SWAPBLK_NONE)
+			return (sb->p + i);
+	}
+
+	/*
+	 * We get here if a swblk is present in the trie but it
+	 * doesn't map any blocks.
+	 */
+	MPASS(0);
+	return (object->size);
 }
 
 /*
@@ -2110,16 +2087,13 @@
 	if (error)
 		return (error);
 
-	mtx_lock(&Giant);
-	while (swdev_syscall_active)
-	    tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
-	swdev_syscall_active = 1;
+	sx_xlock(&swdev_syscall_lock);
 
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
-	if (swap_zone == NULL) {
+	if (swblk_zone == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
@@ -2134,7 +2108,7 @@
 	vp = nd.ni_vp;
 
 	if (vn_isdisk(vp, &error)) {
-		error = swapongeom(td, vp);
+		error = swapongeom(vp);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
@@ -2148,9 +2122,7 @@
 	if (error)
 		vrele(vp);
 done:
-	swdev_syscall_active = 0;
-	wakeup_one(&swdev_syscall_active);
-	mtx_unlock(&Giant);
+	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
@@ -2157,15 +2129,16 @@
 /*
  * Check that the total amount of swap currently configured does not
  * exceed half the theoretical maximum.  If it does, print a warning
- * message and return -1; otherwise, return 0.
+ * message.
  */
-static int
-swapon_check_swzone(unsigned long npages)
+static void
+swapon_check_swzone(void)
 {
-	unsigned long maxpages;
+	unsigned long maxpages, npages;
 
+	npages = swap_total / PAGE_SIZE;
 	/* absolute maximum we can handle assuming 100% efficiency */
-	maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES;
+	maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES;
 
 	/* recommend using no more than half that amount */
 	if (npages > maxpages / 2) {
@@ -2174,9 +2147,7 @@
 		    npages, maxpages / 2);
 		printf("warning: increase kern.maxswzone "
 		    "or reduce amount of swap.\n");
-		return (-1);
 	}
-	return (0);
 }
 
 static void
@@ -2212,7 +2183,6 @@
 	sp->sw_vp = vp;
 	sp->sw_id = id;
 	sp->sw_dev = dev;
-	sp->sw_flags = 0;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
@@ -2244,7 +2214,7 @@
 	nswapdev++;
 	swap_pager_avail += nblks - 2;
 	swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
-	swapon_check_swzone(swap_total / PAGE_SIZE);
+	swapon_check_swzone();
 	swp_sizecheck();
 	mtx_unlock(&sw_dev_mtx);
 }
@@ -2280,10 +2250,7 @@
 	if (error)
 		return (error);
 
-	mtx_lock(&Giant);
-	while (swdev_syscall_active)
-	    tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
-	swdev_syscall_active = 1;
+	sx_xlock(&swdev_syscall_lock);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
 	    td);
@@ -2305,9 +2272,7 @@
 	}
 	error = swapoff_one(sp, td->td_ucred);
 done:
-	swdev_syscall_active = 0;
-	wakeup_one(&swdev_syscall_active);
-	mtx_unlock(&Giant);
+	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
@@ -2319,7 +2284,7 @@
 	int error;
 #endif
 
-	mtx_assert(&Giant, MA_OWNED);
+	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 #ifdef MAC
 	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_system_check_swapoff(cred, sp->sw_vp);
@@ -2335,10 +2300,8 @@
 	 * of data we will have to page back in, plus an epsilon so
 	 * the system doesn't become critically low on swap space.
 	 */
-	if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
-	    nblks + nswap_lowat) {
+	if (vm_cnt.v_free_count + swap_pager_avail < nblks + nswap_lowat)
 		return (ENOMEM);
-	}
 
 	/*
 	 * Prevent further allocations on this device.
@@ -2378,10 +2341,7 @@
 	const char *devname;
 	int error;
 
-	mtx_lock(&Giant);
-	while (swdev_syscall_active)
-		tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
-	swdev_syscall_active = 1;
+	sx_xlock(&swdev_syscall_lock);
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
@@ -2401,9 +2361,7 @@
 	}
 	mtx_unlock(&sw_dev_mtx);
 
-	swdev_syscall_active = 0;
-	wakeup_one(&swdev_syscall_active);
-	mtx_unlock(&Giant);
+	sx_xunlock(&swdev_syscall_lock);
 }
 
 void
@@ -2472,19 +2430,14 @@
 
 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
     "Number of swap devices");
-SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
+SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE,
+    sysctl_vm_swap_info,
     "Swap statistics by device");
 
 /*
- * vmspace_swap_count() - count the approximate swap usage in pages for a
- *			  vmspace.
- *
- *	The map must be locked.
- *
- *	Swap usage is determined by taking the proportional swap used by
- *	VM objects backing the VM map.  To make up for fractional losses,
- *	if the VM object has any swap use at all the associated map entries
- *	count for at least 1 swap page.
+ * Count the approximate swap usage in pages for a vmspace.  The
+ * shadowed or not yet copied on write swap blocks are not accounted.
+ * The map must be locked.
  */
 long
 vmspace_swap_count(struct vmspace *vmspace)
@@ -2492,23 +2445,38 @@
 	vm_map_t map;
 	vm_map_entry_t cur;
 	vm_object_t object;
-	long count, n;
+	struct swblk *sb;
+	vm_pindex_t e, pi;
+	long count;
+	int i;
 
 	map = &vmspace->vm_map;
 	count = 0;
 
 	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
-		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
-		    (object = cur->object.vm_object) != NULL) {
-			VM_OBJECT_WLOCK(object);
-			if (object->type == OBJT_SWAP &&
-			    object->un_pager.swp.swp_bcount != 0) {
-				n = (cur->end - cur->start) / PAGE_SIZE;
-				count += object->un_pager.swp.swp_bcount *
-				    SWAP_META_PAGES * n / object->size + 1;
+		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
+			continue;
+		object = cur->object.vm_object;
+		if (object == NULL || object->type != OBJT_SWAP)
+			continue;
+		VM_OBJECT_RLOCK(object);
+		if (object->type != OBJT_SWAP)
+			goto unlock;
+		pi = OFF_TO_IDX(cur->offset);
+		e = pi + OFF_TO_IDX(cur->end - cur->start);
+		for (;; pi = sb->p + SWAP_META_PAGES) {
+			sb = SWAP_PCTRIE_LOOKUP_GE(
+			    &object->un_pager.swp.swp_blks, pi);
+			if (sb == NULL || sb->p >= e)
+				break;
+			for (i = 0; i < SWAP_META_PAGES; i++) {
+				if (sb->p + i < e &&
+				    sb->d[i] != SWAPBLK_NONE)
+					count++;
 			}
-			VM_OBJECT_WUNLOCK(object);
 		}
+unlock:
+		VM_OBJECT_RUNLOCK(object);
 	}
 	return (count);
 }
@@ -2554,8 +2522,9 @@
 }
 
 /*
- * Remove a reference from the g_consumer. Post a close event if
- * all referneces go away.
+ * Remove a reference from the g_consumer.  Post a close event if all
+ * references go away, since the function might be called from the
+ * biodone context.
  */
 static void
 swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
@@ -2628,7 +2597,7 @@
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
-	if ((bp->b_flags & B_UNMAPPED) != 0) {
+	if (!buf_mapped(bp)) {
 		bio->bio_ma = bp->b_pages;
 		bio->bio_data = unmapped_buf;
 		bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
@@ -2678,22 +2647,19 @@
 	cp = sw->sw_id;
 	sw->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
-	/* XXX: direct call when Giant untangled */
+
+	/*
+	 * swapgeom_close() may be called from the biodone context,
+	 * where we cannot perform topology changes.  Delegate the
+	 * work to the events thread.
+	 */
 	if (cp != NULL)
 		g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
 }
 
-
-struct swh0h0 {
-	struct cdev *dev;
-	struct vnode *vp;
-	int	error;
-};
-
-static void
-swapongeom_ev(void *arg, int flags)
+static int
+swapongeom_locked(struct cdev *dev, struct vnode *vp)
 {
-	struct swh0h0 *swh;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	static struct g_geom *gp;
@@ -2701,20 +2667,15 @@
 	u_long nblks;
 	int error;
 
-	swh = arg;
-	swh->error = 0;
-	pp = g_dev_getprovider(swh->dev);
-	if (pp == NULL) {
-		swh->error = ENODEV;
-		return;
-	}
+	pp = g_dev_getprovider(dev);
+	if (pp == NULL)
+		return (ENODEV);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		cp = sp->sw_id;
 		if (cp != NULL && cp->provider == pp) {
 			mtx_unlock(&sw_dev_mtx);
-			swh->error = EBUSY;
-			return;
+			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
@@ -2721,44 +2682,41 @@
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap");
 	cp = g_new_consumer(gp);
-	cp->index = 1;		/* Number of active I/Os, plus one for being active. */
+	cp->index = 1;	/* Number of active I/Os, plus one for being active. */
 	cp->flags |=  G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	g_attach(cp, pp);
 	/*
-	 * XXX: Everytime you think you can improve the margin for
+	 * XXX: Every time you think you can improve the margin for
 	 * footshooting, somebody depends on the ability to do so:
 	 * savecore(8) wants to write to our swapdev so we cannot
 	 * set an exclusive count :-(
 	 */
 	error = g_access(cp, 1, 1, 0);
-	if (error) {
+	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
-		swh->error = error;
-		return;
+		return (error);
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
-	swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
-	    swapgeom_close, dev2udev(swh->dev),
+	swaponsomething(vp, cp, nblks, swapgeom_strategy,
+	    swapgeom_close, dev2udev(dev),
 	    (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
-	swh->error = 0;
+	return (0);
 }
 
 static int
-swapongeom(struct thread *td, struct vnode *vp)
+swapongeom(struct vnode *vp)
 {
 	int error;
-	struct swh0h0 swh;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-
-	swh.dev = vp->v_rdev;
-	swh.vp = vp;
-	swh.error = 0;
-	/* XXX: direct call when Giant untangled */
-	error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
-	if (!error)
-		error = swh.error;
+	if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) {
+		error = ENOENT;
+	} else {
+		g_topology_lock();
+		error = swapongeom_locked(vp->v_rdev, vp);
+		g_topology_unlock();
+	}
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
@@ -2833,3 +2791,40 @@
 	    NODEV, 0);
 	return (0);
 }
+
+static int
+sysctl_swap_async_max(SYSCTL_HANDLER_ARGS)
+{
+	int error, new, n;
+
+	new = nsw_wcount_async_max;
+	error = sysctl_handle_int(oidp, &new, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	if (new > nswbuf / 2 || new < 1)
+		return (EINVAL);
+
+	mtx_lock(&pbuf_mtx);
+	while (nsw_wcount_async_max != new) {
+		/*
+		 * Adjust difference.  If the current async count is too low,
+		 * we will need to sqeeze our update slowly in.  Sleep with a
+		 * higher priority than getpbuf() to finish faster.
+		 */
+		n = new - nsw_wcount_async_max;
+		if (nsw_wcount_async + n >= 0) {
+			nsw_wcount_async += n;
+			nsw_wcount_async_max += n;
+			wakeup(&nsw_wcount_async);
+		} else {
+			nsw_wcount_async_max -= nsw_wcount_async;
+			nsw_wcount_async = 0;
+			msleep(&nsw_wcount_async, &pbuf_mtx, PSWP,
+			    "swpsysctl", 0);
+		}
+	}
+	mtx_unlock(&pbuf_mtx);
+
+	return (0);
+}

Modified: trunk/sys/vm/swap_pager.h
===================================================================
--- trunk/sys/vm/swap_pager.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/swap_pager.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)swap_pager.h	7.1 (Berkeley) 12/5/90
- * $FreeBSD: stable/10/sys/vm/swap_pager.h 248514 2013-03-19 14:39:27Z kib $
+ * $FreeBSD: stable/11/sys/vm/swap_pager.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef	_VM_SWAP_PAGER_H_
@@ -74,15 +74,14 @@
 
 #ifdef _KERNEL
 
-extern int swap_pager_full;
 extern int swap_pager_avail;
 
 struct xswdev;
 int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len);
 void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int);
+vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex);
 void swap_pager_freespace(vm_object_t, vm_pindex_t, vm_size_t);
 void swap_pager_swap_init(void);
-int swap_pager_isswapped(vm_object_t, struct swdevt *);
 int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t);
 void swap_pager_status(int *total, int *used);
 void swapoff_all(void);

Modified: trunk/sys/vm/uma.h
===================================================================
--- trunk/sys/vm/uma.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,7 +25,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/uma.h 324602 2017-10-13 17:11:08Z jhb $
+ * $FreeBSD: stable/11/sys/vm/uma.h 338389 2018-08-29 17:58:01Z markj $
  *
  */
 
@@ -263,8 +263,8 @@
 					 * information in the vm_page.
 					 */
 #define	UMA_ZONE_SECONDARY	0x0200	/* Zone is a Secondary Zone */
-#define	UMA_ZONE_REFCNT		0x0400	/* Allocate refcnts in slabs */
-#define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets */
+#define	UMA_ZONE_NOBUCKET	0x0400	/* Do not use buckets. */
+#define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets. */
 #define	UMA_ZONE_CACHESPREAD	0x1000	/*
 					 * Spread memory start locations across
 					 * all possible cache lines.  May
@@ -277,7 +277,7 @@
 					 * mini-dumps.
 					 */
 #define	UMA_ZONE_PCPU		0x8000	/*
-					 * Allocates mp_ncpus slabs sized to
+					 * Allocates mp_maxid + 1 slabs sized to
 					 * sizeof(struct pcpu).
 					 */
 
@@ -288,7 +288,7 @@
  */
 #define	UMA_ZONE_INHERIT						\
     (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE |		\
-    UMA_ZONE_HASH | UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
+    UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
 
 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */
@@ -367,6 +367,11 @@
 }
 
 /*
+ * Wait until the specified zone can allocate an item.
+ */
+void uma_zwait(uma_zone_t zone);
+
+/*
  * XXX The rest of the prototypes in this header are h0h0 magic for the VM.
  * If you think you need to use it for a normal zone you're probably incorrect.
  */
@@ -523,6 +528,19 @@
 void uma_zone_set_warning(uma_zone_t zone, const char *warning);
 
 /*
+ * Sets a function to run when limit is reached
+ *
+ * Arguments:
+ *	zone  The zone to which this applies
+ *	fx  The function ro run
+ *
+ * Returns:
+ *	Nothing
+ */
+typedef void (*uma_maxaction_t)(uma_zone_t, int);
+void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t);
+
+/*
  * Obtains the approximate current number of items allocated from a zone
  *
  * Arguments:
@@ -612,21 +630,6 @@
 void uma_prealloc(uma_zone_t zone, int itemcnt);
 
 /*
- * Used to lookup the reference counter allocated for an item
- * from a UMA_ZONE_REFCNT zone.  For UMA_ZONE_REFCNT zones,
- * reference counters are allocated for items and stored in
- * the underlying slab header.
- *
- * Arguments:
- *	zone  The UMA_ZONE_REFCNT zone to which the item belongs.
- *	item  The address of the item for which we want a refcnt.
- *
- * Returns:
- *	A pointer to a uint32_t reference counter.
- */
-uint32_t *uma_find_refcnt(uma_zone_t zone, void *item);
-
-/*
  * Used to determine if a fixed-size zone is exhausted.
  *
  * Arguments:

Modified: trunk/sys/vm/uma_core.c
===================================================================
--- trunk/sys/vm/uma_core.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_core.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -32,7 +32,7 @@
  *
  * This allocator is intended to replace the multitude of similar object caches
  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
- * effecient.  A primary design goal is to return unused memory to the rest of
+ * efficient.  A primary design goal is to return unused memory to the rest of
  * the system.  This will make the system as a whole more flexible due to the
  * ability to move memory to subsystems which most need it instead of leaving
  * pools of reserved memory unused.
@@ -49,7 +49,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/uma_core.c 320440 2017-06-28 06:40:13Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/uma_core.c 357046 2020-01-23 14:14:38Z markj $");
 
 /* I should really use ktr.. */
 /*
@@ -75,10 +75,12 @@
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/random.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
+#include <sys/taskqueue.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
@@ -112,7 +114,6 @@
 
 /* This is the zone from which all of uma_slab_t's are allocated. */
 static uma_zone_t slabzone;
-static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
@@ -138,7 +139,7 @@
     LIST_HEAD_INITIALIZER(uma_cachezones);
 
 /* This RW lock protects the keg list */
-static struct rwlock_padalign uma_rwlock;
+static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
 
 /* Linked list of boot time pages */
 static LIST_HEAD(,uma_slab) uma_boot_pages =
@@ -153,14 +154,9 @@
 static int booted = 0;
 #define	UMA_STARTUP	1
 #define	UMA_STARTUP2	2
+#define	UMA_SHUTDOWN	3
 
 /*
- * Only mbuf clusters use ref zones.  Just provide enough references
- * to support the one user.  New code should not use the ref facility.
- */
-static const u_int uma_max_ipers_ref = PAGE_SIZE / MCLBYTES;
-
-/*
  * This is the handle used to schedule events that need to happen
  * outside of the allocation fast path.
  */
@@ -248,11 +244,12 @@
 static void keg_large_init(uma_keg_t keg);
 static void zone_foreach(void (*zfunc)(uma_zone_t));
 static void zone_timeout(uma_zone_t zone);
-static int hash_alloc(struct uma_hash *);
+static int hash_alloc(struct uma_hash *, u_int);
 static int hash_expand(struct uma_hash *, struct uma_hash *);
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
+static void uma_shutdown(void);
 static void *zone_alloc_item(uma_zone_t, void *, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 static void bucket_enable(void);
@@ -276,6 +273,11 @@
 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 
+#ifdef INVARIANTS
+static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
+static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
+#endif
+
 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
@@ -285,8 +287,7 @@
     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 
 static int zone_warnings = 1;
-TUNABLE_INT("vm.zone_warnings", &zone_warnings);
-SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
+SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
     "Warn when UMA zones becomes full");
 
 /*
@@ -433,6 +434,14 @@
 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 }
 
+static inline void
+zone_maxaction(uma_zone_t zone)
+{
+
+	if (zone->uz_maxaction.ta_func != NULL)
+		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
+}
+
 static void
 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 {
@@ -471,6 +480,7 @@
 static void
 keg_timeout(uma_keg_t keg)
 {
+	u_int slabs;
 
 	KEG_LOCK(keg);
 	/*
@@ -481,7 +491,8 @@
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 	if (keg->uk_flags & UMA_ZONE_HASH &&
-	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
+	    (slabs = keg->uk_pages / keg->uk_ppera) >
+	     keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
@@ -492,9 +503,8 @@
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
-		newhash = keg->uk_hash;
 		KEG_UNLOCK(keg);
-		ret = hash_alloc(&newhash);
+		ret = hash_alloc(&newhash, 1 << fls(slabs));
 		KEG_LOCK(keg);
 		if (ret) {
 			if (hash_expand(&keg->uk_hash, &newhash)) {
@@ -526,19 +536,16 @@
  *	hash  A new hash structure with the old hash size in uh_hashsize
  *
  * Returns:
- *	1 on sucess and 0 on failure.
+ *	1 on success and 0 on failure.
  */
 static int
-hash_alloc(struct uma_hash *hash)
+hash_alloc(struct uma_hash *hash, u_int size)
 {
-	int oldsize;
-	int alloc;
+	size_t alloc;
 
-	oldsize = hash->uh_hashsize;
-
-	/* We're just going to go to a power of two greater */
-	if (oldsize)  {
-		hash->uh_hashsize = oldsize * 2;
+	KASSERT(powerof2(size), ("hash size must be power of 2"));
+	if (size > UMA_HASH_SIZE_INIT)  {
+		hash->uh_hashsize = size;
 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 		    M_UMAHASH, M_NOWAIT);
@@ -575,8 +582,8 @@
 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 {
 	uma_slab_t slab;
-	int hval;
-	int i;
+	u_int hval;
+	u_int idx;
 
 	if (!newhash->uh_slab_hash)
 		return (0);
@@ -589,10 +596,10 @@
 	 * full rehash.
 	 */
 
-	for (i = 0; i < oldhash->uh_hashsize; i++)
-		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
-			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
-			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
+	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
+		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
+			slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]);
+			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink);
 			hval = UMA_HASH(newhash, slab->us_data);
 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 			    slab, us_hlink);
@@ -840,8 +847,7 @@
 keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
-	uma_slab_t slab;
-	uma_slab_t n;
+	uma_slab_t slab, tmp;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
@@ -857,15 +863,10 @@
 	if (keg->uk_free == 0)
 		goto finished;
 
-	slab = LIST_FIRST(&keg->uk_free_slab);
-	while (slab) {
-		n = LIST_NEXT(slab, us_link);
-
-		/* We have no where to free these to */
-		if (slab->us_flags & UMA_SLAB_BOOT) {
-			slab = n;
+	LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) {
+		/* We have nowhere to free these to. */
+		if (slab->us_flags & UMA_SLAB_BOOT)
 			continue;
-		}
 
 		LIST_REMOVE(slab, us_link);
 		keg->uk_pages -= keg->uk_ppera;
@@ -875,8 +876,6 @@
 			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 
 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
-
-		slab = n;
 	}
 finished:
 	KEG_UNLOCK(keg);
@@ -939,7 +938,6 @@
 static uma_slab_t
 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
 {
-	uma_slabrefcnt_t slabref;
 	uma_alloc allocf;
 	uma_slab_t slab;
 	uint8_t *mem;
@@ -1002,11 +1000,6 @@
 #ifdef INVARIANTS
 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
 #endif
-	if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		slabref = (uma_slabrefcnt_t)slab;
-		for (i = 0; i < keg->uk_ipers; i++)
-			slabref->us_refcnt[i] = 0;
-	}
 
 	if (keg->uk_init != NULL) {
 		for (i = 0; i < keg->uk_ipers; i++)
@@ -1135,7 +1128,9 @@
 	npages = howmany(bytes, PAGE_SIZE);
 	while (npages > 0) {
 		p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
-		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
+		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
+		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
+		    VM_ALLOC_NOWAIT));
 		if (p != NULL) {
 			/*
 			 * Since the page does not belong to an object, its
@@ -1145,17 +1140,12 @@
 			npages--;
 			continue;
 		}
-		if (wait & M_WAITOK) {
-			VM_WAIT;
-			continue;
-		}
-
 		/*
 		 * Page allocation failed, free intermediate pages and
 		 * exit.
 		 */
 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
-			vm_page_unwire(p, 0);
+			vm_page_unwire(p, PQ_NONE);
 			vm_page_free(p); 
 		}
 		return (NULL);
@@ -1229,7 +1219,7 @@
 	u_int slabsize;
 
 	if (keg->uk_flags & UMA_ZONE_PCPU) {
-		u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
 
 		slabsize = sizeof(struct pcpu);
 		keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
@@ -1255,15 +1245,20 @@
 	    keg->uk_rsize < sizeof(struct pcpu),
 	    ("%s: size %u too large", __func__, keg->uk_rsize));
 
-	if (keg->uk_flags & UMA_ZONE_REFCNT)
-		rsize += sizeof(uint32_t);
-
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 		shsize = 0;
 	else 
 		shsize = sizeof(struct uma_slab);
 
-	keg->uk_ipers = (slabsize - shsize) / rsize;
+	if (rsize <= slabsize - shsize)
+		keg->uk_ipers = (slabsize - shsize) / rsize;
+	else {
+		/* Handle special case when we have 1 item per slab, so
+		 * alignment requirement can be relaxed. */
+		KASSERT(keg->uk_size <= slabsize - shsize,
+		    ("%s: size %u greater than slab", __func__, keg->uk_size));
+		keg->uk_ipers = 1;
+	}
 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
 
@@ -1337,21 +1332,24 @@
 	keg->uk_ipers = 1;
 	keg->uk_rsize = keg->uk_size;
 
-	/* We can't do OFFPAGE if we're internal, bail out here. */
-	if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
-		return;
-
 	/* Check whether we have enough space to not do OFFPAGE. */
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
 		shsize = sizeof(struct uma_slab);
-		if (keg->uk_flags & UMA_ZONE_REFCNT)
-			shsize += keg->uk_ipers * sizeof(uint32_t);
 		if (shsize & UMA_ALIGN_PTR)
 			shsize = (shsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
 
-		if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize)
-			keg->uk_flags |= UMA_ZONE_OFFPAGE;
+		if (PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < shsize) {
+			/*
+			 * We can't do OFFPAGE if we're internal, in which case
+			 * we need an extra page per allocation to contain the
+			 * slab header.
+			 */
+			if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
+				keg->uk_flags |= UMA_ZONE_OFFPAGE;
+			else
+				keg->uk_ppera++;
+		}
 	}
 
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
@@ -1433,7 +1431,7 @@
 	if (arg->flags & UMA_ZONE_ZINIT)
 		keg->uk_init = zero_init;
 
-	if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
+	if (arg->flags & UMA_ZONE_MALLOC)
 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
 
 	if (arg->flags & UMA_ZONE_PCPU)
@@ -1445,13 +1443,6 @@
 
 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
 		keg_cachespread_init(keg);
-	} else if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		if (keg->uk_size >
-		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) -
-		    sizeof(uint32_t)))
-			keg_large_init(keg);
-		else
-			keg_small_init(keg);
 	} else {
 		if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
 			keg_large_init(keg);
@@ -1459,15 +1450,8 @@
 			keg_small_init(keg);
 	}
 
-	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
-		if (keg->uk_flags & UMA_ZONE_REFCNT) {
-			if (keg->uk_ipers > uma_max_ipers_ref)
-				panic("Too many ref items per zone: %d > %d\n",
-				    keg->uk_ipers, uma_max_ipers_ref);
-			keg->uk_slabzone = slabrefzone;
-		} else
-			keg->uk_slabzone = slabzone;
-	}
+	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+		keg->uk_slabzone = slabzone;
 
 	/*
 	 * If we haven't booted yet we need allocations to go through the
@@ -1504,10 +1488,6 @@
 		/* Size of the slab struct and free list */
 		totsize = sizeof(struct uma_slab);
 
-		/* Size of the reference counts. */
-		if (keg->uk_flags & UMA_ZONE_REFCNT)
-			totsize += keg->uk_ipers * sizeof(uint32_t);
-
 		if (totsize & UMA_ALIGN_PTR)
 			totsize = (totsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
@@ -1521,8 +1501,6 @@
 		 * sure here anyway.
 		 */
 		totsize = keg->uk_pgoff + sizeof(struct uma_slab);
-		if (keg->uk_flags & UMA_ZONE_REFCNT)
-			totsize += keg->uk_ipers * sizeof(uint32_t);
 		if (totsize > PAGE_SIZE * keg->uk_ppera) {
 			printf("zone %s ipers %d rsize %d size %d\n",
 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
@@ -1532,7 +1510,7 @@
 	}
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
-		hash_alloc(&keg->uk_hash);
+		hash_alloc(&keg->uk_hash, 0);
 
 #ifdef UMA_DEBUG
 	printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
@@ -1667,10 +1645,15 @@
 	}
 
 out:
-	if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0)
+	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
+	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
+	    ("Invalid zone flag combination"));
+	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
+		zone->uz_count = BUCKET_MAX;
+	else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
+		zone->uz_count = 0;
+	else
 		zone->uz_count = bucket_select(zone->uz_size);
-	else
-		zone->uz_count = BUCKET_MAX;
 	zone->uz_count_min = zone->uz_count;
 
 	return (0);
@@ -1785,7 +1768,6 @@
 {
 	struct uma_zctor_args args;
 	uma_slab_t slab;
-	u_int slabsize;
 	int i;
 
 #ifdef UMA_DEBUG
@@ -1835,9 +1817,6 @@
 	zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
 
 #ifdef UMA_DEBUG
-	printf("Initializing pcpu cache locks.\n");
-#endif
-#ifdef UMA_DEBUG
 	printf("Creating slab and hash zones.\n");
 #endif
 
@@ -1847,18 +1826,6 @@
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
-	/*
-	 * We also create a zone for the bigger slabs with reference
-	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
-	 */
-	slabsize = sizeof(struct uma_slab_refcnt);
-	slabsize += uma_max_ipers_ref * sizeof(uint32_t);
-	slabrefzone = uma_zcreate("UMA RCntSlabs",
-				  slabsize,
-				  NULL, NULL, NULL, NULL,
-				  UMA_ALIGN_PTR,
-				  UMA_ZFLAG_INTERNAL);
-
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL,
@@ -1885,10 +1852,6 @@
 #endif
 }
 
-/*
- * Initialize our callout handle
- *
- */
 
 static void
 uma_startup3(void)
@@ -1901,8 +1864,18 @@
 #ifdef UMA_DEBUG
 	printf("UMA startup3 complete.\n");
 #endif
+
+	EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL,
+	    EVENTHANDLER_PRI_FIRST);
 }
 
+static void
+uma_shutdown(void)
+{
+
+	booted = UMA_SHUTDOWN;
+}
+
 static uma_keg_t
 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
 		int align, uint32_t flags)
@@ -1948,6 +1921,20 @@
 	args.dtor = dtor;
 	args.uminit = uminit;
 	args.fini = fini;
+#ifdef  INVARIANTS
+	/*
+	 * If a zone is being created with an empty constructor and
+	 * destructor, pass UMA constructor/destructor which checks for
+	 * memory use after free.
+	 */
+	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
+	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
+		args.ctor = trash_ctor;
+		args.dtor = trash_dtor;
+		args.uminit = trash_init;
+		args.fini = trash_fini;
+	}
+#endif
 	args.align = align;
 	args.flags = flags;
 	args.keg = NULL;
@@ -2070,15 +2057,8 @@
 		error = EINVAL;
 		goto out;
 	}
+
 	/*
-	 * Both must either be refcnt, or not be refcnt.
-	 */
-	if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
-	    (master->uz_flags & UMA_ZONE_REFCNT)) {
-		error = EINVAL;
-		goto out;
-	}
-	/*
 	 * The underlying object must be the same size.  rsize
 	 * may be different.
 	 */
@@ -2114,11 +2094,28 @@
 uma_zdestroy(uma_zone_t zone)
 {
 
+	/*
+	 * Large slabs are expensive to reclaim, so don't bother doing
+	 * unnecessary work if we're shutting down.
+	 */
+	if (booted == UMA_SHUTDOWN &&
+	    zone->uz_fini == NULL &&
+	    zone->uz_release == (uma_release)zone_release)
+		return;
 	sx_slock(&uma_drain_lock);
 	zone_free_item(zones, zone, NULL, SKIP_NONE);
 	sx_sunlock(&uma_drain_lock);
 }
 
+void
+uma_zwait(uma_zone_t zone)
+{
+	void *item;
+
+	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
+	uma_zfree(zone, item);
+}
+
 /* See uma.h */
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
@@ -2129,6 +2126,9 @@
 	int lockfail;
 	int cpu;
 
+	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
+	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
+
 	/* This is the fast path allocation */
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
@@ -2140,20 +2140,17 @@
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
 	}
+	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
+	    ("uma_zalloc_arg: called with spinlock or critical section held"));
+
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp_zone(zone)) {
 		item = memguard_alloc(zone->uz_size, flags);
 		if (item != NULL) {
-			/*
-			 * Avoid conflict with the use-after-free
-			 * protecting infrastructure from INVARIANTS.
-			 */
 			if (zone->uz_init != NULL &&
-			    zone->uz_init != mtrash_init &&
 			    zone->uz_init(item, zone->uz_size, flags) != 0)
 				return (NULL);
 			if (zone->uz_ctor != NULL &&
-			    zone->uz_ctor != mtrash_ctor &&
 			    zone->uz_ctor(item, zone->uz_size, udata,
 			    flags) != 0) {
 			    	zone->uz_fini(item, zone->uz_size);
@@ -2289,7 +2286,7 @@
 
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
-	 * works we'll restart the allocation from the begining and it
+	 * works we'll restart the allocation from the beginning and it
 	 * will use the just filled bucket.
 	 */
 	bucket = zone_alloc_bucket(zone, udata, flags);
@@ -2370,6 +2367,7 @@
 			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
 				zone->uz_flags |= UMA_ZFLAG_FULL;
 				zone_log_warning(zone);
+				zone_maxaction(zone);
 			}
 			if (flags & M_NOWAIT)
 				break;
@@ -2489,6 +2487,7 @@
 			zone->uz_flags |= UMA_ZFLAG_FULL;
 			zone->uz_sleeps++;
 			zone_log_warning(zone);
+			zone_maxaction(zone);
 			msleep(zone, zone->uz_lockptr, PVM,
 			    "zonelimit", hz/100);
 			zone->uz_flags &= ~UMA_ZFLAG_FULL;
@@ -2668,6 +2667,9 @@
 	int lockfail;
 	int cpu;
 
+	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
+	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
+
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
 #endif
@@ -2674,14 +2676,17 @@
 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
 	    zone->uz_name);
 
+	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
+	    ("uma_zfree_arg: called with spinlock or critical section held"));
+
         /* uma_zfree(..., NULL) does nothing, to match free(9). */
         if (item == NULL)
                 return;
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(item)) {
-		if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
+		if (zone->uz_dtor != NULL)
 			zone->uz_dtor(item, zone->uz_size, udata);
-		if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
+		if (zone->uz_fini != NULL)
 			zone->uz_fini(item, zone->uz_size);
 		memguard_free(item);
 		return;
@@ -2988,6 +2993,16 @@
 }
 
 /* See uma.h */
+void
+uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
+{
+
+	ZONE_LOCK(zone);
+	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
 int
 uma_zone_get_cur(uma_zone_t zone)
 {
@@ -3176,26 +3191,6 @@
 }
 
 /* See uma.h */
-uint32_t *
-uma_find_refcnt(uma_zone_t zone, void *item)
-{
-	uma_slabrefcnt_t slabref;
-	uma_slab_t slab;
-	uma_keg_t keg;
-	uint32_t *refcnt;
-	int idx;
-
-	slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
-	slabref = (uma_slabrefcnt_t)slab;
-	keg = slab->us_keg;
-	KASSERT(keg->uk_flags & UMA_ZONE_REFCNT,
-	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
-	idx = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
-	refcnt = &slabref->us_refcnt[idx];
-	return refcnt;
-}
-
-/* See uma.h */
 static void
 uma_reclaim_locked(bool kmem_danger)
 {
@@ -3216,7 +3211,6 @@
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzone);
-	zone_drain(slabrefzone);
 	bucket_zone_drain();
 }
 
@@ -3309,9 +3303,10 @@
 static void
 uma_zero_item(void *item, uma_zone_t zone)
 {
+	int i;
 
 	if (zone->uz_flags & UMA_ZONE_PCPU) {
-		for (int i = 0; i < mp_ncpus; i++)
+		CPU_FOREACH(i)
 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
 	} else
 		bzero(item, zone->uz_size);
@@ -3447,7 +3442,7 @@
 {
 	struct uma_stream_header ush;
 	struct uma_type_header uth;
-	struct uma_percpu_stat ups;
+	struct uma_percpu_stat *ups;
 	uma_bucket_t bucket;
 	struct sbuf sbuf;
 	uma_cache_t cache;
@@ -3461,6 +3456,8 @@
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
+	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
 
 	count = 0;
 	rw_rlock(&uma_rwlock);
@@ -3509,7 +3506,6 @@
 			uth.uth_frees = z->uz_frees;
 			uth.uth_fails = z->uz_fails;
 			uth.uth_sleeps = z->uz_sleeps;
-			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 			/*
 			 * While it is not normally safe to access the cache
 			 * bucket pointers while not on the CPU that owns the
@@ -3518,30 +3514,31 @@
 			 * accept the possible race associated with bucket
 			 * exchange during monitoring.
 			 */
-			for (i = 0; i < (mp_maxid + 1); i++) {
-				bzero(&ups, sizeof(ups));
-				if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
-					goto skip;
-				if (CPU_ABSENT(i))
-					goto skip;
+			for (i = 0; i < mp_maxid + 1; i++) {
+				bzero(&ups[i], sizeof(*ups));
+				if (kz->uk_flags & UMA_ZFLAG_INTERNAL ||
+				    CPU_ABSENT(i))
+					continue;
 				cache = &z->uz_cpu[i];
 				if (cache->uc_allocbucket != NULL)
-					ups.ups_cache_free +=
+					ups[i].ups_cache_free +=
 					    cache->uc_allocbucket->ub_cnt;
 				if (cache->uc_freebucket != NULL)
-					ups.ups_cache_free +=
+					ups[i].ups_cache_free +=
 					    cache->uc_freebucket->ub_cnt;
-				ups.ups_allocs = cache->uc_allocs;
-				ups.ups_frees = cache->uc_frees;
-skip:
-				(void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
+				ups[i].ups_allocs = cache->uc_allocs;
+				ups[i].ups_frees = cache->uc_frees;
 			}
 			ZONE_UNLOCK(z);
+			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
+			for (i = 0; i < mp_maxid + 1; i++)
+				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
 		}
 	}
 	rw_runlock(&uma_rwlock);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
+	free(ups, M_TEMP);
 	return (error);
 }
 
@@ -3549,16 +3546,13 @@
 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = *(uma_zone_t *)arg1;
-	int error, max, old;
+	int error, max;
 
-	old = max = uma_zone_get_max(zone);
+	max = uma_zone_get_max(zone);
 	error = sysctl_handle_int(oidp, &max, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
-	if (max < old)
-		return (EINVAL);
-
 	uma_zone_set_max(zone, max);
 
 	return (0);
@@ -3574,6 +3568,102 @@
 	return (sysctl_handle_int(oidp, &cur, 0, req));
 }
 
+#ifdef INVARIANTS
+static uma_slab_t
+uma_dbg_getslab(uma_zone_t zone, void *item)
+{
+	uma_slab_t slab;
+	uma_keg_t keg;
+	uint8_t *mem;
+
+	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
+	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
+		slab = vtoslab((vm_offset_t)mem);
+	} else {
+		/*
+		 * It is safe to return the slab here even though the
+		 * zone is unlocked because the item's allocation state
+		 * essentially holds a reference.
+		 */
+		ZONE_LOCK(zone);
+		keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
+		if (keg->uk_flags & UMA_ZONE_HASH)
+			slab = hash_sfind(&keg->uk_hash, mem);
+		else
+			slab = (uma_slab_t)(mem + keg->uk_pgoff);
+		ZONE_UNLOCK(zone);
+	}
+
+	return (slab);
+}
+
+/*
+ * Set up the slab's freei data such that uma_dbg_free can function.
+ *
+ */
+static void
+uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
+{
+	uma_keg_t keg;
+	int freei;
+
+	if (zone_first_keg(zone) == NULL)
+		return;
+	if (slab == NULL) {
+		slab = uma_dbg_getslab(zone, item);
+		if (slab == NULL) 
+			panic("uma: item %p did not belong to zone %s\n",
+			    item, zone->uz_name);
+	}
+	keg = slab->us_keg;
+	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+
+	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
+	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
+
+	return;
+}
+
+/*
+ * Verifies freed addresses.  Checks for alignment, valid slab membership
+ * and duplicate frees.
+ *
+ */
+static void
+uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
+{
+	uma_keg_t keg;
+	int freei;
+
+	if (zone_first_keg(zone) == NULL)
+		return;
+	if (slab == NULL) {
+		slab = uma_dbg_getslab(zone, item);
+		if (slab == NULL) 
+			panic("uma: Freed item %p did not belong to zone %s\n",
+			    item, zone->uz_name);
+	}
+	keg = slab->us_keg;
+	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+
+	if (freei >= keg->uk_ipers)
+		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
+
+	if (((freei * keg->uk_rsize) + slab->us_data) != item) 
+		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
+
+	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
+
+	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
+}
+#endif /* INVARIANTS */
+
 #ifdef DDB
 DB_SHOW_COMMAND(uma, db_show_uma)
 {
@@ -3631,4 +3721,4 @@
 			return;
 	}
 }
-#endif
+#endif	/* DDB */

Modified: trunk/sys/vm/uma_dbg.c
===================================================================
--- trunk/sys/vm/uma_dbg.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_dbg.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -32,8 +32,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/uma_dbg.c 252040 2013-06-20 19:08:12Z jeff $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/uma_dbg.c 301176 2016-06-01 22:31:35Z markj $");
 
+#include "opt_vm.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bitset.h>
@@ -50,6 +52,7 @@
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
+#include <vm/memguard.h>
 
 static const uint32_t uma_junk = 0xdeadc0de;
 
@@ -58,7 +61,6 @@
  * prior to subsequent reallocation.
  *
  * Complies with standard ctor arg/return
- *
  */
 int
 trash_ctor(void *mem, int size, void *arg, int flags)
@@ -66,12 +68,22 @@
 	int cnt;
 	uint32_t *p;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return (0);
+#endif
+
 	cnt = size / sizeof(uma_junk);
 
 	for (p = mem; cnt > 0; cnt--, p++)
 		if (*p != uma_junk) {
+#ifdef INVARIANTS
+			panic("Memory modified after free %p(%d) val=%x @ %p\n",
+			    mem, size, *p, p);
+#else
 			printf("Memory modified after free %p(%d) val=%x @ %p\n",
 			    mem, size, *p, p);
+#endif
 			return (0);
 		}
 	return (0);
@@ -89,6 +101,11 @@
 	int cnt;
 	uint32_t *p;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return;
+#endif
+
 	cnt = size / sizeof(uma_junk);
 
 	for (p = mem; cnt > 0; cnt--, p++)
@@ -127,6 +144,11 @@
 	uint32_t *p = mem;
 	int cnt;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return (0);
+#endif
+
 	size -= sizeof(struct malloc_type *);
 	ksp = (struct malloc_type **)mem;
 	ksp += size / sizeof(struct malloc_type *);
@@ -154,6 +176,11 @@
 	int cnt;
 	uint32_t *p;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return;
+#endif
+
 	size -= sizeof(struct malloc_type *);
 	cnt = size / sizeof(uma_junk);
 
@@ -172,6 +199,11 @@
 {
 	struct malloc_type **ksp;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return (0);
+#endif
+
 	mtrash_dtor(mem, size, NULL);
 
 	ksp = (struct malloc_type **)mem;
@@ -192,100 +224,3 @@
 {
 	(void)mtrash_ctor(mem, size, NULL, 0);
 }
-
-#ifdef INVARIANTS
-static uma_slab_t
-uma_dbg_getslab(uma_zone_t zone, void *item)
-{
-	uma_slab_t slab;
-	uma_keg_t keg;
-	uint8_t *mem;
-
-	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
-	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
-		slab = vtoslab((vm_offset_t)mem);
-	} else {
-		/*
-		 * It is safe to return the slab here even though the
-		 * zone is unlocked because the item's allocation state
-		 * essentially holds a reference.
-		 */
-		ZONE_LOCK(zone);
-		keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
-		if (keg->uk_flags & UMA_ZONE_HASH)
-			slab = hash_sfind(&keg->uk_hash, mem);
-		else
-			slab = (uma_slab_t)(mem + keg->uk_pgoff);
-		ZONE_UNLOCK(zone);
-	}
-
-	return (slab);
-}
-
-/*
- * Set up the slab's freei data such that uma_dbg_free can function.
- *
- */
-void
-uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
-{
-	uma_keg_t keg;
-	int freei;
-
-	if (zone_first_keg(zone) == NULL)
-		return;
-	if (slab == NULL) {
-		slab = uma_dbg_getslab(zone, item);
-		if (slab == NULL) 
-			panic("uma: item %p did not belong to zone %s\n",
-			    item, zone->uz_name);
-	}
-	keg = slab->us_keg;
-	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
-
-	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
-		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
-		    item, zone, zone->uz_name, slab, freei);
-	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
-
-	return;
-}
-
-/*
- * Verifies freed addresses.  Checks for alignment, valid slab membership
- * and duplicate frees.
- *
- */
-void
-uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
-{
-	uma_keg_t keg;
-	int freei;
-
-	if (zone_first_keg(zone) == NULL)
-		return;
-	if (slab == NULL) {
-		slab = uma_dbg_getslab(zone, item);
-		if (slab == NULL) 
-			panic("uma: Freed item %p did not belong to zone %s\n",
-			    item, zone->uz_name);
-	}
-	keg = slab->us_keg;
-	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
-
-	if (freei >= keg->uk_ipers)
-		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
-		    item, zone, zone->uz_name, slab, freei);
-
-	if (((freei * keg->uk_rsize) + slab->us_data) != item) 
-		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
-		    item, zone, zone->uz_name, slab, freei);
-
-	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
-		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
-		    item, zone, zone->uz_name, slab, freei);
-
-	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
-}
-
-#endif /* INVARIANTS */

Modified: trunk/sys/vm/uma_dbg.h
===================================================================
--- trunk/sys/vm/uma_dbg.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_dbg.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,7 +25,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/uma_dbg.h 148078 2005-07-16 09:51:52Z rwatson $
+ * $FreeBSD: stable/11/sys/vm/uma_dbg.h 295221 2016-02-03 22:02:36Z glebius $
  *
  */
 
@@ -50,7 +50,4 @@
 int mtrash_init(void *mem, int size, int flags);
 void mtrash_fini(void *mem, int size);
 
-void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
-void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
-
 #endif /* VM_UMA_DBG_H */

Modified: trunk/sys/vm/uma_int.h
===================================================================
--- trunk/sys/vm/uma_int.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_int.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,10 +25,13 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/uma_int.h 316835 2017-04-14 14:11:59Z avg $
+ * $FreeBSD: stable/11/sys/vm/uma_int.h 344363 2019-02-20 14:12:25Z pfg $
  *
  */
 
+#include <sys/_bitset.h>
+#include <sys/_task.h>
+
 /* 
  * This file includes definitions, structures, prototypes, and inlines that
  * should not be used outside of the actual implementation of UMA.
@@ -109,6 +112,8 @@
 #define UMA_SLAB_SHIFT	PAGE_SHIFT	/* Number of bits PAGE_MASK */
 
 #define UMA_BOOT_PAGES		64	/* Pages allocated for startup */
+#define UMA_BOOT_PAGES_ZONES	32	/* Multiplier for pages to reserve */
+					/* if uma_zone > PAGE_SIZE */
 
 /* Max waste percentage before going to off page slab management */
 #define UMA_MAX_WASTE	10
@@ -140,8 +145,8 @@
 
 struct uma_hash {
 	struct slabhead	*uh_slab_hash;	/* Hash table for slabs */
-	int		uh_hashsize;	/* Current size of the hash table */
-	int		uh_hashmask;	/* Mask used during hashing */
+	u_int		uh_hashsize;	/* Current size of the hash table */
+	u_int		uh_hashmask;	/* Mask used during hashing */
 };
 
 /*
@@ -207,7 +212,7 @@
 	vm_offset_t	uk_kva;		/* Zone base KVA */
 	uma_zone_t	uk_slabzone;	/* Slab zone backing us, if OFFPAGE */
 
-	uint16_t	uk_pgoff;	/* Offset to uma_slab struct */
+	uint32_t	uk_pgoff;	/* Offset to uma_slab struct */
 	uint16_t	uk_ppera;	/* pages per allocation from backend */
 	uint16_t	uk_ipers;	/* Items per slab */
 	uint32_t	uk_flags;	/* Internal flags */
@@ -248,17 +253,7 @@
 #define	us_link	us_type._us_link
 #define	us_size	us_type._us_size
 
-/*
- * The slab structure for UMA_ZONE_REFCNT zones for whose items we
- * maintain reference counters in the slab for.
- */
-struct uma_slab_refcnt {
-	struct uma_slab		us_head;	/* slab header data */
-	uint32_t		us_refcnt[0];	/* Actually larger. */
-};
-
 typedef struct uma_slab * uma_slab_t;
-typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
 typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int);
 
 struct uma_klink {
@@ -303,10 +298,12 @@
 	uint16_t	uz_count;	/* Amount of items in full bucket */
 	uint16_t	uz_count_min;	/* Minimal amount of items there */
 
-	/* The next three fields are used to print a rate-limited warnings. */
+	/* The next two fields are used to print a rate-limited warnings. */
 	const char	*uz_warning;	/* Warning to print on failure */
 	struct timeval	uz_ratecheck;	/* Warnings rate-limiting */
 
+	struct task	uz_maxaction;	/* Task to run when at limit */
+
 	/*
 	 * This HAS to be the last item because we adjust the zone size
 	 * based on NCPU and then allocate the space for the zones.
@@ -390,7 +387,7 @@
 hash_sfind(struct uma_hash *hash, uint8_t *data)
 {
         uma_slab_t slab;
-        int hval;
+        u_int hval;
 
         hval = UMA_HASH(hash, data);
 
@@ -421,7 +418,7 @@
 
 /*
  * The following two functions may be defined by architecture specific code
- * if they can provide more effecient allocation functions.  This is useful
+ * if they can provide more efficient allocation functions.  This is useful
  * for using direct mapped addresses.
  */
 void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag,

Modified: trunk/sys/vm/vm.h
===================================================================
--- trunk/sys/vm/vm.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -56,7 +56,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm.h 321717 2017-07-30 10:36:20Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm.h 331921 2018-04-03 09:38:53Z kib $
  */
 
 #ifndef VM_H
@@ -79,7 +79,9 @@
 #define	VM_PROT_WRITE		((vm_prot_t) 0x02)
 #define	VM_PROT_EXECUTE		((vm_prot_t) 0x04)
 #define	VM_PROT_COPY		((vm_prot_t) 0x08)	/* copy-on-read */
-#define	VM_PROT_FAULT_LOOKUP	((vm_prot_t) 0x010)
+#define	VM_PROT_PRIV_FLAG	((vm_prot_t) 0x10)
+#define	VM_PROT_FAULT_LOOKUP	VM_PROT_PRIV_FLAG
+#define	VM_PROT_QUICK_NOFAULT	VM_PROT_PRIV_FLAG	/* same to save bits */
 
 #define	VM_PROT_ALL		(VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)
 #define VM_PROT_RW		(VM_PROT_READ|VM_PROT_WRITE)
@@ -112,8 +114,9 @@
 typedef int boolean_t;
 
 /*
- * The exact set of memory attributes is machine dependent.  However, every
- * machine is required to define VM_MEMATTR_DEFAULT.
+ * The exact set of memory attributes is machine dependent.  However,
+ * every machine is required to define VM_MEMATTR_DEFAULT and
+ * VM_MEMATTR_UNCACHEABLE.
  */
 typedef	char vm_memattr_t;	/* memory attribute codes */
 

Added: trunk/sys/vm/vm_domain.c
===================================================================
--- trunk/sys/vm/vm_domain.c	                        (rev 0)
+++ trunk/sys/vm/vm_domain.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,401 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 Adrian Chadd <adrian at FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ *    redistribution must be conditioned upon including a substantially
+ *    similar Disclaimer requirement for further binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_domain.c 312714 2017-01-24 19:39:24Z mjg $");
+
+#include "opt_vm.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#ifdef VM_NUMA_ALLOC
+#include <sys/proc.h>
+#endif
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/tree.h>
+#include <sys/vmmeter.h>
+#include <sys/seq.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_phys.h>
+
+#include <vm/vm_domain.h>
+
+#ifdef VM_NUMA_ALLOC
+static __inline int
+vm_domain_rr_selectdomain(int skip_domain)
+{
+	struct thread *td;
+
+	td = curthread;
+
+	td->td_dom_rr_idx++;
+	td->td_dom_rr_idx %= vm_ndomains;
+
+	/*
+	 * If skip_domain is provided then skip over that
+	 * domain.  This is intended for round robin variants
+	 * which first try a fixed domain.
+	 */
+	if ((skip_domain > -1) && (td->td_dom_rr_idx == skip_domain)) {
+		td->td_dom_rr_idx++;
+		td->td_dom_rr_idx %= vm_ndomains;
+	}
+	return (td->td_dom_rr_idx);
+}
+#endif
+
+/*
+ * This implements a very simple set of VM domain memory allocation
+ * policies and iterators.
+ */
+
+/*
+ * A VM domain policy represents a desired VM domain policy.
+ * Iterators implement searching through VM domains in a specific
+ * order.
+ */
+
+/*
+ * When setting a policy, the caller must establish their own
+ * exclusive write protection for the contents of the domain
+ * policy.
+ */
+int
+vm_domain_policy_init(struct vm_domain_policy *vp)
+{
+
+	bzero(vp, sizeof(*vp));
+	vp->p.policy = VM_POLICY_NONE;
+	vp->p.domain = -1;
+	return (0);
+}
+
+int
+vm_domain_policy_set(struct vm_domain_policy *vp,
+    vm_domain_policy_type_t vt, int domain)
+{
+
+	seq_write_begin(&vp->seq);
+	vp->p.policy = vt;
+	vp->p.domain = domain;
+	seq_write_end(&vp->seq);
+	return (0);
+}
+
+/*
+ * Take a local copy of a policy.
+ *
+ * The destination policy isn't write-barriered; this is used
+ * for doing local copies into something that isn't shared.
+ */
+void
+vm_domain_policy_localcopy(struct vm_domain_policy *dst,
+    const struct vm_domain_policy *src)
+{
+	seq_t seq;
+
+	for (;;) {
+		seq = seq_read(&src->seq);
+		*dst = *src;
+		if (seq_consistent(&src->seq, seq))
+			return;
+	}
+}
+
+/*
+ * Take a write-barrier copy of a policy.
+ *
+ * The destination policy is write -barriered; this is used
+ * for doing copies into policies that may be read by other
+ * threads.
+ */
+void
+vm_domain_policy_copy(struct vm_domain_policy *dst,
+    const struct vm_domain_policy *src)
+{
+	seq_t seq;
+	struct vm_domain_policy d;
+
+	for (;;) {
+		seq = seq_read(&src->seq);
+		d = *src;
+		if (seq_consistent(&src->seq, seq)) {
+			seq_write_begin(&dst->seq);
+			dst->p.domain = d.p.domain;
+			dst->p.policy = d.p.policy;
+			seq_write_end(&dst->seq);
+			return;
+		}
+	}
+}
+
+int
+vm_domain_policy_validate(const struct vm_domain_policy *vp)
+{
+
+	switch (vp->p.policy) {
+	case VM_POLICY_NONE:
+	case VM_POLICY_ROUND_ROBIN:
+	case VM_POLICY_FIRST_TOUCH:
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		if (vp->p.domain == -1)
+			return (0);
+		return (-1);
+	case VM_POLICY_FIXED_DOMAIN:
+	case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+#ifdef VM_NUMA_ALLOC
+		if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains)
+			return (0);
+#else
+		if (vp->p.domain == 0)
+			return (0);
+#endif
+		return (-1);
+	default:
+		return (-1);
+	}
+	return (-1);
+}
+
+int
+vm_domain_policy_cleanup(struct vm_domain_policy *vp)
+{
+
+	/* For now, empty */
+	return (0);
+}
+
+int
+vm_domain_iterator_init(struct vm_domain_iterator *vi)
+{
+
+	/* Nothing to do for now */
+	return (0);
+}
+
+/*
+ * Manually setup an iterator with the given details.
+ */
+int
+vm_domain_iterator_set(struct vm_domain_iterator *vi,
+    vm_domain_policy_type_t vt, int domain)
+{
+
+#ifdef VM_NUMA_ALLOC
+	switch (vt) {
+	case VM_POLICY_FIXED_DOMAIN:
+		vi->policy = VM_POLICY_FIXED_DOMAIN;
+		vi->domain = domain;
+		vi->n = 1;
+		break;
+	case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+		vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN;
+		vi->domain = domain;
+		vi->n = vm_ndomains;
+		break;
+	case VM_POLICY_FIRST_TOUCH:
+		vi->policy = VM_POLICY_FIRST_TOUCH;
+		vi->domain = PCPU_GET(domain);
+		vi->n = 1;
+		break;
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN;
+		vi->domain = PCPU_GET(domain);
+		vi->n = vm_ndomains;
+		break;
+	case VM_POLICY_ROUND_ROBIN:
+	default:
+		vi->policy = VM_POLICY_ROUND_ROBIN;
+		vi->domain = -1;
+		vi->n = vm_ndomains;
+		break;
+	}
+#else
+	vi->domain = 0;
+	vi->n = 1;
+#endif
+	return (0);
+}
+
+/*
+ * Setup an iterator based on the given policy.
+ */
+static inline void
+_vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+    const struct vm_domain_policy *vt)
+{
+
+#ifdef VM_NUMA_ALLOC
+	/*
+	 * Initialise the iterator.
+	 *
+	 * For first-touch, the initial domain is set
+	 * via the current thread CPU domain.
+	 *
+	 * For fixed-domain, it's assumed that the
+	 * caller has initialised the specific domain
+	 * it is after.
+	 */
+	switch (vt->p.policy) {
+	case VM_POLICY_FIXED_DOMAIN:
+		vi->policy = vt->p.policy;
+		vi->domain = vt->p.domain;
+		vi->n = 1;
+		break;
+	case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+		vi->policy = vt->p.policy;
+		vi->domain = vt->p.domain;
+		vi->n = vm_ndomains;
+		break;
+	case VM_POLICY_FIRST_TOUCH:
+		vi->policy = vt->p.policy;
+		vi->domain = PCPU_GET(domain);
+		vi->n = 1;
+		break;
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		vi->policy = vt->p.policy;
+		vi->domain = PCPU_GET(domain);
+		vi->n = vm_ndomains;
+		break;
+	case VM_POLICY_ROUND_ROBIN:
+	default:
+		/*
+		 * Default to round-robin policy.
+		 */
+		vi->policy = VM_POLICY_ROUND_ROBIN;
+		vi->domain = -1;
+		vi->n = vm_ndomains;
+		break;
+	}
+#else
+	vi->domain = 0;
+	vi->n = 1;
+#endif
+}
+
+void
+vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+    const struct vm_domain_policy *vt)
+{
+	seq_t seq;
+	struct vm_domain_policy vt_lcl;
+
+	for (;;) {
+		seq = seq_read(&vt->seq);
+		vt_lcl = *vt;
+		if (seq_consistent(&vt->seq, seq)) {
+			_vm_domain_iterator_set_policy(vi, &vt_lcl);
+			return;
+		}
+	}
+}
+
+/*
+ * Return the next VM domain to use.
+ *
+ * Returns 0 w/ domain set to the next domain to use, or
+ * -1 to indicate no more domains are available.
+ */
+int
+vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain)
+{
+
+	/* General catch-all */
+	if (vi->n <= 0)
+		return (-1);
+
+#ifdef VM_NUMA_ALLOC
+	switch (vi->policy) {
+	case VM_POLICY_FIXED_DOMAIN:
+	case VM_POLICY_FIRST_TOUCH:
+		*domain = vi->domain;
+		vi->n--;
+		break;
+	case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		/*
+		 * XXX TODO: skip over the rr'ed domain
+		 * if it equals the one we started with.
+		 */
+		if (vi->n == vm_ndomains)
+			*domain = vi->domain;
+		else
+			*domain = vm_domain_rr_selectdomain(vi->domain);
+		vi->n--;
+		break;
+	case VM_POLICY_ROUND_ROBIN:
+	default:
+		*domain = vm_domain_rr_selectdomain(-1);
+		vi->n--;
+		break;
+	}
+#else
+	*domain = 0;
+	vi->n--;
+#endif
+
+	return (0);
+}
+
+/*
+ * Returns 1 if the iteration is done, or 0 if it has not.
+
+ * This can only be called after at least one loop through
+ * the iterator.  Ie, it's designed to be used as a tail
+ * check of a loop, not the head check of a loop.
+ */
+int
+vm_domain_iterator_isdone(struct vm_domain_iterator *vi)
+{
+
+	return (vi->n <= 0);
+}
+
+int
+vm_domain_iterator_cleanup(struct vm_domain_iterator *vi)
+{
+
+	return (0);
+}


Property changes on: trunk/sys/vm/vm_domain.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/vm/vm_domain.h
===================================================================
--- trunk/sys/vm/vm_domain.h	                        (rev 0)
+++ trunk/sys/vm/vm_domain.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,67 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 Adrian Chadd <adrian at FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ *    redistribution must be conditioned upon including a substantially
+ *    similar Disclaimer requirement for further binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD: stable/11/sys/vm/vm_domain.h 285387 2015-07-11 15:21:37Z adrian $
+ */
+#ifndef	__VM_DOMAIN_H__
+#define	__VM_DOMAIN_H__
+
+#include <sys/_vm_domain.h>
+
+struct vm_domain_iterator {
+	vm_domain_policy_type_t policy;
+	int domain;
+	int n;
+};
+
+/*
+ * TODO: check to see if these should just become inline functions
+ * at some point.
+ */
+extern	int vm_domain_policy_init(struct vm_domain_policy *vp);
+extern	int vm_domain_policy_set(struct vm_domain_policy *vp,
+	    vm_domain_policy_type_t vt, int domain);
+extern	int vm_domain_policy_cleanup(struct vm_domain_policy *vp);
+extern	void vm_domain_policy_localcopy(struct vm_domain_policy *dst,
+	    const struct vm_domain_policy *src);
+extern	void vm_domain_policy_copy(struct vm_domain_policy *dst,
+	    const struct vm_domain_policy *src);
+extern	int vm_domain_policy_validate(const struct vm_domain_policy *vp);
+
+extern	int vm_domain_iterator_init(struct vm_domain_iterator *vi);
+extern	int vm_domain_iterator_set(struct vm_domain_iterator *vi,
+	    vm_domain_policy_type_t vt, int domain);
+extern	void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+	    const struct vm_domain_policy *vt);
+extern	int vm_domain_iterator_run(struct vm_domain_iterator *vi,
+	    int *domain);
+extern	int vm_domain_iterator_isdone(struct vm_domain_iterator *vi);
+extern	int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi);
+
+#endif	/* __VM_DOMAIN_H__ */


Property changes on: trunk/sys/vm/vm_domain.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/vm_extern.h
===================================================================
--- trunk/sys/vm/vm_extern.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_extern.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_extern.h	8.2 (Berkeley) 1/12/94
- * $FreeBSD: stable/10/sys/vm/vm_extern.h 270920 2014-09-01 07:58:15Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_extern.h 337262 2018-08-03 15:42:39Z markj $
  */
 
 #ifndef _VM_EXTERN_H_
@@ -41,6 +41,8 @@
 struct vmem;
 
 #ifdef _KERNEL
+struct cdev;
+struct cdevsw;
 
 /* These operate on kernel virtual addresses only. */
 vm_offset_t kva_alloc(vm_size_t);
@@ -64,6 +66,7 @@
 void kmem_unback(vm_object_t, vm_offset_t, vm_size_t);
 
 /* Bootstrapping. */
+void kmem_bootstrap_free(vm_offset_t, vm_size_t);
 vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t,
     boolean_t);
 void kmem_init(vm_offset_t, vm_offset_t);
@@ -70,7 +73,6 @@
 void kmem_init_zero_region(void);
 void kmeminit(void);
 
-void swapout_procs(int);
 int kernacc(void *, int, int);
 int useracc(void *, int, int);
 int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
@@ -82,10 +84,18 @@
     int fault_flags, vm_page_t *m_hold);
 int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
     vm_prot_t prot, vm_page_t *ma, int max_count);
-int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
+int vm_forkproc(struct thread *, struct proc *, struct thread *,
+    struct vmspace *, int);
 void vm_waitproc(struct proc *);
-int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t);
+int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int,
+    objtype_t, void *, vm_ooffset_t);
+int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t,
+    vm_prot_t, int, vm_object_t, vm_ooffset_t, boolean_t, struct thread *);
 int vm_mmap_to_errno(int rv);
+int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
+    int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *);
+int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *,
+    struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
 void vm_set_page_size(void);
 void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t);
 typedef int (*pmap_pinit_t)(struct pmap *pmap);
@@ -97,6 +107,7 @@
 struct vmspace *vmspace_acquire_ref(struct proc *);
 void vmspace_free(struct vmspace *);
 void vmspace_exitfree(struct proc *);
+void vmspace_switch_aio(struct vmspace *);
 void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
 int vslock(void *, size_t);
 void vsunlock(void *, size_t);
@@ -104,6 +115,5 @@
 void vm_imgact_unmap_page(struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 int vm_thread_new(struct thread *td, int pages);
-int vm_mlock(struct proc *, struct ucred *, const void *, size_t);
 #endif				/* _KERNEL */
 #endif				/* !_VM_EXTERN_H_ */

Modified: trunk/sys/vm/vm_fault.c
===================================================================
--- trunk/sys/vm/vm_fault.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_fault.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -73,7 +73,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_fault.c 329707 2018-02-21 11:31:29Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_fault.c 345572 2019-03-27 11:03:07Z kib $");
 
 #include "opt_ktrace.h"
 #include "opt_vm.h"
@@ -82,7 +82,9 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/mman.h>
 #include <sys/proc.h>
+#include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
@@ -107,14 +109,11 @@
 #define PFBAK 4
 #define PFFOR 4
 
-static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
-
-#define	VM_FAULT_READ_BEHIND	8
+#define	VM_FAULT_READ_DEFAULT	(1 + VM_FAULT_READ_AHEAD_INIT)
 #define	VM_FAULT_READ_MAX	(1 + VM_FAULT_READ_AHEAD_MAX)
-#define	VM_FAULT_NINCR		(VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND)
-#define	VM_FAULT_SUM		(VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2)
-#define	VM_FAULT_CACHE_BEHIND	(VM_FAULT_READ_BEHIND * VM_FAULT_SUM)
 
+#define	VM_FAULT_DONTNEED_MIN	1048576
+
 struct faultstate {
 	vm_page_t m;
 	vm_object_t object;
@@ -124,14 +123,15 @@
 	vm_pindex_t first_pindex;
 	vm_map_t map;
 	vm_map_entry_t entry;
-	int lookup_still_valid;
 	int map_generation;
+	bool lookup_still_valid;
 	struct vnode *vp;
 };
 
-static void vm_fault_cache_behind(const struct faultstate *fs, int distance);
+static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
+	    int ahead);
 static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
-	    int faultcount, int reqpage);
+	    int backward, int forward, bool obj_locked);
 
 static inline void
 release_page(struct faultstate *fs)
@@ -150,7 +150,7 @@
 
 	if (fs->lookup_still_valid) {
 		vm_map_lookup_done(fs->map, fs->entry);
-		fs->lookup_still_valid = FALSE;
+		fs->lookup_still_valid = false;
 	}
 }
 
@@ -237,14 +237,15 @@
 	 * written NOW so dirty it explicitly to save on
 	 * pmap_is_modified() calls later.
 	 *
-	 * Also tell the backing pager, if any, that it should remove
-	 * any swap backing since the page is now dirty.
+	 * Also, since the page is now dirty, we can possibly tell
+	 * the pager to release any swap backing the page.  Calling
+	 * the pager requires a write lock on the object.
 	 */
 	if (need_dirty)
 		vm_page_dirty(m);
 	if (!set_wd)
 		vm_page_unlock(m);
-	if (need_dirty)
+	else if (need_dirty)
 		vm_pager_page_unswapped(m);
 }
 
@@ -267,8 +268,12 @@
 vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
     int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
 {
-	vm_page_t m;
-	int rv;
+	vm_page_t m, m_map;
+#if defined(__amd64__) && VM_NRESERVLEVEL > 0
+	vm_page_t m_super;
+	int flags;
+#endif
+	int psind, rv;
 
 	MPASS(fs->vp == NULL);
 	m = vm_page_lookup(fs->first_object, fs->first_pindex);
@@ -276,20 +281,204 @@
 	if (m == NULL || ((prot & VM_PROT_WRITE) != 0 &&
 	    vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL)
 		return (KERN_FAILURE);
-	rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
-	    PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), 0);
+	m_map = m;
+	psind = 0;
+#if defined(__amd64__) && VM_NRESERVLEVEL > 0
+	if ((m->flags & PG_FICTITIOUS) == 0 &&
+	    (m_super = vm_reserv_to_superpage(m)) != NULL &&
+	    rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
+	    roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
+	    (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
+	    (pagesizes[m_super->psind] - 1)) &&
+	    pmap_ps_enabled(fs->map->pmap)) {
+		flags = PS_ALL_VALID;
+		if ((prot & VM_PROT_WRITE) != 0) {
+			/*
+			 * Create a superpage mapping allowing write access
+			 * only if none of the constituent pages are busy and
+			 * all of them are already dirty (except possibly for
+			 * the page that was faulted on).
+			 */
+			flags |= PS_NONE_BUSY;
+			if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
+				flags |= PS_ALL_DIRTY;
+		}
+		if (vm_page_ps_test(m_super, flags, m)) {
+			m_map = m_super;
+			psind = m_super->psind;
+			vaddr = rounddown2(vaddr, pagesizes[psind]);
+			/* Preset the modified bit for dirty superpages. */
+			if ((flags & PS_ALL_DIRTY) != 0)
+				fault_type |= VM_PROT_WRITE;
+		}
+	}
+#endif
+	rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type |
+	    PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind);
 	if (rv != KERN_SUCCESS)
 		return (rv);
 	vm_fault_fill_hold(m_hold, m);
 	vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
+	if (psind == 0 && !wired)
+		vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
 	VM_OBJECT_RUNLOCK(fs->first_object);
-	if (!wired)
-		vm_fault_prefault(fs, vaddr, 0, 0);
 	vm_map_lookup_done(fs->map, fs->entry);
 	curthread->td_ru.ru_minflt++;
 	return (KERN_SUCCESS);
 }
 
+static void
+vm_fault_restore_map_lock(struct faultstate *fs)
+{
+
+	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
+	MPASS(fs->first_object->paging_in_progress > 0);
+
+	if (!vm_map_trylock_read(fs->map)) {
+		VM_OBJECT_WUNLOCK(fs->first_object);
+		vm_map_lock_read(fs->map);
+		VM_OBJECT_WLOCK(fs->first_object);
+	}
+	fs->lookup_still_valid = true;
+}
+
+static void
+vm_fault_populate_check_page(vm_page_t m)
+{
+
+	/*
+	 * Check each page to ensure that the pager is obeying the
+	 * interface: the page must be installed in the object, fully
+	 * valid, and exclusively busied.
+	 */
+	MPASS(m != NULL);
+	MPASS(m->valid == VM_PAGE_BITS_ALL);
+	MPASS(vm_page_xbusied(m));
+}
+
+static void
+vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first,
+    vm_pindex_t last)
+{
+	vm_page_t m;
+	vm_pindex_t pidx;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	MPASS(first <= last);
+	for (pidx = first, m = vm_page_lookup(object, pidx);
+	    pidx <= last; pidx++, m = vm_page_next(m)) {
+		vm_fault_populate_check_page(m);
+		vm_page_lock(m);
+		vm_page_deactivate(m);
+		vm_page_unlock(m);
+		vm_page_xunbusy(m);
+	}
+}
+
+static int
+vm_fault_populate(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
+    int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
+{
+	vm_page_t m;
+	vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
+	int rv;
+
+	MPASS(fs->object == fs->first_object);
+	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
+	MPASS(fs->first_object->paging_in_progress > 0);
+	MPASS(fs->first_object->backing_object == NULL);
+	MPASS(fs->lookup_still_valid);
+
+	pager_first = OFF_TO_IDX(fs->entry->offset);
+	pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1;
+	unlock_map(fs);
+	unlock_vp(fs);
+
+	/*
+	 * Call the pager (driver) populate() method.
+	 *
+	 * There is no guarantee that the method will be called again
+	 * if the current fault is for read, and a future fault is
+	 * for write.  Report the entry's maximum allowed protection
+	 * to the driver.
+	 */
+	rv = vm_pager_populate(fs->first_object, fs->first_pindex,
+	    fault_type, fs->entry->max_protection, &pager_first, &pager_last);
+
+	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
+	if (rv == VM_PAGER_BAD) {
+		/*
+		 * VM_PAGER_BAD is the backdoor for a pager to request
+		 * normal fault handling.
+		 */
+		vm_fault_restore_map_lock(fs);
+		if (fs->map->timestamp != fs->map_generation)
+			return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
+		return (KERN_NOT_RECEIVER);
+	}
+	if (rv != VM_PAGER_OK)
+		return (KERN_FAILURE); /* AKA SIGSEGV */
+
+	/* Ensure that the driver is obeying the interface. */
+	MPASS(pager_first <= pager_last);
+	MPASS(fs->first_pindex <= pager_last);
+	MPASS(fs->first_pindex >= pager_first);
+	MPASS(pager_last < fs->first_object->size);
+
+	vm_fault_restore_map_lock(fs);
+	if (fs->map->timestamp != fs->map_generation) {
+		vm_fault_populate_cleanup(fs->first_object, pager_first,
+		    pager_last);
+		return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
+	}
+
+	/*
+	 * The map is unchanged after our last unlock.  Process the fault.
+	 *
+	 * The range [pager_first, pager_last] that is given to the
+	 * pager is only a hint.  The pager may populate any range
+	 * within the object that includes the requested page index.
+	 * In case the pager expanded the range, clip it to fit into
+	 * the map entry.
+	 */
+	map_first = OFF_TO_IDX(fs->entry->offset);
+	if (map_first > pager_first) {
+		vm_fault_populate_cleanup(fs->first_object, pager_first,
+		    map_first - 1);
+		pager_first = map_first;
+	}
+	map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1;
+	if (map_last < pager_last) {
+		vm_fault_populate_cleanup(fs->first_object, map_last + 1,
+		    pager_last);
+		pager_last = map_last;
+	}
+	for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx);
+	    pidx <= pager_last; pidx++, m = vm_page_next(m)) {
+		vm_fault_populate_check_page(m);
+		vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags,
+		    true);
+		VM_OBJECT_WUNLOCK(fs->first_object);
+		pmap_enter(fs->map->pmap, fs->entry->start + IDX_TO_OFF(pidx) -
+		    fs->entry->offset, m, prot, fault_type | (wired ?
+		    PMAP_ENTER_WIRED : 0), 0);
+		VM_OBJECT_WLOCK(fs->first_object);
+		if (pidx == fs->first_pindex)
+			vm_fault_fill_hold(m_hold, m);
+		vm_page_lock(m);
+		if ((fault_flags & VM_FAULT_WIRE) != 0) {
+			KASSERT(wired, ("VM_FAULT_WIRE && !wired"));
+			vm_page_wire(m);
+		} else {
+			vm_page_activate(m);
+		}
+		vm_page_unlock(m);
+		vm_page_xunbusy(m);
+	}
+	curthread->td_ru.ru_majflt++;
+	return (KERN_SUCCESS);
+}
+
 /*
  *	vm_fault:
  *
@@ -334,21 +523,23 @@
 vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold)
 {
-	vm_prot_t prot;
-	long ahead, behind;
-	int alloc_req, era, faultcount, nera, reqpage, result;
-	boolean_t dead, is_first_object_locked, wired;
-	vm_object_t next_object;
-	vm_page_t marray[VM_FAULT_READ_MAX];
-	int hardfault;
 	struct faultstate fs;
 	struct vnode *vp;
-	int locked, error;
+	vm_object_t next_object, retry_object;
+	vm_offset_t e_end, e_start;
+	vm_pindex_t retry_pindex;
+	vm_prot_t prot, retry_prot;
+	int ahead, alloc_req, behind, cluster_offset, error, era, faultcount;
+	int locked, nera, result, rv;
+	u_char behavior;
+	boolean_t wired;	/* Passed by reference. */
+	bool dead, hardfault, is_first_object_locked;
 
-	hardfault = 0;
 	PCPU_INC(cnt.v_vm_faults);
 	fs.vp = NULL;
-	faultcount = reqpage = 0;
+	faultcount = 0;
+	nera = -1;
+	hardfault = false;
 
 RetryFault:;
 
@@ -415,10 +606,10 @@
 		    (fs.first_object->type != OBJT_VNODE &&
 		    (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
 		    (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) {
-			result = vm_fault_soft_fast(&fs, vaddr, prot,
-			    fault_type, fault_flags, wired, m_hold);
-			if (result == KERN_SUCCESS)
-				return (result);
+			rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type,
+			    fault_flags, wired, m_hold);
+			if (rv == KERN_SUCCESS)
+				return (rv);
 		}
 		if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
 			VM_OBJECT_RUNLOCK(fs.first_object);
@@ -435,13 +626,12 @@
 	 * they will stay around as well.
 	 *
 	 * Bump the paging-in-progress count to prevent size changes (e.g. 
-	 * truncation operations) during I/O.  This must be done after
-	 * obtaining the vnode lock in order to avoid possible deadlocks.
+	 * truncation operations) during I/O.
 	 */
 	vm_object_reference_locked(fs.first_object);
 	vm_object_pip_add(fs.first_object, 1);
 
-	fs.lookup_still_valid = TRUE;
+	fs.lookup_still_valid = true;
 
 	fs.first_m = NULL;
 
@@ -534,11 +724,13 @@
 				goto readrest;
 			break;
 		}
+		KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m));
 
 		/*
-		 * Page is not resident.  If this is the search termination
-		 * or the pager might contain the page, allocate a new page.
-		 * Default objects are zero-fill, there is no real pager.
+		 * Page is not resident.  If the pager might contain the page
+		 * or this is the beginning of the search, allocate a new
+		 * page.  (Default objects are zero-fill, so there is no real
+		 * pager for them.)
 		 */
 		if (fs.object->type != OBJT_DEFAULT ||
 		    fs.object == fs.first_object) {
@@ -547,6 +739,30 @@
 				return (KERN_PROTECTION_FAILURE);
 			}
 
+			if (fs.object == fs.first_object &&
+			    (fs.first_object->flags & OBJ_POPULATE) != 0 &&
+			    fs.first_object->shadow_count == 0) {
+				rv = vm_fault_populate(&fs, vaddr, prot,
+				    fault_type, fault_flags, wired, m_hold);
+				switch (rv) {
+				case KERN_SUCCESS:
+				case KERN_FAILURE:
+					unlock_and_deallocate(&fs);
+					return (rv);
+				case KERN_RESOURCE_SHORTAGE:
+					unlock_and_deallocate(&fs);
+					goto RetryFault;
+				case KERN_NOT_RECEIVER:
+					/*
+					 * Pager's populate() method
+					 * returned VM_PAGER_BAD.
+					 */
+					break;
+				default:
+					panic("inconsistent return codes");
+				}
+			}
+
 			/*
 			 * Allocate a new page for this object/offset pair.
 			 *
@@ -555,14 +771,10 @@
 			 * there, and allocation can fail, causing
 			 * restart and new reading of the p_flag.
 			 */
-			fs.m = NULL;
 			if (!vm_page_count_severe() || P_KILLED(curproc)) {
 #if VM_NRESERVLEVEL > 0
-				if ((fs.object->flags & OBJ_COLORED) == 0) {
-					fs.object->flags |= OBJ_COLORED;
-					fs.object->pg_color = atop(vaddr) -
-					    fs.pindex;
-				}
+				vm_object_color(fs.object, atop(vaddr) -
+				    fs.pindex);
 #endif
 				alloc_req = P_KILLED(curproc) ?
 				    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
@@ -576,80 +788,113 @@
 				unlock_and_deallocate(&fs);
 				VM_WAITPFAULT;
 				goto RetryFault;
-			} else if (fs.m->valid == VM_PAGE_BITS_ALL)
-				break;
+			}
 		}
 
 readrest:
 		/*
-		 * We have found a valid page or we have allocated a new page.
-		 * The page thus may not be valid or may not be entirely 
-		 * valid.
+		 * At this point, we have either allocated a new page or found
+		 * an existing page that is only partially valid.
 		 *
-		 * Attempt to fault-in the page if there is a chance that the
-		 * pager has it, and potentially fault in additional pages
-		 * at the same time.  For default objects simply provide
-		 * zero-filled pages.
+		 * We hold a reference on the current object and the page is
+		 * exclusive busied.
 		 */
-		if (fs.object->type != OBJT_DEFAULT) {
-			int rv;
-			u_char behavior = vm_map_entry_behavior(fs.entry);
 
-			if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
-			    P_KILLED(curproc)) {
-				behind = 0;
-				ahead = 0;
+		/*
+		 * If the pager for the current object might have the page,
+		 * then determine the number of additional pages to read and
+		 * potentially reprioritize previously read pages for earlier
+		 * reclamation.  These operations should only be performed
+		 * once per page fault.  Even if the current pager doesn't
+		 * have the page, the number of additional pages to read will
+		 * apply to subsequent objects in the shadow chain.
+		 */
+		if (fs.object->type != OBJT_DEFAULT && nera == -1 &&
+		    !P_KILLED(curproc)) {
+			KASSERT(fs.lookup_still_valid, ("map unlocked"));
+			era = fs.entry->read_ahead;
+			behavior = vm_map_entry_behavior(fs.entry);
+			if (behavior == MAP_ENTRY_BEHAV_RANDOM) {
+				nera = 0;
 			} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
-				behind = 0;
-				ahead = atop(fs.entry->end - vaddr) - 1;
-				if (ahead > VM_FAULT_READ_AHEAD_MAX)
-					ahead = VM_FAULT_READ_AHEAD_MAX;
-				if (fs.pindex == fs.entry->next_read)
-					vm_fault_cache_behind(&fs,
-					    VM_FAULT_READ_MAX);
-			} else {
+				nera = VM_FAULT_READ_AHEAD_MAX;
+				if (vaddr == fs.entry->next_read)
+					vm_fault_dontneed(&fs, vaddr, nera);
+			} else if (vaddr == fs.entry->next_read) {
 				/*
-				 * If this is a sequential page fault, then
-				 * arithmetically increase the number of pages
-				 * in the read-ahead window.  Otherwise, reset
-				 * the read-ahead window to its smallest size.
+				 * This is a sequential fault.  Arithmetically
+				 * increase the requested number of pages in
+				 * the read-ahead window.  The requested
+				 * number of pages is "# of sequential faults
+				 * x (read ahead min + 1) + read ahead min"
 				 */
-				behind = atop(vaddr - fs.entry->start);
-				if (behind > VM_FAULT_READ_BEHIND)
-					behind = VM_FAULT_READ_BEHIND;
-				ahead = atop(fs.entry->end - vaddr) - 1;
-				era = fs.entry->read_ahead;
-				if (fs.pindex == fs.entry->next_read) {
-					nera = era + behind;
+				nera = VM_FAULT_READ_AHEAD_MIN;
+				if (era > 0) {
+					nera += era + 1;
 					if (nera > VM_FAULT_READ_AHEAD_MAX)
 						nera = VM_FAULT_READ_AHEAD_MAX;
-					behind = 0;
-					if (ahead > nera)
-						ahead = nera;
-					if (era == VM_FAULT_READ_AHEAD_MAX)
-						vm_fault_cache_behind(&fs,
-						    VM_FAULT_CACHE_BEHIND);
-				} else if (ahead > VM_FAULT_READ_AHEAD_MIN)
-					ahead = VM_FAULT_READ_AHEAD_MIN;
-				if (era != ahead)
-					fs.entry->read_ahead = ahead;
+				}
+				if (era == VM_FAULT_READ_AHEAD_MAX)
+					vm_fault_dontneed(&fs, vaddr, nera);
+			} else {
+				/*
+				 * This is a non-sequential fault.
+				 */
+				nera = 0;
 			}
+			if (era != nera) {
+				/*
+				 * A read lock on the map suffices to update
+				 * the read ahead count safely.
+				 */
+				fs.entry->read_ahead = nera;
+			}
 
 			/*
-			 * Call the pager to retrieve the data, if any, after
-			 * releasing the lock on the map.  We hold a ref on
-			 * fs.object and the pages are exclusive busied.
+			 * Prepare for unlocking the map.  Save the map
+			 * entry's start and end addresses, which are used to
+			 * optimize the size of the pager operation below.
+			 * Even if the map entry's addresses change after
+			 * unlocking the map, using the saved addresses is
+			 * safe.
 			 */
+			e_start = fs.entry->start;
+			e_end = fs.entry->end;
+		}
+
+		/*
+		 * Call the pager to retrieve the page if there is a chance
+		 * that the pager has it, and potentially retrieve additional
+		 * pages at the same time.
+		 */
+		if (fs.object->type != OBJT_DEFAULT) {
+			/*
+			 * Release the map lock before locking the vnode or
+			 * sleeping in the pager.  (If the current object has
+			 * a shadow, then an earlier iteration of this loop
+			 * may have already unlocked the map.)
+			 */
 			unlock_map(&fs);
 
 			if (fs.object->type == OBJT_VNODE &&
 			    (vp = fs.object->handle) != fs.vp) {
+				/*
+				 * Perform an unlock in case the desired vnode
+				 * changed while the map was unlocked during a
+				 * retry.
+				 */
 				unlock_vp(&fs);
+
 				locked = VOP_ISLOCKED(vp);
-
 				if (locked != LK_EXCLUSIVE)
 					locked = LK_SHARED;
-				/* Do not sleep for vnode lock while fs.m is busy */
+
+				/*
+				 * We must not sleep acquiring the vnode lock
+				 * while we have the page exclusive busied or
+				 * the object's paging-in-progress count
+				 * incremented.  Otherwise, we could deadlock.
+				 */
 				error = vget(vp, locked | LK_CANRECURSE |
 				    LK_NOWAIT, curthread);
 				if (error != 0) {
@@ -670,88 +915,85 @@
 			    ("vm_fault: vnode-backed object mapped by system map"));
 
 			/*
-			 * now we find out if any other pages should be paged
-			 * in at this time this routine checks to see if the
-			 * pages surrounding this fault reside in the same
-			 * object as the page for this fault.  If they do,
-			 * then they are faulted in also into the object.  The
-			 * array "marray" returned contains an array of
-			 * vm_page_t structs where one of them is the
-			 * vm_page_t passed to the routine.  The reqpage
-			 * return value is the index into the marray for the
-			 * vm_page_t passed to the routine.
-			 *
-			 * fs.m plus the additional pages are exclusive busied.
+			 * Page in the requested page and hint the pager,
+			 * that it may bring up surrounding pages.
 			 */
-			faultcount = vm_fault_additional_pages(
-			    fs.m, behind, ahead, marray, &reqpage);
-
-			rv = faultcount ?
-			    vm_pager_get_pages(fs.object, marray, faultcount,
-				reqpage) : VM_PAGER_FAIL;
-
+			if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM ||
+			    P_KILLED(curproc)) {
+				behind = 0;
+				ahead = 0;
+			} else {
+				/* Is this a sequential fault? */
+				if (nera > 0) {
+					behind = 0;
+					ahead = nera;
+				} else {
+					/*
+					 * Request a cluster of pages that is
+					 * aligned to a VM_FAULT_READ_DEFAULT
+					 * page offset boundary within the
+					 * object.  Alignment to a page offset
+					 * boundary is more likely to coincide
+					 * with the underlying file system
+					 * block than alignment to a virtual
+					 * address boundary.
+					 */
+					cluster_offset = fs.pindex %
+					    VM_FAULT_READ_DEFAULT;
+					behind = ulmin(cluster_offset,
+					    atop(vaddr - e_start));
+					ahead = VM_FAULT_READ_DEFAULT - 1 -
+					    cluster_offset;
+				}
+				ahead = ulmin(ahead, atop(e_end - vaddr) - 1);
+			}
+			rv = vm_pager_get_pages(fs.object, &fs.m, 1,
+			    &behind, &ahead);
 			if (rv == VM_PAGER_OK) {
-				/*
-				 * Found the page. Leave it busy while we play
-				 * with it.
-				 */
-
-				/*
-				 * Relookup in case pager changed page. Pager
-				 * is responsible for disposition of old page
-				 * if moved.
-				 */
-				fs.m = vm_page_lookup(fs.object, fs.pindex);
-				if (!fs.m) {
-					unlock_and_deallocate(&fs);
-					goto RetryFault;
-				}
-
-				hardfault++;
+				faultcount = behind + 1 + ahead;
+				hardfault = true;
 				break; /* break to PAGE HAS BEEN FOUND */
 			}
-			/*
-			 * Remove the bogus page (which does not exist at this
-			 * object/offset); before doing so, we must get back
-			 * our object lock to preserve our invariant.
-			 *
-			 * Also wake up any other process that may want to bring
-			 * in this page.
-			 *
-			 * If this is the top-level object, we must leave the
-			 * busy page to prevent another process from rushing
-			 * past us, and inserting the page in that object at
-			 * the same time that we are.
-			 */
 			if (rv == VM_PAGER_ERROR)
 				printf("vm_fault: pager read error, pid %d (%s)\n",
 				    curproc->p_pid, curproc->p_comm);
+
 			/*
-			 * Data outside the range of the pager or an I/O error
+			 * If an I/O error occurred or the requested page was
+			 * outside the range of the pager, clean up and return
+			 * an error.
 			 */
-			/*
-			 * XXX - the check for kernel_map is a kludge to work
-			 * around having the machine panic on a kernel space
-			 * fault w/ I/O error.
-			 */
-			if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) ||
-				(rv == VM_PAGER_BAD)) {
+			if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) {
 				vm_page_lock(fs.m);
-				vm_page_free(fs.m);
+				if (fs.m->wire_count == 0)
+					vm_page_free(fs.m);
+				else
+					vm_page_xunbusy_maybelocked(fs.m);
 				vm_page_unlock(fs.m);
 				fs.m = NULL;
 				unlock_and_deallocate(&fs);
-				return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
+				return (rv == VM_PAGER_ERROR ? KERN_FAILURE :
+				    KERN_PROTECTION_FAILURE);
 			}
+
+			/*
+			 * The requested page does not exist at this object/
+			 * offset.  Remove the invalid page from the object,
+			 * waking up anyone waiting for it, and continue on to
+			 * the next object.  However, if this is the top-level
+			 * object, we must leave the busy page in place to
+			 * prevent another process from rushing past us, and
+			 * inserting the page in that object at the same time
+			 * that we are.
+			 */
 			if (fs.object != fs.first_object) {
 				vm_page_lock(fs.m);
-				vm_page_free(fs.m);
+				if (fs.m->wire_count == 0)
+					vm_page_free(fs.m);
+				else
+					vm_page_xunbusy_maybelocked(fs.m);
 				vm_page_unlock(fs.m);
 				fs.m = NULL;
-				/*
-				 * XXX - we cannot just fall out at this
-				 * point, m has been freed and is invalid!
-				 */
 			}
 		}
 
@@ -766,7 +1008,6 @@
 		 * Move on to the next object.  Lock the next object before
 		 * unlocking the current one.
 		 */
-		fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset);
 		next_object = fs.object->backing_object;
 		if (next_object == NULL) {
 			/*
@@ -804,6 +1045,8 @@
 			vm_object_pip_add(next_object, 1);
 			if (fs.object != fs.first_object)
 				vm_object_pip_wakeup(fs.object);
+			fs.pindex +=
+			    OFF_TO_IDX(fs.object->backing_object_offset);
 			VM_OBJECT_WUNLOCK(fs.object);
 			fs.object = next_object;
 		}
@@ -836,7 +1079,7 @@
 			 * dirty in the first object so that it will go out 
 			 * to swap when needed.
 			 */
-			is_first_object_locked = FALSE;
+			is_first_object_locked = false;
 			if (
 				/*
 				 * Only one shadow object
@@ -860,22 +1103,15 @@
 				 * We don't chase down the shadow chain
 				 */
 			    fs.object == fs.first_object->backing_object) {
-				/*
-				 * get rid of the unnecessary page
-				 */
+				vm_page_lock(fs.m);
+				vm_page_remove(fs.m);
+				vm_page_unlock(fs.m);
 				vm_page_lock(fs.first_m);
+				vm_page_replace_checked(fs.m, fs.first_object,
+				    fs.first_pindex, fs.first_m);
 				vm_page_free(fs.first_m);
 				vm_page_unlock(fs.first_m);
-				/*
-				 * grab the page and put it into the 
-				 * process'es object.  The page is 
-				 * automatically made dirty.
-				 */
-				if (vm_page_rename(fs.m, fs.first_object,
-				    fs.first_pindex)) {
-					unlock_and_deallocate(&fs);
-					goto RetryFault;
-				}
+				vm_page_dirty(fs.m);
 #if VM_NRESERVLEVEL > 0
 				/*
 				 * Rename the reservation.
@@ -884,6 +1120,10 @@
 				    fs.object, OFF_TO_IDX(
 				    fs.first_object->backing_object_offset));
 #endif
+				/*
+				 * Removing the page from the backing object
+				 * unbusied it.
+				 */
 				vm_page_xbusy(fs.m);
 				fs.first_m = fs.m;
 				fs.m = NULL;
@@ -905,7 +1145,7 @@
 					vm_page_unlock(fs.first_m);
 					
 					vm_page_lock(fs.m);
-					vm_page_unwire(fs.m, FALSE);
+					vm_page_unwire(fs.m, PQ_INACTIVE);
 					vm_page_unlock(fs.m);
 				}
 				/*
@@ -939,16 +1179,12 @@
 	 * lookup.
 	 */
 	if (!fs.lookup_still_valid) {
-		vm_object_t retry_object;
-		vm_pindex_t retry_pindex;
-		vm_prot_t retry_prot;
-
 		if (!vm_map_trylock_read(fs.map)) {
 			release_page(&fs);
 			unlock_and_deallocate(&fs);
 			goto RetryFault;
 		}
-		fs.lookup_still_valid = TRUE;
+		fs.lookup_still_valid = true;
 		if (fs.map->timestamp != fs.map_generation) {
 			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
 			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
@@ -986,20 +1222,23 @@
 			 * write-enabled after all.
 			 */
 			prot &= retry_prot;
+			fault_type &= retry_prot;
+			if (prot == 0) {
+				release_page(&fs);
+				unlock_and_deallocate(&fs);
+				goto RetryFault;
+			}
 		}
 	}
+
 	/*
-	 * If the page was filled by a pager, update the map entry's
-	 * last read offset.  Since the pager does not return the
-	 * actual set of pages that it read, this update is based on
-	 * the requested set.  Typically, the requested and actual
-	 * sets are the same.
-	 *
-	 * XXX The following assignment modifies the map
-	 * without holding a write lock on it.
+	 * If the page was filled by a pager, save the virtual address that
+	 * should be faulted on next under a sequential access pattern to the
+	 * map entry.  A read lock on the map suffices to update this address
+	 * safely.
 	 */
 	if (hardfault)
-		fs.entry->next_read = fs.pindex + faultcount - reqpage;
+		fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;
 
 	vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
 	vm_page_assert_xbusied(fs.m);
@@ -1022,7 +1261,9 @@
 	    fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0);
 	if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 &&
 	    wired == 0)
-		vm_fault_prefault(&fs, vaddr, faultcount, reqpage);
+		vm_fault_prefault(&fs, vaddr,
+		    faultcount > 0 ? behind : PFBAK,
+		    faultcount > 0 ? ahead : PFFOR, false);
 	VM_OBJECT_WLOCK(fs.object);
 	vm_page_lock(fs.m);
 
@@ -1049,6 +1290,21 @@
 	if (hardfault) {
 		PCPU_INC(cnt.v_io_faults);
 		curthread->td_ru.ru_majflt++;
+#ifdef RACCT
+		if (racct_enable && fs.object->type == OBJT_VNODE) {
+			PROC_LOCK(curproc);
+			if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
+				racct_add_force(curproc, RACCT_WRITEBPS,
+				    PAGE_SIZE + behind * PAGE_SIZE);
+				racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+			} else {
+				racct_add_force(curproc, RACCT_READBPS,
+				    PAGE_SIZE + ahead * PAGE_SIZE);
+				racct_add_force(curproc, RACCT_READIOPS, 1);
+			}
+			PROC_UNLOCK(curproc);
+		}
+#endif
 	} else 
 		curthread->td_ru.ru_minflt++;
 
@@ -1056,15 +1312,26 @@
 }
 
 /*
- * Speed up the reclamation of up to "distance" pages that precede the
- * faulting pindex within the first object of the shadow chain.
+ * Speed up the reclamation of pages that precede the faulting pindex within
+ * the first object of the shadow chain.  Essentially, perform the equivalent
+ * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
+ * the faulting pindex by the cluster size when the pages read by vm_fault()
+ * cross a cluster-size boundary.  The cluster size is the greater of the
+ * smallest superpage size and VM_FAULT_DONTNEED_MIN.
+ *
+ * When "fs->first_object" is a shadow object, the pages in the backing object
+ * that precede the faulting pindex are deactivated by vm_fault().  So, this
+ * function must only be concerned with pages in the first object.
  */
 static void
-vm_fault_cache_behind(const struct faultstate *fs, int distance)
+vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
 {
+	vm_map_entry_t entry;
 	vm_object_t first_object, object;
-	vm_page_t m, m_prev;
-	vm_pindex_t pindex;
+	vm_offset_t end, start;
+	vm_page_t m, m_next;
+	vm_pindex_t pend, pstart;
+	vm_size_t size;
 
 	object = fs->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
@@ -1076,32 +1343,44 @@
 			VM_OBJECT_WLOCK(object);
 		}
 	}
-	/* Neither fictitious nor unmanaged pages can be cached. */
+	/* Neither fictitious nor unmanaged pages can be reclaimed. */
 	if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
-		if (fs->first_pindex < distance)
-			pindex = 0;
-		else
-			pindex = fs->first_pindex - distance;
-		if (pindex < OFF_TO_IDX(fs->entry->offset))
-			pindex = OFF_TO_IDX(fs->entry->offset);
-		m = first_object != object ? fs->first_m : fs->m;
-		vm_page_assert_xbusied(m);
-		m_prev = vm_page_prev(m);
-		while ((m = m_prev) != NULL && m->pindex >= pindex &&
-		    m->valid == VM_PAGE_BITS_ALL) {
-			m_prev = vm_page_prev(m);
-			if (vm_page_busied(m))
-				continue;
-			vm_page_lock(m);
-			if (m->hold_count == 0 && m->wire_count == 0) {
-				pmap_remove_all(m);
-				vm_page_aflag_clear(m, PGA_REFERENCED);
-				if (m->dirty != 0)
-					vm_page_deactivate(m);
-				else
-					vm_page_cache(m);
+		size = VM_FAULT_DONTNEED_MIN;
+		if (MAXPAGESIZES > 1 && size < pagesizes[1])
+			size = pagesizes[1];
+		end = rounddown2(vaddr, size);
+		if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
+		    (entry = fs->entry)->start < end) {
+			if (end - entry->start < size)
+				start = entry->start;
+			else
+				start = end - size;
+			pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
+			pstart = OFF_TO_IDX(entry->offset) + atop(start -
+			    entry->start);
+			m_next = vm_page_find_least(first_object, pstart);
+			pend = OFF_TO_IDX(entry->offset) + atop(end -
+			    entry->start);
+			while ((m = m_next) != NULL && m->pindex < pend) {
+				m_next = TAILQ_NEXT(m, listq);
+				if (m->valid != VM_PAGE_BITS_ALL ||
+				    vm_page_busied(m))
+					continue;
+
+				/*
+				 * Don't clear PGA_REFERENCED, since it would
+				 * likely represent a reference by a different
+				 * process.
+				 *
+				 * Typically, at this point, prefetched pages
+				 * are still in the inactive queue.  Only
+				 * pages that triggered page faults are in the
+				 * active queue.
+				 */
+				vm_page_lock(m);
+				vm_page_deactivate(m);
+				vm_page_unlock(m);
 			}
-			vm_page_unlock(m);
 		}
 	}
 	if (first_object != object)
@@ -1116,7 +1395,7 @@
  */
 static void
 vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
-    int faultcount, int reqpage)
+    int backward, int forward, bool obj_locked)
 {
 	pmap_t pmap;
 	vm_map_entry_t entry;
@@ -1124,19 +1403,12 @@
 	vm_offset_t addr, starta;
 	vm_pindex_t pindex;
 	vm_page_t m;
-	int backward, forward, i;
+	int i;
 
 	pmap = fs->map->pmap;
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
 		return;
 
-	if (faultcount > 0) {
-		backward = reqpage;
-		forward = faultcount - reqpage - 1;
-	} else {
-		backward = PFBAK;
-		forward = PFFOR;
-	}
 	entry = fs->entry;
 
 	if (addra < backward * PAGE_SIZE) {
@@ -1169,7 +1441,8 @@
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = entry->object.vm_object;
-		VM_OBJECT_RLOCK(lobject);
+		if (!obj_locked)
+			VM_OBJECT_RLOCK(lobject);
 		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
 		    lobject->type == OBJT_DEFAULT &&
 		    (backing_object = lobject->backing_object) != NULL) {
@@ -1177,17 +1450,20 @@
 			    0, ("vm_fault_prefault: unaligned object offset"));
 			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
 			VM_OBJECT_RLOCK(backing_object);
-			VM_OBJECT_RUNLOCK(lobject);
+			if (!obj_locked || lobject != entry->object.vm_object)
+				VM_OBJECT_RUNLOCK(lobject);
 			lobject = backing_object;
 		}
 		if (m == NULL) {
-			VM_OBJECT_RUNLOCK(lobject);
+			if (!obj_locked || lobject != entry->object.vm_object)
+				VM_OBJECT_RUNLOCK(lobject);
 			break;
 		}
 		if (m->valid == VM_PAGE_BITS_ALL &&
 		    (m->flags & PG_FICTITIOUS) == 0)
 			pmap_enter_quick(pmap, addr, m, entry->protection);
-		VM_OBJECT_RUNLOCK(lobject);
+		if (!obj_locked || lobject != entry->object.vm_object)
+			VM_OBJECT_RUNLOCK(lobject);
 	}
 }
 
@@ -1252,7 +1528,18 @@
 		 * page was mapped at the specified virtual address or that
 		 * mapping had insufficient permissions.  Attempt to fault in
 		 * and hold these pages.
+		 *
+		 * If vm_fault_disable_pagefaults() was called,
+		 * i.e., TDP_NOFAULTING is set, we must not sleep nor
+		 * acquire MD VM locks, which means we must not call
+		 * vm_fault_hold().  Some (out of tree) callers mark
+		 * too wide a code area with vm_fault_disable_pagefaults()
+		 * already, use the VM_PROT_QUICK_NOFAULT flag to request
+		 * the proper behaviour explicitly.
 		 */
+		if ((prot & VM_PROT_QUICK_NOFAULT) != 0 &&
+		    (curthread->td_pflags & TDP_NOFAULTING) != 0)
+			goto error;
 		for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
 			if (*mp == NULL && vm_fault_hold(map, va, prot,
 			    VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
@@ -1315,11 +1602,12 @@
 		 * actually shadow anything - we copy the pages directly.)
 		 */
 		dst_object = vm_object_allocate(OBJT_DEFAULT,
-		    OFF_TO_IDX(dst_entry->end - dst_entry->start));
+		    atop(dst_entry->end - dst_entry->start));
 #if VM_NRESERVLEVEL > 0
 		dst_object->flags |= OBJ_COLORED;
 		dst_object->pg_color = atop(dst_entry->start);
 #endif
+		dst_object->charge = dst_entry->end - dst_entry->start;
 	}
 
 	VM_OBJECT_WLOCK(dst_object);
@@ -1328,7 +1616,6 @@
 	if (src_object != dst_object) {
 		dst_entry->object.vm_object = dst_object;
 		dst_entry->offset = 0;
-		dst_object->charge = dst_entry->end - dst_entry->start;
 	}
 	if (fork_charge != NULL) {
 		KASSERT(dst_entry->cred == NULL,
@@ -1336,7 +1623,9 @@
 		dst_object->cred = curthread->td_ucred;
 		crhold(dst_object->cred);
 		*fork_charge += dst_object->charge;
-	} else if (dst_object->cred == NULL) {
+	} else if ((dst_object->type == OBJT_DEFAULT ||
+	    dst_object->type == OBJT_SWAP) &&
+	    dst_object->cred == NULL) {
 		KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
 		    dst_entry));
 		dst_object->cred = dst_entry->cred;
@@ -1361,7 +1650,7 @@
 	 * range, copying each page from the source object to the
 	 * destination object.  Since the source is wired, those pages
 	 * must exist.  In contrast, the destination is pageable.
-	 * Since the destination object does share any backing storage
+	 * Since the destination object doesn't share any backing storage
 	 * with the source object, all of its pages must be dirtied,
 	 * regardless of whether they can be written.
 	 */
@@ -1417,15 +1706,19 @@
 			}
 			pmap_copy_page(src_m, dst_m);
 			VM_OBJECT_RUNLOCK(object);
-			dst_m->valid = VM_PAGE_BITS_ALL;
-			dst_m->dirty = VM_PAGE_BITS_ALL;
+			dst_m->dirty = dst_m->valid = src_m->valid;
 		} else {
 			dst_m = src_m;
 			if (vm_page_sleep_if_busy(dst_m, "fltupg"))
 				goto again;
+			if (dst_m->pindex >= dst_object->size)
+				/*
+				 * We are upgrading.  Index can occur
+				 * out of bounds if the object type is
+				 * vnode and the file was truncated.
+				 */
+				break;
 			vm_page_xbusy(dst_m);
-			KASSERT(dst_m->valid == VM_PAGE_BITS_ALL,
-			    ("invalid dst page %p", dst_m));
 		}
 		VM_OBJECT_WUNLOCK(dst_object);
 
@@ -1433,9 +1726,18 @@
 		 * Enter it in the pmap. If a wired, copy-on-write
 		 * mapping is being replaced by a write-enabled
 		 * mapping, then wire that new mapping.
+		 *
+		 * The page can be invalid if the user called
+		 * msync(MS_INVALIDATE) or truncated the backing vnode
+		 * or shared memory object.  In this case, do not
+		 * insert it into pmap, but still do the copy so that
+		 * all copies of the wired map entry have similar
+		 * backing pages.
 		 */
-		pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
-		    access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
+		if (dst_m->valid == VM_PAGE_BITS_ALL) {
+			pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
+			    access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
+		}
 
 		/*
 		 * Mark it no longer busy, and put it on the active list.
@@ -1445,7 +1747,7 @@
 		if (upgrade) {
 			if (src_m != dst_m) {
 				vm_page_lock(src_m);
-				vm_page_unwire(src_m, 0);
+				vm_page_unwire(src_m, PQ_INACTIVE);
 				vm_page_unlock(src_m);
 				vm_page_lock(dst_m);
 				vm_page_wire(dst_m);
@@ -1468,134 +1770,7 @@
 	}
 }
 
-
 /*
- * This routine checks around the requested page for other pages that
- * might be able to be faulted in.  This routine brackets the viable
- * pages for the pages to be paged in.
- *
- * Inputs:
- *	m, rbehind, rahead
- *
- * Outputs:
- *  marray (array of vm_page_t), reqpage (index of requested page)
- *
- * Return value:
- *  number of pages in marray
- */
-static int
-vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
-	vm_page_t m;
-	int rbehind;
-	int rahead;
-	vm_page_t *marray;
-	int *reqpage;
-{
-	int i,j;
-	vm_object_t object;
-	vm_pindex_t pindex, startpindex, endpindex, tpindex;
-	vm_page_t rtm;
-	int cbehind, cahead;
-
-	VM_OBJECT_ASSERT_WLOCKED(m->object);
-
-	object = m->object;
-	pindex = m->pindex;
-	cbehind = cahead = 0;
-
-	/*
-	 * if the requested page is not available, then give up now
-	 */
-	if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
-		return 0;
-	}
-
-	if ((cbehind == 0) && (cahead == 0)) {
-		*reqpage = 0;
-		marray[0] = m;
-		return 1;
-	}
-
-	if (rahead > cahead) {
-		rahead = cahead;
-	}
-
-	if (rbehind > cbehind) {
-		rbehind = cbehind;
-	}
-
-	/*
-	 * scan backward for the read behind pages -- in memory 
-	 */
-	if (pindex > 0) {
-		if (rbehind > pindex) {
-			rbehind = pindex;
-			startpindex = 0;
-		} else {
-			startpindex = pindex - rbehind;
-		}
-
-		if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL &&
-		    rtm->pindex >= startpindex)
-			startpindex = rtm->pindex + 1;
-
-		/* tpindex is unsigned; beware of numeric underflow. */
-		for (i = 0, tpindex = pindex - 1; tpindex >= startpindex &&
-		    tpindex < pindex; i++, tpindex--) {
-
-			rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
-			    VM_ALLOC_IFNOTCACHED);
-			if (rtm == NULL) {
-				/*
-				 * Shift the allocated pages to the
-				 * beginning of the array.
-				 */
-				for (j = 0; j < i; j++) {
-					marray[j] = marray[j + tpindex + 1 -
-					    startpindex];
-				}
-				break;
-			}
-
-			marray[tpindex - startpindex] = rtm;
-		}
-	} else {
-		startpindex = 0;
-		i = 0;
-	}
-
-	marray[i] = m;
-	/* page offset of the required page */
-	*reqpage = i;
-
-	tpindex = pindex + 1;
-	i++;
-
-	/*
-	 * scan forward for the read ahead pages
-	 */
-	endpindex = tpindex + rahead;
-	if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex)
-		endpindex = rtm->pindex;
-	if (endpindex > object->size)
-		endpindex = object->size;
-
-	for (; tpindex < endpindex; i++, tpindex++) {
-
-		rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
-		    VM_ALLOC_IFNOTCACHED);
-		if (rtm == NULL) {
-			break;
-		}
-
-		marray[i] = rtm;
-	}
-
-	/* return number of pages */
-	return i;
-}
-
-/*
  * Block entry into the machine-independent layer's page fault handler by
  * the calling thread.  Subsequent calls to vm_fault() by that thread will
  * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of

Modified: trunk/sys/vm/vm_glue.c
===================================================================
--- trunk/sys/vm/vm_glue.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_glue.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_glue.c 300673 2016-05-25 10:04:53Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_glue.c 341467 2018-12-04 15:04:48Z emaste $");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
@@ -102,13 +102,6 @@
 
 #include <machine/cpu.h>
 
-#ifndef NO_SWAPPING
-static int swapout(struct proc *);
-static void swapclear(struct proc *);
-static void vm_thread_swapin(struct thread *td);
-static void vm_thread_swapout(struct thread *td);
-#endif
-
 /*
  * MPSAFE
  *
@@ -119,9 +112,7 @@
  * space.
  */
 int
-kernacc(addr, len, rw)
-	void *addr;
-	int len, rw;
+kernacc(void *addr, int len, int rw)
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
@@ -130,7 +121,7 @@
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 
-	if ((vm_offset_t)addr + len > kernel_map->max_offset ||
+	if ((vm_offset_t)addr + len > vm_map_max(kernel_map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr)
 		return (FALSE);
 
@@ -150,12 +141,10 @@
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  vmapbuf(),
  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
- * used in conjuction with this call.
+ * used in conjunction with this call.
  */
 int
-useracc(addr, len, rw)
-	void *addr;
-	int len, rw;
+useracc(void *addr, int len, int rw)
 {
 	boolean_t rv;
 	vm_prot_t prot;
@@ -201,16 +190,21 @@
 	 * Also, the sysctl code, which is the only present user
 	 * of vslock(), does a hard loop on EAGAIN.
 	 */
-	if (npages + cnt.v_wire_count > vm_page_max_wired)
+	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #endif
 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
+	if (error == KERN_SUCCESS) {
+		curthread->td_vslock_sz += len;
+		return (0);
+	}
+
 	/*
 	 * Return EFAULT on error to match copy{in,out}() behaviour
 	 * rather than returning ENOMEM like mlock() would.
 	 */
-	return (error == KERN_SUCCESS ? 0 : EFAULT);
+	return (EFAULT);
 }
 
 void
@@ -218,6 +212,8 @@
 {
 
 	/* Rely on the parameter sanity checks performed by vslock(). */
+	MPASS(curthread->td_vslock_sz >= len);
+	curthread->td_vslock_sz -= len;
 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
@@ -231,19 +227,16 @@
 static vm_page_t
 vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
 {
-	vm_page_t m, ma[1];
+	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_WLOCK(object);
 	pindex = OFF_TO_IDX(offset);
-	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
+	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (m->valid != VM_PAGE_BITS_ALL) {
-		ma[0] = m;
-		rv = vm_pager_get_pages(object, ma, 1, 0);
-		m = vm_page_lookup(object, pindex);
-		if (m == NULL)
-			goto out;
+		vm_page_xbusy(m);
+		rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 		if (rv != VM_PAGER_OK) {
 			vm_page_lock(m);
 			vm_page_free(m);
@@ -251,8 +244,8 @@
 			m = NULL;
 			goto out;
 		}
+		vm_page_xunbusy(m);
 	}
-	vm_page_xunbusy(m);
 	vm_page_lock(m);
 	vm_page_hold(m);
 	vm_page_activate(m);
@@ -312,10 +305,6 @@
 SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
     "");
 
-#ifndef KSTACK_MAX_PAGES
-#define KSTACK_MAX_PAGES 32
-#endif
-
 /*
  * Create the kernel stack (including pcb for i386) for a new thread.
  * This routine directly affects the fork perf for a process and
@@ -326,17 +315,17 @@
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
-	vm_page_t m, ma[KSTACK_MAX_PAGES];
+	vm_page_t ma[KSTACK_MAX_PAGES];
 	struct kstack_cache_entry *ks_ce;
 	int i;
 
 	/* Bounds check */
 	if (pages <= 1)
-		pages = KSTACK_PAGES;
+		pages = kstack_pages;
 	else if (pages > KSTACK_MAX_PAGES)
 		pages = KSTACK_MAX_PAGES;
 
-	if (pages == KSTACK_PAGES) {
+	if (pages == kstack_pages) {
 		mtx_lock(&kstack_cache_mtx);
 		if (kstack_cache != NULL) {
 			ks_ce = kstack_cache;
@@ -345,7 +334,7 @@
 
 			td->td_kstack_obj = ks_ce->ksobj;
 			td->td_kstack = (vm_offset_t)ks_ce;
-			td->td_kstack_pages = KSTACK_PAGES;
+			td->td_kstack_pages = kstack_pages;
 			return (1);
 		}
 		mtx_unlock(&kstack_cache_mtx);
@@ -395,15 +384,10 @@
 	 * page of stack.
 	 */
 	VM_OBJECT_WLOCK(ksobj);
-	for (i = 0; i < pages; i++) {
-		/*
-		 * Get a kernel stack page.
-		 */
-		m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
-		    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
-		ma[i] = m;
-		m->valid = VM_PAGE_BITS_ALL;
-	}
+	(void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY |
+	    VM_ALLOC_WIRED, ma, pages);
+	for (i = 0; i < pages; i++)
+		ma[i]->valid = VM_PAGE_BITS_ALL;
 	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
 	return (1);
@@ -423,7 +407,7 @@
 		if (m == NULL)
 			panic("vm_thread_dispose: kstack already missing?");
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_NONE);
 		vm_page_free(m);
 		vm_page_unlock(m);
 	}
@@ -449,7 +433,7 @@
 	ks = td->td_kstack;
 	td->td_kstack = 0;
 	td->td_kstack_pages = 0;
-	if (pages == KSTACK_PAGES && kstacks <= kstack_cache_size) {
+	if (pages == kstack_pages && kstacks <= kstack_cache_size) {
 		ks_ce = (struct kstack_cache_entry *)ks;
 		ks_ce->ksobj = ksobj;
 		mtx_lock(&kstack_cache_mtx);
@@ -476,7 +460,7 @@
 		ks_ce = ks_ce->next_ks_entry;
 
 		vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1,
-		    KSTACK_PAGES);
+		    kstack_pages);
 	}
 }
 
@@ -536,78 +520,7 @@
 }
 #endif /* KSTACK_USAGE_PROF */
 
-#ifndef NO_SWAPPING
 /*
- * Allow a thread's kernel stack to be paged out.
- */
-static void
-vm_thread_swapout(struct thread *td)
-{
-	vm_object_t ksobj;
-	vm_page_t m;
-	int i, pages;
-
-	cpu_thread_swapout(td);
-	pages = td->td_kstack_pages;
-	ksobj = td->td_kstack_obj;
-	pmap_qremove(td->td_kstack, pages);
-	VM_OBJECT_WLOCK(ksobj);
-	for (i = 0; i < pages; i++) {
-		m = vm_page_lookup(ksobj, i);
-		if (m == NULL)
-			panic("vm_thread_swapout: kstack already missing?");
-		vm_page_dirty(m);
-		vm_page_lock(m);
-		vm_page_unwire(m, 0);
-		vm_page_unlock(m);
-	}
-	VM_OBJECT_WUNLOCK(ksobj);
-}
-
-/*
- * Bring the kernel stack for a specified thread back in.
- */
-static void
-vm_thread_swapin(struct thread *td)
-{
-	vm_object_t ksobj;
-	vm_page_t ma[KSTACK_MAX_PAGES];
-	int i, j, k, pages, rv;
-
-	pages = td->td_kstack_pages;
-	ksobj = td->td_kstack_obj;
-	VM_OBJECT_WLOCK(ksobj);
-	for (i = 0; i < pages; i++)
-		ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL |
-		    VM_ALLOC_WIRED);
-	for (i = 0; i < pages; i++) {
-		if (ma[i]->valid != VM_PAGE_BITS_ALL) {
-			vm_page_assert_xbusied(ma[i]);
-			vm_object_pip_add(ksobj, 1);
-			for (j = i + 1; j < pages; j++) {
-				if (ma[j]->valid != VM_PAGE_BITS_ALL)
-					vm_page_assert_xbusied(ma[j]);
-				if (ma[j]->valid == VM_PAGE_BITS_ALL)
-					break;
-			}
-			rv = vm_pager_get_pages(ksobj, ma + i, j - i, 0);
-			if (rv != VM_PAGER_OK)
-	panic("vm_thread_swapin: cannot get kstack for proc: %d",
-				    td->td_proc->p_pid);
-			vm_object_pip_wakeup(ksobj);
-			for (k = i; k < j; k++)
-				ma[k] = vm_page_lookup(ksobj, k);
-			vm_page_xunbusy(ma[i]);
-		} else if (vm_page_xbusied(ma[i]))
-			vm_page_xunbusy(ma[i]);
-	}
-	VM_OBJECT_WUNLOCK(ksobj);
-	pmap_qenter(td->td_kstack, ma, pages);
-	cpu_thread_swapin(td);
-}
-#endif /* !NO_SWAPPING */
-
-/*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
@@ -616,12 +529,8 @@
  * to user mode to avoid stack copying and relocation problems.
  */
 int
-vm_forkproc(td, p2, td2, vm2, flags)
-	struct thread *td;
-	struct proc *p2;
-	struct thread *td2;
-	struct vmspace *vm2;
-	int flags;
+vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2,
+    struct vmspace *vm2, int flags)
 {
 	struct proc *p1 = td->td_proc;
 	int error;
@@ -667,7 +576,7 @@
 }
 
 /*
- * Called after process has been wait(2)'ed apon and is being reaped.
+ * Called after process has been wait(2)'ed upon and is being reaped.
  * The idea is to reclaim resources that we could not reclaim while
  * the process was still executing.
  */
@@ -680,414 +589,8 @@
 }
 
 void
-faultin(p)
-	struct proc *p;
-{
-#ifdef NO_SWAPPING
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	if ((p->p_flag & P_INMEM) == 0)
-		panic("faultin: proc swapped out with NO_SWAPPING!");
-#else /* !NO_SWAPPING */
-	struct thread *td;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	/*
-	 * If another process is swapping in this process,
-	 * just wait until it finishes.
-	 */
-	if (p->p_flag & P_SWAPPINGIN) {
-		while (p->p_flag & P_SWAPPINGIN)
-			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
-		return;
-	}
-	if ((p->p_flag & P_INMEM) == 0) {
-		/*
-		 * Don't let another thread swap process p out while we are
-		 * busy swapping it in.
-		 */
-		++p->p_lock;
-		p->p_flag |= P_SWAPPINGIN;
-		PROC_UNLOCK(p);
-
-		/*
-		 * We hold no lock here because the list of threads
-		 * can not change while all threads in the process are
-		 * swapped out.
-		 */
-		FOREACH_THREAD_IN_PROC(p, td)
-			vm_thread_swapin(td);
-		PROC_LOCK(p);
-		swapclear(p);
-		p->p_swtick = ticks;
-
-		wakeup(&p->p_flag);
-
-		/* Allow other threads to swap p out now. */
-		--p->p_lock;
-	}
-#endif /* NO_SWAPPING */
-}
-
-/*
- * This swapin algorithm attempts to swap-in processes only if there
- * is enough space for them.  Of course, if a process waits for a long
- * time, it will be swapped in anyway.
- */
-void
-swapper(void)
-{
-	struct proc *p;
-	struct thread *td;
-	struct proc *pp;
-	int slptime;
-	int swtime;
-	int ppri;
-	int pri;
-
-loop:
-	if (vm_page_count_min()) {
-		VM_WAIT;
-		goto loop;
-	}
-
-	pp = NULL;
-	ppri = INT_MIN;
-	sx_slock(&allproc_lock);
-	FOREACH_PROC_IN_SYSTEM(p) {
-		PROC_LOCK(p);
-		if (p->p_state == PRS_NEW ||
-		    p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
-			PROC_UNLOCK(p);
-			continue;
-		}
-		swtime = (ticks - p->p_swtick) / hz;
-		FOREACH_THREAD_IN_PROC(p, td) {
-			/*
-			 * An otherwise runnable thread of a process
-			 * swapped out has only the TDI_SWAPPED bit set.
-			 * 
-			 */
-			thread_lock(td);
-			if (td->td_inhibitors == TDI_SWAPPED) {
-				slptime = (ticks - td->td_slptick) / hz;
-				pri = swtime + slptime;
-				if ((td->td_flags & TDF_SWAPINREQ) == 0)
-					pri -= p->p_nice * 8;
-				/*
-				 * if this thread is higher priority
-				 * and there is enough space, then select
-				 * this process instead of the previous
-				 * selection.
-				 */
-				if (pri > ppri) {
-					pp = p;
-					ppri = pri;
-				}
-			}
-			thread_unlock(td);
-		}
-		PROC_UNLOCK(p);
-	}
-	sx_sunlock(&allproc_lock);
-
-	/*
-	 * Nothing to do, back to sleep.
-	 */
-	if ((p = pp) == NULL) {
-		tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
-		goto loop;
-	}
-	PROC_LOCK(p);
-
-	/*
-	 * Another process may be bringing or may have already
-	 * brought this process in while we traverse all threads.
-	 * Or, this process may even be being swapped out again.
-	 */
-	if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
-		PROC_UNLOCK(p);
-		goto loop;
-	}
-
-	/*
-	 * We would like to bring someone in. (only if there is space).
-	 * [What checks the space? ]
-	 */
-	faultin(p);
-	PROC_UNLOCK(p);
-	goto loop;
-}
-
-void
 kick_proc0(void)
 {
 
 	wakeup(&proc0);
 }
-
-#ifndef NO_SWAPPING
-
-/*
- * Swap_idle_threshold1 is the guaranteed swapped in time for a process
- */
-static int swap_idle_threshold1 = 2;
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
-    &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
-
-/*
- * Swap_idle_threshold2 is the time that a process can be idle before
- * it will be swapped out, if idle swapping is enabled.
- */
-static int swap_idle_threshold2 = 10;
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
-    &swap_idle_threshold2, 0, "Time before a process will be swapped out");
-
-/*
- * First, if any processes have been sleeping or stopped for at least
- * "swap_idle_threshold1" seconds, they are swapped out.  If, however,
- * no such processes exist, then the longest-sleeping or stopped
- * process is swapped out.  Finally, and only as a last resort, if
- * there are no sleeping or stopped processes, the longest-resident
- * process is swapped out.
- */
-void
-swapout_procs(action)
-int action;
-{
-	struct proc *p;
-	struct thread *td;
-	int didswap = 0;
-
-retry:
-	sx_slock(&allproc_lock);
-	FOREACH_PROC_IN_SYSTEM(p) {
-		struct vmspace *vm;
-		int minslptime = 100000;
-		int slptime;
-		
-		/*
-		 * Watch out for a process in
-		 * creation.  It may have no
-		 * address space or lock yet.
-		 */
-		if (p->p_state == PRS_NEW)
-			continue;
-		/*
-		 * An aio daemon switches its
-		 * address space while running.
-		 * Perform a quick check whether
-		 * a process has P_SYSTEM.
-		 */
-		if ((p->p_flag & P_SYSTEM) != 0)
-			continue;
-		/*
-		 * Do not swapout a process that
-		 * is waiting for VM data
-		 * structures as there is a possible
-		 * deadlock.  Test this first as
-		 * this may block.
-		 *
-		 * Lock the map until swapout
-		 * finishes, or a thread of this
-		 * process may attempt to alter
-		 * the map.
-		 */
-		vm = vmspace_acquire_ref(p);
-		if (vm == NULL)
-			continue;
-		if (!vm_map_trylock(&vm->vm_map))
-			goto nextproc1;
-
-		PROC_LOCK(p);
-		if (p->p_lock != 0 ||
-		    (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
-		    ) != 0) {
-			goto nextproc;
-		}
-		/*
-		 * only aiod changes vmspace, however it will be
-		 * skipped because of the if statement above checking 
-		 * for P_SYSTEM
-		 */
-		if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
-			goto nextproc;
-
-		switch (p->p_state) {
-		default:
-			/* Don't swap out processes in any sort
-			 * of 'special' state. */
-			break;
-
-		case PRS_NORMAL:
-			/*
-			 * do not swapout a realtime process
-			 * Check all the thread groups..
-			 */
-			FOREACH_THREAD_IN_PROC(p, td) {
-				thread_lock(td);
-				if (PRI_IS_REALTIME(td->td_pri_class)) {
-					thread_unlock(td);
-					goto nextproc;
-				}
-				slptime = (ticks - td->td_slptick) / hz;
-				/*
-				 * Guarantee swap_idle_threshold1
-				 * time in memory.
-				 */
-				if (slptime < swap_idle_threshold1) {
-					thread_unlock(td);
-					goto nextproc;
-				}
-
-				/*
-				 * Do not swapout a process if it is
-				 * waiting on a critical event of some
-				 * kind or there is a thread whose
-				 * pageable memory may be accessed.
-				 *
-				 * This could be refined to support
-				 * swapping out a thread.
-				 */
-				if (!thread_safetoswapout(td)) {
-					thread_unlock(td);
-					goto nextproc;
-				}
-				/*
-				 * If the system is under memory stress,
-				 * or if we are swapping
-				 * idle processes >= swap_idle_threshold2,
-				 * then swap the process out.
-				 */
-				if (((action & VM_SWAP_NORMAL) == 0) &&
-				    (((action & VM_SWAP_IDLE) == 0) ||
-				    (slptime < swap_idle_threshold2))) {
-					thread_unlock(td);
-					goto nextproc;
-				}
-
-				if (minslptime > slptime)
-					minslptime = slptime;
-				thread_unlock(td);
-			}
-
-			/*
-			 * If the pageout daemon didn't free enough pages,
-			 * or if this process is idle and the system is
-			 * configured to swap proactively, swap it out.
-			 */
-			if ((action & VM_SWAP_NORMAL) ||
-				((action & VM_SWAP_IDLE) &&
-				 (minslptime > swap_idle_threshold2))) {
-				if (swapout(p) == 0)
-					didswap++;
-				PROC_UNLOCK(p);
-				vm_map_unlock(&vm->vm_map);
-				vmspace_free(vm);
-				sx_sunlock(&allproc_lock);
-				goto retry;
-			}
-		}
-nextproc:
-		PROC_UNLOCK(p);
-		vm_map_unlock(&vm->vm_map);
-nextproc1:
-		vmspace_free(vm);
-		continue;
-	}
-	sx_sunlock(&allproc_lock);
-	/*
-	 * If we swapped something out, and another process needed memory,
-	 * then wakeup the sched process.
-	 */
-	if (didswap)
-		wakeup(&proc0);
-}
-
-static void
-swapclear(p)
-	struct proc *p;
-{
-	struct thread *td;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-
-	FOREACH_THREAD_IN_PROC(p, td) {
-		thread_lock(td);
-		td->td_flags |= TDF_INMEM;
-		td->td_flags &= ~TDF_SWAPINREQ;
-		TD_CLR_SWAPPED(td);
-		if (TD_CAN_RUN(td))
-			if (setrunnable(td)) {
-#ifdef INVARIANTS
-				/*
-				 * XXX: We just cleared TDI_SWAPPED
-				 * above and set TDF_INMEM, so this
-				 * should never happen.
-				 */
-				panic("not waking up swapper");
-#endif
-			}
-		thread_unlock(td);
-	}
-	p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
-	p->p_flag |= P_INMEM;
-}
-
-static int
-swapout(p)
-	struct proc *p;
-{
-	struct thread *td;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-#if defined(SWAP_DEBUG)
-	printf("swapping out %d\n", p->p_pid);
-#endif
-
-	/*
-	 * The states of this process and its threads may have changed
-	 * by now.  Assuming that there is only one pageout daemon thread,
-	 * this process should still be in memory.
-	 */
-	KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
-		("swapout: lost a swapout race?"));
-
-	/*
-	 * remember the process resident count
-	 */
-	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
-	/*
-	 * Check and mark all threads before we proceed.
-	 */
-	p->p_flag &= ~P_INMEM;
-	p->p_flag |= P_SWAPPINGOUT;
-	FOREACH_THREAD_IN_PROC(p, td) {
-		thread_lock(td);
-		if (!thread_safetoswapout(td)) {
-			thread_unlock(td);
-			swapclear(p);
-			return (EBUSY);
-		}
-		td->td_flags &= ~TDF_INMEM;
-		TD_SET_SWAPPED(td);
-		thread_unlock(td);
-	}
-	td = FIRST_THREAD_IN_PROC(p);
-	++td->td_ru.ru_nswap;
-	PROC_UNLOCK(p);
-
-	/*
-	 * This list is stable because all threads are now prevented from
-	 * running.  The list is only modified in the context of a running
-	 * thread in this process.
-	 */
-	FOREACH_THREAD_IN_PROC(p, td)
-		vm_thread_swapout(td);
-
-	PROC_LOCK(p);
-	p->p_flag &= ~P_SWAPPINGOUT;
-	p->p_swtick = ticks;
-	return (0);
-}
-#endif /* !NO_SWAPPING */

Modified: trunk/sys/vm/vm_init.c
===================================================================
--- trunk/sys/vm/vm_init.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_init.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_init.c 255426 2013-09-09 18:11:59Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_init.c 338484 2018-09-05 21:28:33Z kib $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -75,6 +75,7 @@
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/selinfo.h>
+#include <sys/smp.h>
 #include <sys/pipe.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
@@ -91,11 +92,6 @@
 
 long physmem;
 
-static int exec_map_entries = 16;
-TUNABLE_INT("vm.exec_map_entries", &exec_map_entries);
-SYSCTL_INT(_vm, OID_AUTO, exec_map_entries, CTLFLAG_RD, &exec_map_entries, 0,
-    "Maximum number of simultaneous execs");
-
 /*
  * System initialization
  */
@@ -197,8 +193,8 @@
 	 * Discount the physical memory larger than the size of kernel_map
 	 * to avoid eating up all of KVA space.
 	 */
-	physmem_est = lmin(physmem, btoc(kernel_map->max_offset -
-	    kernel_map->min_offset));
+	physmem_est = lmin(physmem, btoc(vm_map_max(kernel_map) -
+	    vm_map_min(kernel_map)));
 
 	v = kern_vfs_bio_buffer_alloc(v, physmem_est);
 
@@ -231,12 +227,15 @@
 
 	/*
 	 * Allocate the buffer arena.
+	 *
+	 * Enable the quantum cache if we have more than 4 cpus.  This
+	 * avoids lock contention at the expense of some fragmentation.
 	 */
 	size = (long)nbuf * BKVASIZE;
 	kmi->buffer_sva = firstaddr;
 	kmi->buffer_eva = kmi->buffer_sva + size;
 	vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size,
-	    PAGE_SIZE, 0, 0);
+	    PAGE_SIZE, (mp_ncpus > 4) ? BKVASIZE * 8 : 0, 0);
 	firstaddr += size;
 
 	/*
@@ -259,10 +258,19 @@
 		panic("Clean map calculation incorrect");
 
 	/*
- 	 * Allocate the pageable submaps.
+	 * Allocate the pageable submaps.  We may cache an exec map entry per
+	 * CPU, so we therefore need to reserve space for at least ncpu+1
+	 * entries to avoid deadlock.  The exec map is also used by some image
+	 * activators, so we leave a fixed number of pages for their use.
 	 */
+#ifdef __LP64__
+	exec_map_entries = 8 * mp_ncpus;
+#else
+	exec_map_entries = 2 * mp_ncpus + 4;
+#endif
+	exec_map_entry_size = round_page(PATH_MAX + ARG_MAX);
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
-	    exec_map_entries * round_page(PATH_MAX + ARG_MAX), FALSE);
+	    exec_map_entries * exec_map_entry_size + 64 * PAGE_SIZE, FALSE);
 	pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva,
 	    FALSE);
 }

Modified: trunk/sys/vm/vm_kern.c
===================================================================
--- trunk/sys/vm/vm_kern.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_kern.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_kern.c 324782 2017-10-20 00:38:01Z emaste $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_kern.c 340660 2018-11-20 01:12:21Z markj $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -85,6 +85,8 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
@@ -98,6 +100,9 @@
 /* NB: Used by kernel debuggers. */
 const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS;
 
+u_int exec_map_entry_size;
+u_int exec_map_entries;
+
 SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
     SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
 
@@ -160,8 +165,7 @@
     vm_paddr_t high, vm_memattr_t memattr)
 {
 	vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
-	vm_offset_t addr, i;
-	vm_ooffset_t offset;
+	vm_offset_t addr, i, offset;
 	vm_page_t m;
 	int pflags, tries;
 
@@ -170,16 +174,21 @@
 		return (0);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+	pflags |= VM_ALLOC_NOWAIT;
 	VM_OBJECT_WLOCK(object);
 	for (i = 0; i < size; i += PAGE_SIZE) {
 		tries = 0;
 retry:
-		m = vm_page_alloc_contig(object, OFF_TO_IDX(offset + i),
+		m = vm_page_alloc_contig(object, atop(offset + i),
 		    pflags, 1, low, high, PAGE_SIZE, 0, memattr);
 		if (m == NULL) {
 			VM_OBJECT_WUNLOCK(object);
 			if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
-				vm_pageout_grow_cache(tries, low, high);
+				if (!vm_page_reclaim_contig(pflags, 1,
+				    low, high, PAGE_SIZE, 0) &&
+				    (flags & M_WAITOK) != 0)
+					VM_WAIT;
 				VM_OBJECT_WLOCK(object);
 				tries++;
 				goto retry;
@@ -212,9 +221,9 @@
     vm_memattr_t memattr)
 {
 	vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
-	vm_offset_t addr, tmp;
-	vm_ooffset_t offset;
+	vm_offset_t addr, offset, tmp;
 	vm_page_t end_m, m;
+	u_long npages;
 	int pflags, tries;
  
 	size = round_page(size);
@@ -222,15 +231,20 @@
 		return (0);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+	pflags |= VM_ALLOC_NOWAIT;
+	npages = atop(size);
 	VM_OBJECT_WLOCK(object);
 	tries = 0;
 retry:
-	m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags,
-	    atop(size), low, high, alignment, boundary, memattr);
+	m = vm_page_alloc_contig(object, atop(offset), pflags,
+	    npages, low, high, alignment, boundary, memattr);
 	if (m == NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
-			vm_pageout_grow_cache(tries, low, high);
+			if (!vm_page_reclaim_contig(pflags, npages, low, high,
+			    alignment, boundary) && (flags & M_WAITOK) != 0)
+				VM_WAIT;
 			VM_OBJECT_WLOCK(object);
 			tries++;
 			goto retry;
@@ -238,7 +252,7 @@
 		vmem_free(vmem, addr, size);
 		return (0);
 	}
-	end_m = m + atop(size);
+	end_m = m + npages;
 	tmp = addr;
 	for (; m < end_m; m++) {
 		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
@@ -322,7 +336,7 @@
 kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
 {
 	vm_offset_t offset, i;
-	vm_page_t m;
+	vm_page_t m, mpred;
 	int pflags;
 
 	KASSERT(object == kmem_object || object == kernel_object,
@@ -330,11 +344,17 @@
 
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+	if (flags & M_WAITOK)
+		pflags |= VM_ALLOC_WAITFAIL;
 
+	i = 0;
 	VM_OBJECT_WLOCK(object);
-	for (i = 0; i < size; i += PAGE_SIZE) {
 retry:
-		m = vm_page_alloc(object, OFF_TO_IDX(offset + i), pflags);
+	mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i));
+	for (; i < size; i += PAGE_SIZE, mpred = m) {
+		m = vm_page_alloc_after(object, atop(offset + i), pflags,
+		    mpred);
 
 		/*
 		 * Ran out of space, free everything up and return. Don't need
@@ -342,12 +362,9 @@
 		 * aren't on any queues.
 		 */
 		if (m == NULL) {
+			if ((flags & M_NOWAIT) == 0)
+				goto retry;
 			VM_OBJECT_WUNLOCK(object);
-			if ((flags & M_NOWAIT) == 0) {
-				VM_WAIT;
-				VM_OBJECT_WLOCK(object);
-				goto retry;
-			}
 			kmem_unback(object, addr, i);
 			return (KERN_NO_SPACE);
 		}
@@ -376,8 +393,8 @@
 void
 kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
 {
-	vm_page_t m;
-	vm_offset_t i, offset;
+	vm_page_t m, next;
+	vm_offset_t end, offset;
 
 	KASSERT(object == kmem_object || object == kernel_object,
 	    ("kmem_unback: only supports kernel objects."));
@@ -384,10 +401,12 @@
 
 	pmap_remove(kernel_pmap, addr, addr + size);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
+	end = offset + size;
 	VM_OBJECT_WLOCK(object);
-	for (i = 0; i < size; i += PAGE_SIZE) {
-		m = vm_page_lookup(object, OFF_TO_IDX(offset + i));
-		vm_page_unwire(m, 0);
+	for (m = vm_page_lookup(object, atop(offset)); offset < end;
+	    offset += PAGE_SIZE, m = next) {
+		next = vm_page_next(m);
+		vm_page_unwire(m, PQ_NONE);
 		vm_page_free(m);
 	}
 	VM_OBJECT_WUNLOCK(object);
@@ -443,8 +462,8 @@
 		map->needs_wakeup = TRUE;
 		vm_map_unlock_and_wait(map, 0);
 	}
-	vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL,
-	    VM_PROT_ALL, MAP_ACC_CHARGED);
+	vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_RW, VM_PROT_RW,
+	    MAP_ACC_CHARGED);
 	vm_map_unlock(map);
 	return (addr);
 }
@@ -520,6 +539,43 @@
 	vm_map_unlock(m);
 }
 
+/*
+ *	kmem_bootstrap_free:
+ *
+ *	Free pages backing preloaded data (e.g., kernel modules) to the
+ *	system.  Currently only supported on platforms that create a
+ *	vm_phys segment for preloaded data.
+ */
+void
+kmem_bootstrap_free(vm_offset_t start, vm_size_t size)
+{
+#if defined(__i386__) || defined(__amd64__)
+	struct vm_domain *vmd;
+	vm_offset_t end, va;
+	vm_paddr_t pa;
+	vm_page_t m;
+
+	end = trunc_page(start + size);
+	start = round_page(start);
+
+	for (va = start; va < end; va += PAGE_SIZE) {
+		pa = pmap_kextract(va);
+		m = PHYS_TO_VM_PAGE(pa);
+
+		vmd = vm_phys_domain(m);
+		mtx_lock(&vm_page_queue_free_mtx);
+		vm_phys_free_pages(m, 0);
+		vmd->vmd_page_count++;
+		vm_phys_freecnt_adj(m, 1);
+		mtx_unlock(&vm_page_queue_free_mtx);
+
+		vm_cnt.v_page_count++;
+	}
+	pmap_remove(kernel_pmap, start, end);
+	(void)vmem_add(kernel_arena, start, end - start, M_WAITOK);
+#endif
+}
+
 #ifdef DIAGNOSTIC
 /*
  * Allow userspace to directly trigger the VM drain routine for testing

Modified: trunk/sys/vm/vm_kern.h
===================================================================
--- trunk/sys/vm/vm_kern.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_kern.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,11 +58,11 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_kern.h 254307 2013-08-13 22:40:43Z jeff $
+ * $FreeBSD: stable/11/sys/vm/vm_kern.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _VM_VM_KERN_H_
-#define _VM_VM_KERN_H_ 1
+#define	_VM_VM_KERN_H_
 
 /* Kernel memory management definitions. */
 extern vm_map_t kernel_map;
@@ -75,5 +75,7 @@
 extern struct vmem *memguard_arena;
 extern vm_offset_t swapbkva;
 extern u_long vm_kmem_size;
+extern u_int exec_map_entries;
+extern u_int exec_map_entry_size;
 
-#endif				/* _VM_VM_KERN_H_ */
+#endif /* _VM_VM_KERN_H_ */

Modified: trunk/sys/vm/vm_map.c
===================================================================
--- trunk/sys/vm/vm_map.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_map.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_map.c 326523 2017-12-04 10:05:59Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_map.c 355049 2019-11-24 06:54:17Z dougm $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -136,6 +136,8 @@
 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
     vm_map_entry_t gap_entry);
+static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
+    vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
 #ifdef INVARIANTS
 static void vm_map_zdtor(void *mem, int size, void *arg);
 static void vmspace_zdtor(void *mem, int size, void *arg);
@@ -277,12 +279,7 @@
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
-
 	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
-
-	if (pinit == NULL)
-		pinit = &pmap_pinit;
-
 	if (!pinit(vmspace_pmap(vm))) {
 		uma_zfree(vmspace_zone, vm);
 		return (NULL);
@@ -333,8 +330,8 @@
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
-	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
-	    vm->vm_map.max_offset);
+	(void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
+	    vm_map_max(&vm->vm_map));
 
 	pmap_release(vmspace_pmap(vm));
 	vm->vm_map.pmap = NULL;
@@ -346,7 +343,7 @@
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
-	    "vmspace_free() called with non-sleepable lock held");
+	    "vmspace_free() called");
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
@@ -452,7 +449,48 @@
 	return (vm);
 }
 
+/*
+ * Switch between vmspaces in an AIO kernel process.
+ *
+ * The new vmspace is either the vmspace of a user process obtained
+ * from an active AIO request or the initial vmspace of the AIO kernel
+ * process (when it is idling).  Because user processes will block to
+ * drain any active AIO requests before proceeding in exit() or
+ * execve(), the reference count for vmspaces from AIO requests can
+ * never be 0.  Similarly, AIO kernel processes hold an extra
+ * reference on their initial vmspace for the life of the process.  As
+ * a result, the 'newvm' vmspace always has a non-zero reference
+ * count.  This permits an additional reference on 'newvm' to be
+ * acquired via a simple atomic increment rather than the loop in
+ * vmspace_acquire_ref() above.
+ */
 void
+vmspace_switch_aio(struct vmspace *newvm)
+{
+	struct vmspace *oldvm;
+
+	/* XXX: Need some way to assert that this is an aio daemon. */
+
+	KASSERT(newvm->vm_refcnt > 0,
+	    ("vmspace_switch_aio: newvm unreferenced"));
+
+	oldvm = curproc->p_vmspace;
+	if (oldvm == newvm)
+		return;
+
+	/*
+	 * Point to the new address space and refer to it.
+	 */
+	curproc->p_vmspace = newvm;
+	atomic_add_int(&newvm->vm_refcnt, 1);
+
+	/* Activate the new mapping. */
+	pmap_activate(curthread);
+
+	vmspace_free(oldvm);
+}
+
+void
 _vm_map_lock(vm_map_t map, const char *file, int line)
 {
 
@@ -748,8 +786,8 @@
 	map->needs_wakeup = FALSE;
 	map->system_map = 0;
 	map->pmap = pmap;
-	map->min_offset = min;
-	map->max_offset = max;
+	map->header.end = min;
+	map->header.start = max;
 	map->flags = 0;
 	map->root = NULL;
 	map->timestamp = 0;
@@ -952,12 +990,10 @@
 	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
 	    map->nentries, entry, after_where);
 	VM_MAP_ASSERT_LOCKED(map);
-	KASSERT(after_where == &map->header ||
-	    after_where->end <= entry->start,
+	KASSERT(after_where->end <= entry->start,
 	    ("vm_map_entry_link: prev end %jx new start %jx overlap",
 	    (uintmax_t)after_where->end, (uintmax_t)entry->start));
-	KASSERT(after_where->next == &map->header ||
-	    entry->end <= after_where->next->start,
+	KASSERT(entry->end <= after_where->next->start,
 	    ("vm_map_entry_link: new end %jx next start %jx overlap",
 	    (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
 
@@ -979,8 +1015,7 @@
 		entry->right = map->root;
 		entry->left = NULL;
 	}
-	entry->adj_free = (entry->next == &map->header ? map->max_offset :
-	    entry->next->start) - entry->end;
+	entry->adj_free = entry->next->start - entry->end;
 	vm_map_entry_set_max_free(entry);
 	map->root = entry;
 }
@@ -999,8 +1034,7 @@
 	else {
 		root = vm_map_entry_splay(entry->start, entry->left);
 		root->right = entry->right;
-		root->adj_free = (entry->next == &map->header ? map->max_offset :
-		    entry->next->start) - root->end;
+		root->adj_free = entry->next->start - root->end;
 		vm_map_entry_set_max_free(root);
 	}
 	map->root = root;
@@ -1036,8 +1070,7 @@
 	if (entry != map->root)
 		map->root = vm_map_entry_splay(entry->start, map->root);
 
-	entry->adj_free = (entry->next == &map->header ? map->max_offset :
-	    entry->next->start) - entry->end;
+	entry->adj_free = entry->next->start - entry->end;
 	vm_map_entry_set_max_free(entry);
 }
 
@@ -1152,7 +1185,8 @@
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
-	if (start < map->min_offset || end > map->max_offset || start >= end)
+	if (start < vm_map_min(map) || end > vm_map_max(map) ||
+	    start >= end)
 		return (KERN_INVALID_ADDRESS);
 
 	/*
@@ -1167,7 +1201,7 @@
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
-	if (prev_entry->next != &map->header && prev_entry->next->start < end)
+	if (prev_entry->next->start < end)
 		return (KERN_NO_SPACE);
 
 	if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
@@ -1295,7 +1329,7 @@
 	new_entry->wired_count = 0;
 	new_entry->wiring_thread = NULL;
 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
-	new_entry->next_read = OFF_TO_IDX(offset);
+	new_entry->next_read = start;
 
 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
 	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
@@ -1352,9 +1386,8 @@
 	 * Request must fit within min/max VM address and must avoid
 	 * address wrap.
 	 */
-	if (start < map->min_offset)
-		start = map->min_offset;
-	if (start + length > map->max_offset || start + length < start)
+	start = MAX(start, vm_map_min(map));
+	if (start + length > vm_map_max(map) || start + length < start)
 		return (1);
 
 	/* Empty tree means wide open address space. */
@@ -1456,6 +1489,8 @@
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_find: non-NULL backing object for stack"));
+	MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
+	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
 	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
 	    (object->flags & OBJ_COLORED) == 0))
 		find_space = VMFS_ANY_SPACE;
@@ -1496,6 +1531,14 @@
 			}
 
 			start = *addr;
+		} else if ((cow & MAP_REMAP) != 0) {
+			if (start < vm_map_min(map) ||
+			    start + length > vm_map_max(map) ||
+			    start + length <= length) {
+				result = KERN_INVALID_ADDRESS;
+				break;
+			}
+			vm_map_delete(map, start, start + length);
 		}
 		if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 			result = vm_map_stack_locked(map, start, length,
@@ -1549,7 +1592,7 @@
  *
  *	The map must be locked.
  *
- *	This routine guarentees that the passed entry remains valid (though
+ *	This routine guarantees that the passed entry remains valid (though
  *	possibly extended).  When merging, this routine may delete one or
  *	both neighbors.
  */
@@ -1655,6 +1698,8 @@
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
+	KASSERT(entry->end > start && entry->start < start,
+	    ("_vm_map_clip_start: invalid clip of entry %p", entry));
 
 	/*
 	 * Split off the front portion -- note that we must insert the new
@@ -1740,6 +1785,8 @@
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
+	KASSERT(entry->start < end && entry->end > end,
+	    ("_vm_map_clip_end: invalid clip of entry %p", entry));
 
 	/*
 	 * If there is no object backing this entry, we might as well create
@@ -1856,11 +1903,9 @@
  *	limited number of page mappings are created at the low-end of the
  *	specified address range.  (For this purpose, a superpage mapping
  *	counts as one page mapping.)  Otherwise, all resident pages within
- *	the specified address range are mapped.  Because these mappings are
- *	being created speculatively, cached pages are not reactivated and
- *	mapped.
+ *	the specified address range are mapped.
  */
-void
+static void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
@@ -1910,7 +1955,7 @@
 		 * free pages allocating pv entries.
 		 */
 		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
-		    cnt.v_free_count < cnt.v_free_reserved) ||
+		    vm_cnt.v_free_count < vm_cnt.v_free_reserved) ||
 		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
 		    tmpidx >= threshold)) {
 			psize = tmpidx;
@@ -1926,7 +1971,7 @@
 			    (pagesizes[p->psind] - 1)) == 0) {
 				mask = atop(pagesizes[p->psind]) - 1;
 				if (tmpidx + mask < psize &&
-				    vm_page_ps_is_valid(p)) {
+				    vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
 					p += mask;
 					threshold += mask;
 				}
@@ -1955,7 +2000,7 @@
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
-	vm_map_entry_t current, entry;
+	vm_map_entry_t current, entry, in_tran;
 	vm_object_t obj;
 	struct ucred *cred;
 	vm_prot_t old_prot;
@@ -1963,8 +2008,18 @@
 	if (start == end)
 		return (KERN_SUCCESS);
 
+again:
+	in_tran = NULL;
 	vm_map_lock(map);
 
+	/*
+	 * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
+	 * need to fault pages into the map and will drop the map lock while
+	 * doing so, and the VM object may end up in an inconsistent state if we
+	 * update the protection on the map entry in between faults.
+	 */
+	vm_map_wait_busy(map);
+
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
@@ -1976,8 +2031,7 @@
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
-	for (current = entry; current != &map->header && current->start < end;
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
@@ -1988,15 +2042,29 @@
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
+		if ((current->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
+			in_tran = current;
 	}
 
 	/*
+	 * Postpone the operation until all in-transition map entries have
+	 * stabilized.  An in-transition entry might already have its pages
+	 * wired and wired_count incremented, but not yet have its
+	 * MAP_ENTRY_USER_WIRED flag set.  In which case, we would fail to call
+	 * vm_fault_copy_entry() in the final loop below.
+	 */
+	if (in_tran != NULL) {
+		in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
+		vm_map_unlock_and_wait(map, 0);
+		goto again;
+	}
+
+	/*
 	 * Do an accounting pass for private read-only mappings that
 	 * now will do cow due to allowed write (e.g. debugger sets
 	 * breakpoint on text segment)
 	 */
-	for (current = entry; current != &map->header && current->start < end;
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 
 		vm_map_clip_end(map, current, end);
 
@@ -2050,8 +2118,7 @@
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
-	for (current = entry; current != &map->header && current->start < end;
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 
@@ -2160,10 +2227,8 @@
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
-		for (current = entry;
-		     (current != &map->header) && (current->start < end);
-		     current = current->next
-		) {
+		for (current = entry; current->start < end;
+		    current = current->next) {
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
@@ -2207,15 +2272,25 @@
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
-		for (current = entry;
-		     (current != &map->header) && (current->start < end);
-		     current = current->next
-		) {
+		for (current = entry; current->start < end;
+		    current = current->next) {
 			vm_offset_t useEnd, useStart;
 
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
+			/*
+			 * MADV_FREE would otherwise rewind time to
+			 * the creation of the shadow object.  Because
+			 * we hold the VM map read-locked, neither the
+			 * entry's object nor the presence of a
+			 * backing object can change.
+			 */
+			if (behav == MADV_FREE &&
+			    current->object.vm_object != NULL &&
+			    current->object.vm_object->backing_object != NULL)
+				continue;
+
 			pstart = OFF_TO_IDX(current->offset);
 			pend = pstart + atop(current->end - current->start);
 			useStart = current->start;
@@ -2306,7 +2381,7 @@
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = temp_entry->next;
-	while ((entry != &map->header) && (entry->start < end)) {
+	while (entry->start < end) {
 		vm_map_clip_end(map, entry, end);
 		if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
 		    new_inheritance != VM_INHERIT_ZERO)
@@ -2348,7 +2423,7 @@
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
-	while (entry != &map->header && entry->start < end) {
+	while (entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
@@ -2411,8 +2486,7 @@
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
-		    (entry->end < end && (entry->next == &map->header ||
-		    entry->next->start > entry->end))) {
+		    (entry->end < end && entry->next->start > entry->end)) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
@@ -2438,8 +2512,7 @@
 		else
 			KASSERT(result, ("vm_map_unwire: lookup failed"));
 	}
-	for (entry = first_entry; entry != &map->header && entry->start < end;
-	    entry = entry->next) {
+	for (entry = first_entry; entry->start < end; entry = entry->next) {
 		/*
 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
 		 * space in the unwired region could have been mapped
@@ -2553,7 +2626,7 @@
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
-	while (entry != &map->header && entry->start < end) {
+	while (entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
@@ -2690,8 +2763,7 @@
 		 */
 	next_entry:
 		if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
-		    entry->end < end && (entry->next == &map->header ||
-		    entry->next->start > entry->end)) {
+		    entry->end < end && entry->next->start > entry->end) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
@@ -2708,8 +2780,7 @@
 		else
 			KASSERT(result, ("vm_map_wire: lookup failed"));
 	}
-	for (entry = first_entry; entry != &map->header && entry->start < end;
-	    entry = entry->next) {
+	for (entry = first_entry; entry->start < end; entry = entry->next) {
 		/*
 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
 		 * space in the unwired region could have been mapped
@@ -2813,15 +2884,13 @@
 	/*
 	 * Make a first pass to check for user-wired memory and holes.
 	 */
-	for (current = entry; current != &map->header && current->start < end;
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if (end > current->end &&
-		    (current->next == &map->header ||
-			current->end != current->next->start)) {
+		    current->end != current->next->start) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
@@ -2835,7 +2904,7 @@
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
-	for (current = entry; current != &map->header && current->start < end;) {
+	for (current = entry; current->start < end;) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
@@ -2912,7 +2981,7 @@
 {
 	vm_object_t object;
 	vm_pindex_t offidxstart, offidxend, count, size1;
-	vm_ooffset_t size;
+	vm_size_t size;
 
 	vm_map_entry_unlink(map, entry);
 	object = entry->object.vm_object;
@@ -2938,7 +3007,7 @@
 		KASSERT(entry->cred == NULL || object->cred == NULL ||
 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
 		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
-		count = OFF_TO_IDX(size);
+		count = atop(size);
 		offidxstart = OFF_TO_IDX(entry->offset);
 		offidxend = offidxstart + count;
 		VM_OBJECT_WLOCK(object);
@@ -3012,7 +3081,7 @@
 	/*
 	 * Step through all entries in this region
 	 */
-	while ((entry != &map->header) && (entry->start < end)) {
+	while (entry->start < end) {
 		vm_map_entry_t next;
 
 		/*
@@ -3058,11 +3127,17 @@
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
-		if (entry->wired_count != 0) {
+		if (entry->wired_count != 0)
 			vm_map_entry_unwire(map, entry);
-		}
 
-		pmap_remove(map->pmap, entry->start, entry->end);
+		/*
+		 * Remove mappings for the pages, but only if the
+		 * mappings could exist.  For instance, it does not
+		 * make sense to call pmap_remove() for guard entries.
+		 */
+		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
+		    entry->object.vm_object != NULL)
+			pmap_remove(map->pmap, entry->start, entry->end);
 
 		/*
 		 * Delete the entry only after removing all pmap
@@ -3120,8 +3195,6 @@
 	entry = tmp_entry;
 
 	while (start < end) {
-		if (entry == &map->header)
-			return (FALSE);
 		/*
 		 * No holes allowed!
 		 */
@@ -3325,7 +3398,8 @@
 
 	old_map = &vm1->vm_map;
 	/* Copy immutable fields of vm1 to vm2. */
-	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
+	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
+	    pmap_pinit);
 	if (vm2 == NULL)
 		return (NULL);
 	vm2->vm_taddr = vm1->vm_taddr;
@@ -3529,9 +3603,7 @@
 	growsize = sgrowsiz;
 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
 	vm_map_lock(map);
-	PROC_LOCK(curproc);
-	vmemlim = lim_cur(curproc, RLIMIT_VMEM);
-	PROC_UNLOCK(curproc);
+	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + init_ssize > vmemlim) {
 		rv = KERN_NO_SPACE;
@@ -3572,7 +3644,8 @@
 	    addrbos + max_ssize > vm_map_max(map) ||
 	    addrbos + max_ssize <= addrbos)
 		return (KERN_INVALID_ADDRESS);
-	sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
+	sgp = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 :
+	    (vm_size_t)stack_guard_page * PAGE_SIZE;
 	if (sgp >= max_ssize)
 		return (KERN_INVALID_ARGUMENT);
 
@@ -3585,10 +3658,9 @@
 		return (KERN_NO_SPACE);
 
 	/*
-	 * If we can't accomodate max_ssize in the current mapping, no go.
+	 * If we can't accommodate max_ssize in the current mapping, no go.
 	 */
-	if ((prev_entry->next != &map->header) &&
-	    (prev_entry->next->start < addrbos + max_ssize))
+	if (prev_entry->next->start < addrbos + max_ssize)
 		return (KERN_NO_SPACE);
 
 	/*
@@ -3624,11 +3696,25 @@
 	KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
 	    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
 	    ("new entry lacks MAP_ENTRY_GROWS_UP"));
+	if (gap_bot == gap_top)
+		return (KERN_SUCCESS);
 	rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
 	    VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
 	    MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
-	if (rv != KERN_SUCCESS)
+	if (rv == KERN_SUCCESS) {
+		/*
+		 * Gap can never successfully handle a fault, so
+		 * read-ahead logic is never used for it.  Re-use
+		 * next_read of the gap entry to store
+		 * stack_guard_page for vm_map_growstack().
+		 */
+		if (orient == MAP_STACK_GROWS_DOWN)
+			new_entry->prev->next_read = sgp;
+		else
+			new_entry->next->next_read = sgp;
+	} else {
 		(void)vm_map_delete(map, bot, top);
+	}
 	return (rv);
 }
 
@@ -3663,17 +3749,15 @@
 	 * debugger or AIO daemon.  The reason is that the wrong
 	 * resource limits are applied.
 	 */
-	if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
+	if (p != initproc && (map != &p->p_vmspace->vm_map ||
+	    p->p_textvp == NULL))
 		return (KERN_FAILURE);
 
 	MPASS(!map->system_map);
 
-	guard = stack_guard_page * PAGE_SIZE;
-	PROC_LOCK(p);
-	lmemlim = lim_cur(p, RLIMIT_MEMLOCK);
-	stacklim = lim_cur(p, RLIMIT_STACK);
-	vmemlim = lim_cur(p, RLIMIT_VMEM);
-	PROC_UNLOCK(p);
+	lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
+	stacklim = lim_cur(curthread, RLIMIT_STACK);
+	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 retry:
 	/* If addr is not in a hole for a stack grow area, no need to grow. */
 	if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
@@ -3697,6 +3781,8 @@
 	} else {
 		return (KERN_FAILURE);
 	}
+	guard = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 :
+	    gap_entry->next_read;
 	max_grow = gap_entry->end - gap_entry->start;
 	if (guard > max_grow)
 		return (KERN_NO_SPACE);
@@ -3844,9 +3930,7 @@
 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
 		vm_map_unlock(map);
 		vm_map_wire(map, grow_start, grow_start + grow_amount,
-		    (p->p_flag & P_SYSTEM)
-		    ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
-		    : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
+		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		vm_map_lock_read(map);
 	} else
 		vm_map_lock_downgrade(map);
@@ -3883,7 +3967,7 @@
 
 	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("vmspace_exec recursed"));
-	newvmspace = vmspace_alloc(minuser, maxuser, NULL);
+	newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
@@ -4125,7 +4209,7 @@
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
-	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
+	*pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
@@ -4206,7 +4290,7 @@
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
-	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
+	*pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
@@ -4228,6 +4312,27 @@
 	vm_map_unlock_read(map);
 }
 
+vm_offset_t
+vm_map_max_KBI(const struct vm_map *map)
+{
+
+	return (vm_map_max(map));
+}
+
+vm_offset_t
+vm_map_min_KBI(const struct vm_map *map)
+{
+
+	return (vm_map_min(map));
+}
+
+pmap_t
+vm_map_pmap_KBI(vm_map_t map)
+{
+
+	return (map->pmap);
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>

Modified: trunk/sys/vm/vm_map.h
===================================================================
--- trunk/sys/vm/vm_map.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_map.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_map.h 321718 2017-07-30 10:49:13Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_map.h 343426 2019-01-25 11:46:07Z kib $
  */
 
 /*
@@ -105,6 +105,7 @@
 	vm_offset_t start;		/* start address */
 	vm_offset_t end;		/* end address */
 	vm_offset_t pad0;
+	vm_offset_t next_read;		/* vaddr of the next sequential read */
 	vm_size_t adj_free;		/* amount of adjacent free space */
 	vm_size_t max_free;		/* max free space in subtree */
 	union vm_map_object object;	/* object I point to */
@@ -115,7 +116,6 @@
 	vm_inherit_t inheritance;	/* inheritance */
 	uint8_t read_ahead;		/* pages in the read-ahead window */
 	int wired_count;		/* can be paged if = 0 */
-	vm_pindex_t next_read;		/* index of the next sequential read */
 	struct ucred *cred;		/* tmp storage for creator ref */
 	struct thread *wiring_thread;
 };
@@ -173,15 +173,26 @@
  *	A map is a set of map entries.  These map entries are
  *	organized both as a binary search tree and as a doubly-linked
  *	list.  Both structures are ordered based upon the start and
- *	end addresses contained within each map entry.  Sleator and
- *	Tarjan's top-down splay algorithm is employed to control
- *	height imbalance in the binary search tree.
+ *	end addresses contained within each map entry.
  *
- * List of locks
+ *	Counterintuitively, the map's min offset value is stored in
+ *	map->header.end, and its max offset value is stored in
+ *	map->header.start.
+ *
+ *	The list header has max start value and min end value to act
+ *	as sentinels for sequential search of the doubly-linked list.
+ *	Sleator and Tarjan's top-down splay algorithm is employed to
+ *	control height imbalance in the binary search tree.
+ *
+ *	List of locks
  *	(c)	const until freed
  */
 struct vm_map {
 	struct vm_map_entry header;	/* List of entries */
+/*
+	map min_offset	header.end	(c)
+	map max_offset	header.start	(c)
+*/
 	struct sx lock;			/* Lock for map data */
 	struct mtx system_mtx;
 	int nentries;			/* Number of entries */
@@ -192,8 +203,6 @@
 	vm_flags_t flags;		/* flags for this vm_map */
 	vm_map_entry_t root;		/* Root of a binary search tree */
 	pmap_t pmap;			/* (c) Physical map */
-#define	min_offset	header.start	/* (c) */
-#define	max_offset	header.end	/* (c) */
 	int busy;
 };
 
@@ -204,16 +213,23 @@
 #define	MAP_BUSY_WAKEUP		0x02
 
 #ifdef	_KERNEL
+#ifdef KLD_MODULE
+#define	vm_map_max(map)		vm_map_max_KBI((map))
+#define	vm_map_min(map)		vm_map_min_KBI((map))
+#define	vm_map_pmap(map)	vm_map_pmap_KBI((map))
+#else
 static __inline vm_offset_t
 vm_map_max(const struct vm_map *map)
 {
-	return (map->max_offset);
+
+	return (map->header.start);
 }
 
 static __inline vm_offset_t
 vm_map_min(const struct vm_map *map)
 {
-	return (map->min_offset);
+
+	return (map->header.end);
 }
 
 static __inline pmap_t
@@ -227,6 +243,7 @@
 {
 	map->flags = (map->flags | set) & ~clear;
 }
+#endif	/* KLD_MODULE */
 #endif	/* _KERNEL */
 
 /*
@@ -287,6 +304,9 @@
 void vm_map_busy(vm_map_t map);
 void vm_map_unbusy(vm_map_t map);
 void vm_map_wait_busy(vm_map_t map);
+vm_offset_t vm_map_max_KBI(const struct vm_map *map);
+vm_offset_t vm_map_min_KBI(const struct vm_map *map);
+pmap_t vm_map_pmap_KBI(vm_map_t map);
 
 #define	vm_map_lock(map)	_vm_map_lock(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_unlock(map)	_vm_map_unlock(map, LOCK_FILE, LOCK_LINE)
@@ -306,9 +326,8 @@
 #endif	/* _KERNEL */
 
 
-/* XXX: number of kernel maps and entries to statically allocate */
+/* XXX: number of kernel maps to statically allocate */
 #define MAX_KMAP	10
-#define	MAX_KMAPENT	128
 
 /*
  * Copy-on-write flags for vm_map operations
@@ -324,6 +343,7 @@
 #define MAP_DISABLE_COREDUMP	0x0100
 #define MAP_PREFAULT_MADVISE	0x0200	/* from (user) madvise request */
 #define	MAP_VN_WRITECOUNT	0x0400
+#define	MAP_REMAP		0x0800
 #define	MAP_STACK_GROWS_DOWN	0x1000
 #define	MAP_STACK_GROWS_UP	0x2000
 #define	MAP_ACC_CHARGED		0x4000
@@ -389,15 +409,13 @@
     vm_pindex_t *, vm_prot_t *, boolean_t *);
 void vm_map_lookup_done (vm_map_t, vm_map_entry_t);
 boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *);
-void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
-    vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
 int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t);
 int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t);
+void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry);
 void vm_map_startup (void);
 int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t);
 int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
 int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int);
-void vm_map_simplify_entry (vm_map_t, vm_map_entry_t);
 int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
 int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags);

Modified: trunk/sys/vm/vm_meter.c
===================================================================
--- trunk/sys/vm/vm_meter.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_meter.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_meter.c 311049 2017-01-02 08:31:29Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_meter.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -54,24 +54,20 @@
 #include <vm/vm_object.h>
 #include <sys/sysctl.h>
 
-struct vmmeter cnt;
+struct vmmeter vm_cnt;
 
 SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min,
-	CTLFLAG_RW, &cnt.v_free_min, 0, "Minimum low-free-pages threshold");
+	CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold");
 SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target,
-	CTLFLAG_RW, &cnt.v_free_target, 0, "Desired free pages");
+	CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages");
 SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved,
-	CTLFLAG_RW, &cnt.v_free_reserved, 0, "Pages reserved for deadlock");
+	CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock");
 SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target,
-	CTLFLAG_RW, &cnt.v_inactive_target, 0, "Pages desired inactive");
-SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min,
-	CTLFLAG_RW, &cnt.v_cache_min, 0, "Min pages on cache queue");
-SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max,
-	CTLFLAG_RW, &cnt.v_cache_max, 0, "Max pages on cache queue");
+	CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive");
 SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
-	CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
+	CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
 SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
-	CTLFLAG_RW, &cnt.v_free_severe, 0, "Severe page depletion point");
+	CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point");
 
 static int
 sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS)
@@ -140,7 +136,7 @@
 						else
 							total.t_sl++;
 						if (td->td_wchan ==
-						    &cnt.v_free_count)
+						    &vm_cnt.v_free_count)
 							total.t_pw++;
 					}
 					break;
@@ -209,13 +205,13 @@
 		}
 	}
 	mtx_unlock(&vm_object_list_mtx);
-	total.t_free = cnt.v_free_count + cnt.v_cache_count;
+	total.t_free = vm_cnt.v_free_count;
 	return (sysctl_handle_opaque(oidp, &total, sizeof(total), req));
 }
 
 /*
- * vcnt() -	accumulate statistics from all cpus and the global cnt
- *		structure.
+ * vm_meter_cnt() -	accumulate statistics from all cpus and the global cnt
+ *			structure.
  *
  *	The vmmeter structure is now per-cpu as well as global.  Those
  *	statistics which can be kept on a per-cpu basis (to avoid cache
@@ -222,23 +218,31 @@
  *	stalls between cpus) can be moved to the per-cpu vmmeter.  Remaining
  *	statistics, such as v_free_reserved, are left in the global
  *	structure.
- *
- * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
  */
-static int
-vcnt(SYSCTL_HANDLER_ARGS)
+u_int
+vm_meter_cnt(size_t offset)
 {
-	int count = *(int *)arg1;
-	int offset = (char *)arg1 - (char *)&cnt;
+	struct pcpu *pcpu;
+	u_int count;
 	int i;
 
+	count = *(u_int *)((char *)&vm_cnt + offset);
 	CPU_FOREACH(i) {
-		struct pcpu *pcpu = pcpu_find(i);
-		count += *(int *)((char *)&pcpu->pc_cnt + offset);
+		pcpu = pcpu_find(i);
+		count += *(u_int *)((char *)&pcpu->pc_cnt + offset);
 	}
-	return (SYSCTL_OUT(req, &count, sizeof(int)));
+	return (count);
 }
 
+static int
+cnt_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	u_int count;
+
+	count = vm_meter_cnt((char *)arg1 - (char *)&vm_cnt);
+	return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
 SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, sizeof(struct vmtotal), vmtotal, "S,vmtotal", 
     "System virtual memory statistics");
@@ -251,8 +255,8 @@
 
 #define	VM_STATS(parent, var, descr) \
 	SYSCTL_PROC(parent, OID_AUTO, var, \
-	    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &cnt.var, 0, vcnt, \
-	    "IU", descr)
+	    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0,	\
+	    cnt_sysctl, "IU", descr)
 #define	VM_STATS_VM(var, descr)		VM_STATS(_vm_stats_vm, var, descr)
 #define	VM_STATS_SYS(var, descr)	VM_STATS(_vm_stats_sys, var, descr)
 
@@ -276,9 +280,10 @@
 VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in");
 VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out");
 VM_STATS_VM(v_intrans, "In transit page faults");
-VM_STATS_VM(v_reactivated, "Pages reactivated from free list");
+VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon");
 VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups");
 VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon");
+VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls");
 VM_STATS_VM(v_tcached, "Total pages cached");
 VM_STATS_VM(v_dfree, "Pages freed by pagedaemon");
 VM_STATS_VM(v_pfree, "Pages freed by exiting processes");
@@ -293,9 +298,8 @@
 VM_STATS_VM(v_active_count, "Active pages");
 VM_STATS_VM(v_inactive_target, "Desired inactive pages");
 VM_STATS_VM(v_inactive_count, "Inactive pages");
+VM_STATS_VM(v_laundry_count, "Pages eligible for laundering");
 VM_STATS_VM(v_cache_count, "Pages on cache queue");
-VM_STATS_VM(v_cache_min, "Min pages on cache queue");
-VM_STATS_VM(v_cache_max, "Max pages on cached queue");
 VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel");
 VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code");
 VM_STATS_VM(v_forks, "Number of fork() calls");

Modified: trunk/sys/vm/vm_mmap.c
===================================================================
--- trunk/sys/vm/vm_mmap.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_mmap.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -42,10 +42,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 321717 2017-07-30 10:36:20Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_mmap.c 356634 2020-01-11 15:06:06Z kevans $");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
+#include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -74,6 +75,7 @@
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 
+#include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
@@ -93,21 +95,16 @@
 #endif
 
 int old_mlock = 0;
-SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0,
+SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
     "Do not apply RLIMIT_MEMLOCK on mlockall");
-TUNABLE_INT("vm.old_mlock", &old_mlock);
+static int mincore_mapped = 1;
+SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
+    "mincore reports mappings, not residency");
 
 #ifdef MAP_32BIT
 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 #endif
 
-static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
-    int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
-static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
-    int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
-static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
-    int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
-
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
@@ -177,34 +174,48 @@
 #endif
 
 int
-sys_mmap(td, uap)
-	struct thread *td;
-	struct mmap_args *uap;
+sys_mmap(struct thread *td, struct mmap_args *uap)
 {
-#ifdef HWPMC_HOOKS
-	struct pmckern_map_in pkm;
-#endif
+
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
+	    uap->flags, uap->fd, uap->pos));
+}
+
+int
+kern_mmap(struct thread *td, uintptr_t addr0, size_t size, int prot, int flags,
+    int fd, off_t pos)
+{
+
+	return (kern_mmap_fpcheck(td, addr0, size, prot, flags, fd, pos, NULL));
+}
+
+/*
+ * When mmap'ing a file, check_fp_fn may be used for the caller to do any
+ * last-minute validation based on the referenced file in a non-racy way.
+ */
+int
+kern_mmap_fpcheck(struct thread *td, uintptr_t addr0, size_t size, int prot,
+    int flags, int fd, off_t pos, mmap_check_fp_fn check_fp_fn)
+{
+	struct vmspace *vms;
 	struct file *fp;
-	struct vnode *vp;
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
-	vm_prot_t cap_maxprot, prot, maxprot;
-	void *handle;
-	objtype_t handle_type;
-	int align, error, flags;
-	off_t pos;
-	struct vmspace *vms = td->td_proc->p_vmspace;
+	vm_size_t pageoff;
+	vm_prot_t cap_maxprot;
+	int align, error;
 	cap_rights_t rights;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
-	prot = uap->prot & VM_PROT_ALL;
-	flags = uap->flags;
-	pos = uap->pos;
-
+	vms = td->td_proc->p_vmspace;
 	fp = NULL;
+	AUDIT_ARG_FD(fd);
+	addr = addr0;
 
 	/*
+	 * Ignore old flags that used to be defined but did not do anything.
+	 */
+	flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
+	
+	/*
 	 * Enforce the constraints.
 	 * Mapping of length 0 is only allowed for old binaries.
 	 * Anonymous mapping shall specify -1 as filedescriptor and
@@ -214,8 +225,8 @@
 	 * pos.
 	 */
 	if (!SV_CURPROC_FLAG(SV_AOUT)) {
-		if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
-		    ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
+		if ((size == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
+		    ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0)))
 			return (EINVAL);
 	} else {
 		if ((flags & MAP_ANON) != 0)
@@ -223,15 +234,28 @@
 	}
 
 	if (flags & MAP_STACK) {
-		if ((uap->fd != -1) ||
+		if ((fd != -1) ||
 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
 			return (EINVAL);
 		flags |= MAP_ANON;
 		pos = 0;
 	}
+	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
+	    MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
+	    MAP_PREFAULT_READ | MAP_GUARD |
+#ifdef MAP_32BIT
+	    MAP_32BIT |
+#endif
+	    MAP_ALIGNMENT_MASK)) != 0)
+		return (EINVAL);
 	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
 		return (EINVAL);
-	if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || uap->fd != -1 ||
+	if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
+		return (EINVAL);
+	if (prot != PROT_NONE &&
+	    (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
+		return (EINVAL);
+	if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
 	    pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT |
 	    MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0))
 		return (EINVAL);
@@ -295,28 +319,32 @@
 		 * There should really be a pmap call to determine a reasonable
 		 * location.
 		 */
-		PROC_LOCK(td->td_proc);
 		if (addr == 0 ||
 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 		    addr < round_page((vm_offset_t)vms->vm_daddr +
-		    lim_max(td->td_proc, RLIMIT_DATA))))
+		    lim_max(td, RLIMIT_DATA))))
 			addr = round_page((vm_offset_t)vms->vm_daddr +
-			    lim_max(td->td_proc, RLIMIT_DATA));
-		PROC_UNLOCK(td->td_proc);
+			    lim_max(td, RLIMIT_DATA));
 	}
-	if ((flags & MAP_GUARD) != 0) {
-		handle = NULL;
-		handle_type = OBJT_DEFAULT;
-		maxprot = VM_PROT_NONE;
-		cap_maxprot = VM_PROT_NONE;
+	if (size == 0) {
+		/*
+		 * Return success without mapping anything for old
+		 * binaries that request a page-aligned mapping of
+		 * length 0.  For modern binaries, this function
+		 * returns an error earlier.
+		 */
+		error = 0;
+	} else if ((flags & MAP_GUARD) != 0) {
+		error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
+		    VM_PROT_NONE, flags, NULL, pos, FALSE, td);
 	} else if ((flags & MAP_ANON) != 0) {
 		/*
 		 * Mapping blank space is trivial.
+		 *
+		 * This relies on VM_PROT_* matching PROT_*.
 		 */
-		handle = NULL;
-		handle_type = OBJT_DEFAULT;
-		maxprot = VM_PROT_ALL;
-		cap_maxprot = VM_PROT_ALL;
+		error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
+		    VM_PROT_ALL, flags, NULL, pos, FALSE, td);
 	} else {
 		/*
 		 * Mapping file, get fp for validation and don't let the
@@ -333,94 +361,24 @@
 		}
 		if (prot & PROT_EXEC)
 			cap_rights_set(&rights, CAP_MMAP_X);
-		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
+		error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
 		if (error != 0)
 			goto done;
-		if (fp->f_type == DTYPE_SHM) {
-			handle = fp->f_data;
-			handle_type = OBJT_SWAP;
-			maxprot = VM_PROT_NONE;
-
-			/* FREAD should always be set. */
-			if (fp->f_flag & FREAD)
-				maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
-			if (fp->f_flag & FWRITE)
-				maxprot |= VM_PROT_WRITE;
-			goto map;
-		}
-		if (fp->f_type != DTYPE_VNODE) {
-			error = ENODEV;
+		if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
+		    td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) {
+			error = EINVAL;
 			goto done;
 		}
-#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
-    defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
-		/*
-		 * POSIX shared-memory objects are defined to have
-		 * kernel persistence, and are not defined to support
-		 * read(2)/write(2) -- or even open(2).  Thus, we can
-		 * use MAP_ASYNC to trade on-disk coherence for speed.
-		 * The shm_open(3) library routine turns on the FPOSIXSHM
-		 * flag to request this behavior.
-		 */
-		if (fp->f_flag & FPOSIXSHM)
-			flags |= MAP_NOSYNC;
-#endif
-		vp = fp->f_vnode;
-		/*
-		 * Ensure that file and memory protections are
-		 * compatible.  Note that we only worry about
-		 * writability if mapping is shared; in this case,
-		 * current and max prot are dictated by the open file.
-		 * XXX use the vnode instead?  Problem is: what
-		 * credentials do we use for determination? What if
-		 * proc does a setuid?
-		 */
-		if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
-			maxprot = VM_PROT_NONE;
-		else
-			maxprot = VM_PROT_EXECUTE;
-		if (fp->f_flag & FREAD) {
-			maxprot |= VM_PROT_READ;
-		} else if (prot & PROT_READ) {
-			error = EACCES;
-			goto done;
-		}
-		/*
-		 * If we are sharing potential changes (either via
-		 * MAP_SHARED or via the implicit sharing of character
-		 * device mappings), and we are trying to get write
-		 * permission although we opened it without asking
-		 * for it, bail out.
-		 */
-		if ((flags & MAP_SHARED) != 0) {
-			if ((fp->f_flag & FWRITE) != 0) {
-				maxprot |= VM_PROT_WRITE;
-			} else if ((prot & PROT_WRITE) != 0) {
-				error = EACCES;
+		if (check_fp_fn != NULL) {
+			error = check_fp_fn(fp, prot, cap_maxprot, flags);
+			if (error != 0)
 				goto done;
-			}
-		} else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
-			maxprot |= VM_PROT_WRITE;
-			cap_maxprot |= VM_PROT_WRITE;
 		}
-		handle = (void *)vp;
-		handle_type = OBJT_VNODE;
+		/* This relies on VM_PROT_* matching PROT_*. */
+		error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
+		    cap_maxprot, flags, pos, td);
 	}
-map:
-	td->td_fpop = fp;
-	maxprot &= cap_maxprot;
-	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
-	    flags, handle_type, handle, pos);
-	td->td_fpop = NULL;
-#ifdef HWPMC_HOOKS
-	/* inform hwpmc(4) if an executable is being mapped */
-	if (error == 0 && handle_type == OBJT_VNODE &&
-	    (prot & PROT_EXEC)) {
-		pkm.pm_file = handle;
-		pkm.pm_address = (uintptr_t) addr;
-		PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
-	}
-#endif
+
 	if (error == 0)
 		td->td_retval[0] = (register_t) (addr + pageoff);
 done:
@@ -430,19 +388,15 @@
 	return (error);
 }
 
+#if defined(COMPAT_FREEBSD6)
 int
 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
 {
-	struct mmap_args oargs;
 
-	oargs.addr = uap->addr;
-	oargs.len = uap->len;
-	oargs.prot = uap->prot;
-	oargs.flags = uap->flags;
-	oargs.fd = uap->fd;
-	oargs.pos = uap->pos;
-	return (sys_mmap(td, &oargs));
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
+	    uap->flags, uap->fd, uap->pos));
 }
+#endif
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
@@ -456,11 +410,8 @@
 };
 #endif
 int
-ommap(td, uap)
-	struct thread *td;
-	struct ommap_args *uap;
+ommap(struct thread *td, struct ommap_args *uap)
 {
-	struct mmap_args nargs;
 	static const char cvtbsdprot[8] = {
 		0,
 		PROT_EXEC,
@@ -471,6 +422,7 @@
 		PROT_WRITE | PROT_READ,
 		PROT_EXEC | PROT_WRITE | PROT_READ,
 	};
+	int flags, prot;
 
 #define	OMAP_ANON	0x0002
 #define	OMAP_COPY	0x0020
@@ -477,30 +429,27 @@
 #define	OMAP_SHARED	0x0010
 #define	OMAP_FIXED	0x0100
 
-	nargs.addr = uap->addr;
-	nargs.len = uap->len;
-	nargs.prot = cvtbsdprot[uap->prot & 0x7];
+	prot = cvtbsdprot[uap->prot & 0x7];
 #ifdef COMPAT_FREEBSD32
-#if defined(__amd64__) || defined(__ia64__)
+#if defined(__amd64__)
 	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
-	    nargs.prot != 0)
-		nargs.prot |= PROT_EXEC;
+	    prot != 0)
+		prot |= PROT_EXEC;
 #endif
 #endif
-	nargs.flags = 0;
+	flags = 0;
 	if (uap->flags & OMAP_ANON)
-		nargs.flags |= MAP_ANON;
+		flags |= MAP_ANON;
 	if (uap->flags & OMAP_COPY)
-		nargs.flags |= MAP_COPY;
+		flags |= MAP_COPY;
 	if (uap->flags & OMAP_SHARED)
-		nargs.flags |= MAP_SHARED;
+		flags |= MAP_SHARED;
 	else
-		nargs.flags |= MAP_PRIVATE;
+		flags |= MAP_PRIVATE;
 	if (uap->flags & OMAP_FIXED)
-		nargs.flags |= MAP_FIXED;
-	nargs.fd = uap->fd;
-	nargs.pos = uap->pos;
-	return (sys_mmap(td, &nargs));
+		flags |= MAP_FIXED;
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags,
+	    uap->fd, uap->pos));
 }
 #endif				/* COMPAT_43 */
 
@@ -513,20 +462,21 @@
 };
 #endif
 int
-sys_msync(td, uap)
-	struct thread *td;
-	struct msync_args *uap;
+sys_msync(struct thread *td, struct msync_args *uap)
 {
+
+	return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
+}
+
+int
+kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
+{
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
-	int flags;
+	vm_size_t pageoff;
 	vm_map_t map;
 	int rv;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
-	flags = uap->flags;
-
+	addr = addr0;
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
@@ -565,23 +515,28 @@
 };
 #endif
 int
-sys_munmap(td, uap)
-	struct thread *td;
-	struct munmap_args *uap;
+sys_munmap(struct thread *td, struct munmap_args *uap)
 {
+
+	return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
+}
+
+int
+kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
+{
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_out pkm;
 	vm_map_entry_t entry;
+	bool pmc_handled;
 #endif
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
+	vm_size_t pageoff;
 	vm_map_t map;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
 	if (size == 0)
 		return (EINVAL);
 
+	addr = addr0;
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
@@ -597,20 +552,23 @@
 		return (EINVAL);
 	vm_map_lock(map);
 #ifdef HWPMC_HOOKS
-	/*
-	 * Inform hwpmc if the address range being unmapped contains
-	 * an executable region.
-	 */
-	pkm.pm_address = (uintptr_t) NULL;
-	if (vm_map_lookup_entry(map, addr, &entry)) {
-		for (;
-		     entry != &map->header && entry->start < addr + size;
-		     entry = entry->next) {
-			if (vm_map_check_protection(map, entry->start,
-				entry->end, VM_PROT_EXECUTE) == TRUE) {
-				pkm.pm_address = (uintptr_t) addr;
-				pkm.pm_size = (size_t) size;
-				break;
+	pmc_handled = false;
+	if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
+		pmc_handled = true;
+		/*
+		 * Inform hwpmc if the address range being unmapped contains
+		 * an executable region.
+		 */
+		pkm.pm_address = (uintptr_t) NULL;
+		if (vm_map_lookup_entry(map, addr, &entry)) {
+			for (; entry->start < addr + size;
+			    entry = entry->next) {
+				if (vm_map_check_protection(map, entry->start,
+					entry->end, VM_PROT_EXECUTE) == TRUE) {
+					pkm.pm_address = (uintptr_t) addr;
+					pkm.pm_size = (size_t) size;
+					break;
+				}
 			}
 		}
 	}
@@ -618,14 +576,16 @@
 	vm_map_delete(map, addr, addr + size);
 
 #ifdef HWPMC_HOOKS
-	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
-	vm_map_lock_downgrade(map);
-	if (pkm.pm_address != (uintptr_t) NULL)
-		PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
-	vm_map_unlock_read(map);
-#else
-	vm_map_unlock(map);
+	if (__predict_false(pmc_handled)) {
+		/* downgrade the lock to prevent a LOR with the pmc-sx lock */
+		vm_map_lock_downgrade(map);
+		if (pkm.pm_address != (uintptr_t) NULL)
+			PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
+		vm_map_unlock_read(map);
+	} else
 #endif
+		vm_map_unlock(map);
+
 	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
 	return (0);
 }
@@ -638,22 +598,30 @@
 };
 #endif
 int
-sys_mprotect(td, uap)
-	struct thread *td;
-	struct mprotect_args *uap;
+sys_mprotect(struct thread *td, struct mprotect_args *uap)
 {
+
+	return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot));
+}
+
+int
+kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot)
+{
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
-	vm_prot_t prot;
+	vm_size_t pageoff;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
-	prot = uap->prot & VM_PROT_ALL;
-
+	addr = addr0;
+	prot = (prot & VM_PROT_ALL);
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
+#ifdef COMPAT_FREEBSD32
+	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+		if (((addr + size) & 0xffffffff) < addr)
+			return (EINVAL);
+	} else
+#endif
 	if (addr + size < addr)
 		return (EINVAL);
 
@@ -715,8 +683,15 @@
 int
 sys_madvise(struct thread *td, struct madvise_args *uap)
 {
-	vm_offset_t start, end;
+
+	return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
+}
+
+int
+kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
+{
 	vm_map_t map;
+	vm_offset_t addr, end, start;
 	int flags;
 
 	/*
@@ -723,7 +698,7 @@
 	 * Check for our special case, advising the swap pager we are
 	 * "immortal."
 	 */
-	if (uap->behav == MADV_PROTECT) {
+	if (behav == MADV_PROTECT) {
 		flags = PPROT_SET;
 		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
 		    PROC_SPROTECT, &flags));
@@ -732,7 +707,7 @@
 	/*
 	 * Check for illegal behavior
 	 */
-	if (uap->behav < 0 || uap->behav > MADV_CORE)
+	if (behav < 0 || behav > MADV_CORE)
 		return (EINVAL);
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
@@ -739,10 +714,10 @@
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
-	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
-	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
+	addr = addr0;
+	if (addr < vm_map_min(map) || addr + len > vm_map_max(map))
 		return (EINVAL);
-	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
+	if ((addr + len) < addr)
 		return (EINVAL);
 
 	/*
@@ -749,10 +724,10 @@
 	 * Since this routine is only advisory, we default to conservative
 	 * behavior.
 	 */
-	start = trunc_page((vm_offset_t) uap->addr);
-	end = round_page((vm_offset_t) uap->addr + uap->len);
+	start = trunc_page(addr);
+	end = round_page(addr + len);
 
-	if (vm_map_madvise(map, start, end, uap->behav))
+	if (vm_map_madvise(map, start, end, behav))
 		return (EINVAL);
 	return (0);
 }
@@ -768,11 +743,17 @@
 int
 sys_mincore(struct thread *td, struct mincore_args *uap)
 {
+
+	return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
+}
+
+int
+kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
+{
 	vm_offset_t addr, first_addr;
 	vm_offset_t end, cend;
 	pmap_t pmap;
 	vm_map_t map;
-	char *vec;
 	int error = 0;
 	int vecindex, lastvecindex;
 	vm_map_entry_t current;
@@ -789,17 +770,12 @@
 	 * Make sure that the addresses presented are valid for user
 	 * mode.
 	 */
-	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
-	end = addr + (vm_size_t)round_page(uap->len);
+	first_addr = addr = trunc_page(addr0);
+	end = addr + (vm_size_t)round_page(len);
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (end > vm_map_max(map) || end < addr)
 		return (ENOMEM);
 
-	/*
-	 * Address of byte vector
-	 */
-	vec = uap->vec;
-
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 
 	vm_map_lock_read(map);
@@ -817,16 +793,12 @@
 	 * up the pages elsewhere.
 	 */
 	lastvecindex = -1;
-	for (current = entry;
-	    (current != &map->header) && (current->start < end);
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 
 		/*
 		 * check for contiguity
 		 */
-		if (current->end < end &&
-		    (entry->next == &map->header ||
-		     current->next->start > current->end)) {
+		if (current->end < end && current->next->start > current->end) {
 			vm_map_unlock_read(map);
 			return (ENOMEM);
 		}
@@ -862,8 +834,17 @@
 		retry:
 			m = NULL;
 			mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
-			if (locked_pa != 0) {
+			if (mincore_mapped) {
 				/*
+				 * We only care about this pmap's
+				 * mapping of the page, if any.
+				 */
+				if (locked_pa != 0) {
+					vm_page_unlock(PHYS_TO_VM_PAGE(
+					    locked_pa));
+				}
+			} else if (locked_pa != 0) {
+				/*
 				 * The page is mapped by this process but not
 				 * both accessed and modified.  It is also
 				 * managed.  Acquire the object lock so that
@@ -905,9 +886,6 @@
 					pindex = OFF_TO_IDX(current->offset +
 					    (addr - current->start));
 					m = vm_page_lookup(object, pindex);
-					if (m == NULL &&
-					    vm_page_is_cached(object, pindex))
-						mincoreinfo = MINCORE_INCORE;
 					if (m != NULL && m->valid == 0)
 						m = NULL;
 					if (m != NULL)
@@ -945,7 +923,7 @@
 			/*
 			 * calculate index into user supplied byte vector
 			 */
-			vecindex = OFF_TO_IDX(addr - first_addr);
+			vecindex = atop(addr - first_addr);
 
 			/*
 			 * If we have skipped map entries, we need to make sure that
@@ -991,7 +969,7 @@
 	/*
 	 * Zero the last entries in the byte vector.
 	 */
-	vecindex = OFF_TO_IDX(end - first_addr);
+	vecindex = atop(end - first_addr);
 	while ((lastvecindex + 1) < vecindex) {
 		++lastvecindex;
 		error = subyte(vec + lastvecindex, 0);
@@ -1023,11 +1001,12 @@
 sys_mlock(struct thread *td, struct mlock_args *uap)
 {
 
-	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
+	return (kern_mlock(td->td_proc, td->td_ucred,
+	    __DECONST(uintptr_t, uap->addr), uap->len));
 }
 
 int
-vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
+kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t npages, size;
@@ -1038,7 +1017,7 @@
 	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
 	if (error)
 		return (error);
-	addr = (vm_offset_t)addr0;
+	addr = addr0;
 	size = len;
 	last = addr + size;
 	start = trunc_page(addr);
@@ -1051,12 +1030,12 @@
 	map = &proc->p_vmspace->vm_map;
 	PROC_LOCK(proc);
 	nsize = ptoa(npages + pmap_wired_count(map->pmap));
-	if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
+	if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(proc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(proc);
-	if (npages + cnt.v_wire_count > vm_page_max_wired)
+	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #ifdef RACCT
 	if (racct_enable) {
@@ -1106,7 +1085,7 @@
 	 */
 	if (!old_mlock && uap->how & MCL_CURRENT) {
 		PROC_LOCK(td->td_proc);
-		if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
+		if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
@@ -1195,12 +1174,16 @@
 };
 #endif
 int
-sys_munlock(td, uap)
-	struct thread *td;
-	struct munlock_args *uap;
+sys_munlock(struct thread *td, struct munlock_args *uap)
 {
+
+	return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
+}
+
+int
+kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
+{
 	vm_offset_t addr, end, last, start;
-	vm_size_t size;
 #ifdef RACCT
 	vm_map_t map;
 #endif
@@ -1209,8 +1192,7 @@
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
-	addr = (vm_offset_t)uap->addr;
-	size = uap->len;
+	addr = addr0;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
@@ -1235,9 +1217,6 @@
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on vnodes.
- *
- * For VCHR vnodes, the vnode lock is held over the call to
- * vm_mmap_cdev() to keep vp->v_rdev valid.
  */
 int
 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
@@ -1247,7 +1226,7 @@
 {
 	struct vattr va;
 	vm_object_t obj;
-	vm_offset_t foff;
+	vm_ooffset_t foff;
 	struct ucred *cred;
 	int error, flags, locktype;
 
@@ -1258,6 +1237,7 @@
 		locktype = LK_SHARED;
 	if ((error = vget(vp, locktype, td)) != 0)
 		return (error);
+	AUDIT_ARG_VNODE1(vp);
 	foff = *foffp;
 	flags = *flagsp;
 	obj = vp->v_object;
@@ -1284,12 +1264,6 @@
 			*writecounted = TRUE;
 			vnode_pager_update_writecount(obj, 0, objsize);
 		}
-	} else if (vp->v_type == VCHR) {
-		error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
-		    vp->v_rdev, foffp, objp);
-		if (error == 0)
-			goto mark_atime;
-		goto done;
 	} else {
 		error = EINVAL;
 		goto done;
@@ -1297,13 +1271,14 @@
 	if ((error = VOP_GETATTR(vp, &va, cred)))
 		goto done;
 #ifdef MAC
-	error = mac_vnode_check_mmap(cred, vp, prot, flags);
+	/* This relies on VM_PROT_* matching PROT_*. */
+	error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
 	if (error != 0)
 		goto done;
 #endif
 	if ((flags & MAP_SHARED) != 0) {
 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
-			if (prot & PROT_WRITE) {
+			if (prot & VM_PROT_WRITE) {
 				error = EPERM;
 				goto done;
 			}
@@ -1318,22 +1293,26 @@
 	objsize = round_page(va.va_size);
 	if (va.va_nlink == 0)
 		flags |= MAP_NOSYNC;
-	if (obj->type == OBJT_VNODE)
+	if (obj->type == OBJT_VNODE) {
 		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
 		    cred);
-	else {
+		if (obj == NULL) {
+			error = ENOMEM;
+			goto done;
+		}
+	} else {
 		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
 		    ("wrong object type"));
-		vm_object_reference(obj);
+		VM_OBJECT_WLOCK(obj);
+		vm_object_reference_locked(obj);
+#if VM_NRESERVLEVEL > 0
+		vm_object_color(obj, 0);
+#endif
+		VM_OBJECT_WUNLOCK(obj);
 	}
-	if (obj == NULL) {
-		error = ENOMEM;
-		goto done;
-	}
 	*objp = obj;
 	*flagsp = flags;
 
-mark_atime:
 	vfs_mark_atime(vp, cred);
 
 done:
@@ -1352,21 +1331,18 @@
  * operations on cdevs.
  */
 int
-vm_mmap_cdev(struct thread *td, vm_size_t objsize,
-    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
-    struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
+vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
+    vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
+    vm_ooffset_t *foff, vm_object_t *objp)
 {
 	vm_object_t obj;
-	struct cdevsw *dsw;
-	int error, flags, ref;
+	int error, flags;
 
 	flags = *flagsp;
 
-	dsw = dev_refthread(cdev, &ref);
-	if (dsw == NULL)
-		return (ENXIO);
 	if (dsw->d_flags & D_MMAP_ANON) {
-		dev_relthread(cdev, ref);
+		*objp = NULL;
+		*foff = 0;
 		*maxprotp = VM_PROT_ALL;
 		*flagsp |= MAP_ANON;
 		return (0);
@@ -1375,24 +1351,18 @@
 	 * cdevs do not provide private mappings of any kind.
 	 */
 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
-	    (prot & PROT_WRITE) != 0) {
-		dev_relthread(cdev, ref);
+	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
-	}
-	if (flags & (MAP_PRIVATE|MAP_COPY)) {
-		dev_relthread(cdev, ref);
+	if (flags & (MAP_PRIVATE|MAP_COPY))
 		return (EINVAL);
-	}
 	/*
 	 * Force device mappings to be shared.
 	 */
 	flags |= MAP_SHARED;
 #ifdef MAC_XXX
-	error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
-	if (error != 0) {
-		dev_relthread(cdev, ref);
+	error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
+	if (error != 0)
 		return (error);
-	}
 #endif
 	/*
 	 * First, try d_mmap_single().  If that is not implemented
@@ -1404,7 +1374,6 @@
 	 * XXX assumes VM_PROT_* == PROT_*
 	 */
 	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
-	dev_relthread(cdev, ref);
 	if (error != ENODEV)
 		return (error);
 	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
@@ -1417,65 +1386,96 @@
 }
 
 /*
- * vm_mmap_shm()
+ * vm_mmap()
  *
- * MPSAFE
- *
- * Helper function for vm_mmap.  Perform sanity check specific for mmap
- * operations on shm file descriptors.
+ * Internal version of mmap used by exec, sys5 shared memory, and
+ * various device drivers.  Handle is either a vnode pointer, a
+ * character device, or NULL for MAP_ANON.
  */
 int
-vm_mmap_shm(struct thread *td, vm_size_t objsize,
-    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
-    struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp)
+vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
+	vm_prot_t maxprot, int flags,
+	objtype_t handle_type, void *handle,
+	vm_ooffset_t foff)
 {
+	vm_object_t object;
+	struct thread *td = curthread;
 	int error;
+	boolean_t writecounted;
 
-	if ((*flagsp & MAP_SHARED) != 0 &&
-	    (*maxprotp & VM_PROT_WRITE) == 0 &&
-	    (prot & PROT_WRITE) != 0)
-		return (EACCES);
-#ifdef MAC
-	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
-	if (error != 0)
-		return (error);
-#endif
-	error = shm_mmap(shmfd, objsize, foff, objp);
+	if (size == 0)
+		return (EINVAL);
+
+	size = round_page(size);
+	object = NULL;
+	writecounted = FALSE;
+
+	/*
+	 * Lookup/allocate object.
+	 */
+	switch (handle_type) {
+	case OBJT_DEVICE: {
+		struct cdevsw *dsw;
+		struct cdev *cdev;
+		int ref;
+
+		cdev = handle;
+		dsw = dev_refthread(cdev, &ref);
+		if (dsw == NULL)
+			return (ENXIO);
+		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
+		    dsw, &foff, &object);
+		dev_relthread(cdev, ref);
+		break;
+	}
+	case OBJT_VNODE:
+		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
+		    handle, &foff, &object, &writecounted);
+		break;
+	case OBJT_DEFAULT:
+		if (handle == NULL) {
+			error = 0;
+			break;
+		}
+		/* FALLTHROUGH */
+	default:
+		error = EINVAL;
+		break;
+	}
 	if (error)
 		return (error);
-	return (0);
+
+	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
+	    foff, writecounted, td);
+	if (error != 0 && object != NULL) {
+		/*
+		 * If this mapping was accounted for in the vnode's
+		 * writecount, then undo that now.
+		 */
+		if (writecounted)
+			vnode_pager_release_writecount(object, 0, size);
+		vm_object_deallocate(object);
+	}
+	return (error);
 }
 
 /*
- * vm_mmap()
- *
- * MPSAFE
- *
- * Internal version of mmap.  Currently used by mmap, exec, and sys5
- * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
+ * Internal version of mmap that maps a specific VM object into an
+ * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
  */
 int
-vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
-	vm_prot_t maxprot, int flags,
-	objtype_t handle_type, void *handle,
-	vm_ooffset_t foff)
+vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
+    vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
+    boolean_t writecounted, struct thread *td)
 {
 	boolean_t curmap, fitit;
 	vm_offset_t max_addr;
-	vm_object_t object = NULL;
-	struct thread *td = curthread;
 	int docow, error, findspace, rv;
-	boolean_t writecounted;
 
-	if (size == 0)
-		return (0);
-
-	size = round_page(size);
-
 	curmap = map == &td->td_proc->p_vmspace->vm_map;
 	if (curmap) {
 		PROC_LOCK(td->td_proc);
-		if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) {
+		if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
@@ -1485,7 +1485,7 @@
 		}
 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 			if (ptoa(pmap_wired_count(map->pmap)) + size >
-			    lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
+			    lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				PROC_UNLOCK(td->td_proc);
@@ -1505,11 +1505,11 @@
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
-	 * The check is here rather than in the syscall because the
-	 * kernel calls this function internally for other mmaping
-	 * operations (such as in exec) and non-aligned offsets will
-	 * cause pmap inconsistencies...so we want to be sure to
-	 * disallow this in all cases.
+	 * The mmap() system call already enforces this by subtracting
+	 * the page offset from the file offset, but checking here
+	 * catches errors in device drivers (e.g. d_single_mmap()
+	 * callbacks) and other internal mapping requests (such as in
+	 * exec).
 	 */
 	if (foff & PAGE_MASK)
 		return (EINVAL);
@@ -1522,44 +1522,11 @@
 			return (EINVAL);
 		fitit = FALSE;
 	}
-	writecounted = FALSE;
 
-	/*
-	 * Lookup/allocate object.
-	 */
-	switch (handle_type) {
-	case OBJT_DEVICE:
-		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
-		    handle, &foff, &object);
-		break;
-	case OBJT_VNODE:
-		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
-		    handle, &foff, &object, &writecounted);
-		break;
-	case OBJT_SWAP:
-		error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
-		    handle, foff, &object);
-		break;
-	case OBJT_DEFAULT:
-		if (handle == NULL) {
-			error = 0;
-			break;
-		}
-		/* FALLTHROUGH */
-	default:
-		error = EINVAL;
-		break;
-	}
-	if (error)
-		return (error);
 	if (flags & MAP_ANON) {
-		object = NULL;
+		if (object != NULL || foff != 0)
+			return (EINVAL);
 		docow = 0;
-		/*
-		 * Unnamed anonymous regions always start at 0.
-		 */
-		if (handle == 0)
-			foff = 0;
 	} else if (flags & MAP_PREFAULT_READ)
 		docow = MAP_PREFAULT;
 	else
@@ -1600,15 +1567,9 @@
 			max_addr = MAP_32BIT_MAX_ADDR;
 #endif
 		if (curmap) {
-			vm_offset_t min_addr;
-
-			PROC_LOCK(td->td_proc);
-			min_addr = round_page((vm_offset_t)td->td_proc->
-			    p_vmspace->vm_daddr + lim_max(td->td_proc,
-			    RLIMIT_DATA));
-			PROC_UNLOCK(td->td_proc);
 			rv = vm_map_find_min(map, object, foff, addr, size,
-			    min_addr, max_addr,
+			    round_page((vm_offset_t)td->td_proc->p_vmspace->
+			    vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr,
 			    findspace, prot, maxprot, docow);
 		} else {
 			rv = vm_map_find(map, object, foff, addr, size,
@@ -1629,19 +1590,6 @@
 			    VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
 			    VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
 		}
-	} else {
-		/*
-		 * If this mapping was accounted for in the vnode's
-		 * writecount, then undo that now.
-		 */
-		if (writecounted)
-			vnode_pager_release_writecount(object, 0, size);
-		/*
-		 * Lose the object reference.  Will destroy the
-		 * object if it's an unnamed anonymous mapping
-		 * or named anonymous without other references.
-		 */
-		vm_object_deallocate(object);
 	}
 	return (vm_mmap_to_errno(rv));
 }

Modified: trunk/sys/vm/vm_object.c
===================================================================
--- trunk/sys/vm/vm_object.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_object.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_object.c 321677 2017-07-29 08:24:51Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_object.c 352331 2019-09-14 13:35:48Z kib $");
 
 #include "opt_vm.h"
 
@@ -74,6 +74,7 @@
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
+#include <sys/pctrie.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
@@ -179,9 +180,6 @@
 	    ("object %p has reservations",
 	    object));
 #endif
-	KASSERT(vm_object_cache_is_empty(object),
-	    ("object %p has cached pages",
-	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
@@ -203,19 +201,16 @@
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
-	bzero(&object->lock, sizeof(object->lock));
-	rw_init_flags(&object->lock, "vm object", RW_DUPOK);
+	rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW);
 
 	/* These are true for any object that has been freed */
 	object->type = OBJT_DEAD;
 	object->ref_count = 0;
-	object->rtree.rt_root = 0;
-	object->rtree.rt_flags = 0;
+	vm_radix_init(&object->rtree);
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
-	object->cache.rt_root = 0;
-	object->cache.rt_flags = 0;
+	object->flags = OBJ_DEAD;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
@@ -231,6 +226,16 @@
 	LIST_INIT(&object->shadow_head);
 
 	object->type = type;
+	if (type == OBJT_SWAP)
+		pctrie_init(&object->un_pager.swp.swp_blks);
+
+	/*
+	 * Ensure that swap_pager_swapoff() iteration over object_list
+	 * sees up to date type and pctrie head if it observed
+	 * non-dead object.
+	 */
+	atomic_thread_fence_rel();
+
 	switch (type) {
 	case OBJT_DEAD:
 		panic("_vm_object_allocate: can't create OBJT_DEAD");
@@ -266,6 +271,7 @@
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
+	umtx_shm_object_init(object);
 }
 
 /*
@@ -280,8 +286,8 @@
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	rw_init(&kernel_object->lock, "kernel vm object");
-	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
-	    kernel_object);
+	_vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS -
+	    VM_MIN_KERNEL_ADDRESS), kernel_object);
 #if VM_NRESERVLEVEL > 0
 	kernel_object->flags |= OBJ_COLORED;
 	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
@@ -288,8 +294,8 @@
 #endif
 
 	rw_init(&kmem_object->lock, "kmem vm object");
-	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
-	    kmem_object);
+	_vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS -
+	    VM_MIN_KERNEL_ADDRESS), kmem_object);
 #if VM_NRESERVLEVEL > 0
 	kmem_object->flags |= OBJ_COLORED;
 	kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
@@ -308,7 +314,7 @@
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
-	vm_radix_init();
+	vm_radix_zinit();
 }
 
 void
@@ -472,11 +478,14 @@
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 #ifdef INVARIANTS
 	if (object->ref_count == 0) {
-		vprint("vm_object_vndeallocate", vp);
+		vn_printf(vp, "vm_object_vndeallocate ");
 		panic("vm_object_vndeallocate: bad object reference count");
 	}
 #endif
 
+	if (!umtx_shm_vnobj_persistent && object->ref_count == 1)
+		umtx_shm_object_terminated(object);
+
 	/*
 	 * The test for text of vp vnode does not need a bypass to
 	 * reach right VV_TEXT there, since it is obtained from
@@ -649,6 +658,7 @@
 			return;
 		}
 doterm:
+		umtx_shm_object_terminated(object);
 		temp = object->backing_object;
 		if (temp != NULL) {
 			KASSERT((object->flags & OBJ_TMPFS_NODE) == 0,
@@ -697,6 +707,89 @@
 }
 
 /*
+ *	vm_object_terminate_pages removes any remaining pageable pages
+ *	from the object and resets the object to an empty state.
+ */
+static void
+vm_object_terminate_pages(vm_object_t object)
+{
+	vm_page_t p, p_next;
+	struct mtx *mtx, *mtx1;
+	struct vm_pagequeue *pq, *pq1;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	mtx = NULL;
+	pq = NULL;
+
+	/*
+	 * Free any remaining pageable pages.  This also removes them from the
+	 * paging queues.  However, don't free wired pages, just remove them
+	 * from the object.  Rather than incrementally removing each page from
+	 * the object, the page and object are reset to any empty state. 
+	 */
+	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
+		vm_page_assert_unbusied(p);
+		if ((object->flags & OBJ_UNMANAGED) == 0) {
+			/*
+			 * vm_page_free_prep() only needs the page
+			 * lock for managed pages.
+			 */
+			mtx1 = vm_page_lockptr(p);
+			if (mtx1 != mtx) {
+				if (mtx != NULL)
+					mtx_unlock(mtx);
+				if (pq != NULL) {
+					vm_pagequeue_unlock(pq);
+					pq = NULL;
+				}
+				mtx = mtx1;
+				mtx_lock(mtx);
+			}
+		}
+		p->object = NULL;
+		if (p->wire_count != 0)
+			goto unlist;
+		PCPU_INC(cnt.v_pfree);
+		p->flags &= ~PG_ZERO;
+		if (p->queue != PQ_NONE) {
+			KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: "
+			    "page %p is not queued", p));
+			pq1 = vm_page_pagequeue(p);
+			if (pq != pq1) {
+				if (pq != NULL)
+					vm_pagequeue_unlock(pq);
+				pq = pq1;
+				vm_pagequeue_lock(pq);
+			}
+		}
+		if (vm_page_free_prep(p, true))
+			continue;
+unlist:
+		TAILQ_REMOVE(&object->memq, p, listq);
+	}
+	if (pq != NULL)
+		vm_pagequeue_unlock(pq);
+	if (mtx != NULL)
+		mtx_unlock(mtx);
+
+	vm_page_free_phys_pglist(&object->memq);
+
+	/*
+	 * If the object contained any pages, then reset it to an empty state.
+	 * None of the object's fields, including "resident_page_count", were
+	 * modified by the preceding loop.
+	 */
+	if (object->resident_page_count != 0) {
+		vm_radix_reclaim_allnodes(&object->rtree);
+		TAILQ_INIT(&object->memq);
+		object->resident_page_count = 0;
+		if (object->type == OBJT_VNODE)
+			vdrop(object->handle);
+	}
+}
+
+/*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
@@ -706,7 +799,6 @@
 void
 vm_object_terminate(vm_object_t object)
 {
-	vm_page_t p, p_next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
@@ -749,48 +841,13 @@
 		("vm_object_terminate: object with references, ref_count=%d",
 		object->ref_count));
 
-	/*
-	 * Free any remaining pageable pages.  This also removes them from the
-	 * paging queues.  However, don't free wired pages, just remove them
-	 * from the object.  Rather than incrementally removing each page from
-	 * the object, the page and object are reset to any empty state. 
-	 */
-	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
-		vm_page_assert_unbusied(p);
-		vm_page_lock(p);
-		/*
-		 * Optimize the page's removal from the object by resetting
-		 * its "object" field.  Specifically, if the page is not
-		 * wired, then the effect of this assignment is that
-		 * vm_page_free()'s call to vm_page_remove() will return
-		 * immediately without modifying the page or the object.
-		 */ 
-		p->object = NULL;
-		if (p->wire_count == 0) {
-			vm_page_free(p);
-			PCPU_INC(cnt.v_pfree);
-		}
-		vm_page_unlock(p);
-	}
-	/*
-	 * If the object contained any pages, then reset it to an empty state.
-	 * None of the object's fields, including "resident_page_count", were
-	 * modified by the preceding loop.
-	 */
-	if (object->resident_page_count != 0) {
-		vm_radix_reclaim_allnodes(&object->rtree);
-		TAILQ_INIT(&object->memq);
-		object->resident_page_count = 0;
-		if (object->type == OBJT_VNODE)
-			vdrop(object->handle);
-	}
+	if ((object->flags & OBJ_PG_DTOR) == 0)
+		vm_object_terminate_pages(object);
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
-	if (__predict_false(!vm_object_cache_is_empty(object)))
-		vm_page_cache_free(object, 0, 0);
 
 	KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT ||
 	    object->type == OBJT_SWAP,
@@ -1027,13 +1084,13 @@
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
-	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
-		vp = object->handle;
+	    (object->flags & OBJ_MIGHTBEDIRTY) != 0 &&
+	    ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) {
 		VM_OBJECT_WUNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (syncio && !invalidate && offset == 0 &&
-		    OFF_TO_IDX(size) == object->size) {
+		    atop(size) == object->size) {
 			/*
 			 * If syncing the whole mapping of the file,
 			 * it is faster to schedule all the writes in
@@ -1080,6 +1137,33 @@
 }
 
 /*
+ * Determine whether the given advice can be applied to the object.  Advice is
+ * not applied to unmanaged pages since they never belong to page queues, and
+ * since MADV_FREE is destructive, it can apply only to anonymous pages that
+ * have been mapped at most once.
+ */
+static bool
+vm_object_advice_applies(vm_object_t object, int advice)
+{
+
+	if ((object->flags & OBJ_UNMANAGED) != 0)
+		return (false);
+	if (advice != MADV_FREE)
+		return (true);
+	return ((object->type == OBJT_DEFAULT || object->type == OBJT_SWAP) &&
+	    (object->flags & OBJ_ONEMAPPING) != 0);
+}
+
+static void
+vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex,
+    vm_size_t size)
+{
+
+	if (advice == MADV_FREE && object->type == OBJT_SWAP)
+		swap_pager_freespace(object, pindex, size);
+}
+
+/*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
@@ -1102,103 +1186,109 @@
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end,
-    int advise)
+    int advice)
 {
 	vm_pindex_t tpindex;
 	vm_object_t backing_object, tobject;
-	vm_page_t m;
+	vm_page_t m, tm;
 
 	if (object == NULL)
 		return;
+
+relookup:
 	VM_OBJECT_WLOCK(object);
-	/*
-	 * Locate and adjust resident pages
-	 */
-	for (; pindex < end; pindex += 1) {
-relookup:
+	if (!vm_object_advice_applies(object, advice)) {
+		VM_OBJECT_WUNLOCK(object);
+		return;
+	}
+	for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) {
 		tobject = object;
-		tpindex = pindex;
-shadowlookup:
+
 		/*
-		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
-		 * and those pages must be OBJ_ONEMAPPING.
+		 * If the next page isn't resident in the top-level object, we
+		 * need to search the shadow chain.  When applying MADV_FREE, we
+		 * take care to release any swap space used to store
+		 * non-resident pages.
 		 */
-		if (advise == MADV_FREE) {
-			if ((tobject->type != OBJT_DEFAULT &&
-			     tobject->type != OBJT_SWAP) ||
-			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
-				goto unlock_tobject;
-			}
-		} else if ((tobject->flags & OBJ_UNMANAGED) != 0)
-			goto unlock_tobject;
-		m = vm_page_lookup(tobject, tpindex);
-		if (m == NULL && advise == MADV_WILLNEED) {
+		if (m == NULL || pindex < m->pindex) {
 			/*
-			 * If the page is cached, reactivate it.
+			 * Optimize a common case: if the top-level object has
+			 * no backing object, we can skip over the non-resident
+			 * range in constant time.
 			 */
-			m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
-			    VM_ALLOC_NOBUSY);
+			if (object->backing_object == NULL) {
+				tpindex = (m != NULL && m->pindex < end) ?
+				    m->pindex : end;
+				vm_object_madvise_freespace(object, advice,
+				    pindex, tpindex - pindex);
+				if ((pindex = tpindex) == end)
+					break;
+				goto next_page;
+			}
+
+			tpindex = pindex;
+			do {
+				vm_object_madvise_freespace(tobject, advice,
+				    tpindex, 1);
+				/*
+				 * Prepare to search the next object in the
+				 * chain.
+				 */
+				backing_object = tobject->backing_object;
+				if (backing_object == NULL)
+					goto next_pindex;
+				VM_OBJECT_WLOCK(backing_object);
+				tpindex +=
+				    OFF_TO_IDX(tobject->backing_object_offset);
+				if (tobject != object)
+					VM_OBJECT_WUNLOCK(tobject);
+				tobject = backing_object;
+				if (!vm_object_advice_applies(tobject, advice))
+					goto next_pindex;
+			} while ((tm = vm_page_lookup(tobject, tpindex)) ==
+			    NULL);
+		} else {
+next_page:
+			tm = m;
+			m = TAILQ_NEXT(m, listq);
 		}
-		if (m == NULL) {
-			/*
-			 * There may be swap even if there is no backing page
-			 */
-			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
-				swap_pager_freespace(tobject, tpindex, 1);
-			/*
-			 * next object
-			 */
-			backing_object = tobject->backing_object;
-			if (backing_object == NULL)
-				goto unlock_tobject;
-			VM_OBJECT_WLOCK(backing_object);
-			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
-			if (tobject != object)
-				VM_OBJECT_WUNLOCK(tobject);
-			tobject = backing_object;
-			goto shadowlookup;
-		} else if (m->valid != VM_PAGE_BITS_ALL)
-			goto unlock_tobject;
+
 		/*
 		 * If the page is not in a normal state, skip it.
 		 */
-		vm_page_lock(m);
-		if (m->hold_count != 0 || m->wire_count != 0) {
-			vm_page_unlock(m);
-			goto unlock_tobject;
+		if (tm->valid != VM_PAGE_BITS_ALL)
+			goto next_pindex;
+		vm_page_lock(tm);
+		if (tm->hold_count != 0 || tm->wire_count != 0) {
+			vm_page_unlock(tm);
+			goto next_pindex;
 		}
-		KASSERT((m->flags & PG_FICTITIOUS) == 0,
-		    ("vm_object_madvise: page %p is fictitious", m));
-		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
-		    ("vm_object_madvise: page %p is not managed", m));
-		if (vm_page_busied(m)) {
-			if (advise == MADV_WILLNEED) {
+		KASSERT((tm->flags & PG_FICTITIOUS) == 0,
+		    ("vm_object_madvise: page %p is fictitious", tm));
+		KASSERT((tm->oflags & VPO_UNMANAGED) == 0,
+		    ("vm_object_madvise: page %p is not managed", tm));
+		if (vm_page_busied(tm)) {
+			if (object != tobject)
+				VM_OBJECT_WUNLOCK(tobject);
+			VM_OBJECT_WUNLOCK(object);
+			if (advice == MADV_WILLNEED) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
-				 * likely to reclaim it. 
+				 * likely to reclaim it.
 				 */
-				vm_page_aflag_set(m, PGA_REFERENCED);
+				vm_page_aflag_set(tm, PGA_REFERENCED);
 			}
-			if (object != tobject)
-				VM_OBJECT_WUNLOCK(object);
-			VM_OBJECT_WUNLOCK(tobject);
-			vm_page_busy_sleep(m, "madvpo", false);
-			VM_OBJECT_WLOCK(object);
+			vm_page_busy_sleep(tm, "madvpo", false);
   			goto relookup;
 		}
-		if (advise == MADV_WILLNEED) {
-			vm_page_activate(m);
-		} else {
-			vm_page_advise(m, advise);
-		}
-		vm_page_unlock(m);
-		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
-			swap_pager_freespace(tobject, tpindex, 1);
-unlock_tobject:
+		vm_page_advise(tm, advice);
+		vm_page_unlock(tm);
+		vm_object_madvise_freespace(tobject, advice, tm->pindex, 1);
+next_pindex:
 		if (tobject != object)
 			VM_OBJECT_WUNLOCK(tobject);
-	}	
+	}
 	VM_OBJECT_WUNLOCK(object);
 }
 
@@ -1368,11 +1458,11 @@
 			goto retry;
 		}
 
-		/* vm_page_rename() will handle dirty and cache. */
+		/* vm_page_rename() will dirty the page. */
 		if (vm_page_rename(m, new_object, idx)) {
 			VM_OBJECT_WUNLOCK(new_object);
 			VM_OBJECT_WUNLOCK(orig_object);
-			VM_WAIT;
+			vm_radix_wait();
 			VM_OBJECT_WLOCK(orig_object);
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
@@ -1403,19 +1493,6 @@
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 		TAILQ_FOREACH(m, &new_object->memq, listq)
 			vm_page_xunbusy(m);
-
-		/*
-		 * Transfer any cached pages from orig_object to new_object.
-		 * If swap_pager_copy() found swapped out pages within the
-		 * specified range of orig_object, then it changed
-		 * new_object's type to OBJT_SWAP when it transferred those
-		 * pages to new_object.  Otherwise, new_object's type
-		 * should still be OBJT_DEFAULT and orig_object should not
-		 * contain any cached pages within the specified range.
-		 */
-		if (__predict_false(!vm_object_cache_is_empty(orig_object)))
-			vm_page_cache_transfer(orig_object, offidxstart,
-			    new_object);
 	}
 	VM_OBJECT_WUNLOCK(orig_object);
 	VM_OBJECT_WUNLOCK(new_object);
@@ -1425,12 +1502,11 @@
 	VM_OBJECT_WLOCK(new_object);
 }
 
-#define	OBSC_TEST_ALL_SHADOWED	0x0001
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
 static vm_page_t
-vm_object_backing_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
+vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
     int op)
 {
 	vm_object_t backing_object;
@@ -1448,8 +1524,9 @@
 		vm_page_lock(p);
 	VM_OBJECT_WUNLOCK(object);
 	VM_OBJECT_WUNLOCK(backing_object);
+	/* The page is only NULL when rename fails. */
 	if (p == NULL)
-		VM_WAIT;
+		vm_radix_wait();
 	else
 		vm_page_busy_sleep(p, "vmocol", false);
 	VM_OBJECT_WLOCK(object);
@@ -1458,192 +1535,195 @@
 }
 
 static bool
-vm_object_backing_scan(vm_object_t object, int op)
+vm_object_scan_all_shadowed(vm_object_t object)
 {
 	vm_object_t backing_object;
-	vm_page_t next, p, pp;
-	vm_pindex_t backing_offset_index, new_pindex;
+	vm_page_t p, pp;
+	vm_pindex_t backing_offset_index, new_pindex, pi, ps;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
-	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
-	 * Initial conditions
+	 * Initial conditions:
+	 *
+	 * We do not want to have to test for the existence of swap
+	 * pages in the backing object.  XXX but with the new swapper this
+	 * would be pretty easy to do.
 	 */
-	if (op & OBSC_TEST_ALL_SHADOWED) {
+	if (backing_object->type != OBJT_DEFAULT &&
+	    backing_object->type != OBJT_SWAP)
+		return (false);
+
+	pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
+	p = vm_page_find_least(backing_object, pi);
+	ps = swap_pager_find_least(backing_object, pi);
+
+	/*
+	 * Only check pages inside the parent object's range and
+	 * inside the parent object's mapping of the backing object.
+	 */
+	for (;; pi++) {
+		if (p != NULL && p->pindex < pi)
+			p = TAILQ_NEXT(p, listq);
+		if (ps < pi)
+			ps = swap_pager_find_least(backing_object, pi);
+		if (p == NULL && ps >= backing_object->size)
+			break;
+		else if (p == NULL)
+			pi = ps;
+		else
+			pi = MIN(p->pindex, ps);
+
+		new_pindex = pi - backing_offset_index;
+		if (new_pindex >= object->size)
+			break;
+
 		/*
-		 * We do not want to have to test for the existence of cache
-		 * or swap pages in the backing object.  XXX but with the
-		 * new swapper this would be pretty easy to do.
+		 * See if the parent has the page or if the parent's object
+		 * pager has the page.  If the parent has the page but the page
+		 * is not valid, the parent's object pager must have the page.
 		 *
-		 * XXX what about anonymous MAP_SHARED memory that hasn't
-		 * been ZFOD faulted yet?  If we do not test for this, the
-		 * shadow test may succeed! XXX
+		 * If this fails, the parent does not completely shadow the
+		 * object and we might as well give up now.
 		 */
-		if (backing_object->type != OBJT_DEFAULT) {
+		pp = vm_page_lookup(object, new_pindex);
+		if ((pp == NULL || pp->valid == 0) &&
+		    !vm_pager_has_page(object, new_pindex, NULL, NULL))
 			return (false);
-		}
 	}
-	if (op & OBSC_COLLAPSE_WAIT) {
+	return (true);
+}
+
+static bool
+vm_object_collapse_scan(vm_object_t object, int op)
+{
+	vm_object_t backing_object;
+	vm_page_t next, p, pp;
+	vm_pindex_t backing_offset_index, new_pindex;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
+
+	backing_object = object->backing_object;
+	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
+
+	/*
+	 * Initial conditions
+	 */
+	if ((op & OBSC_COLLAPSE_WAIT) != 0)
 		vm_object_set_flag(backing_object, OBJ_DEAD);
-	}
 
 	/*
 	 * Our scan
 	 */
-	p = TAILQ_FIRST(&backing_object->memq);
-	while (p) {
+	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
 		next = TAILQ_NEXT(p, listq);
 		new_pindex = p->pindex - backing_offset_index;
-		if (op & OBSC_TEST_ALL_SHADOWED) {
-			/*
-			 * Ignore pages outside the parent object's range
-			 * and outside the parent object's mapping of the 
-			 * backing object.
-			 *
-			 * Note that we do not busy the backing object's
-			 * page.
-			 */
-			if (p->pindex < backing_offset_index ||
-			    new_pindex >= object->size) {
-				p = next;
-				continue;
-			}
 
-			/*
-			 * See if the parent has the page or if the parent's
-			 * object pager has the page.  If the parent has the
-			 * page but the page is not valid, the parent's
-			 * object pager must have the page.
-			 *
-			 * If this fails, the parent does not completely shadow
-			 * the object and we might as well give up now.
-			 */
-
-			pp = vm_page_lookup(object, new_pindex);
-			if ((pp == NULL || pp->valid == 0) &&
-			    !vm_pager_has_page(object, new_pindex, NULL, NULL))
-				return (false);
-		}
-
 		/*
 		 * Check for busy page
 		 */
-		if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
-			if (vm_page_busied(p)) {
-				p = vm_object_backing_scan_wait(object, p,
-				    next, op);
-				continue;
-			}
+		if (vm_page_busied(p)) {
+			next = vm_object_collapse_scan_wait(object, p, next, op);
+			continue;
+		}
 
-			KASSERT(p->object == backing_object,
-			    ("vm_object_backing_scan: object mismatch"));
+		KASSERT(p->object == backing_object,
+		    ("vm_object_collapse_scan: object mismatch"));
 
-			if (p->pindex < backing_offset_index ||
-			    new_pindex >= object->size) {
-				if (backing_object->type == OBJT_SWAP)
-					swap_pager_freespace(backing_object, 
-					    p->pindex, 1);
+		if (p->pindex < backing_offset_index ||
+		    new_pindex >= object->size) {
+			if (backing_object->type == OBJT_SWAP)
+				swap_pager_freespace(backing_object, p->pindex,
+				    1);
 
-				/*
-				 * Page is out of the parent object's range, we 
-				 * can simply destroy it. 
-				 */
-				vm_page_lock(p);
-				KASSERT(!pmap_page_is_mapped(p),
-				    ("freeing mapped page %p", p));
-				if (p->wire_count == 0)
-					vm_page_free(p);
-				else
-					vm_page_remove(p);
-				vm_page_unlock(p);
-				p = next;
-				continue;
-			}
+			/*
+			 * Page is out of the parent object's range, we can
+			 * simply destroy it.
+			 */
+			vm_page_lock(p);
+			KASSERT(!pmap_page_is_mapped(p),
+			    ("freeing mapped page %p", p));
+			if (p->wire_count == 0)
+				vm_page_free(p);
+			else
+				vm_page_remove(p);
+			vm_page_unlock(p);
+			continue;
+		}
 
-			pp = vm_page_lookup(object, new_pindex);
-			if (pp != NULL && vm_page_busied(pp)) {
-				/*
-				 * The page in the parent is busy and
-				 * possibly not (yet) valid.  Until
-				 * its state is finalized by the busy
-				 * bit owner, we can't tell whether it
-				 * shadows the original page.
-				 * Therefore, we must either skip it
-				 * and the original (backing_object)
-				 * page or wait for its state to be
-				 * finalized.
-				 *
-				 * This is due to a race with vm_fault()
-				 * where we must unbusy the original
-				 * (backing_obj) page before we can
-				 * (re)lock the parent.  Hence we can
-				 * get here.
-				 */
-				p = vm_object_backing_scan_wait(object, pp,
-				    next, op);
-				continue;
-			}
+		pp = vm_page_lookup(object, new_pindex);
+		if (pp != NULL && vm_page_busied(pp)) {
+			/*
+			 * The page in the parent is busy and possibly not
+			 * (yet) valid.  Until its state is finalized by the
+			 * busy bit owner, we can't tell whether it shadows the
+			 * original page.  Therefore, we must either skip it
+			 * and the original (backing_object) page or wait for
+			 * its state to be finalized.
+			 *
+			 * This is due to a race with vm_fault() where we must
+			 * unbusy the original (backing_obj) page before we can
+			 * (re)lock the parent.  Hence we can get here.
+			 */
+			next = vm_object_collapse_scan_wait(object, pp, next,
+			    op);
+			continue;
+		}
 
-			KASSERT(pp == NULL || pp->valid != 0,
-			    ("unbusy invalid page %p", pp));
+		KASSERT(pp == NULL || pp->valid != 0,
+		    ("unbusy invalid page %p", pp));
 
-			if (pp != NULL || vm_pager_has_page(object,
-			    new_pindex, NULL, NULL)) {
-				/*
-				 * The page already exists in the
-				 * parent OR swap exists for this
-				 * location in the parent.  Leave the
-				 * parent's page alone.  Destroy the
-				 * original page from the backing
-				 * object.
-				 */
-				if (backing_object->type == OBJT_SWAP)
-					swap_pager_freespace(backing_object,
-					    p->pindex, 1);
-				vm_page_lock(p);
-				KASSERT(!pmap_page_is_mapped(p),
-				    ("freeing mapped page %p", p));
-				if (p->wire_count == 0)
-					vm_page_free(p);
-				else
-					vm_page_remove(p);
-				vm_page_unlock(p);
-				p = next;
-				continue;
-			}
-
+		if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL,
+			NULL)) {
 			/*
-			 * Page does not exist in parent, rename the
-			 * page from the backing object to the main object. 
-			 *
-			 * If the page was mapped to a process, it can remain 
-			 * mapped through the rename.
-			 * vm_page_rename() will handle dirty and cache.
+			 * The page already exists in the parent OR swap exists
+			 * for this location in the parent.  Leave the parent's
+			 * page alone.  Destroy the original page from the
+			 * backing object.
 			 */
-			if (vm_page_rename(p, object, new_pindex)) {
-				p = vm_object_backing_scan_wait(object, NULL,
-				    next, op);
-				continue;
-			}
-
-			/* Use the old pindex to free the right page. */
 			if (backing_object->type == OBJT_SWAP)
-				swap_pager_freespace(backing_object,
-				    new_pindex + backing_offset_index, 1);
+				swap_pager_freespace(backing_object, p->pindex,
+				    1);
+			vm_page_lock(p);
+			KASSERT(!pmap_page_is_mapped(p),
+			    ("freeing mapped page %p", p));
+			if (p->wire_count == 0)
+				vm_page_free(p);
+			else
+				vm_page_remove(p);
+			vm_page_unlock(p);
+			continue;
+		}
 
+		/*
+		 * Page does not exist in parent, rename the page from the
+		 * backing object to the main object.
+		 *
+		 * If the page was mapped to a process, it can remain mapped
+		 * through the rename.  vm_page_rename() will dirty the page.
+		 */
+		if (vm_page_rename(p, object, new_pindex)) {
+			next = vm_object_collapse_scan_wait(object, NULL, next,
+			    op);
+			continue;
+		}
+
+		/* Use the old pindex to free the right page. */
+		if (backing_object->type == OBJT_SWAP)
+			swap_pager_freespace(backing_object,
+			    new_pindex + backing_offset_index, 1);
+
 #if VM_NRESERVLEVEL > 0
-			/*
-			 * Rename the reservation.
-			 */
-			vm_reserv_rename(p, object, backing_object,
-			    backing_offset_index);
+		/*
+		 * Rename the reservation.
+		 */
+		vm_reserv_rename(p, object, backing_object,
+		    backing_offset_index);
 #endif
-		}
-		p = next;
 	}
 	return (true);
 }
@@ -1665,7 +1745,7 @@
 	if (backing_object->ref_count != 1)
 		return;
 
-	vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
+	vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT);
 }
 
 /*
@@ -1698,8 +1778,8 @@
 		VM_OBJECT_WLOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
-		     backing_object->type != OBJT_SWAP) ||
-		    (backing_object->flags & OBJ_DEAD) ||
+		    backing_object->type != OBJT_SWAP) ||
+		    (backing_object->flags & (OBJ_DEAD | OBJ_NOSPLIT)) != 0 ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
@@ -1722,7 +1802,7 @@
 		 * all the resident pages in the entire backing object.
 		 *
 		 * This is ignoring pager-backed pages such as swap pages.
-		 * vm_object_backing_scan fails the shadowing test in this
+		 * vm_object_collapse_scan fails the shadowing test in this
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
@@ -1731,9 +1811,9 @@
 
 			/*
 			 * If there is exactly one reference to the backing
-			 * object, we can collapse it into the parent.  
+			 * object, we can collapse it into the parent.
 			 */
-			vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
+			vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT);
 
 #if VM_NRESERVLEVEL > 0
 			/*
@@ -1759,13 +1839,6 @@
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
-
-				/*
-				 * Free any cached pages from backing_object.
-				 */
-				if (__predict_false(
-				    !vm_object_cache_is_empty(backing_object)))
-					vm_page_cache_free(backing_object, 0, 0);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
@@ -1814,8 +1887,7 @@
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
-			    !vm_object_backing_scan(object,
-			    OBSC_TEST_ALL_SHADOWED)) {
+			    !vm_object_scan_all_shadowed(object)) {
 				VM_OBJECT_WUNLOCK(backing_object);
 				break;
 			}
@@ -1889,6 +1961,8 @@
     int options)
 {
 	vm_page_t p, next;
+	struct mtx *mtx;
+	struct pglist pgl;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
@@ -1895,10 +1969,12 @@
 	    (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
 	    ("vm_object_page_remove: illegal options for object %p", object));
 	if (object->resident_page_count == 0)
-		goto skipmemq;
+		return;
 	vm_object_pip_add(object, 1);
+	TAILQ_INIT(&pgl);
 again:
 	p = vm_page_find_least(object, start);
+	mtx = NULL;
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
@@ -1915,7 +1991,7 @@
 		 * however, be invalidated if the option OBJPR_CLEANONLY is
 		 * not specified.
 		 */
-		vm_page_lock(p);
+		vm_page_change_lock(p, &mtx);
 		if (vm_page_xbusied(p)) {
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(p, "vmopax", true);
@@ -1923,13 +1999,14 @@
 			goto again;
 		}
 		if (p->wire_count != 0) {
-			if ((options & OBJPR_NOTMAPPED) == 0)
+			if ((options & OBJPR_NOTMAPPED) == 0 &&
+			    object->ref_count != 0)
 				pmap_remove_all(p);
 			if ((options & OBJPR_CLEANONLY) == 0) {
 				p->valid = 0;
 				vm_page_undirty(p);
 			}
-			goto next;
+			continue;
 		}
 		if (vm_page_busied(p)) {
 			VM_OBJECT_WUNLOCK(object);
@@ -1940,33 +2017,34 @@
 		KASSERT((p->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_page_remove: page %p is fictitious", p));
 		if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
-			if ((options & OBJPR_NOTMAPPED) == 0)
+			if ((options & OBJPR_NOTMAPPED) == 0 &&
+			    object->ref_count != 0)
 				pmap_remove_write(p);
-			if (p->dirty)
-				goto next;
+			if (p->dirty != 0)
+				continue;
 		}
-		if ((options & OBJPR_NOTMAPPED) == 0)
+		if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0)
 			pmap_remove_all(p);
-		vm_page_free(p);
-next:
-		vm_page_unlock(p);
+		p->flags &= ~PG_ZERO;
+		if (vm_page_free_prep(p, false))
+			TAILQ_INSERT_TAIL(&pgl, p, listq);
 	}
+	if (mtx != NULL)
+		mtx_unlock(mtx);
+	vm_page_free_phys_pglist(&pgl);
 	vm_object_pip_wakeup(object);
-skipmemq:
-	if (__predict_false(!vm_object_cache_is_empty(object)))
-		vm_page_cache_free(object, start, end);
 }
 
 /*
- *	vm_object_page_cache:
+ *	vm_object_page_noreuse:
  *
- *	For the given object, attempt to move the specified clean
- *	pages to the cache queue.  If a page is wired for any reason,
- *	then it will not be changed.  Pages are specified by the given
- *	range ["start", "end").  As a special case, if "end" is zero,
- *	then the range extends from "start" to the end of the object.
- *	Any mappings to the specified pages are removed before the
- *	pages are moved to the cache queue.
+ *	For the given object, attempt to move the specified pages to
+ *	the head of the inactive queue.  This bypasses regular LRU
+ *	operation and allows the pages to be reused quickly under memory
+ *	pressure.  If a page is wired for any reason, then it will not
+ *	be queued.  Pages are specified by the range ["start", "end").
+ *	As a special case, if "end" is zero, then the range extends from
+ *	"start" to the end of the object.
  *
  *	This operation should only be performed on objects that
  *	contain non-fictitious, managed pages.
@@ -1974,14 +2052,14 @@
  *	The object must be locked.
  */
 void
-vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
-	struct mtx *mtx, *new_mtx;
+	struct mtx *mtx;
 	vm_page_t p, next;
 
-	VM_OBJECT_ASSERT_WLOCKED(object);
+	VM_OBJECT_ASSERT_LOCKED(object);
 	KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
-	    ("vm_object_page_cache: illegal object %p", object));
+	    ("vm_object_page_noreuse: illegal object %p", object));
 	if (object->resident_page_count == 0)
 		return;
 	p = vm_page_find_least(object, start);
@@ -1993,18 +2071,8 @@
 	mtx = NULL;
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
-
-		/*
-		 * Avoid releasing and reacquiring the same page lock.
-		 */
-		new_mtx = vm_page_lockptr(p);
-		if (mtx != new_mtx) {
-			if (mtx != NULL)
-				mtx_unlock(mtx);
-			mtx = new_mtx;
-			mtx_lock(mtx);
-		}
-		vm_page_try_to_cache(p);
+		vm_page_change_lock(p, &mtx);
+		vm_page_deactivate_noreuse(p);
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
@@ -2023,7 +2091,7 @@
 boolean_t
 vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
-	vm_page_t m, ma[1];
+	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
@@ -2031,11 +2099,7 @@
 	for (pindex = start; pindex < end; pindex++) {
 		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 		if (m->valid != VM_PAGE_BITS_ALL) {
-			ma[0] = m;
-			rv = vm_pager_get_pages(object, ma, 1, 0);
-			m = vm_page_lookup(object, pindex);
-			if (m == NULL)
-				break;
+			rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 			if (rv != VM_PAGER_OK) {
 				vm_page_lock(m);
 				vm_page_free(m);
@@ -2090,7 +2154,7 @@
 	VM_OBJECT_WLOCK(prev_object);
 	if ((prev_object->type != OBJT_DEFAULT &&
 	    prev_object->type != OBJT_SWAP) ||
-	    (prev_object->flags & OBJ_TMPFS_NODE) != 0) {
+	    (prev_object->flags & OBJ_NOSPLIT) != 0) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
@@ -2127,7 +2191,7 @@
 
 		/*
 		 * If prev_object was charged, then this mapping,
-		 * althought not charged now, may become writable
+		 * although not charged now, may become writable
 		 * later. Non-NULL cred in the object would prevent
 		 * swap reservation during enabling of the write
 		 * access, so reserve swap now. Failed reservation
@@ -2205,7 +2269,7 @@
 vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length,
     uint8_t queue)
 {
-	vm_object_t tobject;
+	vm_object_t tobject, t1object;
 	vm_page_t m, tm;
 	vm_pindex_t end_pindex, pindex, tpindex;
 	int depth, locked_depth;
@@ -2219,6 +2283,7 @@
 		return;
 	pindex = OFF_TO_IDX(offset);
 	end_pindex = pindex + atop(length);
+again:
 	locked_depth = 1;
 	VM_OBJECT_RLOCK(object);
 	m = vm_page_find_least(object, pindex);
@@ -2252,6 +2317,16 @@
 			m = TAILQ_NEXT(m, listq);
 		}
 		vm_page_lock(tm);
+		if (vm_page_xbusied(tm)) {
+			for (tobject = object; locked_depth >= 1;
+			    locked_depth--) {
+				t1object = tobject->backing_object;
+				VM_OBJECT_RUNLOCK(tobject);
+				tobject = t1object;
+			}
+			vm_page_busy_sleep(tm, "unwbo", true);
+			goto again;
+		}
 		vm_page_unwire(tm, queue);
 		vm_page_unlock(tm);
 next_page:
@@ -2258,10 +2333,10 @@
 		pindex++;
 	}
 	/* Release the accumulated object locks. */
-	for (depth = 0; depth < locked_depth; depth++) {
-		tobject = object->backing_object;
-		VM_OBJECT_RUNLOCK(object);
-		object = tobject;
+	for (tobject = object; locked_depth >= 1; locked_depth--) {
+		t1object = tobject->backing_object;
+		VM_OBJECT_RUNLOCK(tobject);
+		tobject = t1object;
 	}
 }
 
@@ -2340,9 +2415,9 @@
 			 * sysctl is only meant to give an
 			 * approximation of the system anyway.
 			 */
-			if (m->queue == PQ_ACTIVE)
+			if (vm_page_active(m))
 				kvo->kvo_active++;
-			else if (m->queue == PQ_INACTIVE)
+			else if (vm_page_inactive(m))
 				kvo->kvo_inactive++;
 		}
 

Modified: trunk/sys/vm/vm_object.h
===================================================================
--- trunk/sys/vm/vm_object.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_object.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_object.h 313384 2017-02-07 08:33:46Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_object.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*
@@ -71,6 +71,7 @@
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
+#include <sys/_pctrie.h>
 #include <sys/_rwlock.h>
 
 #include <vm/_vm_radix.h>
@@ -80,17 +81,6 @@
  *
  *	vm_object_t		Virtual memory object.
  *
- *	The root of cached pages pool is protected by both the per-object lock
- *	and the free pages queue mutex.
- *	On insert in the cache radix trie, the per-object lock is expected
- *	to be already held and the free pages queue mutex will be
- *	acquired during the operation too.
- *	On remove and lookup from the cache radix trie, only the free
- *	pages queue mutex is expected to be locked.
- *	These rules allow for reliably checking for the presence of cached
- *	pages with only the per-object lock held, thereby reducing contention
- *	for the free pages queue mutex.
- *
  * List of locks
  *	(c)	const until freed
  *	(o)	per-object lock 
@@ -98,12 +88,17 @@
  *
  */
 
+#ifndef VM_PAGE_HAVE_PGLIST
+TAILQ_HEAD(pglist, vm_page);
+#define VM_PAGE_HAVE_PGLIST
+#endif
+
 struct vm_object {
 	struct rwlock lock;
 	TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
 	LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
 	LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
-	TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
+	struct pglist memq;		/* list of resident pages */
 	struct vm_radix rtree;		/* root of the resident page radix trie*/
 	vm_pindex_t size;		/* Object size */
 	int generation;			/* generation ID */
@@ -119,7 +114,6 @@
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	LIST_HEAD(, vm_reserv) rvq;	/* list of reservations */
-	struct vm_radix cache;		/* (o + f) root of the cache page radix trie */
 	void *handle;
 	union {
 		/*
@@ -164,17 +158,17 @@
 		 *		     the handle changed and hash-chain
 		 *		     invalid.
 		 *
-		 *	swp_bcount - number of swap 'swblock' metablocks, each
-		 *		     contains up to 16 swapblk assignments.
-		 *		     see vm/swap_pager.h
+		 *	swp_blks -   pc-trie of the allocated swap blocks.
+		 *
 		 */
 		struct {
 			void *swp_tmpfs;
-			int swp_bcount;
+			struct pctrie swp_blks;
 		} swp;
 	} un_pager;
 	struct ucred *cred;
 	vm_ooffset_t charge;
+	void *umtx_data;
 };
 
 /*
@@ -182,10 +176,13 @@
  */
 #define	OBJ_FICTITIOUS	0x0001		/* (c) contains fictitious pages */
 #define	OBJ_UNMANAGED	0x0002		/* (c) contains unmanaged pages */
-#define OBJ_DEAD	0x0008		/* dead objects (during rundown) */
+#define	OBJ_POPULATE	0x0004		/* pager implements populate() */
+#define	OBJ_DEAD	0x0008		/* dead objects (during rundown) */
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
-#define OBJ_PIPWNT	0x0040		/* paging in progress wanted */
-#define OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty, only for vnode */
+#define	OBJ_UMTXDEAD	0x0020		/* umtx pshared was terminated */
+#define	OBJ_PIPWNT	0x0040		/* paging in progress wanted */
+#define	OBJ_PG_DTOR	0x0080		/* dont reset object, leave that for dtor */
+#define	OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty, only for vnode */
 #define	OBJ_TMPFS_NODE	0x0200		/* object belongs to tmpfs VREG node */
 #define	OBJ_TMPFS_DIRTY	0x0400		/* dirty tmpfs obj */
 #define	OBJ_COLORED	0x1000		/* pg_color is defined */
@@ -193,14 +190,29 @@
 #define	OBJ_DISCONNECTWNT 0x4000	/* disconnect from vnode wanted */
 #define	OBJ_TMPFS	0x8000		/* has tmpfs vnode allocated */
 
+/*
+ * Helpers to perform conversion between vm_object page indexes and offsets.
+ * IDX_TO_OFF() converts an index into an offset.
+ * OFF_TO_IDX() converts an offset into an index.  Since offsets are signed
+ *   by default, the sign propagation in OFF_TO_IDX(), when applied to
+ *   negative offsets, is intentional and returns a vm_object page index
+ *   that cannot be created by a userspace mapping.
+ * UOFF_TO_IDX() treats the offset as an unsigned value and converts it
+ *   into an index accordingly.  Use it only when the full range of offset
+ *   values are allowed.  Currently, this only applies to device mappings.
+ * OBJ_MAX_SIZE specifies the maximum page index corresponding to the
+ *   maximum unsigned offset.
+ */
 #define	IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
 #define	OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
+#define	UOFF_TO_IDX(off) (((vm_pindex_t)(off)) >> PAGE_SHIFT)
+#define	OBJ_MAX_SIZE	(UOFF_TO_IDX(UINT64_MAX) + 1)
 
 #ifdef	_KERNEL
 
 #define OBJPC_SYNC	0x1			/* sync I/O */
 #define OBJPC_INVAL	0x2			/* invalidate */
-#define OBJPC_NOSYNC	0x4			/* skip if PG_NOSYNC */
+#define OBJPC_NOSYNC	0x4			/* skip if VPO_NOSYNC */
 
 /*
  * The following options are supported by vm_object_page_remove().
@@ -243,6 +255,8 @@
 	rw_try_upgrade(&(object)->lock)
 #define	VM_OBJECT_WLOCK(object)						\
 	rw_wlock(&(object)->lock)
+#define	VM_OBJECT_WOWNED(object)					\
+	rw_wowned(&(object)->lock)
 #define	VM_OBJECT_WUNLOCK(object)					\
 	rw_wunlock(&(object)->lock)
 
@@ -256,6 +270,30 @@
 	object->flags |= bits;
 }
 
+/*
+ *	Conditionally set the object's color, which (1) enables the allocation
+ *	of physical memory reservations for anonymous objects and larger-than-
+ *	superpage-sized named objects and (2) determines the first page offset
+ *	within the object at which a reservation may be allocated.  In other
+ *	words, the color determines the alignment of the object with respect
+ *	to the largest superpage boundary.  When mapping named objects, like
+ *	files or POSIX shared memory objects, the color should be set to zero
+ *	before a virtual address is selected for the mapping.  In contrast,
+ *	for anonymous objects, the color may be set after the virtual address
+ *	is selected.
+ *
+ *	The object must be locked.
+ */
+static __inline void
+vm_object_color(vm_object_t object, u_short color)
+{
+
+	if ((object->flags & OBJ_COLORED) == 0) {
+		object->pg_color = color;
+		object->flags |= OBJ_COLORED;
+	}
+}
+
 void vm_object_clear_flag(vm_object_t object, u_short bits);
 void vm_object_pip_add(vm_object_t object, short i);
 void vm_object_pip_subtract(vm_object_t object, short i);
@@ -263,13 +301,10 @@
 void vm_object_pip_wakeupn(vm_object_t object, short i);
 void vm_object_pip_wait(vm_object_t object, char *waitid);
 
-static __inline boolean_t
-vm_object_cache_is_empty(vm_object_t object)
-{
+void umtx_shm_object_init(vm_object_t object);
+void umtx_shm_object_terminated(vm_object_t object);
+extern int umtx_shm_vnobj_persistent;
 
-	return (vm_radix_is_empty(&object->cache));
-}
-
 vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
 boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t,
    boolean_t);
@@ -280,10 +315,10 @@
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int);
-void vm_object_page_cache(vm_object_t object, vm_pindex_t start,
-    vm_pindex_t end);
 boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
     vm_ooffset_t end, int flags);
+void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start,
+    vm_pindex_t end);
 void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end, int options);
 boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);

Modified: trunk/sys/vm/vm_page.c
===================================================================
--- trunk/sys/vm/vm_page.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_page.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -83,7 +83,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_page.c 320190 2017-06-21 14:39:31Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_page.c 342797 2019-01-06 00:38:28Z kib $");
 
 #include "opt_vm.h"
 
@@ -92,6 +92,7 @@
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
+#include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
@@ -98,6 +99,8 @@
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
@@ -125,9 +128,9 @@
  */
 
 struct vm_domain vm_dom[MAXMEMDOM];
-struct mtx_padalign vm_page_queue_free_mtx;
+struct mtx_padalign __exclusive_cache_line vm_page_queue_free_mtx;
 
-struct mtx_padalign pa_lock[PA_LOCK_COUNT];
+struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
 
 vm_page_t vm_page_array;
 long vm_page_array_size;
@@ -135,25 +138,37 @@
 int vm_page_zero_count;
 
 static int boot_pages = UMA_BOOT_PAGES;
-TUNABLE_INT("vm.boot_pages", &boot_pages);
-SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
-	"number of pages allocated for bootstrapping the VM system");
+SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+    &boot_pages, 0,
+    "number of pages allocated for bootstrapping the VM system");
 
 static int pa_tryrelock_restart;
 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
     &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
 
+static TAILQ_HEAD(, vm_page) blacklist_head;
+static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
+
+/* Is the page daemon waiting for free pages? */
+static int vm_pageout_pages_needed;
+
 static uma_zone_t fakepg_zone;
 
-static struct vnode *vm_page_alloc_init(vm_page_t m);
-static void vm_page_cache_turn_free(vm_page_t m);
+static void vm_page_alloc_check(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
-static void vm_page_enqueue(int queue, vm_page_t m);
+static void vm_page_enqueue(uint8_t queue, vm_page_t m);
+static void vm_page_free_phys(vm_page_t m);
+static void vm_page_free_wakeup(void);
 static void vm_page_init_fakepg(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mpred);
 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
     vm_page_t mpred);
+static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
+    vm_paddr_t high);
+static int vm_page_alloc_fail(vm_object_t object, int req);
 
 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
 
@@ -162,7 +177,7 @@
 {
 
 	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); 
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
 }
 
 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
@@ -210,43 +225,171 @@
 void
 vm_set_page_size(void)
 {
-	if (cnt.v_page_size == 0)
-		cnt.v_page_size = PAGE_SIZE;
-	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
+	if (vm_cnt.v_page_size == 0)
+		vm_cnt.v_page_size = PAGE_SIZE;
+	if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
 		panic("vm_set_page_size: page size not a power of two");
 }
 
 /*
- *	vm_page_blacklist_lookup:
+ *	vm_page_blacklist_next:
  *
- *	See if a physical address in this page has been listed
- *	in the blacklist tunable.  Entries in the tunable are
- *	separated by spaces or commas.  If an invalid integer is
- *	encountered then the rest of the string is skipped.
+ *	Find the next entry in the provided string of blacklist
+ *	addresses.  Entries are separated by space, comma, or newline.
+ *	If an invalid integer is encountered then the rest of the
+ *	string is skipped.  Updates the list pointer to the next
+ *	character, or NULL if the string is exhausted or invalid.
  */
-static int
-vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
+static vm_paddr_t
+vm_page_blacklist_next(char **list, char *end)
 {
 	vm_paddr_t bad;
 	char *cp, *pos;
 
-	for (pos = list; *pos != '\0'; pos = cp) {
+	if (list == NULL || *list == NULL)
+		return (0);
+	if (**list =='\0') {
+		*list = NULL;
+		return (0);
+	}
+
+	/*
+	 * If there's no end pointer then the buffer is coming from
+	 * the kenv and we know it's null-terminated.
+	 */
+	if (end == NULL)
+		end = *list + strlen(*list);
+
+	/* Ensure that strtoq() won't walk off the end */
+	if (*end != '\0') {
+		if (*end == '\n' || *end == ' ' || *end  == ',')
+			*end = '\0';
+		else {
+			printf("Blacklist not terminated, skipping\n");
+			*list = NULL;
+			return (0);
+		}
+	}
+
+	for (pos = *list; *pos != '\0'; pos = cp) {
 		bad = strtoq(pos, &cp, 0);
-		if (*cp != '\0') {
-			if (*cp == ' ' || *cp == ',') {
-				cp++;
-				if (cp == pos)
+		if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
+			if (bad == 0) {
+				if (++cp < end)
 					continue;
-			} else
-				break;
-		}
-		if (pa == trunc_page(bad))
-			return (1);
+				else
+					break;
+			}
+		} else
+			break;
+		if (*cp == '\0' || ++cp >= end)
+			*list = NULL;
+		else
+			*list = cp;
+		return (trunc_page(bad));
 	}
+	printf("Garbage in RAM blacklist, skipping\n");
+	*list = NULL;
 	return (0);
 }
 
+bool
+vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
+{
+	vm_page_t m;
+	int ret;
+
+	m = vm_phys_paddr_to_vm_page(pa);
+	if (m == NULL)
+		return (true); /* page does not exist, no failure */
+
+	mtx_lock(&vm_page_queue_free_mtx);
+	ret = vm_phys_unfree_page(m);
+	if (ret != 0)
+		vm_phys_freecnt_adj(m, -1);
+	mtx_unlock(&vm_page_queue_free_mtx);
+	if (ret != 0) {
+		TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
+		if (verbose)
+			printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
+	}
+	return (ret);
+}
+
+/*
+ *	vm_page_blacklist_check:
+ *
+ *	Iterate through the provided string of blacklist addresses, pulling
+ *	each entry out of the physical allocator free list and putting it
+ *	onto a list for reporting via the vm.page_blacklist sysctl.
+ */
 static void
+vm_page_blacklist_check(char *list, char *end)
+{
+	vm_paddr_t pa;
+	char *next;
+
+	next = list;
+	while (next != NULL) {
+		if ((pa = vm_page_blacklist_next(&next, end)) == 0)
+			continue;
+		vm_page_blacklist_add(pa, bootverbose);
+	}
+}
+
+/*
+ *	vm_page_blacklist_load:
+ *
+ *	Search for a special module named "ram_blacklist".  It'll be a
+ *	plain text file provided by the user via the loader directive
+ *	of the same name.
+ */
+static void
+vm_page_blacklist_load(char **list, char **end)
+{
+	void *mod;
+	u_char *ptr;
+	u_int len;
+
+	mod = NULL;
+	ptr = NULL;
+
+	mod = preload_search_by_type("ram_blacklist");
+	if (mod != NULL) {
+		ptr = preload_fetch_addr(mod);
+		len = preload_fetch_size(mod);
+        }
+	*list = ptr;
+	if (ptr != NULL)
+		*end = ptr + len;
+	else
+		*end = NULL;
+	return;
+}
+
+static int
+sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
+{
+	vm_page_t m;
+	struct sbuf sbuf;
+	int error, first;
+
+	first = 1;
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+	TAILQ_FOREACH(m, &blacklist_head, listq) {
+		sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
+		    (uintmax_t)m->phys_addr);
+		first = 0;
+	}
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
+	return (error);
+}
+
+static void
 vm_page_domain_init(struct vm_domain *vmd)
 {
 	struct vm_pagequeue *pq;
@@ -255,16 +398,19 @@
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
 	    "vm inactive pagequeue";
 	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
-	    &cnt.v_inactive_count;
+	    &vm_cnt.v_inactive_count;
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
 	    "vm active pagequeue";
 	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
-	    &cnt.v_active_count;
+	    &vm_cnt.v_active_count;
+	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
+	    "vm laundry pagequeue";
+	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) =
+	    &vm_cnt.v_laundry_count;
 	vmd->vmd_page_count = 0;
 	vmd->vmd_free_count = 0;
 	vmd->vmd_segs = 0;
 	vmd->vmd_oom = FALSE;
-	vmd->vmd_pass = 0;
 	for (i = 0; i < PQ_COUNT; i++) {
 		pq = &vmd->vmd_pagequeues[i];
 		TAILQ_INIT(&pq->pq_pl);
@@ -274,6 +420,29 @@
 }
 
 /*
+ * Initialize a physical page in preparation for adding it to the free
+ * lists.
+ */
+static void
+vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind)
+{
+
+	m->object = NULL;
+	m->wire_count = 0;
+	m->busy_lock = VPB_UNBUSIED;
+	m->hold_count = 0;
+	m->flags = 0;
+	m->phys_addr = pa;
+	m->queue = PQ_NONE;
+	m->psind = 0;
+	m->segind = segind;
+	m->order = VM_NFREEORDER;
+	m->pool = VM_FREEPOOL_DEFAULT;
+	m->valid = m->dirty = 0;
+	pmap_page_init(m);
+}
+
+/*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.  Allocates physical memory for
@@ -284,19 +453,16 @@
 vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
+	struct vm_domain *vmd;
+	struct vm_phys_seg *seg;
+	vm_page_t m;
+	char *list, *listend;
 	vm_offset_t mapped;
-	vm_paddr_t high_avail, low_avail, page_range, size;
-	vm_paddr_t new_end;
-	int i;
-	vm_paddr_t pa;
-	vm_paddr_t last_pa;
-	char *list;
+	vm_paddr_t end, high_avail, low_avail, new_end, page_range, size;
+	vm_paddr_t biggestsize, last_pa, pa;
+	u_long pagecount;
+	int biggestone, i, pages_per_zone, segind;
 
-	/* the biggest memory array is the second group of pages */
-	vm_paddr_t end;
-	vm_paddr_t biggestsize;
-	int biggestone;
-
 	biggestsize = 0;
 	biggestone = 0;
 	vaddr = round_page(vaddr);
@@ -305,15 +471,6 @@
 		phys_avail[i] = round_page(phys_avail[i]);
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
-
-#ifdef XEN
-	/*
-	 * There is no obvious reason why i386 PV Xen needs vm_page structs
-	 * created for these pseudo-physical addresses.  XXX
-	 */
-	vm_phys_add_seg(0, phys_avail[0]);
-#endif
-
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		size = phys_avail[i + 1] - phys_avail[i];
 		if (size > biggestsize) {
@@ -334,9 +491,27 @@
 		vm_page_domain_init(&vm_dom[i]);
 
 	/*
+	 * Almost all of the pages needed for bootstrapping UMA are used
+	 * for zone structures, so if the number of CPUs results in those
+	 * structures taking more than one page each, we set aside more pages
+	 * in proportion to the zone structure size.
+	 */
+	pages_per_zone = howmany(sizeof(struct uma_zone) +
+	    sizeof(struct uma_cache) * (mp_maxid + 1) +
+	    roundup2(sizeof(struct uma_slab), sizeof(void *)), UMA_SLAB_SIZE);
+	if (pages_per_zone > 1) {
+		/* Reserve more pages so that we don't run out. */
+		boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone;
+	}
+
+	/*
 	 * Allocate memory for use when boot strapping the kernel memory
 	 * allocator.
+	 *
+	 * CTFLAG_RDTUN doesn't work during the early boot process, so we must
+	 * manually fetch the value.
 	 */
+	TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages);
 	new_end = end - (boot_pages * UMA_SLAB_SIZE);
 	new_end = trunc_page(new_end);
 	mapped = pmap_map(&vaddr, new_end, end,
@@ -344,8 +519,8 @@
 	bzero((void *)mapped, end - new_end);
 	uma_startup((void *)mapped, boot_pages);
 
-#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
-    defined(__mips__)
+#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
+    defined(__i386__) || defined(__mips__)
 	/*
 	 * Allocate a bitmap to indicate that a random physical page
 	 * needs to be included in a minidump.
@@ -367,8 +542,10 @@
 	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
 	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)vm_page_dump, vm_page_dump_size);
+#else
+	(void)last_pa;
 #endif
-#if defined(__amd64__) || defined(__mips__)
+#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
 	/*
 	 * Include the UMA bootstrap pages and vm_page_dump in a crash dump.
 	 * When pmap_map() uses the direct map, they are not automatically 
@@ -471,7 +648,9 @@
 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
-	vm_page_array = (vm_page_t) mapped;
+	vm_page_array = (vm_page_t)mapped;
+	vm_page_array_size = page_range;
+
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Allocate physical memory for the reservation management system's
@@ -481,13 +660,13 @@
 		high_avail = new_end;
 	new_end = vm_reserv_startup(&vaddr, new_end, high_avail);
 #endif
-#if defined(__amd64__) || defined(__mips__)
+#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
 	/*
 	 * Include vm_page_array and vm_reserv_array in a crash dump.
 	 */
 	for (pa = new_end; pa < end; pa += PAGE_SIZE)
 		dump_add_page(pa);
-#endif	
+#endif
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
@@ -498,38 +677,60 @@
 		vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
 
 	/*
-	 * Clear all of the page structures
-	 */
-	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
-	for (i = 0; i < page_range; i++)
-		vm_page_array[i].order = VM_NFREEORDER;
-	vm_page_array_size = page_range;
-
-	/*
 	 * Initialize the physical memory allocator.
 	 */
 	vm_phys_init();
 
 	/*
-	 * Add every available physical page that is not blacklisted to
-	 * the free lists.
+	 * Initialize the page structures and add every available page to the
+	 * physical memory allocator's free lists.
 	 */
-	cnt.v_page_count = 0;
-	cnt.v_free_count = 0;
-	list = getenv("vm.blacklist");
-	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
-		pa = phys_avail[i];
-		last_pa = phys_avail[i + 1];
-		while (pa < last_pa) {
-			if (list != NULL &&
-			    vm_page_blacklist_lookup(list, pa))
-				printf("Skipping page with pa 0x%jx\n",
-				    (uintmax_t)pa);
-			else
-				vm_phys_add_page(pa);
-			pa += PAGE_SIZE;
+	vm_cnt.v_page_count = 0;
+	vm_cnt.v_free_count = 0;
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		for (m = seg->first_page, pa = seg->start; pa < seg->end;
+		    m++, pa += PAGE_SIZE)
+			vm_page_init_page(m, pa, segind);
+
+		/*
+		 * Add the segment to the free lists only if it is covered by
+		 * one of the ranges in phys_avail.  Because we've added the
+		 * ranges to the vm_phys_segs array, we can assume that each
+		 * segment is either entirely contained in one of the ranges,
+		 * or doesn't overlap any of them.
+		 */
+		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+			if (seg->start < phys_avail[i] ||
+			    seg->end > phys_avail[i + 1])
+				continue;
+
+			m = seg->first_page;
+			pagecount = (u_long)atop(seg->end - seg->start);
+
+			mtx_lock(&vm_page_queue_free_mtx);
+			vm_phys_free_contig(m, pagecount);
+			vm_phys_freecnt_adj(m, (int)pagecount);
+			mtx_unlock(&vm_page_queue_free_mtx);
+			vm_cnt.v_page_count += (u_int)pagecount;
+
+			vmd = &vm_dom[seg->domain];
+			vmd->vmd_page_count += (u_int)pagecount;
+			vmd->vmd_segs |= 1UL << m->segind;
+			break;
 		}
 	}
+
+	/*
+	 * Remove blacklisted pages from the physical memory allocator.
+	 */
+	TAILQ_INIT(&blacklist_head);
+	vm_page_blacklist_load(&list, &listend);
+	vm_page_blacklist_check(list, listend);
+
+	list = kern_getenv("vm.blacklist");
+	vm_page_blacklist_check(list, NULL);
+
 	freeenv(list);
 #if VM_NRESERVLEVEL > 0
 	/*
@@ -603,6 +804,7 @@
 {
 	u_int x;
 
+	vm_page_lock_assert(m, MA_NOTOWNED);
 	vm_page_assert_sbusied(m);
 
 	for (;;) {
@@ -683,6 +885,41 @@
 	}
 }
 
+static void
+vm_page_xunbusy_locked(vm_page_t m)
+{
+
+	vm_page_assert_xbusied(m);
+	vm_page_assert_locked(m);
+
+	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
+	/* There is a waiter, do wakeup() instead of vm_page_flash(). */
+	wakeup(m);
+}
+
+void
+vm_page_xunbusy_maybelocked(vm_page_t m)
+{
+	bool lockacq;
+
+	vm_page_assert_xbusied(m);
+
+	/*
+	 * Fast path for unbusy.  If it succeeds, we know that there
+	 * are no waiters, so we do not need a wakeup.
+	 */
+	if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER,
+	    VPB_UNBUSIED))
+		return;
+
+	lockacq = !mtx_owned(vm_page_lockptr(m));
+	if (lockacq)
+		vm_page_lock(m);
+	vm_page_xunbusy_locked(m);
+	if (lockacq)
+		vm_page_unlock(m);
+}
+
 /*
  *	vm_page_xunbusy_hard:
  *
@@ -696,8 +933,7 @@
 	vm_page_assert_xbusied(m);
 
 	vm_page_lock(m);
-	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
-	wakeup(m);
+	vm_page_xunbusy_locked(m);
 	vm_page_unlock(m);
 }
 
@@ -728,6 +964,23 @@
 }
 
 /*
+ * Avoid releasing and reacquiring the same page lock.
+ */
+void
+vm_page_change_lock(vm_page_t m, struct mtx **mtx)
+{
+	struct mtx *mtx1;
+
+	mtx1 = vm_page_lockptr(m);
+	if (*mtx == mtx1)
+		return;
+	if (*mtx != NULL)
+		mtx_unlock(*mtx);
+	*mtx = mtx1;
+	mtx_lock(mtx1);
+}
+
+/*
  * Keep page from being freed by the page daemon
  * much of the same effect as wiring, except much lower
  * overhead and should be used only for *very* temporary
@@ -756,24 +1009,15 @@
  *	vm_page_unhold_pages:
  *
  *	Unhold each of the pages that is referenced by the given array.
- */ 
+ */
 void
 vm_page_unhold_pages(vm_page_t *ma, int count)
 {
-	struct mtx *mtx, *new_mtx;
+	struct mtx *mtx;
 
 	mtx = NULL;
 	for (; count != 0; count--) {
-		/*
-		 * Avoid releasing and reacquiring the same page lock.
-		 */
-		new_mtx = vm_page_lockptr(*ma);
-		if (mtx != new_mtx) {
-			if (mtx != NULL)
-				mtx_unlock(mtx);
-			mtx = new_mtx;
-			mtx_lock(mtx);
-		}
+		vm_page_change_lock(*ma, &mtx);
 		vm_page_unhold(*ma);
 		ma++;
 	}
@@ -905,39 +1149,29 @@
 }
 
 /*
- * Unbusy and handle the page queueing for a page from the VOP_GETPAGES()
- * array which is not the request page.
+ * Unbusy and handle the page queueing for a page from a getpages request that
+ * was optionally read ahead or behind.
  */
 void
 vm_page_readahead_finish(vm_page_t m)
 {
 
-	if (m->valid != 0) {
-		/*
-		 * Since the page is not the requested page, whether
-		 * it should be activated or deactivated is not
-		 * obvious.  Empirical results have shown that
-		 * deactivating the page is usually the best choice,
-		 * unless the page is wanted by another thread.
-		 */
-		vm_page_lock(m);
-		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
-			vm_page_activate(m);
-		else
-			vm_page_deactivate(m);
-		vm_page_unlock(m);
-		vm_page_xunbusy(m);
-	} else {
-		/*
-		 * Free the completely invalid page.  Such page state
-		 * occurs due to the short read operation which did
-		 * not covered our page at all, or in case when a read
-		 * error happens.
-		 */
-		vm_page_lock(m);
-		vm_page_free(m);
-		vm_page_unlock(m);
-	}
+	/* We shouldn't put invalid pages on queues. */
+	KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m));
+
+	/*
+	 * Since the page is not the actually needed one, whether it should
+	 * be activated or deactivated is not obvious.  Empirical results
+	 * have shown that deactivating the page is usually the best choice,
+	 * unless the page is wanted by another thread.
+	 */
+	vm_page_lock(m);
+	if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+		vm_page_activate(m);
+	else
+		vm_page_deactivate(m);
+	vm_page_unlock(m);
+	vm_page_xunbusy(m);
 }
 
 /*
@@ -991,11 +1225,7 @@
 vm_page_dirty_KBI(vm_page_t m)
 {
 
-	/* These assertions refer to this operation by its public name. */
-	KASSERT((m->flags & PG_CACHED) == 0,
-	    ("vm_page_dirty: page in cache!"));
-	KASSERT(!VM_PAGE_IS_FREE(m),
-	    ("vm_page_dirty: page is free!"));
+	/* Refer to this operation by its public name. */
 	KASSERT(m->valid == VM_PAGE_BITS_ALL,
 	    ("vm_page_dirty: page is invalid!"));
 	m->dirty = VM_PAGE_BITS_ALL;
@@ -1119,9 +1349,8 @@
 /*
  *	vm_page_remove:
  *
- *	Removes the given mem entry from the object/offset-page
- *	table and the object page list, but do not invalidate/terminate
- *	the backing store.
+ *	Removes the specified page from its containing object, but does not
+ *	invalidate any backing storage.
  *
  *	The object must be locked.  The page must be locked if it is managed.
  */
@@ -1129,30 +1358,21 @@
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
-	boolean_t lockacq;
+	vm_page_t mrem;
 
 	if ((m->oflags & VPO_UNMANAGED) == 0)
-		vm_page_lock_assert(m, MA_OWNED);
+		vm_page_assert_locked(m);
 	if ((object = m->object) == NULL)
 		return;
 	VM_OBJECT_ASSERT_WLOCKED(object);
-	if (vm_page_xbusied(m)) {
-		lockacq = FALSE;
-		if ((m->oflags & VPO_UNMANAGED) != 0 &&
-		    !mtx_owned(vm_page_lockptr(m))) {
-			lockacq = TRUE;
-			vm_page_lock(m);
-		}
-		vm_page_flash(m);
-		atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
-		if (lockacq)
-			vm_page_unlock(m);
-	}
+	if (vm_page_xbusied(m))
+		vm_page_xunbusy_maybelocked(m);
+	mrem = vm_radix_remove(&object->rtree, m->pindex);
+	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
-	vm_radix_remove(&object->rtree, m->pindex);
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
@@ -1215,7 +1435,7 @@
 {
 	vm_page_t next;
 
-	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((next = TAILQ_NEXT(m, listq)) != NULL) {
 		MPASS(next->object == m->object);
 		if (next->pindex != m->pindex + 1)
@@ -1235,7 +1455,7 @@
 {
 	vm_page_t prev;
 
-	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
 		MPASS(prev->object == m->object);
 		if (prev->pindex != m->pindex - 1)
@@ -1253,9 +1473,13 @@
 vm_page_t
 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t mold, mpred;
+	vm_page_t mold;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(mnew->object == NULL,
+	    ("vm_page_replace: page %p already in object", mnew));
+	KASSERT(mnew->queue == PQ_NONE,
+	    ("vm_page_replace: new page %p is on a paging queue", mnew));
 
 	/*
 	 * This function mostly follows vm_page_insert() and
@@ -1262,31 +1486,24 @@
 	 * vm_page_remove() without the radix, object count and vnode
 	 * dance.  Double check such functions for more comments.
 	 */
-	mpred = vm_radix_lookup(&object->rtree, pindex);
-	KASSERT(mpred != NULL,
-	    ("vm_page_replace: replacing page not present with pindex"));
-	mpred = TAILQ_PREV(mpred, respgs, listq);
-	if (mpred != NULL)
-		KASSERT(mpred->pindex < pindex,
-		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 
 	mnew->object = object;
 	mnew->pindex = pindex;
 	mold = vm_radix_replace(&object->rtree, mnew);
 	KASSERT(mold->queue == PQ_NONE,
-	    ("vm_page_replace: mold is on a paging queue"));
+	    ("vm_page_replace: old page %p is on a paging queue", mold));
 
-	/* Detach the old page from the resident tailq. */
+	/* Keep the resident page list in sorted order. */
+	TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
 	TAILQ_REMOVE(&object->memq, mold, listq);
 
 	mold->object = NULL;
-	vm_page_xunbusy(mold);
+	vm_page_xunbusy_maybelocked(mold);
 
-	/* Insert the new page in the resident tailq. */
-	if (mpred != NULL)
-		TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq);
-	else
-		TAILQ_INSERT_HEAD(&object->memq, mnew, listq);
+	/*
+	 * The object's resident_page_count does not change because we have
+	 * swapped one page for another, but OBJ_MIGHTBEDIRTY.
+	 */
 	if (pmap_page_is_write_mapped(mnew))
 		vm_object_set_writeable_dirty(object);
 	return (mold);
@@ -1306,9 +1523,7 @@
  *
  *	Note: we *always* dirty the page.  It is necessary both for the
  *	      fact that we moved it, and because we may be invalidating
- *	      swap.  If the page is on the cache, we have to deactivate it
- *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
- *	      on the cache.
+ *	      swap.
  *
  *	The objects must be locked.
  */
@@ -1354,142 +1569,6 @@
 }
 
 /*
- *	Convert all of the given object's cached pages that have a
- *	pindex within the given range into free pages.  If the value
- *	zero is given for "end", then the range's upper bound is
- *	infinity.  If the given object is backed by a vnode and it
- *	transitions from having one or more cached pages to none, the
- *	vnode's hold count is reduced. 
- */
-void
-vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
-{
-	vm_page_t m;
-	boolean_t empty;
-
-	mtx_lock(&vm_page_queue_free_mtx);
-	if (__predict_false(vm_radix_is_empty(&object->cache))) {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		return;
-	}
-	while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) {
-		if (end != 0 && m->pindex >= end)
-			break;
-		vm_radix_remove(&object->cache, m->pindex);
-		vm_page_cache_turn_free(m);
-	}
-	empty = vm_radix_is_empty(&object->cache);
-	mtx_unlock(&vm_page_queue_free_mtx);
-	if (object->type == OBJT_VNODE && empty)
-		vdrop(object->handle);
-}
-
-/*
- *	Returns the cached page that is associated with the given
- *	object and offset.  If, however, none exists, returns NULL.
- *
- *	The free page queue must be locked.
- */
-static inline vm_page_t
-vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
-{
-
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	return (vm_radix_lookup(&object->cache, pindex));
-}
-
-/*
- *	Remove the given cached page from its containing object's
- *	collection of cached pages.
- *
- *	The free page queue must be locked.
- */
-static void
-vm_page_cache_remove(vm_page_t m)
-{
-
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	KASSERT((m->flags & PG_CACHED) != 0,
-	    ("vm_page_cache_remove: page %p is not cached", m));
-	vm_radix_remove(&m->object->cache, m->pindex);
-	m->object = NULL;
-	cnt.v_cache_count--;
-}
-
-/*
- *	Transfer all of the cached pages with offset greater than or
- *	equal to 'offidxstart' from the original object's cache to the
- *	new object's cache.  However, any cached pages with offset
- *	greater than or equal to the new object's size are kept in the
- *	original object.  Initially, the new object's cache must be
- *	empty.  Offset 'offidxstart' in the original object must
- *	correspond to offset zero in the new object.
- *
- *	The new object must be locked.
- */
-void
-vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
-    vm_object_t new_object)
-{
-	vm_page_t m;
-
-	/*
-	 * Insertion into an object's collection of cached pages
-	 * requires the object to be locked.  In contrast, removal does
-	 * not.
-	 */
-	VM_OBJECT_ASSERT_WLOCKED(new_object);
-	KASSERT(vm_radix_is_empty(&new_object->cache),
-	    ("vm_page_cache_transfer: object %p has cached pages",
-	    new_object));
-	mtx_lock(&vm_page_queue_free_mtx);
-	while ((m = vm_radix_lookup_ge(&orig_object->cache,
-	    offidxstart)) != NULL) {
-		/*
-		 * Transfer all of the pages with offset greater than or
-		 * equal to 'offidxstart' from the original object's
-		 * cache to the new object's cache.
-		 */
-		if ((m->pindex - offidxstart) >= new_object->size)
-			break;
-		vm_radix_remove(&orig_object->cache, m->pindex);
-		/* Update the page's object and offset. */
-		m->object = new_object;
-		m->pindex -= offidxstart;
-		if (vm_radix_insert(&new_object->cache, m))
-			vm_page_cache_turn_free(m);
-	}
-	mtx_unlock(&vm_page_queue_free_mtx);
-}
-
-/*
- *	Returns TRUE if a cached page is associated with the given object and
- *	offset, and FALSE otherwise.
- *
- *	The object must be locked.
- */
-boolean_t
-vm_page_is_cached(vm_object_t object, vm_pindex_t pindex)
-{
-	vm_page_t m;
-
-	/*
-	 * Insertion into an object's collection of cached pages requires the
-	 * object to be locked.  Therefore, if the object is locked and the
-	 * object's collection is empty, there is no need to acquire the free
-	 * page queues lock in order to prove that the specified page doesn't
-	 * exist.
-	 */
-	VM_OBJECT_ASSERT_WLOCKED(object);
-	if (__predict_true(vm_object_cache_is_empty(object)))
-		return (FALSE);
-	mtx_lock(&vm_page_queue_free_mtx);
-	m = vm_page_cache_lookup(object, pindex);
-	mtx_unlock(&vm_page_queue_free_mtx);
-	return (m != NULL);
-}
-
-/*
  *	vm_page_alloc:
  *
  *	Allocate and return a page that is associated with the specified
@@ -1505,13 +1584,10 @@
  *	optional allocation flags:
  *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
  *				intends to allocate
- *	VM_ALLOC_IFCACHED	return page only if it is cached
- *	VM_ALLOC_IFNOTCACHED	return NULL, do not reactivate if the page
- *				is cached
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
- *				should not be exclusive busy 
+ *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
@@ -1521,21 +1597,41 @@
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
-	struct vnode *vp = NULL;
-	vm_object_t m_object;
-	vm_page_t m, mpred;
+
+	return (vm_page_alloc_after(object, pindex, req, object != NULL ?
+	    vm_radix_lookup_le(&object->rtree, pindex) : NULL));
+}
+
+/*
+ * Allocate a page in the specified object with the given page index.  To
+ * optimize insertion of the page into the object, the caller must also specifiy
+ * the resident page in the object with largest index smaller than the given
+ * page index, or NULL if no such page exists.
+ */
+vm_page_t
+vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req,
+    vm_page_t mpred)
+{
+	vm_page_t m;
 	int flags, req_class;
+	u_int free_count;
 
-	mpred = 0;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
-	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
-	    req));
+	    ("inconsistent object(%p)/req(%x)", object, req));
+	KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
+	    ("Can't sleep and retry object insertion."));
+	KASSERT(mpred == NULL || mpred->pindex < pindex,
+	    ("mpred %p doesn't precede pindex 0x%jx", mpred,
+	    (uintmax_t)pindex));
 	if (object != NULL)
 		VM_OBJECT_ASSERT_WLOCKED(object);
 
+	if (__predict_false((req & VM_ALLOC_IFCACHED) != 0))
+		return (NULL);
+
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
@@ -1544,52 +1640,29 @@
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
-	if (object != NULL) {
-		mpred = vm_radix_lookup_le(&object->rtree, pindex);
-		KASSERT(mpred == NULL || mpred->pindex != pindex,
-		   ("vm_page_alloc: pindex already allocated"));
-	}
-
 	/*
-	 * The page allocation request can came from consumers which already
-	 * hold the free page queue mutex, like vm_page_insert() in
-	 * vm_page_cache().
+	 * Allocate a page if the number of free pages exceeds the minimum
+	 * for the request class.
 	 */
-	mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
-	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
+again:
+	mtx_lock(&vm_page_queue_free_mtx);
+	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
-	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
+	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
-	    cnt.v_free_count + cnt.v_cache_count > 0)) {
+	    vm_cnt.v_free_count > 0)) {
 		/*
-		 * Allocate from the free queue if the number of free pages
-		 * exceeds the minimum for the request class.
+		 * Can we allocate the page from a reservation?
 		 */
-		if (object != NULL &&
-		    (m = vm_page_cache_lookup(object, pindex)) != NULL) {
-			if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
-				mtx_unlock(&vm_page_queue_free_mtx);
-				return (NULL);
-			}
-			if (vm_phys_unfree_page(m))
-				vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
 #if VM_NRESERVLEVEL > 0
-			else if (!vm_reserv_reactivate_page(m))
-#else
-			else
-#endif
-				panic("vm_page_alloc: cache page %p is missing"
-				    " from the free queue", m);
-		} else if ((req & VM_ALLOC_IFCACHED) != 0) {
-			mtx_unlock(&vm_page_queue_free_mtx);
-			return (NULL);
-#if VM_NRESERVLEVEL > 0
-		} else if (object == NULL || (object->flags & (OBJ_COLORED |
+		if (object == NULL || (object->flags & (OBJ_COLORED |
 		    OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
-		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL) {
-#else
-		} else {
+		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL)
 #endif
+		{
+			/*
+			 * If not, allocate it from the free page queues.
+			 */
 			m = vm_phys_alloc_pages(object != NULL ?
 			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
 #if VM_NRESERVLEVEL > 0
@@ -1604,10 +1677,8 @@
 		/*
 		 * Not allocatable, give up.
 		 */
-		mtx_unlock(&vm_page_queue_free_mtx);
-		atomic_add_int(&vm_pageout_deficit,
-		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
-		pagedaemon_wakeup();
+		if (vm_page_alloc_fail(object, req))
+			goto again;
 		return (NULL);
 	}
 
@@ -1614,52 +1685,23 @@
 	/*
 	 *  At this point we had better have found a good page.
 	 */
-	KASSERT(m != NULL, ("vm_page_alloc: missing page"));
-	KASSERT(m->queue == PQ_NONE,
-	    ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
-	KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
-	KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
-	KASSERT(!vm_page_busied(m), ("vm_page_alloc: page %p is busy", m));
-	KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
-	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
-	    ("vm_page_alloc: page %p has unexpected memattr %d", m,
-	    pmap_page_get_memattr(m)));
-	if ((m->flags & PG_CACHED) != 0) {
-		KASSERT((m->flags & PG_ZERO) == 0,
-		    ("vm_page_alloc: cached page %p is PG_ZERO", m));
-		KASSERT(m->valid != 0,
-		    ("vm_page_alloc: cached page %p is invalid", m));
-		if (m->object == object && m->pindex == pindex)
-	  		cnt.v_reactivated++;
-		else
-			m->valid = 0;
-		m_object = m->object;
-		vm_page_cache_remove(m);
-		if (m_object->type == OBJT_VNODE &&
-		    vm_object_cache_is_empty(m_object))
-			vp = m_object->handle;
-	} else {
-		KASSERT(VM_PAGE_IS_FREE(m),
-		    ("vm_page_alloc: page %p is not free", m));
-		KASSERT(m->valid == 0,
-		    ("vm_page_alloc: free page %p is valid", m));
-		vm_phys_freecnt_adj(m, -1);
-	}
+	KASSERT(m != NULL, ("missing page"));
+	free_count = vm_phys_freecnt_adj(m, -1);
+	if ((m->flags & PG_ZERO) != 0)
+		vm_page_zero_count--;
+	mtx_unlock(&vm_page_queue_free_mtx);
+	vm_page_alloc_check(m);
 
 	/*
-	 * Only the PG_ZERO flag is inherited.  The PG_CACHED or PG_FREE flag
-	 * must be cleared before the free page queues lock is released.
+	 * Initialize the page.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
-	if (m->flags & PG_ZERO) {
-		vm_page_zero_count--;
-		if (req & VM_ALLOC_ZERO)
-			flags = PG_ZERO;
-	}
-	if (req & VM_ALLOC_NODUMP)
+	if ((req & VM_ALLOC_ZERO) != 0)
+		flags = PG_ZERO;
+	flags &= m->flags;
+	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	m->flags = flags;
-	mtx_unlock(&vm_page_queue_free_mtx);
 	m->aflags = 0;
 	m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 	    VPO_UNMANAGED : 0;
@@ -1673,7 +1715,7 @@
 		 * The page lock is not required for wiring a page until that
 		 * page is inserted into the object.
 		 */
-		atomic_add_int(&cnt.v_wire_count, 1);
+		atomic_add_int(&vm_cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	}
 	m->act_count = 0;
@@ -1680,18 +1722,21 @@
 
 	if (object != NULL) {
 		if (vm_page_insert_after(m, object, pindex, mpred)) {
-			/* See the comment below about hold count. */
-			if (vp != NULL)
-				vdrop(vp);
 			pagedaemon_wakeup();
 			if (req & VM_ALLOC_WIRED) {
-				atomic_subtract_int(&cnt.v_wire_count, 1);
+				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 				m->wire_count = 0;
 			}
-			m->object = NULL;
+			KASSERT(m->object == NULL, ("page %p has object", m));
 			m->oflags = VPO_UNMANAGED;
 			m->busy_lock = VPB_UNBUSIED;
-			vm_page_free(m);
+			/* Don't change PG_ZERO. */
+			vm_page_free_toq(m);
+			if (req & VM_ALLOC_WAITFAIL) {
+				VM_OBJECT_WUNLOCK(object);
+				vm_radix_wait();
+				VM_OBJECT_WLOCK(object);
+			}
 			return (NULL);
 		}
 
@@ -1703,34 +1748,15 @@
 		m->pindex = pindex;
 
 	/*
-	 * The following call to vdrop() must come after the above call
-	 * to vm_page_insert() in case both affect the same object and
-	 * vnode.  Otherwise, the affected vnode's hold count could
-	 * temporarily become zero.
-	 */
-	if (vp != NULL)
-		vdrop(vp);
-
-	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
-	if (vm_paging_needed())
+	if (vm_paging_needed(free_count))
 		pagedaemon_wakeup();
 
 	return (m);
 }
 
-static void
-vm_page_alloc_contig_vdrop(struct spglist *lst)
-{
-
-	while (!SLIST_EMPTY(lst)) {
-		vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv);
-		SLIST_REMOVE_HEAD(lst, plinks.s.ss);
-	}
-}
-
 /*
  *	vm_page_alloc_contig:
  *
@@ -1752,6 +1778,8 @@
  *	memory attribute setting for the physical pages cannot be configured
  *	to VM_MEMATTR_DEFAULT.
  *
+ *	The specified object may not contain fictitious pages.
+ *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
@@ -1763,7 +1791,7 @@
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
- *				should not be exclusive busy 
+ *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
@@ -1775,22 +1803,23 @@
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr)
 {
-	struct vnode *drop;
-	struct spglist deferred_vdrop_list;
-	vm_page_t m, m_tmp, m_ret;
-	u_int flags, oflags;
+	vm_page_t m, m_ret, mpred;
+	u_int busy_lock, flags, oflags;
 	int req_class;
 
+	mpred = NULL;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
-	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
+	    ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object,
 	    req));
+	KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
+	    ("Can't sleep and retry object insertion."));
 	if (object != NULL) {
 		VM_OBJECT_ASSERT_WLOCKED(object);
-		KASSERT(object->type == OBJT_PHYS,
-		    ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
+		KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
+		    ("vm_page_alloc_contig: object %p has fictitious pages",
 		    object));
 	}
 	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
@@ -1802,40 +1831,48 @@
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
-	SLIST_INIT(&deferred_vdrop_list);
+	if (object != NULL) {
+		mpred = vm_radix_lookup_le(&object->rtree, pindex);
+		KASSERT(mpred == NULL || mpred->pindex != pindex,
+		    ("vm_page_alloc_contig: pindex already allocated"));
+	}
+
+	/*
+	 * Can we allocate the pages without the number of free pages falling
+	 * below the lower bound for the allocation class?
+	 */
+again:
 	mtx_lock(&vm_page_queue_free_mtx);
-	if (cnt.v_free_count + cnt.v_cache_count >= npages +
-	    cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
-	    cnt.v_free_count + cnt.v_cache_count >= npages +
-	    cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
-	    cnt.v_free_count + cnt.v_cache_count >= npages)) {
+	if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved ||
+	    (req_class == VM_ALLOC_SYSTEM &&
+	    vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) ||
+	    (req_class == VM_ALLOC_INTERRUPT &&
+	    vm_cnt.v_free_count >= npages)) {
+		/*
+		 * Can we allocate the pages from a reservation?
+		 */
 #if VM_NRESERVLEVEL > 0
 retry:
 		if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
 		    (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
-		    low, high, alignment, boundary)) == NULL)
+		    low, high, alignment, boundary, mpred)) == NULL)
 #endif
+			/*
+			 * If not, allocate them from the free page queues.
+			 */
 			m_ret = vm_phys_alloc_contig(npages, low, high,
 			    alignment, boundary);
 	} else {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		atomic_add_int(&vm_pageout_deficit, npages);
-		pagedaemon_wakeup();
+		if (vm_page_alloc_fail(object, req))
+			goto again;
 		return (NULL);
 	}
-	if (m_ret != NULL)
-		for (m = m_ret; m < &m_ret[npages]; m++) {
-			drop = vm_page_alloc_init(m);
-			if (drop != NULL) {
-				/*
-				 * Enqueue the vnode for deferred vdrop().
-				 */
-				m->plinks.s.pv = drop;
-				SLIST_INSERT_HEAD(&deferred_vdrop_list, m,
-				    plinks.s.ss);
-			}
-		}
-	else {
+	if (m_ret != NULL) {
+		vm_phys_freecnt_adj(m_ret, -npages);
+		for (m = m_ret; m < &m_ret[npages]; m++)
+			if ((m->flags & PG_ZERO) != 0)
+				vm_page_zero_count--;
+	} else {
 #if VM_NRESERVLEVEL > 0
 		if (vm_reserv_reclaim_contig(npages, low, high, alignment,
 		    boundary))
@@ -1845,6 +1882,8 @@
 	mtx_unlock(&vm_page_queue_free_mtx);
 	if (m_ret == NULL)
 		return (NULL);
+	for (m = m_ret; m < &m_ret[npages]; m++)
+		vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
@@ -1854,9 +1893,15 @@
 		flags = PG_ZERO;
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
+	oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
+	    VPO_UNMANAGED : 0;
+	busy_lock = VPB_UNBUSIED;
+	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
+		busy_lock = VPB_SINGLE_EXCLUSIVER;
+	if ((req & VM_ALLOC_SBUSY) != 0)
+		busy_lock = VPB_SHARERS_WORD(1);
 	if ((req & VM_ALLOC_WIRED) != 0)
-		atomic_add_int(&cnt.v_wire_count, npages);
-	oflags = VPO_UNMANAGED;
+		atomic_add_int(&vm_cnt.v_wire_count, npages);
 	if (object != NULL) {
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    memattr == VM_MEMATTR_DEFAULT)
@@ -1865,39 +1910,37 @@
 	for (m = m_ret; m < &m_ret[npages]; m++) {
 		m->aflags = 0;
 		m->flags = (m->flags | PG_NODUMP) & flags;
-		m->busy_lock = VPB_UNBUSIED;
-		if (object != NULL) {
-			if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
-				m->busy_lock = VPB_SINGLE_EXCLUSIVER;
-			if ((req & VM_ALLOC_SBUSY) != 0)
-				m->busy_lock = VPB_SHARERS_WORD(1);
-		}
+		m->busy_lock = busy_lock;
 		if ((req & VM_ALLOC_WIRED) != 0)
 			m->wire_count = 1;
-		/* Unmanaged pages don't use "act_count". */
+		m->act_count = 0;
 		m->oflags = oflags;
 		if (object != NULL) {
-			if (vm_page_insert(m, object, pindex)) {
-				vm_page_alloc_contig_vdrop(
-				    &deferred_vdrop_list);
-				if (vm_paging_needed())
-					pagedaemon_wakeup();
+			if (vm_page_insert_after(m, object, pindex, mpred)) {
+				pagedaemon_wakeup();
 				if ((req & VM_ALLOC_WIRED) != 0)
-					atomic_subtract_int(&cnt.v_wire_count,
-					    npages);
-				for (m_tmp = m, m = m_ret;
-				    m < &m_ret[npages]; m++) {
-					if ((req & VM_ALLOC_WIRED) != 0)
+					atomic_subtract_int(
+					    &vm_cnt.v_wire_count, npages);
+				KASSERT(m->object == NULL,
+				    ("page %p has object", m));
+				mpred = m;
+				for (m = m_ret; m < &m_ret[npages]; m++) {
+					if (m <= mpred &&
+					    (req & VM_ALLOC_WIRED) != 0)
 						m->wire_count = 0;
-					if (m >= m_tmp) {
-						m->object = NULL;
-						m->oflags |= VPO_UNMANAGED;
-					}
+					m->oflags = VPO_UNMANAGED;
 					m->busy_lock = VPB_UNBUSIED;
-					vm_page_free(m);
+					/* Don't change PG_ZERO. */
+					vm_page_free_toq(m);
 				}
+				if (req & VM_ALLOC_WAITFAIL) {
+					VM_OBJECT_WUNLOCK(object);
+					vm_radix_wait();
+					VM_OBJECT_WLOCK(object);
+				}
 				return (NULL);
 			}
+			mpred = m;
 		} else
 			m->pindex = pindex;
 		if (memattr != VM_MEMATTR_DEFAULT)
@@ -1904,63 +1947,29 @@
 			pmap_page_set_memattr(m, memattr);
 		pindex++;
 	}
-	vm_page_alloc_contig_vdrop(&deferred_vdrop_list);
-	if (vm_paging_needed())
+	if (vm_paging_needed(vm_cnt.v_free_count))
 		pagedaemon_wakeup();
 	return (m_ret);
 }
 
 /*
- * Initialize a page that has been freshly dequeued from a freelist.
- * The caller has to drop the vnode returned, if it is not NULL.
- *
- * This function may only be used to initialize unmanaged pages.
- *
- * To be called with vm_page_queue_free_mtx held.
+ * Check a page that has been freshly dequeued from a freelist.
  */
-static struct vnode *
-vm_page_alloc_init(vm_page_t m)
+static void
+vm_page_alloc_check(vm_page_t m)
 {
-	struct vnode *drop;
-	vm_object_t m_object;
 
+	KASSERT(m->object == NULL, ("page %p has object", m));
 	KASSERT(m->queue == PQ_NONE,
-	    ("vm_page_alloc_init: page %p has unexpected queue %d",
-	    m, m->queue));
-	KASSERT(m->wire_count == 0,
-	    ("vm_page_alloc_init: page %p is wired", m));
-	KASSERT(m->hold_count == 0,
-	    ("vm_page_alloc_init: page %p is held", m));
-	KASSERT(!vm_page_busied(m),
-	    ("vm_page_alloc_init: page %p is busy", m));
-	KASSERT(m->dirty == 0,
-	    ("vm_page_alloc_init: page %p is dirty", m));
+	    ("page %p has unexpected queue %d", m, m->queue));
+	KASSERT(m->wire_count == 0, ("page %p is wired", m));
+	KASSERT(m->hold_count == 0, ("page %p is held", m));
+	KASSERT(!vm_page_busied(m), ("page %p is busy", m));
+	KASSERT(m->dirty == 0, ("page %p is dirty", m));
 	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
-	    ("vm_page_alloc_init: page %p has unexpected memattr %d",
+	    ("page %p has unexpected memattr %d",
 	    m, pmap_page_get_memattr(m)));
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	drop = NULL;
-	if ((m->flags & PG_CACHED) != 0) {
-		KASSERT((m->flags & PG_ZERO) == 0,
-		    ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
-		m->valid = 0;
-		m_object = m->object;
-		vm_page_cache_remove(m);
-		if (m_object->type == OBJT_VNODE &&
-		    vm_object_cache_is_empty(m_object))
-			drop = m_object->handle;
-	} else {
-		KASSERT(VM_PAGE_IS_FREE(m),
-		    ("vm_page_alloc_init: page %p is not free", m));
-		KASSERT(m->valid == 0,
-		    ("vm_page_alloc_init: free page %p is valid", m));
-		vm_phys_freecnt_adj(m, -1);
-		if ((m->flags & PG_ZERO) != 0)
-			vm_page_zero_count--;
-	}
-	/* Don't clear the PG_ZERO flag; we'll need it later. */
-	m->flags &= PG_ZERO;
-	return (drop);
+	KASSERT(m->valid == 0, ("free page %p is valid", m));
 }
 
 /*
@@ -1986,9 +1995,8 @@
 vm_page_t
 vm_page_alloc_freelist(int flind, int req)
 {
-	struct vnode *drop;
 	vm_page_t m;
-	u_int flags;
+	u_int flags, free_count;
 	int req_class;
 
 	req_class = req & VM_ALLOC_CLASS_MASK;
@@ -2002,18 +2010,17 @@
 	/*
 	 * Do not allocate reserved pages unless the req has asked for it.
 	 */
-	mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
-	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
+again:
+	mtx_lock(&vm_page_queue_free_mtx);
+	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
-	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
+	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
-	    cnt.v_free_count + cnt.v_cache_count > 0))
+	    vm_cnt.v_free_count > 0)) {
 		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
-	else {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		atomic_add_int(&vm_pageout_deficit,
-		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
-		pagedaemon_wakeup();
+	} else {
+		if (vm_page_alloc_fail(NULL, req))
+			goto again;
 		return (NULL);
 	}
 	if (m == NULL) {
@@ -2020,8 +2027,11 @@
 		mtx_unlock(&vm_page_queue_free_mtx);
 		return (NULL);
 	}
-	drop = vm_page_alloc_init(m);
+	free_count = vm_phys_freecnt_adj(m, -1);
+	if ((m->flags & PG_ZERO) != 0)
+		vm_page_zero_count--;
 	mtx_unlock(&vm_page_queue_free_mtx);
+	vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the page.  Only the PG_ZERO flag is inherited.
@@ -2036,44 +2046,602 @@
 		 * The page lock is not required for wiring a page that does
 		 * not belong to an object.
 		 */
-		atomic_add_int(&cnt.v_wire_count, 1);
+		atomic_add_int(&vm_cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	}
 	/* Unmanaged pages don't use "act_count". */
 	m->oflags = VPO_UNMANAGED;
-	if (drop != NULL)
-		vdrop(drop);
-	if (vm_paging_needed())
+	if (vm_paging_needed(free_count))
 		pagedaemon_wakeup();
 	return (m);
 }
 
+#define	VPSC_ANY	0	/* No restrictions. */
+#define	VPSC_NORESERV	1	/* Skip reservations; implies VPSC_NOSUPER. */
+#define	VPSC_NOSUPER	2	/* Skip superpages. */
+
 /*
+ *	vm_page_scan_contig:
+ *
+ *	Scan vm_page_array[] between the specified entries "m_start" and
+ *	"m_end" for a run of contiguous physical pages that satisfy the
+ *	specified conditions, and return the lowest page in the run.  The
+ *	specified "alignment" determines the alignment of the lowest physical
+ *	page in the run.  If the specified "boundary" is non-zero, then the
+ *	run of physical pages cannot span a physical address that is a
+ *	multiple of "boundary".
+ *
+ *	"m_end" is never dereferenced, so it need not point to a vm_page
+ *	structure within vm_page_array[].
+ *
+ *	"npages" must be greater than zero.  "m_start" and "m_end" must not
+ *	span a hole (or discontiguity) in the physical address space.  Both
+ *	"alignment" and "boundary" must be a power of two.
+ */
+vm_page_t
+vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
+    u_long alignment, vm_paddr_t boundary, int options)
+{
+	struct mtx *m_mtx;
+	vm_object_t object;
+	vm_paddr_t pa;
+	vm_page_t m, m_run;
+#if VM_NRESERVLEVEL > 0
+	int level;
+#endif
+	int m_inc, order, run_ext, run_len;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	m_run = NULL;
+	run_len = 0;
+	m_mtx = NULL;
+	for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
+		KASSERT((m->flags & PG_MARKER) == 0,
+		    ("page %p is PG_MARKER", m));
+		KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1,
+		    ("fictitious page %p has invalid wire count", m));
+
+		/*
+		 * If the current page would be the start of a run, check its
+		 * physical address against the end, alignment, and boundary
+		 * conditions.  If it doesn't satisfy these conditions, either
+		 * terminate the scan or advance to the next page that
+		 * satisfies the failed condition.
+		 */
+		if (run_len == 0) {
+			KASSERT(m_run == NULL, ("m_run != NULL"));
+			if (m + npages > m_end)
+				break;
+			pa = VM_PAGE_TO_PHYS(m);
+			if ((pa & (alignment - 1)) != 0) {
+				m_inc = atop(roundup2(pa, alignment) - pa);
+				continue;
+			}
+			if (rounddown2(pa ^ (pa + ptoa(npages) - 1),
+			    boundary) != 0) {
+				m_inc = atop(roundup2(pa, boundary) - pa);
+				continue;
+			}
+		} else
+			KASSERT(m_run != NULL, ("m_run == NULL"));
+
+		vm_page_change_lock(m, &m_mtx);
+		m_inc = 1;
+retry:
+		if (m->wire_count != 0 || m->hold_count != 0)
+			run_ext = 0;
+#if VM_NRESERVLEVEL > 0
+		else if ((level = vm_reserv_level(m)) >= 0 &&
+		    (options & VPSC_NORESERV) != 0) {
+			run_ext = 0;
+			/* Advance to the end of the reservation. */
+			pa = VM_PAGE_TO_PHYS(m);
+			m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
+			    pa);
+		}
+#endif
+		else if ((object = m->object) != NULL) {
+			/*
+			 * The page is considered eligible for relocation if
+			 * and only if it could be laundered or reclaimed by
+			 * the page daemon.
+			 */
+			if (!VM_OBJECT_TRYRLOCK(object)) {
+				mtx_unlock(m_mtx);
+				VM_OBJECT_RLOCK(object);
+				mtx_lock(m_mtx);
+				if (m->object != object) {
+					/*
+					 * The page may have been freed.
+					 */
+					VM_OBJECT_RUNLOCK(object);
+					goto retry;
+				} else if (m->wire_count != 0 ||
+				    m->hold_count != 0) {
+					run_ext = 0;
+					goto unlock;
+				}
+			}
+			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+			    ("page %p is PG_UNHOLDFREE", m));
+			/* Don't care: PG_NODUMP, PG_ZERO. */
+			if (object->type != OBJT_DEFAULT &&
+			    object->type != OBJT_SWAP &&
+			    object->type != OBJT_VNODE) {
+				run_ext = 0;
+#if VM_NRESERVLEVEL > 0
+			} else if ((options & VPSC_NOSUPER) != 0 &&
+			    (level = vm_reserv_level_iffullpop(m)) >= 0) {
+				run_ext = 0;
+				/* Advance to the end of the superpage. */
+				pa = VM_PAGE_TO_PHYS(m);
+				m_inc = atop(roundup2(pa + 1,
+				    vm_reserv_size(level)) - pa);
+#endif
+			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
+			    m->queue != PQ_NONE && !vm_page_busied(m)) {
+				/*
+				 * The page is allocated but eligible for
+				 * relocation.  Extend the current run by one
+				 * page.
+				 */
+				KASSERT(pmap_page_get_memattr(m) ==
+				    VM_MEMATTR_DEFAULT,
+				    ("page %p has an unexpected memattr", m));
+				KASSERT((m->oflags & (VPO_SWAPINPROG |
+				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
+				    ("page %p has unexpected oflags", m));
+				/* Don't care: VPO_NOSYNC. */
+				run_ext = 1;
+			} else
+				run_ext = 0;
+unlock:
+			VM_OBJECT_RUNLOCK(object);
+#if VM_NRESERVLEVEL > 0
+		} else if (level >= 0) {
+			/*
+			 * The page is reserved but not yet allocated.  In
+			 * other words, it is still free.  Extend the current
+			 * run by one page.
+			 */
+			run_ext = 1;
+#endif
+		} else if ((order = m->order) < VM_NFREEORDER) {
+			/*
+			 * The page is enqueued in the physical memory
+			 * allocator's free page queues.  Moreover, it is the
+			 * first page in a power-of-two-sized run of
+			 * contiguous free pages.  Add these pages to the end
+			 * of the current run, and jump ahead.
+			 */
+			run_ext = 1 << order;
+			m_inc = 1 << order;
+		} else {
+			/*
+			 * Skip the page for one of the following reasons: (1)
+			 * It is enqueued in the physical memory allocator's
+			 * free page queues.  However, it is not the first
+			 * page in a run of contiguous free pages.  (This case
+			 * rarely occurs because the scan is performed in
+			 * ascending order.) (2) It is not reserved, and it is
+			 * transitioning from free to allocated.  (Conversely,
+			 * the transition from allocated to free for managed
+			 * pages is blocked by the page lock.) (3) It is
+			 * allocated but not contained by an object and not
+			 * wired, e.g., allocated by Xen's balloon driver.
+			 */
+			run_ext = 0;
+		}
+
+		/*
+		 * Extend or reset the current run of pages.
+		 */
+		if (run_ext > 0) {
+			if (run_len == 0)
+				m_run = m;
+			run_len += run_ext;
+		} else {
+			if (run_len > 0) {
+				m_run = NULL;
+				run_len = 0;
+			}
+		}
+	}
+	if (m_mtx != NULL)
+		mtx_unlock(m_mtx);
+	if (run_len >= npages)
+		return (m_run);
+	return (NULL);
+}
+
+/*
+ *	vm_page_reclaim_run:
+ *
+ *	Try to relocate each of the allocated virtual pages within the
+ *	specified run of physical pages to a new physical address.  Free the
+ *	physical pages underlying the relocated virtual pages.  A virtual page
+ *	is relocatable if and only if it could be laundered or reclaimed by
+ *	the page daemon.  Whenever possible, a virtual page is relocated to a
+ *	physical address above "high".
+ *
+ *	Returns 0 if every physical page within the run was already free or
+ *	just freed by a successful relocation.  Otherwise, returns a non-zero
+ *	value indicating why the last attempt to relocate a virtual page was
+ *	unsuccessful.
+ *
+ *	"req_class" must be an allocation class.
+ */
+static int
+vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
+    vm_paddr_t high)
+{
+	struct mtx *m_mtx;
+	struct spglist free;
+	vm_object_t object;
+	vm_paddr_t pa;
+	vm_page_t m, m_end, m_new;
+	int error, order, req;
+
+	KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
+	    ("req_class is not an allocation class"));
+	SLIST_INIT(&free);
+	error = 0;
+	m = m_run;
+	m_end = m_run + npages;
+	m_mtx = NULL;
+	for (; error == 0 && m < m_end; m++) {
+		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
+		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
+
+		/*
+		 * Avoid releasing and reacquiring the same page lock.
+		 */
+		vm_page_change_lock(m, &m_mtx);
+retry:
+		if (m->wire_count != 0 || m->hold_count != 0)
+			error = EBUSY;
+		else if ((object = m->object) != NULL) {
+			/*
+			 * The page is relocated if and only if it could be
+			 * laundered or reclaimed by the page daemon.
+			 */
+			if (!VM_OBJECT_TRYWLOCK(object)) {
+				mtx_unlock(m_mtx);
+				VM_OBJECT_WLOCK(object);
+				mtx_lock(m_mtx);
+				if (m->object != object) {
+					/*
+					 * The page may have been freed.
+					 */
+					VM_OBJECT_WUNLOCK(object);
+					goto retry;
+				} else if (m->wire_count != 0 ||
+				    m->hold_count != 0) {
+					error = EBUSY;
+					goto unlock;
+				}
+			}
+			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+			    ("page %p is PG_UNHOLDFREE", m));
+			/* Don't care: PG_NODUMP, PG_ZERO. */
+			if (object->type != OBJT_DEFAULT &&
+			    object->type != OBJT_SWAP &&
+			    object->type != OBJT_VNODE)
+				error = EINVAL;
+			else if (object->memattr != VM_MEMATTR_DEFAULT)
+				error = EINVAL;
+			else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
+				KASSERT(pmap_page_get_memattr(m) ==
+				    VM_MEMATTR_DEFAULT,
+				    ("page %p has an unexpected memattr", m));
+				KASSERT((m->oflags & (VPO_SWAPINPROG |
+				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
+				    ("page %p has unexpected oflags", m));
+				/* Don't care: VPO_NOSYNC. */
+				if (m->valid != 0) {
+					/*
+					 * First, try to allocate a new page
+					 * that is above "high".  Failing
+					 * that, try to allocate a new page
+					 * that is below "m_run".  Allocate
+					 * the new page between the end of
+					 * "m_run" and "high" only as a last
+					 * resort.
+					 */
+					req = req_class | VM_ALLOC_NOOBJ;
+					if ((m->flags & PG_NODUMP) != 0)
+						req |= VM_ALLOC_NODUMP;
+					if (trunc_page(high) !=
+					    ~(vm_paddr_t)PAGE_MASK) {
+						m_new = vm_page_alloc_contig(
+						    NULL, 0, req, 1,
+						    round_page(high),
+						    ~(vm_paddr_t)0,
+						    PAGE_SIZE, 0,
+						    VM_MEMATTR_DEFAULT);
+					} else
+						m_new = NULL;
+					if (m_new == NULL) {
+						pa = VM_PAGE_TO_PHYS(m_run);
+						m_new = vm_page_alloc_contig(
+						    NULL, 0, req, 1,
+						    0, pa - 1, PAGE_SIZE, 0,
+						    VM_MEMATTR_DEFAULT);
+					}
+					if (m_new == NULL) {
+						pa += ptoa(npages);
+						m_new = vm_page_alloc_contig(
+						    NULL, 0, req, 1,
+						    pa, high, PAGE_SIZE, 0,
+						    VM_MEMATTR_DEFAULT);
+					}
+					if (m_new == NULL) {
+						error = ENOMEM;
+						goto unlock;
+					}
+					KASSERT(m_new->wire_count == 0,
+					    ("page %p is wired", m_new));
+
+					/*
+					 * Replace "m" with the new page.  For
+					 * vm_page_replace(), "m" must be busy
+					 * and dequeued.  Finally, change "m"
+					 * as if vm_page_free() was called.
+					 */
+					if (object->ref_count != 0)
+						pmap_remove_all(m);
+					m_new->aflags = m->aflags;
+					KASSERT(m_new->oflags == VPO_UNMANAGED,
+					    ("page %p is managed", m_new));
+					m_new->oflags = m->oflags & VPO_NOSYNC;
+					pmap_copy_page(m, m_new);
+					m_new->valid = m->valid;
+					m_new->dirty = m->dirty;
+					m->flags &= ~PG_ZERO;
+					vm_page_xbusy(m);
+					vm_page_remque(m);
+					vm_page_replace_checked(m_new, object,
+					    m->pindex, m);
+					m->valid = 0;
+					vm_page_undirty(m);
+
+					/*
+					 * The new page must be deactivated
+					 * before the object is unlocked.
+					 */
+					vm_page_change_lock(m_new, &m_mtx);
+					vm_page_deactivate(m_new);
+				} else {
+					m->flags &= ~PG_ZERO;
+					vm_page_remque(m);
+					vm_page_remove(m);
+					KASSERT(m->dirty == 0,
+					    ("page %p is dirty", m));
+				}
+				SLIST_INSERT_HEAD(&free, m, plinks.s.ss);
+			} else
+				error = EBUSY;
+unlock:
+			VM_OBJECT_WUNLOCK(object);
+		} else {
+			mtx_lock(&vm_page_queue_free_mtx);
+			order = m->order;
+			if (order < VM_NFREEORDER) {
+				/*
+				 * The page is enqueued in the physical memory
+				 * allocator's free page queues.  Moreover, it
+				 * is the first page in a power-of-two-sized
+				 * run of contiguous free pages.  Jump ahead
+				 * to the last page within that run, and
+				 * continue from there.
+				 */
+				m += (1 << order) - 1;
+			}
+#if VM_NRESERVLEVEL > 0
+			else if (vm_reserv_is_page_free(m))
+				order = 0;
+#endif
+			mtx_unlock(&vm_page_queue_free_mtx);
+			if (order == VM_NFREEORDER)
+				error = EINVAL;
+		}
+	}
+	if (m_mtx != NULL)
+		mtx_unlock(m_mtx);
+	if ((m = SLIST_FIRST(&free)) != NULL) {
+		mtx_lock(&vm_page_queue_free_mtx);
+		do {
+			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
+			vm_page_free_phys(m);
+		} while ((m = SLIST_FIRST(&free)) != NULL);
+		vm_page_zero_idle_wakeup();
+		vm_page_free_wakeup();
+		mtx_unlock(&vm_page_queue_free_mtx);
+	}
+	return (error);
+}
+
+#define	NRUNS	16
+
+CTASSERT(powerof2(NRUNS));
+
+#define	RUN_INDEX(count)	((count) & (NRUNS - 1))
+
+#define	MIN_RECLAIM	8
+
+/*
+ *	vm_page_reclaim_contig:
+ *
+ *	Reclaim allocated, contiguous physical memory satisfying the specified
+ *	conditions by relocating the virtual pages using that physical memory.
+ *	Returns true if reclamation is successful and false otherwise.  Since
+ *	relocation requires the allocation of physical pages, reclamation may
+ *	fail due to a shortage of free pages.  When reclamation fails, callers
+ *	are expected to perform VM_WAIT before retrying a failed allocation
+ *	operation, e.g., vm_page_alloc_contig().
+ *
+ *	The caller must always specify an allocation class through "req".
+ *
+ *	allocation classes:
+ *	VM_ALLOC_NORMAL		normal process request
+ *	VM_ALLOC_SYSTEM		system *really* needs a page
+ *	VM_ALLOC_INTERRUPT	interrupt time request
+ *
+ *	The optional allocation flags are ignored.
+ *
+ *	"npages" must be greater than zero.  Both "alignment" and "boundary"
+ *	must be a power of two.
+ */
+bool
+vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary)
+{
+	vm_paddr_t curr_low;
+	vm_page_t m_run, m_runs[NRUNS];
+	u_long count, reclaimed;
+	int error, i, options, req_class;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	req_class = req & VM_ALLOC_CLASS_MASK;
+
+	/*
+	 * The page daemon is allowed to dig deeper into the free page list.
+	 */
+	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+		req_class = VM_ALLOC_SYSTEM;
+
+	/*
+	 * Return if the number of free pages cannot satisfy the requested
+	 * allocation.
+	 */
+	count = vm_cnt.v_free_count;
+	if (count < npages + vm_cnt.v_free_reserved || (count < npages +
+	    vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
+	    (count < npages && req_class == VM_ALLOC_INTERRUPT))
+		return (false);
+
+	/*
+	 * Scan up to three times, relaxing the restrictions ("options") on
+	 * the reclamation of reservations and superpages each time.
+	 */
+	for (options = VPSC_NORESERV;;) {
+		/*
+		 * Find the highest runs that satisfy the given constraints
+		 * and restrictions, and record them in "m_runs".
+		 */
+		curr_low = low;
+		count = 0;
+		for (;;) {
+			m_run = vm_phys_scan_contig(npages, curr_low, high,
+			    alignment, boundary, options);
+			if (m_run == NULL)
+				break;
+			curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
+			m_runs[RUN_INDEX(count)] = m_run;
+			count++;
+		}
+
+		/*
+		 * Reclaim the highest runs in LIFO (descending) order until
+		 * the number of reclaimed pages, "reclaimed", is at least
+		 * MIN_RECLAIM.  Reset "reclaimed" each time because each
+		 * reclamation is idempotent, and runs will (likely) recur
+		 * from one scan to the next as restrictions are relaxed.
+		 */
+		reclaimed = 0;
+		for (i = 0; count > 0 && i < NRUNS; i++) {
+			count--;
+			m_run = m_runs[RUN_INDEX(count)];
+			error = vm_page_reclaim_run(req_class, npages, m_run,
+			    high);
+			if (error == 0) {
+				reclaimed += npages;
+				if (reclaimed >= MIN_RECLAIM)
+					return (true);
+			}
+		}
+
+		/*
+		 * Either relax the restrictions on the next scan or return if
+		 * the last scan had no restrictions.
+		 */
+		if (options == VPSC_NORESERV)
+			options = VPSC_NOSUPER;
+		else if (options == VPSC_NOSUPER)
+			options = VPSC_ANY;
+		else if (options == VPSC_ANY)
+			return (reclaimed != 0);
+	}
+}
+
+/*
  *	vm_wait:	(also see VM_WAIT macro)
  *
  *	Sleep until free pages are available for allocation.
  *	- Called in various places before memory allocations.
  */
-void
-vm_wait(void)
+static void
+_vm_wait(void)
 {
 
-	mtx_lock(&vm_page_queue_free_mtx);
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
 		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 		    PDROP | PSWP, "VMWait", 0);
 	} else {
-		if (!vm_pages_needed) {
-			vm_pages_needed = 1;
-			wakeup(&vm_pages_needed);
-		}
-		msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
-		    "vmwait", 0);
+		if (pageproc == NULL)
+			panic("vm_wait in early boot");
+		pagedaemon_wait(PVM, "vmwait");
 	}
 }
 
+void
+vm_wait(void)
+{
+
+	mtx_lock(&vm_page_queue_free_mtx);
+	_vm_wait();
+}
+
 /*
+ *	vm_page_alloc_fail:
+ *
+ *	Called when a page allocation function fails.  Informs the
+ *	pagedaemon and performs the requested wait.  Requires the
+ *	page_queue_free and object lock on entry.  Returns with the
+ *	object lock held and free lock released.  Returns an error when
+ *	retry is necessary.
+ *
+ */
+static int
+vm_page_alloc_fail(vm_object_t object, int req)
+{
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+	atomic_add_int(&vm_pageout_deficit,
+	    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
+	if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
+		if (object != NULL) 
+			VM_OBJECT_WUNLOCK(object);
+		_vm_wait();
+		if (object != NULL) 
+			VM_OBJECT_WLOCK(object);
+		if (req & VM_ALLOC_WAITOK)
+			return (EAGAIN);
+	} else {
+		mtx_unlock(&vm_page_queue_free_mtx);
+		pagedaemon_wakeup();
+	}
+	return (0);
+}
+
+/*
  *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
  *
  *	Sleep until free pages are available for allocation.
@@ -2088,12 +2656,7 @@
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
-	if (!vm_pages_needed) {
-		vm_pages_needed = 1;
-		wakeup(&vm_pages_needed);
-	}
-	msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
-	    "pfault", 0);
+	pagedaemon_wait(PUSER, "pfault");
 }
 
 struct vm_pagequeue *
@@ -2100,7 +2663,10 @@
 vm_page_pagequeue(vm_page_t m)
 {
 
-	return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
+	if (vm_page_in_laundry(m))
+		return (&vm_dom[0].vmd_pagequeues[m->queue]);
+	else
+		return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
 }
 
 /*
@@ -2115,9 +2681,9 @@
 {
 	struct vm_pagequeue *pq;
 
-	vm_page_lock_assert(m, MA_OWNED);
-	KASSERT(m->queue != PQ_NONE,
-	    ("vm_page_dequeue: page %p is not queued", m));
+	vm_page_assert_locked(m);
+	KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
+	    m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	m->queue = PQ_NONE;
@@ -2154,12 +2720,18 @@
  *	The page must be locked.
  */
 static void
-vm_page_enqueue(int queue, vm_page_t m)
+vm_page_enqueue(uint8_t queue, vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
-	pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
+	KASSERT(queue < PQ_COUNT,
+	    ("vm_page_enqueue: invalid queue %u request for page %p",
+	    queue, m));
+	if (queue == PQ_LAUNDRY)
+		pq = &vm_dom[0].vmd_pagequeues[queue];
+	else
+		pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	m->queue = queue;
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
@@ -2243,13 +2815,12 @@
 /*
  *	vm_page_free_wakeup:
  *
- *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
- *	routine is called when a page has been added to the cache or free
- *	queues.
+ *	Helper routine for vm_page_free_toq().  This routine is called
+ *	when a page is added to the free queues.
  *
  *	The page queues must be locked.
  */
-static inline void
+static void
 vm_page_free_wakeup(void)
 {
 
@@ -2259,7 +2830,7 @@
 	 * some free.
 	 */
 	if (vm_pageout_pages_needed &&
-	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
+	    vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
@@ -2269,45 +2840,36 @@
 	 * lots of memory. this process will swapin processes.
 	 */
 	if (vm_pages_needed && !vm_page_count_min()) {
-		vm_pages_needed = 0;
-		wakeup(&cnt.v_free_count);
+		vm_pages_needed = false;
+		wakeup(&vm_cnt.v_free_count);
 	}
 }
 
 /*
- *	Turn a cached page into a free page, by changing its attributes.
- *	Keep the statistics up-to-date.
+ *	vm_page_free_prep:
  *
- *	The free page queue must be locked.
- */
-static void
-vm_page_cache_turn_free(vm_page_t m)
-{
-
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-
-	m->object = NULL;
-	m->valid = 0;
-	/* Clear PG_CACHED and set PG_FREE. */
-	m->flags ^= PG_CACHED | PG_FREE;
-	KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
-	    ("vm_page_cache_free: page %p has inconsistent flags", m));
-	cnt.v_cache_count--;
-	vm_phys_freecnt_adj(m, 1);
-}
-
-/*
- *	vm_page_free_toq:
+ *	Prepares the given page to be put on the free list,
+ *	disassociating it from any VM object. The caller may return
+ *	the page to the free list only if this function returns true.
  *
- *	Returns the given page to the free list,
- *	disassociating it with any VM object.
- *
- *	The object must be locked.  The page must be locked if it is managed.
+ *	The object must be locked.  The page must be locked if it is
+ *	managed.  For a queued managed page, the pagequeue_locked
+ *	argument specifies whether the page queue is already locked.
  */
-void
-vm_page_free_toq(vm_page_t m)
+bool
+vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
 {
 
+#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
+	if ((m->flags & PG_ZERO) != 0) {
+		uint64_t *p;
+		int i;
+		p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
+		for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
+			KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
+			    m, i, (uintmax_t)*p));
+	}
+#endif
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		vm_page_lock_assert(m, MA_OWNED);
 		KASSERT(!pmap_page_is_mapped(m),
@@ -2317,9 +2879,7 @@
 		    ("vm_page_free_toq: unmanaged page %p is queued", m));
 	PCPU_INC(cnt.v_tfree);
 
-	if (VM_PAGE_IS_FREE(m))
-		panic("vm_page_free: freeing free page %p", m);
-	else if (vm_page_sbusied(m))
+	if (vm_page_sbusied(m))
 		panic("vm_page_free: freeing busy page %p", m);
 
 	/*
@@ -2328,7 +2888,12 @@
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
-	vm_page_remque(m);
+	if (m->queue != PQ_NONE) {
+		if (pagequeue_locked)
+			vm_page_dequeue_locked(m);
+		else
+			vm_page_dequeue(m);
+	}
 	vm_page_remove(m);
 
 	/*
@@ -2335,9 +2900,8 @@
 	 * If fictitious remove object association and
 	 * return, otherwise delay object association removal.
 	 */
-	if ((m->flags & PG_FICTITIOUS) != 0) {
-		return;
-	}
+	if ((m->flags & PG_FICTITIOUS) != 0)
+		return (false);
 
 	m->valid = 0;
 	vm_page_undirty(m);
@@ -2349,36 +2913,75 @@
 		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
 		m->flags |= PG_UNHOLDFREE;
-	} else {
-		/*
-		 * Restore the default memory attribute to the page.
-		 */
-		if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
-			pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
+		return (false);
+	}
 
-		/*
-		 * Insert the page into the physical memory allocator's
-		 * cache/free page queues.
-		 */
-		mtx_lock(&vm_page_queue_free_mtx);
-		m->flags |= PG_FREE;
-		vm_phys_freecnt_adj(m, 1);
+	/*
+	 * Restore the default memory attribute to the page.
+	 */
+	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
+		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
+
+	return (true);
+}
+
+/*
+ * Insert the page into the physical memory allocator's free page
+ * queues.  This is the last step to free a page.
+ */
+static void
+vm_page_free_phys(vm_page_t m)
+{
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+	vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
-		if (!vm_reserv_free_page(m))
-#else
-		if (TRUE)
+	if (!vm_reserv_free_page(m))
 #endif
 			vm_phys_free_pages(m, 0);
-		if ((m->flags & PG_ZERO) != 0)
-			++vm_page_zero_count;
-		else
-			vm_page_zero_idle_wakeup();
-		vm_page_free_wakeup();
-		mtx_unlock(&vm_page_queue_free_mtx);
-	}
+	if ((m->flags & PG_ZERO) != 0)
+		++vm_page_zero_count;
+	else
+		vm_page_zero_idle_wakeup();
 }
 
+void
+vm_page_free_phys_pglist(struct pglist *tq)
+{
+	vm_page_t m;
+
+	if (TAILQ_EMPTY(tq))
+		return;
+	mtx_lock(&vm_page_queue_free_mtx);
+	TAILQ_FOREACH(m, tq, listq)
+		vm_page_free_phys(m);
+	vm_page_free_wakeup();
+	mtx_unlock(&vm_page_queue_free_mtx);
+}
+
 /*
+ *	vm_page_free_toq:
+ *
+ *	Returns the given page to the free list, disassociating it
+ *	from any VM object.
+ *
+ *	The object must be locked.  The page must be locked if it is
+ *	managed.
+ */
+void
+vm_page_free_toq(vm_page_t m)
+{
+
+	if (!vm_page_free_prep(m, false))
+		return;
+	mtx_lock(&vm_page_queue_free_mtx);
+	vm_page_free_phys(m);
+	vm_page_free_wakeup();
+	mtx_unlock(&vm_page_queue_free_mtx);
+}
+
+/*
  *	vm_page_wire:
  *
  *	Mark this page as wired down by yet
@@ -2410,7 +3013,7 @@
 		    m->queue == PQ_NONE,
 		    ("vm_page_wire: unmanaged page %p is queued", m));
 		vm_page_remque(m);
-		atomic_add_int(&cnt.v_wire_count, 1);
+		atomic_add_int(&vm_cnt.v_wire_count, 1);
 	}
 	m->wire_count++;
 	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
@@ -2419,41 +3022,43 @@
 /*
  * vm_page_unwire:
  *
- * Release one wiring of the specified page, potentially enabling it to be
- * paged again.  If paging is enabled, then the value of the parameter
- * "activate" determines to which queue the page is added.  If "activate" is
- * non-zero, then the page is added to the active queue.  Otherwise, it is
- * added to the inactive queue.
+ * Release one wiring of the specified page, potentially allowing it to be
+ * paged out.  Returns TRUE if the number of wirings transitions to zero and
+ * FALSE otherwise.
  *
- * However, unless the page belongs to an object, it is not enqueued because
- * it cannot be paged out.
+ * Only managed pages belonging to an object can be paged out.  If the number
+ * of wirings transitions to zero and the page is eligible for page out, then
+ * the page is added to the specified paging queue (unless PQ_NONE is
+ * specified).
  *
  * If a page is fictitious, then its wire count must always be one.
  *
  * A managed page must be locked.
  */
-void
-vm_page_unwire(vm_page_t m, int activate)
+boolean_t
+vm_page_unwire(vm_page_t m, uint8_t queue)
 {
 
+	KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
+	    ("vm_page_unwire: invalid queue %u request for page %p",
+	    queue, m));
 	if ((m->oflags & VPO_UNMANAGED) == 0)
-		vm_page_lock_assert(m, MA_OWNED);
+		vm_page_assert_locked(m);
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		KASSERT(m->wire_count == 1,
 	    ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
-		return;
+		return (FALSE);
 	}
 	if (m->wire_count > 0) {
 		m->wire_count--;
 		if (m->wire_count == 0) {
-			atomic_subtract_int(&cnt.v_wire_count, 1);
-			if ((m->oflags & VPO_UNMANAGED) != 0 ||
-			    m->object == NULL)
-				return;
-			if (!activate)
-				m->flags &= ~PG_WINATCFLS;
-			vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m);
-		}
+			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
+			if ((m->oflags & VPO_UNMANAGED) == 0 &&
+			    m->object != NULL && queue != PQ_NONE)
+				vm_page_enqueue(queue, m);
+			return (TRUE);
+		} else
+			return (FALSE);
 	} else
 		panic("vm_page_unwire: page %p's wire count is zero", m);
 }
@@ -2461,25 +3066,16 @@
 /*
  * Move the specified page to the inactive queue.
  *
- * Many pages placed on the inactive queue should actually go
- * into the cache, but it is difficult to figure out which.  What
- * we do instead, if the inactive target is well met, is to put
- * clean pages at the head of the inactive queue instead of the tail.
- * This will cause them to be moved to the cache more quickly and
- * if not actively re-referenced, reclaimed more quickly.  If we just
- * stick these pages at the end of the inactive queue, heavy filesystem
- * meta-data accesses can cause an unnecessary paging load on memory bound 
- * processes.  This optimization causes one-time-use metadata to be
- * reused more quickly.
+ * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
+ * queue.  However, setting "noreuse" to TRUE will accelerate the specified
+ * page's reclamation, but it will not unmap the page from any address space.
+ * This is implemented by inserting the page near the head of the inactive
+ * queue, using a marker page to guide FIFO insertion ordering.
  *
- * Normally athead is 0 resulting in LRU operation.  athead is set
- * to 1 if we want this page to be 'as if it were placed in the cache',
- * except without unmapping it from the process address space.
- *
  * The page must be locked.
  */
 static inline void
-_vm_page_deactivate(vm_page_t m, int athead)
+_vm_page_deactivate(vm_page_t m, boolean_t noreuse)
 {
 	struct vm_pagequeue *pq;
 	int queue;
@@ -2490,7 +3086,7 @@
 	 * Ignore if the page is already inactive, unless it is unlikely to be
 	 * reactivated.
 	 */
-	if ((queue = m->queue) == PQ_INACTIVE && !athead)
+	if ((queue = m->queue) == PQ_INACTIVE && !noreuse)
 		return;
 	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
@@ -2501,12 +3097,12 @@
 		} else {
 			if (queue != PQ_NONE)
 				vm_page_dequeue(m);
-			m->flags &= ~PG_WINATCFLS;
 			vm_pagequeue_lock(pq);
 		}
 		m->queue = PQ_INACTIVE;
-		if (athead)
-			TAILQ_INSERT_HEAD(&pq->pq_pl, m, plinks.q);
+		if (noreuse)
+			TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead,
+			    m, plinks.q);
 		else
 			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 		vm_pagequeue_cnt_inc(pq);
@@ -2523,165 +3119,73 @@
 vm_page_deactivate(vm_page_t m)
 {
 
-	_vm_page_deactivate(m, 0);
+	_vm_page_deactivate(m, FALSE);
 }
 
 /*
- * vm_page_try_to_cache:
+ * Move the specified page to the inactive queue with the expectation
+ * that it is unlikely to be reused.
  *
- * Returns 0 on failure, 1 on success
+ * The page must be locked.
  */
-int
-vm_page_try_to_cache(vm_page_t m)
+void
+vm_page_deactivate_noreuse(vm_page_t m)
 {
 
-	vm_page_lock_assert(m, MA_OWNED);
-	VM_OBJECT_ASSERT_WLOCKED(m->object);
-	if (m->dirty || m->hold_count || m->wire_count ||
-	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
-		return (0);
-	pmap_remove_all(m);
-	if (m->dirty)
-		return (0);
-	vm_page_cache(m);
-	return (1);
+	_vm_page_deactivate(m, TRUE);
 }
 
 /*
- * vm_page_try_to_free()
+ * vm_page_launder
  *
- *	Attempt to free the page.  If we cannot free it, we do nothing.
- *	1 is returned on success, 0 on failure.
+ * 	Put a page in the laundry.
  */
-int
-vm_page_try_to_free(vm_page_t m)
+void
+vm_page_launder(vm_page_t m)
 {
+	int queue;
 
-	vm_page_lock_assert(m, MA_OWNED);
-	if (m->object != NULL)
-		VM_OBJECT_ASSERT_WLOCKED(m->object);
-	if (m->dirty || m->hold_count || m->wire_count ||
-	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
-		return (0);
-	pmap_remove_all(m);
-	if (m->dirty)
-		return (0);
-	vm_page_free(m);
-	return (1);
+	vm_page_assert_locked(m);
+	if ((queue = m->queue) != PQ_LAUNDRY) {
+		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
+			if (queue != PQ_NONE)
+				vm_page_dequeue(m);
+			vm_page_enqueue(PQ_LAUNDRY, m);
+		} else
+			KASSERT(queue == PQ_NONE,
+			    ("wired page %p is queued", m));
+	}
 }
 
 /*
- * vm_page_cache
+ * vm_page_try_to_free()
  *
- * Put the specified page onto the page cache queue (if appropriate).
- *
- * The object and page must be locked.
+ *	Attempt to free the page.  If we cannot free it, we do nothing.
+ *	true is returned on success, false on failure.
  */
-void
-vm_page_cache(vm_page_t m)
+bool
+vm_page_try_to_free(vm_page_t m)
 {
-	vm_object_t object;
-	boolean_t cache_was_empty;
 
-	vm_page_lock_assert(m, MA_OWNED);
-	object = m->object;
-	VM_OBJECT_ASSERT_WLOCKED(object);
-	if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) ||
-	    m->hold_count || m->wire_count)
-		panic("vm_page_cache: attempting to cache busy page");
-	KASSERT(!pmap_page_is_mapped(m),
-	    ("vm_page_cache: page %p is mapped", m));
-	KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m));
-	if (m->valid == 0 || object->type == OBJT_DEFAULT ||
-	    (object->type == OBJT_SWAP &&
-	    !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
-		/*
-		 * Hypothesis: A cache-elgible page belonging to a
-		 * default object or swap object but without a backing
-		 * store must be zero filled.
-		 */
-		vm_page_free(m);
-		return;
+	vm_page_assert_locked(m);
+	if (m->object != NULL)
+		VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if (m->dirty != 0 || m->hold_count != 0 || m->wire_count != 0 ||
+	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
+		return (false);
+	if (m->object != NULL && m->object->ref_count != 0) {
+		pmap_remove_all(m);
+		if (m->dirty != 0)
+			return (false);
 	}
-	KASSERT((m->flags & PG_CACHED) == 0,
-	    ("vm_page_cache: page %p is already cached", m));
-
-	/*
-	 * Remove the page from the paging queues.
-	 */
-	vm_page_remque(m);
-
-	/*
-	 * Remove the page from the object's collection of resident
-	 * pages. 
-	 */
-	vm_radix_remove(&object->rtree, m->pindex);
-	TAILQ_REMOVE(&object->memq, m, listq);
-	object->resident_page_count--;
-
-	/*
-	 * Restore the default memory attribute to the page.
-	 */
-	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
-		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
-
-	/*
-	 * Insert the page into the object's collection of cached pages
-	 * and the physical memory allocator's cache/free page queues.
-	 */
-	m->flags &= ~PG_ZERO;
-	mtx_lock(&vm_page_queue_free_mtx);
-	cache_was_empty = vm_radix_is_empty(&object->cache);
-	if (vm_radix_insert(&object->cache, m)) {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		if (object->type == OBJT_VNODE &&
-		    object->resident_page_count == 0)
-			vdrop(object->handle);
-		m->object = NULL;
-		vm_page_free(m);
-		return;
-	}
-
-	/*
-	 * The above call to vm_radix_insert() could reclaim the one pre-
-	 * existing cached page from this object, resulting in a call to
-	 * vdrop().
-	 */
-	if (!cache_was_empty)
-		cache_was_empty = vm_radix_is_singleton(&object->cache);
-
-	m->flags |= PG_CACHED;
-	cnt.v_cache_count++;
-	PCPU_INC(cnt.v_tcached);
-#if VM_NRESERVLEVEL > 0
-	if (!vm_reserv_free_page(m)) {
-#else
-	if (TRUE) {
-#endif
-		vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
-		vm_phys_free_pages(m, 0);
-	}
-	vm_page_free_wakeup();
-	mtx_unlock(&vm_page_queue_free_mtx);
-
-	/*
-	 * Increment the vnode's hold count if this is the object's only
-	 * cached page.  Decrement the vnode's hold count if this was
-	 * the object's only resident page.
-	 */
-	if (object->type == OBJT_VNODE) {
-		if (cache_was_empty && object->resident_page_count != 0)
-			vhold(object->handle);
-		else if (!cache_was_empty && object->resident_page_count == 0)
-			vdrop(object->handle);
-	}
+	vm_page_free(m);
+	return (true);
 }
 
 /*
  * vm_page_advise
  *
- * 	Deactivate or do nothing, as appropriate.  This routine is used
- * 	by madvise() and vop_stdadvise().
+ * 	Apply the specified advice to the given page.
  *
  *	The object and page must be locked.
  */
@@ -2694,20 +3198,16 @@
 	if (advice == MADV_FREE)
 		/*
 		 * Mark the page clean.  This will allow the page to be freed
-		 * up by the system.  However, such pages are often reused
-		 * quickly by malloc() so we do not do anything that would
-		 * cause a page fault if we can help it.
-		 *
-		 * Specifically, we do not try to actually free the page now
-		 * nor do we try to put it in the cache (which would cause a
-		 * page fault on reuse).
-		 *
-		 * But we do make the page as freeable as we can without
-		 * actually taking the step of unmapping it.
+		 * without first paging it out.  MADV_FREE pages are often
+		 * quickly reused by malloc(3), so we do not do anything that
+		 * would result in a page fault on a later access.
 		 */
 		vm_page_undirty(m);
-	else if (advice != MADV_DONTNEED)
+	else if (advice != MADV_DONTNEED) {
+		if (advice == MADV_WILLNEED)
+			vm_page_activate(m);
 		return;
+	}
 
 	/*
 	 * Clear any references to the page.  Otherwise, the page daemon will
@@ -2719,11 +3219,15 @@
 		vm_page_dirty(m);
 
 	/*
-	 * Place clean pages at the head of the inactive queue rather than the
-	 * tail, thus defeating the queue's LRU operation and ensuring that the
-	 * page will be reused quickly.
+	 * Place clean pages near the head of the inactive queue rather than
+	 * the tail, thus defeating the queue's LRU operation and ensuring that
+	 * the page will be reused quickly.  Dirty pages not already in the
+	 * laundry are moved there.
 	 */
-	_vm_page_deactivate(m, m->dirty == 0);
+	if (m->dirty == 0)
+		vm_page_deactivate_noreuse(m);
+	else
+		vm_page_launder(m);
 }
 
 /*
@@ -2742,16 +3246,23 @@
 {
 	vm_page_t m;
 	int sleep;
+	int pflags;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 	    ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
+	pflags = allocflags &
+	    ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+	if ((allocflags & VM_ALLOC_NOWAIT) == 0)
+		pflags |= VM_ALLOC_WAITFAIL;
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
 		    vm_page_xbusied(m) : vm_page_busied(m);
 		if (sleep) {
+			if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+				return (NULL);
 			/*
 			 * Reference the page before unlocking and
 			 * sleeping so that the page daemon is less
@@ -2778,14 +3289,12 @@
 			return (m);
 		}
 	}
-	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_IGN_SBUSY);
+	m = vm_page_alloc(object, pindex, pflags);
 	if (m == NULL) {
-		VM_OBJECT_WUNLOCK(object);
-		VM_WAIT;
-		VM_OBJECT_WLOCK(object);
+		if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+			return (NULL);
 		goto retrylookup;
-	} else if (m->valid != 0)
-		return (m);
+	}
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
@@ -2792,6 +3301,114 @@
 }
 
 /*
+ * Return the specified range of pages from the given object.  For each
+ * page offset within the range, if a page already exists within the object
+ * at that offset and it is busy, then wait for it to change state.  If,
+ * instead, the page doesn't exist, then allocate it.
+ *
+ * The caller must always specify an allocation class.
+ *
+ * allocation classes:
+ *	VM_ALLOC_NORMAL		normal process request
+ *	VM_ALLOC_SYSTEM		system *really* needs the pages
+ *
+ * The caller must always specify that the pages are to be busied and/or
+ * wired.
+ *
+ * optional allocation flags:
+ *	VM_ALLOC_IGN_SBUSY	do not sleep on soft busy pages
+ *	VM_ALLOC_NOBUSY		do not exclusive busy the page
+ *	VM_ALLOC_NOWAIT		do not sleep
+ *	VM_ALLOC_SBUSY		set page to sbusy state
+ *	VM_ALLOC_WIRED		wire the pages
+ *	VM_ALLOC_ZERO		zero and validate any invalid pages
+ *
+ * If VM_ALLOC_NOWAIT is not specified, this routine may sleep.  Otherwise, it
+ * may return a partial prefix of the requested range.
+ */
+int
+vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+    vm_page_t *ma, int count)
+{
+	vm_page_t m, mpred;
+	int pflags;
+	int i;
+	bool sleep;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
+	    ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
+	KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
+	    (allocflags & VM_ALLOC_WIRED) != 0,
+	    ("vm_page_grab_pages: the pages must be busied or wired"));
+	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
+	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
+	    ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch"));
+	if (count == 0)
+		return (0);
+	pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK |
+	    VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY);
+	if ((allocflags & VM_ALLOC_NOWAIT) == 0)
+		pflags |= VM_ALLOC_WAITFAIL;
+	i = 0;
+retrylookup:
+	m = vm_radix_lookup_le(&object->rtree, pindex + i);
+	if (m == NULL || m->pindex != pindex + i) {
+		mpred = m;
+		m = NULL;
+	} else
+		mpred = TAILQ_PREV(m, pglist, listq);
+	for (; i < count; i++) {
+		if (m != NULL) {
+			sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
+			    vm_page_xbusied(m) : vm_page_busied(m);
+			if (sleep) {
+				if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+					break;
+				/*
+				 * Reference the page before unlocking and
+				 * sleeping so that the page daemon is less
+				 * likely to reclaim it.
+				 */
+				vm_page_aflag_set(m, PGA_REFERENCED);
+				vm_page_lock(m);
+				VM_OBJECT_WUNLOCK(object);
+				vm_page_busy_sleep(m, "grbmaw", (allocflags &
+				    VM_ALLOC_IGN_SBUSY) != 0);
+				VM_OBJECT_WLOCK(object);
+				goto retrylookup;
+			}
+			if ((allocflags & VM_ALLOC_WIRED) != 0) {
+				vm_page_lock(m);
+				vm_page_wire(m);
+				vm_page_unlock(m);
+			}
+			if ((allocflags & (VM_ALLOC_NOBUSY |
+			    VM_ALLOC_SBUSY)) == 0)
+				vm_page_xbusy(m);
+			if ((allocflags & VM_ALLOC_SBUSY) != 0)
+				vm_page_sbusy(m);
+		} else {
+			m = vm_page_alloc_after(object, pindex + i,
+			    pflags | VM_ALLOC_COUNT(count - i), mpred);
+			if (m == NULL) {
+				if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+					break;
+				goto retrylookup;
+			}
+		}
+		if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) {
+			if ((m->flags & PG_ZERO) == 0)
+				pmap_zero_page(m);
+			m->valid = VM_PAGE_BITS_ALL;
+		}
+		ma[i] = mpred = m;
+		m = vm_page_next(m);
+	}
+	return (i);
+}
+
+/*
  * Mapping function for valid or dirty bits in a page.
  *
  * Inputs are required to range within a page.
@@ -2841,17 +3458,17 @@
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
-	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
+	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
-	 * If the ending offset is not DEV_BSIZE aligned and the 
+	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
-	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
+	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
@@ -2858,7 +3475,7 @@
 
 	/*
 	 * Assert that no previously invalid block that is now being validated
-	 * is already dirty. 
+	 * is already dirty.
 	 */
 	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
 	    ("vm_page_set_valid_range: page %p is dirty", m));
@@ -2948,17 +3565,17 @@
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
-	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
+	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
-	 * If the ending offset is not DEV_BSIZE aligned and the 
+	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
-	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
+	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
@@ -3050,12 +3667,12 @@
 /*
  * vm_page_zero_invalid()
  *
- *	The kernel assumes that the invalid portions of a page contain 
+ *	The kernel assumes that the invalid portions of a page contain
  *	garbage, but such pages can be mapped into memory by user code.
  *	When this occurs, we must zero out the non-valid portions of the
  *	page so user code sees what it expects.
  *
- *	Pages are most often semi-valid when the end of a file is mapped 
+ *	Pages are most often semi-valid when the end of a file is mapped
  *	into memory and the file's size is not page aligned.
  */
 void
@@ -3072,10 +3689,10 @@
 	 * vm_page_set_validclean().
 	 */
 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
-		if (i == (PAGE_SIZE / DEV_BSIZE) || 
+		if (i == (PAGE_SIZE / DEV_BSIZE) ||
 		    (m->valid & ((vm_page_bits_t)1 << i))) {
 			if (i > b) {
-				pmap_zero_page_area(m, 
+				pmap_zero_page_area(m,
 				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 			}
 			b = i + 1;
@@ -3109,16 +3726,19 @@
 }
 
 /*
- *	vm_page_ps_is_valid:
- *
- *	Returns TRUE if the entire (super)page is valid and FALSE otherwise.
+ * Returns true if all of the specified predicates are true for the entire
+ * (super)page and false otherwise.
  */
-boolean_t
-vm_page_ps_is_valid(vm_page_t m)
+bool
+vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
 {
+	vm_object_t object;
 	int i, npages;
 
-	VM_OBJECT_ASSERT_LOCKED(m->object);
+	object = m->object;
+	if (skip_m != NULL && skip_m->object != object)
+		return (false);
+	VM_OBJECT_ASSERT_LOCKED(object);
 	npages = atop(pagesizes[m->psind]);
 
 	/*
@@ -3127,10 +3747,28 @@
 	 * occupy adjacent entries in vm_page_array[].
 	 */
 	for (i = 0; i < npages; i++) {
-		if (m[i].valid != VM_PAGE_BITS_ALL)
-			return (FALSE);
+		/* Always test object consistency, including "skip_m". */
+		if (m[i].object != object)
+			return (false);
+		if (&m[i] == skip_m)
+			continue;
+		if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
+			return (false);
+		if ((flags & PS_ALL_DIRTY) != 0) {
+			/*
+			 * Calling vm_page_test_dirty() or pmap_is_modified()
+			 * might stop this case from spuriously returning
+			 * "false".  However, that would require a write lock
+			 * on the object containing "m[i]".
+			 */
+			if (m[i].dirty != VM_PAGE_BITS_ALL)
+				return (false);
+		}
+		if ((flags & PS_ALL_VALID) != 0 &&
+		    m[i].valid != VM_PAGE_BITS_ALL)
+			return (false);
 	}
-	return (TRUE);
+	return (true);
 }
 
 /*
@@ -3224,16 +3862,16 @@
 
 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 {
-	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
-	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
-	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
-	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
-	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
-	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
-	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
-	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
-	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
-	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
+
+	db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count);
+	db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
+	db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
+	db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count);
+	db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
+	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
+	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
+	db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
+	db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
 }
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
@@ -3240,17 +3878,16 @@
 {
 	int dom;
 
-	db_printf("pq_free %d pq_cache %d\n",
-	    cnt.v_free_count, cnt.v_cache_count);
+	db_printf("pq_free %d\n", vm_cnt.v_free_count);
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		db_printf(
-	"dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n",
+	    "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n",
 		    dom,
 		    vm_dom[dom].vmd_page_count,
 		    vm_dom[dom].vmd_free_count,
 		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
-		    vm_dom[dom].vmd_pass);
+		    vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt);
 	}
 }
 
@@ -3257,7 +3894,7 @@
 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
 {
 	vm_page_t m;
-	boolean_t phys;
+	boolean_t phys, virt;
 
 	if (!have_addr) {
 		db_printf("show pginfo addr\n");
@@ -3265,7 +3902,10 @@
 	}
 
 	phys = strchr(modif, 'p') != NULL;
-	if (phys)
+	virt = strchr(modif, 'v') != NULL;
+	if (virt)
+		m = PHYS_TO_VM_PAGE(pmap_kextract(addr));
+	else if (phys)
 		m = PHYS_TO_VM_PAGE(addr);
 	else
 		m = (vm_page_t)addr;

Modified: trunk/sys/vm/vm_page.h
===================================================================
--- trunk/sys/vm/vm_page.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_page.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_page.h 307672 2016-10-20 13:12:19Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_page.h 332505 2018-04-14 17:41:54Z kib $
  */
 
 /*
@@ -142,7 +142,7 @@
 	vm_object_t object;		/* which object am I in (O,P) */
 	vm_pindex_t pindex;		/* offset into object (O,P) */
 	vm_paddr_t phys_addr;		/* physical address of page */
-	struct md_page md;		/* machine dependant stuff */
+	struct md_page md;		/* machine dependent stuff */
 	u_int wire_count;		/* wired down maps refs (P) */
 	volatile u_int busy_lock;	/* busy owners lock */
 	uint16_t hold_count;		/* page hold count (P) */
@@ -150,6 +150,7 @@
 	uint8_t aflags;			/* access is atomic */
 	uint8_t oflags;			/* page VPO_* flags (O) */
 	uint8_t	queue;			/* page queue index (P,Q) */
+	int8_t psind;			/* pagesizes[] index (O) */
 	int8_t segind;
 	uint8_t	order;			/* index of the buddy queue */
 	uint8_t pool;
@@ -158,7 +159,6 @@
 	/* so, on normal X86 kernels, they must be at least 8 bits wide */
 	vm_page_bits_t valid;		/* map of valid DEV_BSIZE chunks (O) */
 	vm_page_bits_t dirty;		/* map of dirty DEV_BSIZE chunks (M) */
-	int8_t psind;			/* pagesizes[] index (O) */
 };
 
 /*
@@ -207,9 +207,13 @@
 #define	PQ_NONE		255
 #define	PQ_INACTIVE	0
 #define	PQ_ACTIVE	1
-#define	PQ_COUNT	2
+#define	PQ_LAUNDRY	2
+#define	PQ_COUNT	3
 
+#ifndef VM_PAGE_HAVE_PGLIST
 TAILQ_HEAD(pglist, vm_page);
+#define VM_PAGE_HAVE_PGLIST
+#endif
 SLIST_HEAD(spglist, vm_page);
 
 struct vm_pagequeue {
@@ -227,10 +231,11 @@
 	u_int vmd_free_count;
 	long vmd_segs;	/* bitmask of the segments */
 	boolean_t vmd_oom;
-	int vmd_pass;	/* local pagedaemon pass */
 	int vmd_oom_seq;
 	int vmd_last_active_scan;
+	struct vm_page vmd_laundry_marker;
 	struct vm_page vmd_marker; /* marker for pagedaemon private use */
+	struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
 };
 
 extern struct vm_domain vm_dom[MAXMEMDOM];
@@ -237,6 +242,7 @@
 
 #define	vm_pagequeue_assert_locked(pq)	mtx_assert(&(pq)->pq_mutex, MA_OWNED)
 #define	vm_pagequeue_lock(pq)		mtx_lock(&(pq)->pq_mutex)
+#define	vm_pagequeue_lockptr(pq)	(&(pq)->pq_mutex)
 #define	vm_pagequeue_unlock(pq)		mtx_unlock(&(pq)->pq_mutex)
 
 #ifdef _KERNEL
@@ -324,12 +330,9 @@
  * Page flags.  If changed at any other time than page allocation or
  * freeing, the modification must be protected by the vm_page lock.
  */
-#define	PG_CACHED	0x0001		/* page is cached */
-#define	PG_FREE		0x0002		/* page is free */
 #define	PG_FICTITIOUS	0x0004		/* physical page doesn't exist */
 #define	PG_ZERO		0x0008		/* page is zeroed */
 #define	PG_MARKER	0x0010		/* special queue marker page */
-#define	PG_WINATCFLS	0x0040		/* flush dirty page on inactive q */
 #define	PG_NODUMP	0x0080		/* don't include this page in a dump */
 #define	PG_UNHOLDFREE	0x0100		/* delayed free of a held page */
 
@@ -353,19 +356,16 @@
  *	free
  *		Available for allocation now.
  *
- *	cache
- *		Almost available for allocation. Still associated with
- *		an object, but clean and immediately freeable.
- *
- * The following lists are LRU sorted:
- *
  *	inactive
  *		Low activity, candidates for reclamation.
+ *		This list is approximately LRU ordered.
+ *
+ *	laundry
  *		This is the list of pages that should be
  *		paged out next.
  *
  *	active
- *		Pages that are "active" i.e. they have been
+ *		Pages that are "active", i.e., they have been
  *		recently referenced.
  *
  */
@@ -376,28 +376,51 @@
 extern long vm_page_array_size;		/* number of vm_page_t's */
 extern long first_page;			/* first physical page number */
 
-#define	VM_PAGE_IS_FREE(m)	(((m)->flags & PG_FREE) != 0)
-
 #define VM_PAGE_TO_PHYS(entry)	((entry)->phys_addr)
 
+/*
+ * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory
+ * page to which the given physical address belongs. The correct vm_page_t
+ * object is returned for addresses that are not page-aligned.
+ */
 vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
 
-/* page allocation classes: */
+/*
+ * Page allocation parameters for vm_page for the functions
+ * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and
+ * vm_page_alloc_freelist().  Some functions support only a subset
+ * of the flags, and ignore others, see the flags legend.
+ *
+ * The meaning of VM_ALLOC_ZERO differs slightly between the vm_page_alloc*()
+ * and the vm_page_grab*() functions.  See these functions for details.
+ *
+ * Bits 0 - 1 define class.
+ * Bits 2 - 15 dedicated for flags.
+ * Legend:
+ * (a) - vm_page_alloc() supports the flag.
+ * (c) - vm_page_alloc_contig() supports the flag.
+ * (f) - vm_page_alloc_freelist() supports the flag.
+ * (g) - vm_page_grab() supports the flag.
+ * (p) - vm_page_grab_pages() supports the flag.
+ * Bits above 15 define the count of additional pages that the caller
+ * intends to allocate.
+ */
 #define VM_ALLOC_NORMAL		0
 #define VM_ALLOC_INTERRUPT	1
 #define VM_ALLOC_SYSTEM		2
 #define	VM_ALLOC_CLASS_MASK	3
-/* page allocation flags: */
-#define	VM_ALLOC_WIRED		0x0020	/* non pageable */
-#define	VM_ALLOC_ZERO		0x0040	/* Try to obtain a zeroed page */
-#define	VM_ALLOC_NOOBJ		0x0100	/* No associated object */
-#define	VM_ALLOC_NOBUSY		0x0200	/* Do not busy the page */
-#define	VM_ALLOC_IFCACHED	0x0400	/* Fail if the page is not cached */
-#define	VM_ALLOC_IFNOTCACHED	0x0800	/* Fail if the page is cached */
-#define	VM_ALLOC_IGN_SBUSY	0x1000	/* vm_page_grab() only */
-#define	VM_ALLOC_NODUMP		0x2000	/* don't include in dump */
-#define	VM_ALLOC_SBUSY		0x4000	/* Shared busy the page */
-
+#define	VM_ALLOC_WAITOK		0x0008	/* (acf) Sleep and retry */
+#define	VM_ALLOC_WAITFAIL	0x0010	/* (acf) Sleep and return error */
+#define	VM_ALLOC_WIRED		0x0020	/* (acfgp) Allocate a wired page */
+#define	VM_ALLOC_ZERO		0x0040	/* (acfgp) Allocate a prezeroed page */
+#define	VM_ALLOC_NOOBJ		0x0100	/* (acg) No associated object */
+#define	VM_ALLOC_NOBUSY		0x0200	/* (acgp) Do not excl busy the page */
+#define	VM_ALLOC_IFCACHED	0x0400
+#define	VM_ALLOC_IFNOTCACHED	0x0800
+#define	VM_ALLOC_IGN_SBUSY	0x1000	/* (gp) Ignore shared busy flag */
+#define	VM_ALLOC_NODUMP		0x2000	/* (ag) don't include in dump */
+#define	VM_ALLOC_SBUSY		0x4000	/* (acgp) Shared busy the page */
+#define	VM_ALLOC_NOWAIT		0x8000	/* (acfgp) Do not sleep */
 #define	VM_ALLOC_COUNT_SHIFT	16
 #define	VM_ALLOC_COUNT(count)	((count) << VM_ALLOC_COUNT_SHIFT)
 
@@ -416,10 +439,26 @@
 		pflags |= VM_ALLOC_ZERO;
 	if ((malloc_flags & M_NODUMP) != 0)
 		pflags |= VM_ALLOC_NODUMP;
+	if ((malloc_flags & M_NOWAIT))
+		pflags |= VM_ALLOC_NOWAIT;
+	if ((malloc_flags & M_WAITOK))
+		pflags |= VM_ALLOC_WAITOK;
 	return (pflags);
 }
 #endif
 
+/*
+ * Predicates supported by vm_page_ps_test():
+ *
+ *	PS_ALL_DIRTY is true only if the entire (super)page is dirty.
+ *	However, it can be spuriously false when the (super)page has become
+ *	dirty in the pmap but that information has not been propagated to the
+ *	machine-independent layer.
+ */
+#define	PS_ALL_DIRTY	0x1
+#define	PS_ALL_VALID	0x2
+#define	PS_NONE_BUSY	0x4
+
 void vm_page_busy_downgrade(vm_page_t m);
 void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared);
 void vm_page_flash(vm_page_t m);
@@ -430,33 +469,38 @@
 
 void vm_page_activate (vm_page_t);
 void vm_page_advise(vm_page_t m, int advice);
-vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
+vm_page_t vm_page_alloc(vm_object_t, vm_pindex_t, int);
+vm_page_t vm_page_alloc_after(vm_object_t, vm_pindex_t, int, vm_page_t);
 vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr);
 vm_page_t vm_page_alloc_freelist(int, int);
+bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose);
+void vm_page_change_lock(vm_page_t m, struct mtx **mtx);
 vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
-void vm_page_cache(vm_page_t);
-void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
-void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
-int vm_page_try_to_cache (vm_page_t);
-int vm_page_try_to_free (vm_page_t);
+int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+    vm_page_t *ma, int count);
 void vm_page_deactivate (vm_page_t);
+void vm_page_deactivate_noreuse(vm_page_t);
 void vm_page_dequeue(vm_page_t m);
 void vm_page_dequeue_locked(vm_page_t m);
 vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
+void vm_page_free_phys_pglist(struct pglist *tq);
+bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked);
 vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
 int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
-boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
+void vm_page_launder(vm_page_t m);
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
 vm_page_t vm_page_next(vm_page_t m);
 int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
 struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
 vm_page_t vm_page_prev(vm_page_t m);
-boolean_t vm_page_ps_is_valid(vm_page_t m);
+bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m);
 void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);
+bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
+    vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
 void vm_page_reference(vm_page_t m);
 void vm_page_remove (vm_page_t);
 int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
@@ -465,16 +509,20 @@
 void vm_page_requeue(vm_page_t m);
 void vm_page_requeue_locked(vm_page_t m);
 int vm_page_sbusied(vm_page_t m);
+vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start,
+    vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options);
 void vm_page_set_valid_range(vm_page_t m, int base, int size);
 int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
 void vm_page_sunbusy(vm_page_t m);
+bool vm_page_try_to_free(vm_page_t m);
 int vm_page_trysbusy(vm_page_t m);
 void vm_page_unhold_pages(vm_page_t *ma, int count);
-void vm_page_unwire (vm_page_t, int);
+boolean_t vm_page_unwire(vm_page_t m, uint8_t queue);
 void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_wire (vm_page_t);
 void vm_page_xunbusy_hard(vm_page_t m);
+void vm_page_xunbusy_maybelocked(vm_page_t m);
 void vm_page_set_validclean (vm_page_t, int, int);
 void vm_page_clear_dirty (vm_page_t, int, int);
 void vm_page_set_invalid (vm_page_t, int, int);
@@ -497,17 +545,17 @@
 #define	vm_page_assert_sbusied(m)					\
 	KASSERT(vm_page_sbusied(m),					\
 	    ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \
-	    (void *)m, __FILE__, __LINE__));
+	    (m), __FILE__, __LINE__))
 
 #define	vm_page_assert_unbusied(m)					\
 	KASSERT(!vm_page_busied(m),					\
 	    ("vm_page_assert_unbusied: page %p busy @ %s:%d",		\
-	    (void *)m, __FILE__, __LINE__));
+	    (m), __FILE__, __LINE__))
 
 #define	vm_page_assert_xbusied(m)					\
 	KASSERT(vm_page_xbusied(m),					\
 	    ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \
-	    (void *)m, __FILE__, __LINE__));
+	    (m), __FILE__, __LINE__))
 
 #define	vm_page_busied(m)						\
 	((m)->busy_lock != VPB_UNBUSIED)
@@ -514,22 +562,24 @@
 
 #define	vm_page_sbusy(m) do {						\
 	if (!vm_page_trysbusy(m))					\
-		panic("%s: page %p failed shared busing", __func__, m);	\
+		panic("%s: page %p failed shared busying", __func__,	\
+		    (m));						\
 } while (0)
 
 #define	vm_page_tryxbusy(m)						\
-	(atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED,		\
+	(atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED,		\
 	    VPB_SINGLE_EXCLUSIVER))
 
 #define	vm_page_xbusied(m)						\
-	((m->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0)
+	(((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0)
 
 #define	vm_page_xbusy(m) do {						\
 	if (!vm_page_tryxbusy(m))					\
-		panic("%s: page %p failed exclusive busing", __func__,	\
-		    m);							\
+		panic("%s: page %p failed exclusive busying", __func__,	\
+		    (m));						\
 } while (0)
 
+/* Note: page m's lock must not be owned by the caller. */
 #define	vm_page_xunbusy(m) do {						\
 	if (!atomic_cmpset_rel_int(&(m)->busy_lock,			\
 	    VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED))			\
@@ -660,5 +710,41 @@
 	m->dirty = 0;
 }
 
+static inline void
+vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
+    vm_page_t mold)
+{
+	vm_page_t mret;
+
+	mret = vm_page_replace(mnew, object, pindex);
+	KASSERT(mret == mold,
+	    ("invalid page replacement, mold=%p, mret=%p", mold, mret));
+
+	/* Unused if !INVARIANTS. */
+	(void)mold;
+	(void)mret;
+}
+
+static inline bool
+vm_page_active(vm_page_t m)
+{
+
+	return (m->queue == PQ_ACTIVE);
+}
+
+static inline bool
+vm_page_inactive(vm_page_t m)
+{
+
+	return (m->queue == PQ_INACTIVE);
+}
+
+static inline bool
+vm_page_in_laundry(vm_page_t m)
+{
+
+	return (m->queue == PQ_LAUNDRY);
+}
+
 #endif				/* _KERNEL */
 #endif				/* !_VM_PAGE_ */

Modified: trunk/sys/vm/vm_pageout.c
===================================================================
--- trunk/sys/vm/vm_pageout.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pageout.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -74,10 +74,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pageout.c 320550 2017-07-01 19:24:53Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_pageout.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include "opt_vm.h"
-#include "opt_kdtrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -120,8 +120,9 @@
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static void vm_pageout_init(void);
-static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_scan(struct vm_domain *vmd, int pass);
+static int vm_pageout_clean(vm_page_t m, int *numpagedout);
+static int vm_pageout_cluster(vm_page_t m);
+static bool vm_pageout_scan(struct vm_domain *vmd, int pass);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage);
 
@@ -139,82 +140,49 @@
     &page_kp);
 
 SDT_PROVIDER_DEFINE(vm);
-SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
 
-#if !defined(NO_SWAPPING)
-/* the kernel process "vm_daemon"*/
-static void vm_daemon(void);
-static struct	proc *vmproc;
+/* Pagedaemon activity rates, in subdivisions of one second. */
+#define	VM_LAUNDER_RATE		10
+#define	VM_INACT_SCAN_RATE	2
 
-static struct kproc_desc vm_kp = {
-	"vmdaemon",
-	vm_daemon,
-	&vmproc
-};
-SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
-#endif
-
-
-int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit;		/* Estimated number of pages deficit */
-int vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
-int vm_pageout_wakeup_thresh;
+u_int vm_pageout_wakeup_thresh;
 static int vm_pageout_oom_seq = 12;
+bool vm_pageout_wanted;		/* Event on which pageout daemon sleeps */
+bool vm_pages_needed;		/* Are threads waiting for free pages? */
 
-#if !defined(NO_SWAPPING)
-static int vm_pageout_req_swapout;	/* XXX */
-static int vm_daemon_needed;
-static struct mtx vm_daemon_mtx;
-/* Allow for use by vm_pageout before vm_daemon is initialized. */
-MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
-#endif
-static int vm_max_launder = 32;
+/* Pending request for dirty page laundering. */
+static enum {
+	VM_LAUNDRY_IDLE,
+	VM_LAUNDRY_BACKGROUND,
+	VM_LAUNDRY_SHORTFALL
+} vm_laundry_request = VM_LAUNDRY_IDLE;
+
 static int vm_pageout_update_period;
-static int defer_swap_pageouts;
 static int disable_swap_pageouts;
 static int lowmem_period = 10;
 static time_t lowmem_uptime;
 
-#if defined(NO_SWAPPING)
-static int vm_swap_enabled = 0;
-static int vm_swap_idle_enabled = 0;
-#else
-static int vm_swap_enabled = 1;
-static int vm_swap_idle_enabled = 0;
-#endif
+static int vm_panic_on_oom = 0;
 
+SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
+	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
+	"panic on out of memory instead of killing the largest process");
+
 SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
-	CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
+	CTLFLAG_RWTUN, &vm_pageout_wakeup_thresh, 0,
 	"free page threshold for waking up the pageout daemon");
 
-SYSCTL_INT(_vm, OID_AUTO, max_launder,
-	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
-
 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
-	CTLFLAG_RW, &vm_pageout_update_period, 0,
+	CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
 	"Maximum active LRU update period");
   
-SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
+SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
 	"Low memory callback period");
 
-#if defined(NO_SWAPPING)
-SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
-	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
-	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
-#else
-SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
-	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
-	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
-#endif
-
-SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
-	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
-
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
-	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
+	CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
@@ -221,24 +189,39 @@
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
-	CTLFLAG_RW, &vm_pageout_oom_seq, 0,
+	CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
 	"back-to-back calls to oom detector to start OOM");
 
-#define VM_PAGEOUT_PAGE_COUNT 16
-int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
+static int act_scan_laundry_weight = 3;
+SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
+    &act_scan_laundry_weight, 0,
+    "weight given to clean vs. dirty pages in active queue scans");
 
+static u_int vm_background_launder_target;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RWTUN,
+    &vm_background_launder_target, 0,
+    "background laundering target, in pages");
+
+static u_int vm_background_launder_rate = 4096;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
+    &vm_background_launder_rate, 0,
+    "background laundering rate, in kilobytes per second");
+
+static u_int vm_background_launder_max = 20 * 1024;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
+    &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
+
+int vm_pageout_page_count = 32;
+
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 SYSCTL_INT(_vm, OID_AUTO, max_wired,
 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
 
+static u_int isqrt(u_int num);
 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
-static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
-    vm_paddr_t);
-#if !defined(NO_SWAPPING)
-static void vm_pageout_map_deactivate_pages(vm_map_t, long);
-static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
-static void vm_req_vmdaemon(int req);
-#endif
+static int vm_pageout_launder(struct vm_domain *vmd, int launder,
+    bool in_shortfall);
+static void vm_pageout_laundry_worker(void *arg);
 static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
 
 /*
@@ -352,41 +335,30 @@
 }
 
 /*
- * vm_pageout_clean:
- *
- * Clean the page and remove it from the laundry.
- * 
- * We set the busy bit to cause potential page faults on this page to
- * block.  Note the careful timing, however, the busy bit isn't set till
- * late and we cannot do anything that will mess with the page.
+ * Scan for pages at adjacent offsets within the given page's object that are
+ * eligible for laundering, form a cluster of these pages and the given page,
+ * and launder that cluster.
  */
 static int
-vm_pageout_clean(vm_page_t m)
+vm_pageout_cluster(vm_page_t m)
 {
 	vm_object_t object;
-	vm_page_t mc[2*vm_pageout_page_count], pb, ps;
-	int pageout_count;
-	int ib, is, page_base;
-	vm_pindex_t pindex = m->pindex;
+	vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
+	vm_pindex_t pindex;
+	int ib, is, page_base, pageout_count;
 
-	vm_page_lock_assert(m, MA_OWNED);
+	vm_page_assert_locked(m);
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
+	pindex = m->pindex;
 
 	/*
-	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
-	 * with the new swapper, but we could have serious problems paging
-	 * out other object types if there is insufficient memory.  
-	 *
-	 * Unfortunately, checking free memory here is far too late, so the
-	 * check has been moved up a procedural level.
+	 * We can't clean the page if it is busy or held.
 	 */
+	vm_page_assert_unbusied(m);
+	KASSERT(m->hold_count == 0, ("page %p is held", m));
 
-	/*
-	 * Can't clean the page if it's busy or held.
-	 */
-	vm_page_assert_unbusied(m);
-	KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
+	pmap_remove_write(m);
 	vm_page_unlock(m);
 
 	mc[vm_pageout_page_count] = pb = ps = m;
@@ -396,33 +368,23 @@
 	is = 1;
 
 	/*
-	 * Scan object for clusterable pages.
+	 * We can cluster only if the page is not clean, busy, or held, and
+	 * the page is in the laundry queue.
 	 *
-	 * We can cluster ONLY if: ->> the page is NOT
-	 * clean, wired, busy, held, or mapped into a
-	 * buffer, and one of the following:
-	 * 1) The page is inactive, or a seldom used
-	 *    active page.
-	 * -or-
-	 * 2) we force the issue.
-	 *
 	 * During heavy mmap/modification loads the pageout
 	 * daemon can really fragment the underlying file
-	 * due to flushing pages out of order and not trying
-	 * align the clusters (which leave sporatic out-of-order
+	 * due to flushing pages out of order and not trying to
+	 * align the clusters (which leaves sporadic out-of-order
 	 * holes).  To solve this problem we do the reverse scan
 	 * first and attempt to align our cluster, then do a 
 	 * forward scan if room remains.
 	 */
 more:
-	while (ib && pageout_count < vm_pageout_page_count) {
-		vm_page_t p;
-
+	while (ib != 0 && pageout_count < vm_pageout_page_count) {
 		if (ib > pindex) {
 			ib = 0;
 			break;
 		}
-
 		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
 			ib = 0;
 			break;
@@ -433,28 +395,27 @@
 			break;
 		}
 		vm_page_lock(p);
-		if (p->queue != PQ_INACTIVE ||
+		if (!vm_page_in_laundry(p) ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			ib = 0;
 			break;
 		}
+		pmap_remove_write(p);
 		vm_page_unlock(p);
 		mc[--page_base] = pb = p;
 		++pageout_count;
 		++ib;
+
 		/*
-		 * alignment boundry, stop here and switch directions.  Do
-		 * not clear ib.
+		 * We are at an alignment boundary.  Stop here, and switch
+		 * directions.  Do not clear ib.
 		 */
 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
 			break;
 	}
-
 	while (pageout_count < vm_pageout_page_count && 
 	    pindex + is < object->size) {
-		vm_page_t p;
-
 		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
 			break;
 		vm_page_test_dirty(p);
@@ -461,11 +422,12 @@
 		if (p->dirty == 0)
 			break;
 		vm_page_lock(p);
-		if (p->queue != PQ_INACTIVE ||
+		if (!vm_page_in_laundry(p) ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			break;
 		}
+		pmap_remove_write(p);
 		vm_page_unlock(p);
 		mc[page_base + pageout_count] = ps = p;
 		++pageout_count;
@@ -474,17 +436,14 @@
 
 	/*
 	 * If we exhausted our forward scan, continue with the reverse scan
-	 * when possible, even past a page boundry.  This catches boundry
-	 * conditions.
+	 * when possible, even past an alignment boundary.  This catches
+	 * boundary conditions.
 	 */
-	if (ib && pageout_count < vm_pageout_page_count)
+	if (ib != 0 && pageout_count < vm_pageout_page_count)
 		goto more;
 
-	/*
-	 * we allow reads during pageouts...
-	 */
-	return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL,
-	    NULL));
+	return (vm_pageout_flush(&mc[page_base], pageout_count,
+	    VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
 }
 
 /*
@@ -513,8 +472,8 @@
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
-	 * Initiate I/O.  Bump the vm_page_t->busy counter and
-	 * mark the pages read-only.
+	 * Initiate I/O.  Mark the pages busy and verify that they're valid
+	 * and read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
@@ -526,8 +485,9 @@
 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
+		KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0,
+		    ("vm_pageout_flush: writeable page %p", mc[i]));
 		vm_page_sbusy(mc[i]);
-		pmap_remove_write(mc[i]);
 	}
 	vm_object_pip_add(object, count);
 
@@ -544,23 +504,33 @@
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
+			vm_page_lock(mt);
+			if (vm_page_in_laundry(mt))
+				vm_page_deactivate_noreuse(mt);
+			vm_page_unlock(mt);
+			/* FALLTHROUGH */
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
-			 * Page outside of range of object. Right now we
-			 * essentially lose the changes by pretending it
-			 * worked.
+			 * The page is outside the object's range.  We pretend
+			 * that the page out worked and clean the page, so the
+			 * changes will be lost if the page is reclaimed by
+			 * the page daemon.
 			 */
 			vm_page_undirty(mt);
+			vm_page_lock(mt);
+			if (vm_page_in_laundry(mt))
+				vm_page_deactivate_noreuse(mt);
+			vm_page_unlock(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
-			 * If page couldn't be paged out, then reactivate the
-			 * page so it doesn't clog the inactive list.  (We
-			 * will try paging out it again later).
+			 * If the page couldn't be paged out, then reactivate
+			 * it so that it doesn't clog the laundry and inactive
+			 * queues.  (We will try paging it out again later).
 			 */
 			vm_page_lock(mt);
 			vm_page_activate(mt);
@@ -583,11 +553,6 @@
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_sunbusy(mt);
-			if (vm_page_count_severe()) {
-				vm_page_lock(mt);
-				vm_page_try_to_cache(mt);
-				vm_page_unlock(mt);
-			}
 		}
 	}
 	if (prunlen != NULL)
@@ -595,24 +560,172 @@
 	return (numpagedout);
 }
 
-static boolean_t
-vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
-    vm_paddr_t high)
+/*
+ * Attempt to acquire all of the necessary locks to launder a page and
+ * then call through the clustering layer to PUTPAGES.  Wait a short
+ * time for a vnode lock.
+ *
+ * Requires the page and object lock on entry, releases both before return.
+ * Returns 0 on success and an errno otherwise.
+ */
+static int
+vm_pageout_clean(vm_page_t m, int *numpagedout)
 {
+	struct vnode *vp;
 	struct mount *mp;
-	struct vnode *vp;
 	vm_object_t object;
-	vm_paddr_t pa;
-	vm_page_t m, m_tmp, next;
-	int lockmode;
+	vm_pindex_t pindex;
+	int error, lockmode;
 
+	vm_page_assert_locked(m);
+	object = m->object;
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	error = 0;
+	vp = NULL;
+	mp = NULL;
+
+	/*
+	 * The object is already known NOT to be dead.   It
+	 * is possible for the vget() to block the whole
+	 * pageout daemon, but the new low-memory handling
+	 * code should prevent it.
+	 *
+	 * We can't wait forever for the vnode lock, we might
+	 * deadlock due to a vn_read() getting stuck in
+	 * vm_wait while holding this vnode.  We skip the 
+	 * vnode if we can't get it in a reasonable amount
+	 * of time.
+	 */
+	if (object->type == OBJT_VNODE) {
+		vm_page_unlock(m);
+		vp = object->handle;
+		if (vp->v_type == VREG &&
+		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+			mp = NULL;
+			error = EDEADLK;
+			goto unlock_all;
+		}
+		KASSERT(mp != NULL,
+		    ("vp %p with NULL v_mount", vp));
+		vm_object_reference_locked(object);
+		pindex = m->pindex;
+		VM_OBJECT_WUNLOCK(object);
+		lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
+		    LK_SHARED : LK_EXCLUSIVE;
+		if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
+			vp = NULL;
+			error = EDEADLK;
+			goto unlock_mp;
+		}
+		VM_OBJECT_WLOCK(object);
+
+		/*
+		 * Ensure that the object and vnode were not disassociated
+		 * while locks were dropped.
+		 */
+		if (vp->v_object != object) {
+			error = ENOENT;
+			goto unlock_all;
+		}
+		vm_page_lock(m);
+
+		/*
+		 * While the object and page were unlocked, the page
+		 * may have been:
+		 * (1) moved to a different queue,
+		 * (2) reallocated to a different object,
+		 * (3) reallocated to a different offset, or
+		 * (4) cleaned.
+		 */
+		if (!vm_page_in_laundry(m) || m->object != object ||
+		    m->pindex != pindex || m->dirty == 0) {
+			vm_page_unlock(m);
+			error = ENXIO;
+			goto unlock_all;
+		}
+
+		/*
+		 * The page may have been busied or held while the object
+		 * and page locks were released.
+		 */
+		if (vm_page_busied(m) || m->hold_count != 0) {
+			vm_page_unlock(m);
+			error = EBUSY;
+			goto unlock_all;
+		}
+	}
+
+	/*
+	 * If a page is dirty, then it is either being washed
+	 * (but not yet cleaned) or it is still in the
+	 * laundry.  If it is still in the laundry, then we
+	 * start the cleaning operation. 
+	 */
+	if ((*numpagedout = vm_pageout_cluster(m)) == 0)
+		error = EIO;
+
+unlock_all:
+	VM_OBJECT_WUNLOCK(object);
+
+unlock_mp:
+	vm_page_lock_assert(m, MA_NOTOWNED);
+	if (mp != NULL) {
+		if (vp != NULL)
+			vput(vp);
+		vm_object_deallocate(object);
+		vn_finished_write(mp);
+	}
+
+	return (error);
+}
+
+/*
+ * Attempt to launder the specified number of pages.
+ *
+ * Returns the number of pages successfully laundered.
+ */
+static int
+vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
+{
+	struct vm_pagequeue *pq;
+	vm_object_t object;
+	vm_page_t m, next;
+	int act_delta, error, maxscan, numpagedout, starting_target;
+	int vnodes_skipped;
+	bool pageout_ok, queue_locked;
+
+	starting_target = launder;
+	vnodes_skipped = 0;
+
+	/*
+	 * Scan the laundry queue for pages eligible to be laundered.  We stop
+	 * once the target number of dirty pages have been laundered, or once
+	 * we've reached the end of the queue.  A single iteration of this loop
+	 * may cause more than one page to be laundered because of clustering.
+	 *
+	 * maxscan ensures that we don't re-examine requeued pages.  Any
+	 * additional pages written as part of a cluster are subtracted from
+	 * maxscan since they must be taken from the laundry queue.
+	 */
+	pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
+	maxscan = pq->pq_cnt;
+
 	vm_pagequeue_lock(pq);
-	TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) {
+	queue_locked = true;
+	for (m = TAILQ_FIRST(&pq->pq_pl);
+	    m != NULL && maxscan-- > 0 && launder > 0;
+	    m = next) {
+		vm_pagequeue_assert_locked(pq);
+		KASSERT(queue_locked, ("unlocked laundry queue"));
+		KASSERT(vm_page_in_laundry(m),
+		    ("page %p has an inconsistent queue", m));
+		next = TAILQ_NEXT(m, plinks.q);
 		if ((m->flags & PG_MARKER) != 0)
 			continue;
-		pa = VM_PAGE_TO_PHYS(m);
-		if (pa < low || pa + PAGE_SIZE > high)
-			continue;
+		KASSERT((m->flags & PG_FICTITIOUS) == 0,
+		    ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
+		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+		    ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
 		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
 			vm_page_unlock(m);
 			continue;
@@ -621,326 +734,341 @@
 		if ((!VM_OBJECT_TRYWLOCK(object) &&
 		    (!vm_pageout_fallback_object_lock(m, &next) ||
 		    m->hold_count != 0)) || vm_page_busied(m)) {
+			VM_OBJECT_WUNLOCK(object);
 			vm_page_unlock(m);
-			VM_OBJECT_WUNLOCK(object);
 			continue;
 		}
-		vm_page_test_dirty(m);
-		if (m->dirty == 0 && object->ref_count != 0)
-			pmap_remove_all(m);
-		if (m->dirty != 0) {
-			vm_page_unlock(m);
-			if (tries == 0 || (object->flags & OBJ_DEAD) != 0) {
-				VM_OBJECT_WUNLOCK(object);
-				continue;
-			}
-			if (object->type == OBJT_VNODE) {
-				vm_pagequeue_unlock(pq);
-				vp = object->handle;
-				vm_object_reference_locked(object);
-				VM_OBJECT_WUNLOCK(object);
-				(void)vn_start_write(vp, &mp, V_WAIT);
-				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
-				    LK_SHARED : LK_EXCLUSIVE;
-				vn_lock(vp, lockmode | LK_RETRY);
-				VM_OBJECT_WLOCK(object);
-				vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
-				VM_OBJECT_WUNLOCK(object);
-				VOP_UNLOCK(vp, 0);
-				vm_object_deallocate(object);
-				vn_finished_write(mp);
-				return (TRUE);
-			} else if (object->type == OBJT_SWAP ||
-			    object->type == OBJT_DEFAULT) {
-				vm_pagequeue_unlock(pq);
-				m_tmp = m;
-				vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC,
-				    0, NULL, NULL);
-				VM_OBJECT_WUNLOCK(object);
-				return (TRUE);
-			}
-		} else {
-			/*
-			 * Dequeue here to prevent lock recursion in
-			 * vm_page_cache().
-			 */
-			vm_page_dequeue_locked(m);
-			vm_page_cache(m);
-			vm_page_unlock(m);
+
+		/*
+		 * Unlock the laundry queue, invalidating the 'next' pointer.
+		 * Use a marker to remember our place in the laundry queue.
+		 */
+		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker,
+		    plinks.q);
+		vm_pagequeue_unlock(pq);
+		queue_locked = false;
+
+		/*
+		 * Invalid pages can be easily freed.  They cannot be
+		 * mapped; vm_page_free() asserts this.
+		 */
+		if (m->valid == 0)
+			goto free_page;
+
+		/*
+		 * If the page has been referenced and the object is not dead,
+		 * reactivate or requeue the page depending on whether the
+		 * object is mapped.
+		 */
+		if ((m->aflags & PGA_REFERENCED) != 0) {
+			vm_page_aflag_clear(m, PGA_REFERENCED);
+			act_delta = 1;
+		} else
+			act_delta = 0;
+		if (object->ref_count != 0)
+			act_delta += pmap_ts_referenced(m);
+		else {
+			KASSERT(!pmap_page_is_mapped(m),
+			    ("page %p is mapped", m));
 		}
-		VM_OBJECT_WUNLOCK(object);
-	}
-	vm_pagequeue_unlock(pq);
-	return (FALSE);
-}
+		if (act_delta != 0) {
+			if (object->ref_count != 0) {
+				PCPU_INC(cnt.v_reactivated);
+				vm_page_activate(m);
 
-/*
- * Increase the number of cached pages.  The specified value, "tries",
- * determines which categories of pages are cached:
- *
- *  0: All clean, inactive pages within the specified physical address range
- *     are cached.  Will not sleep.
- *  1: The vm_lowmem handlers are called.  All inactive pages within
- *     the specified physical address range are cached.  May sleep.
- *  2: The vm_lowmem handlers are called.  All inactive and active pages
- *     within the specified physical address range are cached.  May sleep.
- */
-void
-vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
-{
-	int actl, actmax, inactl, inactmax, dom, initial_dom;
-	static int start_dom = 0;
+				/*
+				 * Increase the activation count if the page
+				 * was referenced while in the laundry queue.
+				 * This makes it less likely that the page will
+				 * be returned prematurely to the inactive
+				 * queue.
+ 				 */
+				m->act_count += act_delta + ACT_ADVANCE;
 
-	if (tries > 0) {
+				/*
+				 * If this was a background laundering, count
+				 * activated pages towards our target.  The
+				 * purpose of background laundering is to ensure
+				 * that pages are eventually cycled through the
+				 * laundry queue, and an activation is a valid
+				 * way out.
+				 */
+				if (!in_shortfall)
+					launder--;
+				goto drop_page;
+			} else if ((object->flags & OBJ_DEAD) == 0)
+				goto requeue_page;
+		}
+
 		/*
-		 * Decrease registered cache sizes.  The vm_lowmem handlers
-		 * may acquire locks and/or sleep, so they can only be invoked
-		 * when "tries" is greater than zero.
+		 * If the page appears to be clean at the machine-independent
+		 * layer, then remove all of its mappings from the pmap in
+		 * anticipation of freeing it.  If, however, any of the page's
+		 * mappings allow write access, then the page may still be
+		 * modified until the last of those mappings are removed.
 		 */
-		SDT_PROBE0(vm, , , vm__lowmem_cache);
-		EVENTHANDLER_INVOKE(vm_lowmem, 0);
+		if (object->ref_count != 0) {
+			vm_page_test_dirty(m);
+			if (m->dirty == 0)
+				pmap_remove_all(m);
+		}
 
 		/*
-		 * We do this explicitly after the caches have been drained
-		 * above.
+		 * Clean pages are freed, and dirty pages are paged out unless
+		 * they belong to a dead object.  Requeueing dirty pages from
+		 * dead objects is pointless, as they are being paged out and
+		 * freed by the thread that destroyed the object.
 		 */
-		uma_reclaim();
+		if (m->dirty == 0) {
+free_page:
+			vm_page_free(m);
+			PCPU_INC(cnt.v_dfree);
+		} else if ((object->flags & OBJ_DEAD) == 0) {
+			if (object->type != OBJT_SWAP &&
+			    object->type != OBJT_DEFAULT)
+				pageout_ok = true;
+			else if (disable_swap_pageouts)
+				pageout_ok = false;
+			else
+				pageout_ok = true;
+			if (!pageout_ok) {
+requeue_page:
+				vm_pagequeue_lock(pq);
+				queue_locked = true;
+				vm_page_requeue_locked(m);
+				goto drop_page;
+			}
+
+			/*
+			 * Form a cluster with adjacent, dirty pages from the
+			 * same object, and page out that entire cluster.
+			 *
+			 * The adjacent, dirty pages must also be in the
+			 * laundry.  However, their mappings are not checked
+			 * for new references.  Consequently, a recently
+			 * referenced page may be paged out.  However, that
+			 * page will not be prematurely reclaimed.  After page
+			 * out, the page will be placed in the inactive queue,
+			 * where any new references will be detected and the
+			 * page reactivated.
+			 */
+			error = vm_pageout_clean(m, &numpagedout);
+			if (error == 0) {
+				launder -= numpagedout;
+				maxscan -= numpagedout - 1;
+			} else if (error == EDEADLK) {
+				pageout_lock_miss++;
+				vnodes_skipped++;
+			}
+			goto relock_queue;
+		}
+drop_page:
+		vm_page_unlock(m);
+		VM_OBJECT_WUNLOCK(object);
+relock_queue:
+		if (!queue_locked) {
+			vm_pagequeue_lock(pq);
+			queue_locked = true;
+		}
+		next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q);
+		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q);
 	}
+	vm_pagequeue_unlock(pq);
 
 	/*
-	 * Make the next scan start on the next domain.
+	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
+	 * and we didn't launder enough pages.
 	 */
-	initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
+	if (vnodes_skipped > 0 && launder > 0)
+		(void)speedup_syncer();
 
-	inactl = 0;
-	inactmax = cnt.v_inactive_count;
-	actl = 0;
-	actmax = tries < 2 ? 0 : cnt.v_active_count;
-	dom = initial_dom;
-
-	/*
-	 * Scan domains in round-robin order, first inactive queues,
-	 * then active.  Since domain usually owns large physically
-	 * contiguous chunk of memory, it makes sense to completely
-	 * exhaust one domain before switching to next, while growing
-	 * the pool of contiguous physical pages.
-	 *
-	 * Do not even start launder a domain which cannot contain
-	 * the specified address range, as indicated by segments
-	 * constituting the domain.
-	 */
-again_inact:
-	if (inactl < inactmax) {
-		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
-		    low, high) &&
-		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
-		    tries, low, high)) {
-			inactl++;
-			goto again_inact;
-		}
-		if (++dom == vm_ndomains)
-			dom = 0;
-		if (dom != initial_dom)
-			goto again_inact;
-	}
-again_act:
-	if (actl < actmax) {
-		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
-		    low, high) &&
-		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
-		      tries, low, high)) {
-			actl++;
-			goto again_act;
-		}
-		if (++dom == vm_ndomains)
-			dom = 0;
-		if (dom != initial_dom)
-			goto again_act;
-	}
+	return (starting_target - launder);
 }
 
-#if !defined(NO_SWAPPING)
 /*
- *	vm_pageout_object_deactivate_pages
- *
- *	Deactivate enough pages to satisfy the inactive target
- *	requirements.
- *
- *	The object and map must be locked.
+ * Compute the integer square root.
  */
-static void
-vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
-    long desired)
+static u_int
+isqrt(u_int num)
 {
-	vm_object_t backing_object, object;
-	vm_page_t p;
-	int act_delta, remove_mode;
+	u_int bit, root, tmp;
 
-	VM_OBJECT_ASSERT_LOCKED(first_object);
-	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
-		return;
-	for (object = first_object;; object = backing_object) {
-		if (pmap_resident_count(pmap) <= desired)
-			goto unlock_return;
-		VM_OBJECT_ASSERT_LOCKED(object);
-		if ((object->flags & OBJ_UNMANAGED) != 0 ||
-		    object->paging_in_progress != 0)
-			goto unlock_return;
-
-		remove_mode = 0;
-		if (object->shadow_count > 1)
-			remove_mode = 1;
-		/*
-		 * Scan the object's entire memory queue.
-		 */
-		TAILQ_FOREACH(p, &object->memq, listq) {
-			if (pmap_resident_count(pmap) <= desired)
-				goto unlock_return;
-			if (vm_page_busied(p))
-				continue;
-			PCPU_INC(cnt.v_pdpages);
-			vm_page_lock(p);
-			if (p->wire_count != 0 || p->hold_count != 0 ||
-			    !pmap_page_exists_quick(pmap, p)) {
-				vm_page_unlock(p);
-				continue;
-			}
-			act_delta = pmap_ts_referenced(p);
-			if ((p->aflags & PGA_REFERENCED) != 0) {
-				if (act_delta == 0)
-					act_delta = 1;
-				vm_page_aflag_clear(p, PGA_REFERENCED);
-			}
-			if (p->queue != PQ_ACTIVE && act_delta != 0) {
-				vm_page_activate(p);
-				p->act_count += act_delta;
-			} else if (p->queue == PQ_ACTIVE) {
-				if (act_delta == 0) {
-					p->act_count -= min(p->act_count,
-					    ACT_DECLINE);
-					if (!remove_mode && p->act_count == 0) {
-						pmap_remove_all(p);
-						vm_page_deactivate(p);
-					} else
-						vm_page_requeue(p);
-				} else {
-					vm_page_activate(p);
-					if (p->act_count < ACT_MAX -
-					    ACT_ADVANCE)
-						p->act_count += ACT_ADVANCE;
-					vm_page_requeue(p);
-				}
-			} else if (p->queue == PQ_INACTIVE)
-				pmap_remove_all(p);
-			vm_page_unlock(p);
+	bit = 1u << ((NBBY * sizeof(u_int)) - 2);
+	while (bit > num)
+		bit >>= 2;
+	root = 0;
+	while (bit != 0) {
+		tmp = root + bit;
+		root >>= 1;
+		if (num >= tmp) {
+			num -= tmp;
+			root += bit;
 		}
-		if ((backing_object = object->backing_object) == NULL)
-			goto unlock_return;
-		VM_OBJECT_RLOCK(backing_object);
-		if (object != first_object)
-			VM_OBJECT_RUNLOCK(object);
+		bit >>= 2;
 	}
-unlock_return:
-	if (object != first_object)
-		VM_OBJECT_RUNLOCK(object);
+	return (root);
 }
 
 /*
- * deactivate some number of pages in a map, try to do it fairly, but
- * that is really hard to do.
+ * Perform the work of the laundry thread: periodically wake up and determine
+ * whether any pages need to be laundered.  If so, determine the number of pages
+ * that need to be laundered, and launder them.
  */
 static void
-vm_pageout_map_deactivate_pages(map, desired)
-	vm_map_t map;
-	long desired;
+vm_pageout_laundry_worker(void *arg)
 {
-	vm_map_entry_t tmpe;
-	vm_object_t obj, bigobj;
-	int nothingwired;
+	struct vm_domain *domain;
+	struct vm_pagequeue *pq;
+	uint64_t nclean, ndirty;
+	u_int last_launder, wakeups;
+	int domidx, last_target, launder, shortfall, shortfall_cycle, target;
+	bool in_shortfall;
 
-	if (!vm_map_trylock(map))
-		return;
+	domidx = (uintptr_t)arg;
+	domain = &vm_dom[domidx];
+	pq = &domain->vmd_pagequeues[PQ_LAUNDRY];
+	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
+	vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY);
 
-	bigobj = NULL;
-	nothingwired = TRUE;
+	shortfall = 0;
+	in_shortfall = false;
+	shortfall_cycle = 0;
+	target = 0;
+	last_launder = 0;
 
 	/*
-	 * first, search out the biggest object, and try to free pages from
-	 * that.
+	 * The pageout laundry worker is never done, so loop forever.
 	 */
-	tmpe = map->header.next;
-	while (tmpe != &map->header) {
-		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
-			obj = tmpe->object.vm_object;
-			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
-				if (obj->shadow_count <= 1 &&
-				    (bigobj == NULL ||
-				     bigobj->resident_page_count < obj->resident_page_count)) {
-					if (bigobj != NULL)
-						VM_OBJECT_RUNLOCK(bigobj);
-					bigobj = obj;
-				} else
-					VM_OBJECT_RUNLOCK(obj);
-			}
+	for (;;) {
+		KASSERT(target >= 0, ("negative target %d", target));
+		KASSERT(shortfall_cycle >= 0,
+		    ("negative cycle %d", shortfall_cycle));
+		launder = 0;
+		wakeups = VM_METER_PCPU_CNT(v_pdwakeups);
+
+		/*
+		 * First determine whether we need to launder pages to meet a
+		 * shortage of free pages.
+		 */
+		if (shortfall > 0) {
+			in_shortfall = true;
+			shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
+			target = shortfall;
+		} else if (!in_shortfall)
+			goto trybackground;
+		else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) {
+			/*
+			 * We recently entered shortfall and began laundering
+			 * pages.  If we have completed that laundering run
+			 * (and we are no longer in shortfall) or we have met
+			 * our laundry target through other activity, then we
+			 * can stop laundering pages.
+			 */
+			in_shortfall = false;
+			target = 0;
+			goto trybackground;
 		}
-		if (tmpe->wired_count > 0)
-			nothingwired = FALSE;
-		tmpe = tmpe->next;
-	}
+		last_launder = wakeups;
+		launder = target / shortfall_cycle--;
+		goto dolaundry;
 
-	if (bigobj != NULL) {
-		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
-		VM_OBJECT_RUNLOCK(bigobj);
-	}
-	/*
-	 * Next, hunt around for other pages to deactivate.  We actually
-	 * do this search sort of wrong -- .text first is not the best idea.
-	 */
-	tmpe = map->header.next;
-	while (tmpe != &map->header) {
-		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
-			break;
-		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
-			obj = tmpe->object.vm_object;
-			if (obj != NULL) {
-				VM_OBJECT_RLOCK(obj);
-				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
-				VM_OBJECT_RUNLOCK(obj);
+		/*
+		 * There's no immediate need to launder any pages; see if we
+		 * meet the conditions to perform background laundering:
+		 *
+		 * 1. The ratio of dirty to clean inactive pages exceeds the
+		 *    background laundering threshold and the pagedaemon has
+		 *    been woken up to reclaim pages since our last
+		 *    laundering, or
+		 * 2. we haven't yet reached the target of the current
+		 *    background laundering run.
+		 *
+		 * The background laundering threshold is not a constant.
+		 * Instead, it is a slowly growing function of the number of
+		 * page daemon wakeups since the last laundering.  Thus, as the
+		 * ratio of dirty to clean inactive pages grows, the amount of
+		 * memory pressure required to trigger laundering decreases.
+		 */
+trybackground:
+		nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count;
+		ndirty = vm_cnt.v_laundry_count;
+		if (target == 0 && wakeups != last_launder &&
+		    ndirty * isqrt(wakeups - last_launder) >= nclean) {
+			target = vm_background_launder_target;
+		}
+
+		/*
+		 * We have a non-zero background laundering target.  If we've
+		 * laundered up to our maximum without observing a page daemon
+		 * wakeup, just stop.  This is a safety belt that ensures we
+		 * don't launder an excessive amount if memory pressure is low
+		 * and the ratio of dirty to clean pages is large.  Otherwise,
+		 * proceed at the background laundering rate.
+		 */
+		if (target > 0) {
+			if (wakeups != last_launder) {
+				last_launder = wakeups;
+				last_target = target;
+			} else if (last_target - target >=
+			    vm_background_launder_max * PAGE_SIZE / 1024) {
+				target = 0;
 			}
+			launder = vm_background_launder_rate * PAGE_SIZE / 1024;
+			launder /= VM_LAUNDER_RATE;
+			if (launder > target)
+				launder = target;
 		}
-		tmpe = tmpe->next;
-	}
 
-#ifdef __ia64__
-	/*
-	 * Remove all non-wired, managed mappings if a process is swapped out.
-	 * This will free page table pages.
-	 */
-	if (desired == 0)
-		pmap_remove_pages(map->pmap);
-#else
-	/*
-	 * Remove all mappings if a process is swapped out, this will free page
-	 * table pages.
-	 */
-	if (desired == 0 && nothingwired) {
-		pmap_remove(vm_map_pmap(map), vm_map_min(map),
-		    vm_map_max(map));
+dolaundry:
+		if (launder > 0) {
+			/*
+			 * Because of I/O clustering, the number of laundered
+			 * pages could exceed "target" by the maximum size of
+			 * a cluster minus one. 
+			 */
+			target -= min(vm_pageout_launder(domain, launder,
+			    in_shortfall), target);
+			pause("laundp", hz / VM_LAUNDER_RATE);
+		}
+
+		/*
+		 * If we're not currently laundering pages and the page daemon
+		 * hasn't posted a new request, sleep until the page daemon
+		 * kicks us.
+		 */
+		vm_pagequeue_lock(pq);
+		if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE)
+			(void)mtx_sleep(&vm_laundry_request,
+			    vm_pagequeue_lockptr(pq), PVM, "launds", 0);
+
+		/*
+		 * If the pagedaemon has indicated that it's in shortfall, start
+		 * a shortfall laundering unless we're already in the middle of
+		 * one.  This may preempt a background laundering.
+		 */
+		if (vm_laundry_request == VM_LAUNDRY_SHORTFALL &&
+		    (!in_shortfall || shortfall_cycle == 0)) {
+			shortfall = vm_laundry_target() + vm_pageout_deficit;
+			target = 0;
+		} else
+			shortfall = 0;
+
+		if (target == 0)
+			vm_laundry_request = VM_LAUNDRY_IDLE;
+		vm_pagequeue_unlock(pq);
 	}
-#endif
-
-	vm_map_unlock(map);
 }
-#endif		/* !defined(NO_SWAPPING) */
 
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  *
- *	pass 0 - Update active LRU/deactivate pages
- *	pass 1 - Move inactive to cache or free
- *	pass 2 - Launder dirty pages
+ *	pass == 0: Update active LRU/deactivate pages
+ *	pass >= 1: Free inactive pages
+ *
+ * Returns true if pass was zero or enough pages were freed by the inactive
+ * queue scan to meet the target.
  */
-static void
+static bool
 vm_pageout_scan(struct vm_domain *vmd, int pass)
 {
 	vm_page_t m, next;
@@ -947,10 +1075,8 @@
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	long min_scan;
-	int act_delta, addl_page_shortage, deficit, maxscan, page_shortage;
-	int vnodes_skipped = 0;
-	int maxlaunder, scan_tick, scanned, starting_page_shortage;
-	int lockmode;
+	int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
+	int page_shortage, scan_tick, scanned, starting_page_shortage;
 	boolean_t queue_locked;
 
 	/*
@@ -981,8 +1107,9 @@
 	addl_page_shortage = 0;
 
 	/*
-	 * Calculate the number of pages we want to either free or move
-	 * to the cache.
+	 * Calculate the number of pages that we want to free.  This number
+	 * can be negative if many pages are freed between the wakeup call to
+	 * the page daemon and this calculation.
 	 */
 	if (pass > 0) {
 		deficit = atomic_readandclear_int(&vm_pageout_deficit);
@@ -992,27 +1119,11 @@
 	starting_page_shortage = page_shortage;
 
 	/*
-	 * maxlaunder limits the number of dirty pages we flush per scan.
-	 * For most systems a smaller value (16 or 32) is more robust under
-	 * extreme memory and disk pressure because any unnecessary writes
-	 * to disk can result in extreme performance degredation.  However,
-	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
-	 * used) will die horribly with limited laundering.  If the pageout
-	 * daemon cannot clean enough pages in the first pass, we let it go
-	 * all out in succeeding passes.
+	 * Start scanning the inactive queue for pages that we can free.  The
+	 * scan will stop when we reach the target or we have scanned the
+	 * entire queue.  (Note that m->act_count is not used to make
+	 * decisions for the inactive queue, only for the active queue.)
 	 */
-	if ((maxlaunder = vm_max_launder) <= 1)
-		maxlaunder = 1;
-	if (pass > 1)
-		maxlaunder = 10000;
-
-	/*
-	 * Start scanning the inactive queue for pages we can move to the
-	 * cache or free.  The scan will stop when the target is reached or
-	 * we have scanned the entire inactive queue.  Note that m->act_count
-	 * is not used to form decisions for the inactive queue, only for the
-	 * active queue.
-	 */
 	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 	maxscan = pq->pq_cnt;
 	vm_pagequeue_lock(pq);
@@ -1022,7 +1133,7 @@
 	     m = next) {
 		vm_pagequeue_assert_locked(pq);
 		KASSERT(queue_locked, ("unlocked inactive queue"));
-		KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m));
+		KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
 
 		PCPU_INC(cnt.v_pdpages);
 		next = TAILQ_NEXT(m, plinks.q);
@@ -1044,55 +1155,76 @@
 		 * different position within the queue.  In either
 		 * case, addl_page_shortage should not be incremented.
 		 */
-		if (!vm_pageout_page_lock(m, &next)) {
-			vm_page_unlock(m);
-			continue;
+		if (!vm_pageout_page_lock(m, &next))
+			goto unlock_page;
+		else if (m->hold_count != 0) {
+			/*
+			 * Held pages are essentially stuck in the
+			 * queue.  So, they ought to be discounted
+			 * from the inactive count.  See the
+			 * calculation of inactq_shortage before the
+			 * loop over the active queue below.
+			 */
+			addl_page_shortage++;
+			goto unlock_page;
 		}
 		object = m->object;
-		if (!VM_OBJECT_TRYWLOCK(object) &&
-		    !vm_pageout_fallback_object_lock(m, &next)) {
-			vm_page_unlock(m);
-			VM_OBJECT_WUNLOCK(object);
-			continue;
+		if (!VM_OBJECT_TRYWLOCK(object)) {
+			if (!vm_pageout_fallback_object_lock(m, &next))
+				goto unlock_object;
+			else if (m->hold_count != 0) {
+				addl_page_shortage++;
+				goto unlock_object;
+			}
 		}
-
-		/*
-		 * Don't mess with busy pages, keep them at at the
-		 * front of the queue, most likely they are being
-		 * paged out.  Increment addl_page_shortage for busy
-		 * pages, because they may leave the inactive queue
-		 * shortly after page scan is finished.
-		 */
 		if (vm_page_busied(m)) {
+			/*
+			 * Don't mess with busy pages.  Leave them at
+			 * the front of the queue.  Most likely, they
+			 * are being paged out and will leave the
+			 * queue shortly after the scan finishes.  So,
+			 * they ought to be discounted from the
+			 * inactive count.
+			 */
+			addl_page_shortage++;
+unlock_object:
+			VM_OBJECT_WUNLOCK(object);
+unlock_page:
 			vm_page_unlock(m);
-			VM_OBJECT_WUNLOCK(object);
-			addl_page_shortage++;
 			continue;
 		}
+		KASSERT(m->hold_count == 0, ("Held page %p", m));
 
 		/*
-		 * We unlock the inactive page queue, invalidating the
-		 * 'next' pointer.  Use our marker to remember our
-		 * place.
+		 * Dequeue the inactive page and unlock the inactive page
+		 * queue, invalidating the 'next' pointer.  Dequeueing the
+		 * page here avoids a later reacquisition (and release) of
+		 * the inactive page queue lock when vm_page_activate(),
+		 * vm_page_free(), or vm_page_launder() is called.  Use a
+		 * marker to remember our place in the inactive queue.
 		 */
 		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
+		vm_page_dequeue_locked(m);
 		vm_pagequeue_unlock(pq);
 		queue_locked = FALSE;
 
 		/*
-		 * We bump the activation count if the page has been
-		 * referenced while in the inactive queue.  This makes
-		 * it less likely that the page will be added back to the
-		 * inactive queue prematurely again.  Here we check the 
-		 * page tables (or emulated bits, if any), given the upper 
-		 * level VM system not knowing anything about existing 
-		 * references.
+		 * Invalid pages can be easily freed. They cannot be
+		 * mapped, vm_page_free() asserts this.
 		 */
-		act_delta = 0;
+		if (m->valid == 0)
+			goto free_page;
+
+		/*
+		 * If the page has been referenced and the object is not dead,
+		 * reactivate or requeue the page depending on whether the
+		 * object is mapped.
+		 */
 		if ((m->aflags & PGA_REFERENCED) != 0) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
 			act_delta = 1;
-		}
+		} else
+			act_delta = 0;
 		if (object->ref_count != 0) {
 			act_delta += pmap_ts_referenced(m);
 		} else {
@@ -1099,47 +1231,36 @@
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("vm_pageout_scan: page %p is mapped", m));
 		}
-
-		/*
-		 * If the upper level VM system knows about any page 
-		 * references, we reactivate the page or requeue it.
-		 */
 		if (act_delta != 0) {
-			if (object->ref_count) {
+			if (object->ref_count != 0) {
+				PCPU_INC(cnt.v_reactivated);
 				vm_page_activate(m);
+
+				/*
+				 * Increase the activation count if the page
+				 * was referenced while in the inactive queue.
+				 * This makes it less likely that the page will
+				 * be returned prematurely to the inactive
+				 * queue.
+ 				 */
 				m->act_count += act_delta + ACT_ADVANCE;
-			} else {
+				goto drop_page;
+			} else if ((object->flags & OBJ_DEAD) == 0) {
 				vm_pagequeue_lock(pq);
 				queue_locked = TRUE;
-				vm_page_requeue_locked(m);
+				m->queue = PQ_INACTIVE;
+				TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+				vm_pagequeue_cnt_inc(pq);
+				goto drop_page;
 			}
-			VM_OBJECT_WUNLOCK(object);
-			vm_page_unlock(m);
-			goto relock_queue;
 		}
 
-		if (m->hold_count != 0) {
-			vm_page_unlock(m);
-			VM_OBJECT_WUNLOCK(object);
-
-			/*
-			 * Held pages are essentially stuck in the
-			 * queue.  So, they ought to be discounted
-			 * from the inactive count.  See the
-			 * calculation of the page_shortage for the
-			 * loop over the active queue below.
-			 */
-			addl_page_shortage++;
-			goto relock_queue;
-		}
-
 		/*
 		 * If the page appears to be clean at the machine-independent
 		 * layer, then remove all of its mappings from the pmap in
-		 * anticipation of placing it onto the cache queue.  If,
-		 * however, any of the page's mappings allow write access,
-		 * then the page may still be modified until the last of those
-		 * mappings are removed.
+		 * anticipation of freeing it.  If, however, any of the page's
+		 * mappings allow write access, then the page may still be
+		 * modified until the last of those mappings are removed.
 		 */
 		if (object->ref_count != 0) {
 			vm_page_test_dirty(m);
@@ -1147,199 +1268,23 @@
 				pmap_remove_all(m);
 		}
 
-		if (m->valid == 0) {
-			/*
-			 * Invalid pages can be easily freed
-			 */
+		/*
+		 * Clean pages can be freed, but dirty pages must be sent back
+		 * to the laundry, unless they belong to a dead object.
+		 * Requeueing dirty pages from dead objects is pointless, as
+		 * they are being paged out and freed by the thread that
+		 * destroyed the object.
+		 */
+		if (m->dirty == 0) {
+free_page:
 			vm_page_free(m);
 			PCPU_INC(cnt.v_dfree);
 			--page_shortage;
-		} else if (m->dirty == 0) {
-			/*
-			 * Clean pages can be placed onto the cache queue.
-			 * This effectively frees them.
-			 */
-			vm_page_cache(m);
-			--page_shortage;
-		} else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) {
-			/*
-			 * Dirty pages need to be paged out, but flushing
-			 * a page is extremely expensive verses freeing
-			 * a clean page.  Rather then artificially limiting
-			 * the number of pages we can flush, we instead give
-			 * dirty pages extra priority on the inactive queue
-			 * by forcing them to be cycled through the queue
-			 * twice before being flushed, after which the
-			 * (now clean) page will cycle through once more
-			 * before being freed.  This significantly extends
-			 * the thrash point for a heavily loaded machine.
-			 */
-			m->flags |= PG_WINATCFLS;
-			vm_pagequeue_lock(pq);
-			queue_locked = TRUE;
-			vm_page_requeue_locked(m);
-		} else if (maxlaunder > 0) {
-			/*
-			 * We always want to try to flush some dirty pages if
-			 * we encounter them, to keep the system stable.
-			 * Normally this number is small, but under extreme
-			 * pressure where there are insufficient clean pages
-			 * on the inactive queue, we may have to go all out.
-			 */
-			int swap_pageouts_ok;
-			struct vnode *vp = NULL;
-			struct mount *mp = NULL;
-
-			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
-				swap_pageouts_ok = 1;
-			} else {
-				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
-				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
-				vm_page_count_min());
-										
-			}
-
-			/*
-			 * We don't bother paging objects that are "dead".  
-			 * Those objects are in a "rundown" state.
-			 */
-			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
-				vm_pagequeue_lock(pq);
-				vm_page_unlock(m);
-				VM_OBJECT_WUNLOCK(object);
-				queue_locked = TRUE;
-				vm_page_requeue_locked(m);
-				goto relock_queue;
-			}
-
-			/*
-			 * The object is already known NOT to be dead.   It
-			 * is possible for the vget() to block the whole
-			 * pageout daemon, but the new low-memory handling
-			 * code should prevent it.
-			 *
-			 * The previous code skipped locked vnodes and, worse,
-			 * reordered pages in the queue.  This results in
-			 * completely non-deterministic operation and, on a
-			 * busy system, can lead to extremely non-optimal
-			 * pageouts.  For example, it can cause clean pages
-			 * to be freed and dirty pages to be moved to the end
-			 * of the queue.  Since dirty pages are also moved to
-			 * the end of the queue once-cleaned, this gives
-			 * way too large a weighting to defering the freeing
-			 * of dirty pages.
-			 *
-			 * We can't wait forever for the vnode lock, we might
-			 * deadlock due to a vn_read() getting stuck in
-			 * vm_wait while holding this vnode.  We skip the 
-			 * vnode if we can't get it in a reasonable amount
-			 * of time.
-			 */
-			if (object->type == OBJT_VNODE) {
-				vm_page_unlock(m);
-				vp = object->handle;
-				if (vp->v_type == VREG &&
-				    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
-					mp = NULL;
-					++pageout_lock_miss;
-					if (object->flags & OBJ_MIGHTBEDIRTY)
-						vnodes_skipped++;
-					goto unlock_and_continue;
-				}
-				KASSERT(mp != NULL,
-				    ("vp %p with NULL v_mount", vp));
-				vm_object_reference_locked(object);
-				VM_OBJECT_WUNLOCK(object);
-				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
-				    LK_SHARED : LK_EXCLUSIVE;
-				if (vget(vp, lockmode | LK_TIMELOCK,
-				    curthread)) {
-					VM_OBJECT_WLOCK(object);
-					++pageout_lock_miss;
-					if (object->flags & OBJ_MIGHTBEDIRTY)
-						vnodes_skipped++;
-					vp = NULL;
-					goto unlock_and_continue;
-				}
-				VM_OBJECT_WLOCK(object);
-				vm_page_lock(m);
-				vm_pagequeue_lock(pq);
-				queue_locked = TRUE;
-				/*
-				 * The page might have been moved to another
-				 * queue during potential blocking in vget()
-				 * above.  The page might have been freed and
-				 * reused for another vnode.
-				 */
-				if (m->queue != PQ_INACTIVE ||
-				    m->object != object ||
-				    TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) {
-					vm_page_unlock(m);
-					if (object->flags & OBJ_MIGHTBEDIRTY)
-						vnodes_skipped++;
-					goto unlock_and_continue;
-				}
-	
-				/*
-				 * The page may have been busied during the
-				 * blocking in vget().  We don't move the
-				 * page back onto the end of the queue so that
-				 * statistics are more correct if we don't.
-				 */
-				if (vm_page_busied(m)) {
-					vm_page_unlock(m);
-					addl_page_shortage++;
-					goto unlock_and_continue;
-				}
-
-				/*
-				 * If the page has become held it might
-				 * be undergoing I/O, so skip it
-				 */
-				if (m->hold_count != 0) {
-					vm_page_unlock(m);
-					addl_page_shortage++;
-					if (object->flags & OBJ_MIGHTBEDIRTY)
-						vnodes_skipped++;
-					goto unlock_and_continue;
-				}
-				vm_pagequeue_unlock(pq);
-				queue_locked = FALSE;
-			}
-
-			/*
-			 * If a page is dirty, then it is either being washed
-			 * (but not yet cleaned) or it is still in the
-			 * laundry.  If it is still in the laundry, then we
-			 * start the cleaning operation. 
-			 *
-			 * decrement page_shortage on success to account for
-			 * the (future) cleaned page.  Otherwise we could wind
-			 * up laundering or cleaning too many pages.
-			 */
-			if (vm_pageout_clean(m) != 0) {
-				--page_shortage;
-				--maxlaunder;
-			}
-unlock_and_continue:
-			vm_page_lock_assert(m, MA_NOTOWNED);
-			VM_OBJECT_WUNLOCK(object);
-			if (mp != NULL) {
-				if (queue_locked) {
-					vm_pagequeue_unlock(pq);
-					queue_locked = FALSE;
-				}
-				if (vp != NULL)
-					vput(vp);
-				vm_object_deallocate(object);
-				vn_finished_write(mp);
-			}
-			vm_page_lock_assert(m, MA_NOTOWNED);
-			goto relock_queue;
-		}
+		} else if ((object->flags & OBJ_DEAD) == 0)
+			vm_page_launder(m);
+drop_page:
 		vm_page_unlock(m);
 		VM_OBJECT_WUNLOCK(object);
-relock_queue:
 		if (!queue_locked) {
 			vm_pagequeue_lock(pq);
 			queue_locked = TRUE;
@@ -1349,22 +1294,30 @@
 	}
 	vm_pagequeue_unlock(pq);
 
-#if !defined(NO_SWAPPING)
 	/*
-	 * Wakeup the swapout daemon if we didn't cache or free the targeted
-	 * number of pages. 
+	 * Wake up the laundry thread so that it can perform any needed
+	 * laundering.  If we didn't meet our target, we're in shortfall and
+	 * need to launder more aggressively.
 	 */
-	if (vm_swap_enabled && page_shortage > 0)
-		vm_req_vmdaemon(VM_SWAP_NORMAL);
-#endif
+	if (vm_laundry_request == VM_LAUNDRY_IDLE &&
+	    starting_page_shortage > 0) {
+		pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY];
+		vm_pagequeue_lock(pq);
+		if (page_shortage > 0) {
+			vm_laundry_request = VM_LAUNDRY_SHORTFALL;
+			PCPU_INC(cnt.v_pdshortfalls);
+		} else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL)
+			vm_laundry_request = VM_LAUNDRY_BACKGROUND;
+		wakeup(&vm_laundry_request);
+		vm_pagequeue_unlock(pq);
+	}
 
 	/*
-	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
-	 * and we didn't cache or free enough pages.
+	 * Wakeup the swapout daemon if we didn't free the targeted number of
+	 * pages.
 	 */
-	if (vnodes_skipped > 0 && page_shortage > cnt.v_free_target -
-	    cnt.v_free_min)
-		(void)speedup_syncer();
+	if (page_shortage > 0)
+		vm_swapout_run();
 
 	/*
 	 * If the inactive queue scan fails repeatedly to meet its
@@ -1374,10 +1327,20 @@
 
 	/*
 	 * Compute the number of pages we want to try to move from the
-	 * active queue to the inactive queue.
+	 * active queue to either the inactive or laundry queue.
+	 *
+	 * When scanning active pages, we make clean pages count more heavily
+	 * towards the page shortage than dirty pages.  This is because dirty
+	 * pages must be laundered before they can be reused and thus have less
+	 * utility when attempting to quickly alleviate a shortage.  However,
+	 * this weighting also causes the scan to deactivate dirty pages more
+	 * more aggressively, improving the effectiveness of clustering and
+	 * ensuring that they can eventually be reused.
 	 */
-	page_shortage = cnt.v_inactive_target - cnt.v_inactive_count +
+	inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count +
+	    vm_cnt.v_laundry_count / act_scan_laundry_weight) +
 	    vm_paging_target() + deficit + addl_page_shortage;
+	inactq_shortage *= act_scan_laundry_weight;
 
 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 	vm_pagequeue_lock(pq);
@@ -1394,7 +1357,7 @@
 		min_scan /= hz * vm_pageout_update_period;
 	} else
 		min_scan = 0;
-	if (min_scan > 0 || (page_shortage > 0 && maxscan > 0))
+	if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0))
 		vmd->vmd_last_active_scan = scan_tick;
 
 	/*
@@ -1403,7 +1366,7 @@
 	 * candidates.  Held pages may be deactivated.
 	 */
 	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
-	    min_scan || (page_shortage > 0 && scanned < maxscan)); m = next,
+	    min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next,
 	    scanned++) {
 		KASSERT(m->queue == PQ_ACTIVE,
 		    ("vm_pageout_scan: page %p isn't active", m));
@@ -1428,11 +1391,12 @@
 		/*
 		 * Check to see "how much" the page has been used.
 		 */
-		act_delta = 0;
-		if (m->aflags & PGA_REFERENCED) {
+		if ((m->aflags & PGA_REFERENCED) != 0) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
-			act_delta += 1;
-		}
+			act_delta = 1;
+		} else
+			act_delta = 0;
+
 		/*
 		 * Perform an unsynchronized object ref count check.  While
 		 * the page lock ensures that the page is not reallocated to
@@ -1452,41 +1416,60 @@
 		/*
 		 * Advance or decay the act_count based on recent usage.
 		 */
-		if (act_delta) {
+		if (act_delta != 0) {
 			m->act_count += ACT_ADVANCE + act_delta;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
-		} else {
+		} else
 			m->act_count -= min(m->act_count, ACT_DECLINE);
-			act_delta = m->act_count;
-		}
 
 		/*
-		 * Move this page to the tail of the active or inactive
+		 * Move this page to the tail of the active, inactive or laundry
 		 * queue depending on usage.
 		 */
-		if (act_delta == 0) {
+		if (m->act_count == 0) {
 			/* Dequeue to avoid later lock recursion. */
 			vm_page_dequeue_locked(m);
-			vm_page_deactivate(m);
-			page_shortage--;
+
+			/*
+			 * When not short for inactive pages, let dirty pages go
+			 * through the inactive queue before moving to the
+			 * laundry queues.  This gives them some extra time to
+			 * be reactivated, potentially avoiding an expensive
+			 * pageout.  During a page shortage, the inactive queue
+			 * is necessarily small, so we may move dirty pages
+			 * directly to the laundry queue.
+			 */
+			if (inactq_shortage <= 0)
+				vm_page_deactivate(m);
+			else {
+				/*
+				 * Calling vm_page_test_dirty() here would
+				 * require acquisition of the object's write
+				 * lock.  However, during a page shortage,
+				 * directing dirty pages into the laundry
+				 * queue is only an optimization and not a
+				 * requirement.  Therefore, we simply rely on
+				 * the opportunistic updates to the page's
+				 * dirty field by the pmap.
+				 */
+				if (m->dirty == 0) {
+					vm_page_deactivate(m);
+					inactq_shortage -=
+					    act_scan_laundry_weight;
+				} else {
+					vm_page_launder(m);
+					inactq_shortage--;
+				}
+			}
 		} else
 			vm_page_requeue_locked(m);
 		vm_page_unlock(m);
 	}
 	vm_pagequeue_unlock(pq);
-#if !defined(NO_SWAPPING)
-	/*
-	 * Idle process swapout -- run once per second.
-	 */
-	if (vm_swap_idle_enabled) {
-		static long lsec;
-		if (time_second != lsec) {
-			vm_req_vmdaemon(VM_SWAP_IDLE);
-			lsec = time_second;
-		}
-	}
-#endif
+	if (pass > 0)
+		vm_swapout_run_idle();
+	return (page_shortage <= 0);
 }
 
 static int vm_pageout_oom_vote;
@@ -1668,19 +1651,21 @@
 			PROC_UNLOCK(p);
 			continue;
 		}
-		_PHOLD(p);
+		_PHOLD_LITE(p);
+		PROC_UNLOCK(p);
+		sx_sunlock(&allproc_lock);
 		if (!vm_map_trylock_read(&vm->vm_map)) {
-			_PRELE(p);
-			PROC_UNLOCK(p);
 			vmspace_free(vm);
+			sx_slock(&allproc_lock);
+			PRELE(p);
 			continue;
 		}
-		PROC_UNLOCK(p);
 		size = vmspace_swap_count(vm);
 		if (shortage == VM_OOM_MEM)
 			size += vm_pageout_oom_pagecount(vm);
 		vm_map_unlock_read(&vm->vm_map);
 		vmspace_free(vm);
+		sx_slock(&allproc_lock);
 
 		/*
 		 * If this process is bigger than the biggest one,
@@ -1697,12 +1682,14 @@
 	}
 	sx_sunlock(&allproc_lock);
 	if (bigproc != NULL) {
+		if (vm_panic_on_oom != 0)
+			panic("out of swap space");
 		PROC_LOCK(bigproc);
 		killproc(bigproc, "out of swap space");
 		sched_nice(bigproc, PRIO_MIN);
 		_PRELE(bigproc);
 		PROC_UNLOCK(bigproc);
-		wakeup(&cnt.v_free_count);
+		wakeup(&vm_cnt.v_free_count);
 	}
 }
 
@@ -1710,10 +1697,13 @@
 vm_pageout_worker(void *arg)
 {
 	struct vm_domain *domain;
-	int domidx;
+	int domidx, pass;
+	bool target_met;
 
 	domidx = (uintptr_t)arg;
 	domain = &vm_dom[domidx];
+	pass = 0;
+	target_met = true;
 
 	/*
 	 * XXXKIB It could be useful to bind pageout daemon threads to
@@ -1724,54 +1714,80 @@
 	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
 	domain->vmd_last_active_scan = ticks;
 	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
+	vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE);
+	TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl,
+	    &domain->vmd_inacthead, plinks.q);
 
 	/*
 	 * The pageout daemon worker is never done, so loop forever.
 	 */
 	while (TRUE) {
+		mtx_lock(&vm_page_queue_free_mtx);
+
 		/*
-		 * If we have enough free memory, wakeup waiters.  Do
-		 * not clear vm_pages_needed until we reach our target,
-		 * otherwise we may be woken up over and over again and
-		 * waste a lot of cpu.
+		 * Generally, after a level >= 1 scan, if there are enough
+		 * free pages to wakeup the waiters, then they are already
+		 * awake.  A call to vm_page_free() during the scan awakened
+		 * them.  However, in the following case, this wakeup serves
+		 * to bound the amount of time that a thread might wait.
+		 * Suppose a thread's call to vm_page_alloc() fails, but
+		 * before that thread calls VM_WAIT, enough pages are freed by
+		 * other threads to alleviate the free page shortage.  The
+		 * thread will, nonetheless, wait until another page is freed
+		 * or this wakeup is performed.
 		 */
-		mtx_lock(&vm_page_queue_free_mtx);
 		if (vm_pages_needed && !vm_page_count_min()) {
-			if (!vm_paging_needed())
-				vm_pages_needed = 0;
-			wakeup(&cnt.v_free_count);
+			vm_pages_needed = false;
+			wakeup(&vm_cnt.v_free_count);
 		}
-		if (vm_pages_needed) {
+
+		/*
+		 * Do not clear vm_pageout_wanted until we reach our free page
+		 * target.  Otherwise, we may be awakened over and over again,
+		 * wasting CPU time.
+		 */
+		if (vm_pageout_wanted && target_met)
+			vm_pageout_wanted = false;
+
+		/*
+		 * Might the page daemon receive a wakeup call?
+		 */
+		if (vm_pageout_wanted) {
 			/*
-			 * We're still not done.  Either vm_pages_needed was
-			 * set by another thread during the previous scan
-			 * (typically, this happens during a level 0 scan) or
-			 * vm_pages_needed was already set and the scan failed
-			 * to free enough pages.  If we haven't yet performed
-			 * a level >= 2 scan (unlimited dirty cleaning), then
-			 * upgrade the level and scan again now.  Otherwise,
-			 * sleep a bit and try again later.  While sleeping,
-			 * vm_pages_needed can be cleared.
+			 * No.  Either vm_pageout_wanted was set by another
+			 * thread during the previous scan, which must have
+			 * been a level 0 scan, or vm_pageout_wanted was
+			 * already set and the scan failed to free enough
+			 * pages.  If we haven't yet performed a level >= 1
+			 * (page reclamation) scan, then increase the level
+			 * and scan again now.  Otherwise, sleep a bit and
+			 * try again later.
 			 */
-			if (domain->vmd_pass > 1)
-				msleep(&vm_pages_needed,
-				    &vm_page_queue_free_mtx, PVM, "psleep",
-				    hz / 2);
+			mtx_unlock(&vm_page_queue_free_mtx);
+			if (pass >= 1)
+				pause("pwait", hz / VM_INACT_SCAN_RATE);
+			pass++;
 		} else {
 			/*
-			 * Good enough, sleep until required to refresh
-			 * stats.
+			 * Yes.  If threads are still sleeping in VM_WAIT
+			 * then we immediately start a new scan.  Otherwise,
+			 * sleep until the next wakeup or until pages need to
+			 * have their reference stats updated.
 			 */
-			msleep(&vm_pages_needed, &vm_page_queue_free_mtx,
-			    PVM, "psleep", hz);
+			if (vm_pages_needed) {
+				mtx_unlock(&vm_page_queue_free_mtx);
+				if (pass == 0)
+					pass++;
+			} else if (mtx_sleep(&vm_pageout_wanted,
+			    &vm_page_queue_free_mtx, PDROP | PVM, "psleep",
+			    hz) == 0) {
+				PCPU_INC(cnt.v_pdwakeups);
+				pass = 1;
+			} else
+				pass = 0;
 		}
-		if (vm_pages_needed) {
-			cnt.v_pdwakeups++;
-			domain->vmd_pass++;
-		} else
-			domain->vmd_pass = 0;
-		mtx_unlock(&vm_page_queue_free_mtx);
-		vm_pageout_scan(domain, domain->vmd_pass);
+
+		target_met = vm_pageout_scan(domain, pass);
 	}
 }
 
@@ -1784,8 +1800,8 @@
 	/*
 	 * Initialize some paging parameters.
 	 */
-	cnt.v_interrupt_free_min = 2;
-	if (cnt.v_page_count < 2000)
+	vm_cnt.v_interrupt_free_min = 2;
+	if (vm_cnt.v_page_count < 2000)
 		vm_pageout_page_count = 8;
 
 	/*
@@ -1793,27 +1809,27 @@
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
-	if (cnt.v_page_count > 1024)
-		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
+	if (vm_cnt.v_page_count > 1024)
+		vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200;
 	else
-		cnt.v_free_min = 4;
-	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
-	    cnt.v_interrupt_free_min;
-	cnt.v_free_reserved = vm_pageout_page_count +
-	    cnt.v_pageout_free_min + (cnt.v_page_count / 768);
-	cnt.v_free_severe = cnt.v_free_min / 2;
-	cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
-	cnt.v_free_min += cnt.v_free_reserved;
-	cnt.v_free_severe += cnt.v_free_reserved;
-	cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
-	if (cnt.v_inactive_target > cnt.v_free_count / 3)
-		cnt.v_inactive_target = cnt.v_free_count / 3;
+		vm_cnt.v_free_min = 4;
+	vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
+	    vm_cnt.v_interrupt_free_min;
+	vm_cnt.v_free_reserved = vm_pageout_page_count +
+	    vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768);
+	vm_cnt.v_free_severe = vm_cnt.v_free_min / 2;
+	vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved;
+	vm_cnt.v_free_min += vm_cnt.v_free_reserved;
+	vm_cnt.v_free_severe += vm_cnt.v_free_reserved;
+	vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2;
+	if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3)
+		vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
 
 	/*
 	 * Set the default wakeup threshold to be 10% above the minimum
 	 * page limit.  This keeps the steady state out of shortfall.
 	 */
-	vm_pageout_wakeup_thresh = (cnt.v_free_min / 10) * 11;
+	vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11;
 
 	/*
 	 * Set interval in seconds for active scan.  We want to visit each
@@ -1825,7 +1841,15 @@
 
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
-		vm_page_max_wired = cnt.v_free_count / 3;
+		vm_page_max_wired = vm_cnt.v_free_count / 3;
+
+	/*
+	 * Target amount of memory to move out of the laundry queue during a
+	 * background laundering.  This is proportional to the amount of system
+	 * memory.
+	 */
+	vm_background_launder_target = (vm_cnt.v_free_target -
+	    vm_cnt.v_free_min) / 10;
 }
 
 /*
@@ -1835,12 +1859,17 @@
 vm_pageout(void)
 {
 	int error;
-#if MAXMEMDOM > 1
+#ifdef VM_NUMA_ALLOC
 	int i;
 #endif
 
 	swap_pager_swap_init();
-#if MAXMEMDOM > 1
+	snprintf(curthread->td_name, sizeof(curthread->td_name), "dom0");
+	error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL,
+	    0, 0, "laundry: dom0");
+	if (error != 0)
+		panic("starting laundry for domain 0, error %d", error);
+#ifdef VM_NUMA_ALLOC
 	for (i = 1; i < vm_ndomains; i++) {
 		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
 		    curproc, NULL, 0, 0, "dom%d", i);
@@ -1858,175 +1887,42 @@
 }
 
 /*
- * Unless the free page queue lock is held by the caller, this function
- * should be regarded as advisory.  Specifically, the caller should
- * not msleep() on &cnt.v_free_count following this function unless
- * the free page queue lock is held until the msleep() is performed.
+ * Perform an advisory wakeup of the page daemon.
  */
 void
 pagedaemon_wakeup(void)
 {
 
-	if (!vm_pages_needed && curthread->td_proc != pageproc) {
-		vm_pages_needed = 1;
-		wakeup(&vm_pages_needed);
-	}
-}
+	mtx_assert(&vm_page_queue_free_mtx, MA_NOTOWNED);
 
-#if !defined(NO_SWAPPING)
-static void
-vm_req_vmdaemon(int req)
-{
-	static int lastrun = 0;
-
-	mtx_lock(&vm_daemon_mtx);
-	vm_pageout_req_swapout |= req;
-	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
-		wakeup(&vm_daemon_needed);
-		lastrun = ticks;
+	if (!vm_pageout_wanted && curthread->td_proc != pageproc) {
+		vm_pageout_wanted = true;
+		wakeup(&vm_pageout_wanted);
 	}
-	mtx_unlock(&vm_daemon_mtx);
 }
 
-static void
-vm_daemon(void)
+/*
+ * Wake up the page daemon and wait for it to reclaim free pages.
+ *
+ * This function returns with the free queues mutex unlocked.
+ */
+void
+pagedaemon_wait(int pri, const char *wmesg)
 {
-	struct rlimit rsslim;
-	struct proc *p;
-	struct thread *td;
-	struct vmspace *vm;
-	int breakout, swapout_flags, tryagain, attempts;
-#ifdef RACCT
-	uint64_t rsize, ravailable;
-#endif
 
-	while (TRUE) {
-		mtx_lock(&vm_daemon_mtx);
-		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
-#ifdef RACCT
-		    racct_enable ? hz : 0
-#else
-		    0
-#endif
-		);
-		swapout_flags = vm_pageout_req_swapout;
-		vm_pageout_req_swapout = 0;
-		mtx_unlock(&vm_daemon_mtx);
-		if (swapout_flags)
-			swapout_procs(swapout_flags);
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 
-		/*
-		 * scan the processes for exceeding their rlimits or if
-		 * process is swapped out -- deactivate pages
-		 */
-		tryagain = 0;
-		attempts = 0;
-again:
-		attempts++;
-		sx_slock(&allproc_lock);
-		FOREACH_PROC_IN_SYSTEM(p) {
-			vm_pindex_t limit, size;
-
-			/*
-			 * if this is a system process or if we have already
-			 * looked at this process, skip it.
-			 */
-			PROC_LOCK(p);
-			if (p->p_state != PRS_NORMAL ||
-			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
-				PROC_UNLOCK(p);
-				continue;
-			}
-			/*
-			 * if the process is in a non-running type state,
-			 * don't touch it.
-			 */
-			breakout = 0;
-			FOREACH_THREAD_IN_PROC(p, td) {
-				thread_lock(td);
-				if (!TD_ON_RUNQ(td) &&
-				    !TD_IS_RUNNING(td) &&
-				    !TD_IS_SLEEPING(td) &&
-				    !TD_IS_SUSPENDED(td)) {
-					thread_unlock(td);
-					breakout = 1;
-					break;
-				}
-				thread_unlock(td);
-			}
-			if (breakout) {
-				PROC_UNLOCK(p);
-				continue;
-			}
-			/*
-			 * get a limit
-			 */
-			lim_rlimit(p, RLIMIT_RSS, &rsslim);
-			limit = OFF_TO_IDX(
-			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
-
-			/*
-			 * let processes that are swapped out really be
-			 * swapped out set the limit to nothing (will force a
-			 * swap-out.)
-			 */
-			if ((p->p_flag & P_INMEM) == 0)
-				limit = 0;	/* XXX */
-			vm = vmspace_acquire_ref(p);
-			PROC_UNLOCK(p);
-			if (vm == NULL)
-				continue;
-
-			size = vmspace_resident_count(vm);
-			if (size >= limit) {
-				vm_pageout_map_deactivate_pages(
-				    &vm->vm_map, limit);
-				size = vmspace_resident_count(vm);
-			}
-#ifdef RACCT
-			if (racct_enable) {
-				rsize = IDX_TO_OFF(size);
-				PROC_LOCK(p);
-				if (p->p_state == PRS_NORMAL)
-					racct_set(p, RACCT_RSS, rsize);
-				ravailable = racct_get_available(p, RACCT_RSS);
-				PROC_UNLOCK(p);
-				if (rsize > ravailable) {
-					/*
-					 * Don't be overly aggressive; this
-					 * might be an innocent process,
-					 * and the limit could've been exceeded
-					 * by some memory hog.  Don't try
-					 * to deactivate more than 1/4th
-					 * of process' resident set size.
-					 */
-					if (attempts <= 8) {
-						if (ravailable < rsize -
-						    (rsize / 4)) {
-							ravailable = rsize -
-							    (rsize / 4);
-						}
-					}
-					vm_pageout_map_deactivate_pages(
-					    &vm->vm_map,
-					    OFF_TO_IDX(ravailable));
-					/* Update RSS usage after paging out. */
-					size = vmspace_resident_count(vm);
-					rsize = IDX_TO_OFF(size);
-					PROC_LOCK(p);
-					if (p->p_state == PRS_NORMAL)
-						racct_set(p, RACCT_RSS, rsize);
-					PROC_UNLOCK(p);
-					if (rsize > ravailable)
-						tryagain = 1;
-				}
-			}
-#endif
-			vmspace_free(vm);
-		}
-		sx_sunlock(&allproc_lock);
-		if (tryagain != 0 && attempts <= 10)
-			goto again;
+	/*
+	 * vm_pageout_wanted may have been set by an advisory wakeup, but if the
+	 * page daemon is running on a CPU, the wakeup will have been lost.
+	 * Thus, deliver a potentially spurious wakeup to ensure that the page
+	 * daemon has been notified of the shortage.
+	 */
+	if (!vm_pageout_wanted || !vm_pages_needed) {
+		vm_pageout_wanted = true;
+		wakeup(&vm_pageout_wanted);
 	}
+	vm_pages_needed = true;
+	msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | pri,
+	    wmesg, 0);
 }
-#endif			/* !defined(NO_SWAPPING) */

Modified: trunk/sys/vm/vm_pageout.h
===================================================================
--- trunk/sys/vm/vm_pageout.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pageout.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,12 +58,14 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_pageout.h 314664 2017-03-04 12:05:50Z avg $
+ * $FreeBSD: stable/11/sys/vm/vm_pageout.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _VM_VM_PAGEOUT_H_
 #define _VM_VM_PAGEOUT_H_
 
+#ifdef _KERNEL
+
 /*
  *	Header file for pageout daemon.
  */
@@ -73,17 +75,11 @@
  */
 
 extern int vm_page_max_wired;
-extern int vm_pages_needed;	/* should be some "event" structure */
-extern int vm_pageout_pages_needed;
 extern int vm_pageout_deficit;
 extern int vm_pageout_page_count;
+extern bool vm_pageout_wanted;
+extern bool vm_pages_needed;
 
-/*
- * Swap out requests
- */
-#define VM_SWAP_NORMAL 1
-#define VM_SWAP_IDLE 2
-
 #define	VM_OOM_MEM	1
 #define	VM_OOM_SWAPZ	2
 
@@ -101,15 +97,17 @@
  *	Signal pageout-daemon and wait for it.
  */
 
-extern void pagedaemon_wakeup(void);
+void pagedaemon_wait(int pri, const char *wmesg);
+void pagedaemon_wakeup(void);
 #define VM_WAIT vm_wait()
 #define VM_WAITPFAULT vm_waitpfault()
-extern void vm_wait(void);
-extern void vm_waitpfault(void);
+void vm_wait(void);
+void vm_waitpfault(void);
 
-#ifdef _KERNEL
 int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *);
-void vm_pageout_grow_cache(int, vm_paddr_t, vm_paddr_t);
 void vm_pageout_oom(int shortage);
-#endif
+
+void vm_swapout_run(void);
+void vm_swapout_run_idle(void);
+#endif /* _KERNEL */
 #endif	/* _VM_VM_PAGEOUT_H_ */

Modified: trunk/sys/vm/vm_pager.c
===================================================================
--- trunk/sys/vm/vm_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -65,7 +65,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pager.c 311645 2017-01-07 12:04:30Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_pager.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -87,7 +87,9 @@
 
 int cluster_pbuf_freecnt = -1;	/* unlimited to begin with */
 
-static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int);
+struct buf *swbuf;
+
+static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
 static void dead_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
@@ -95,13 +97,11 @@
 static void dead_pager_dealloc(vm_object_t);
 
 static int
-dead_pager_getpages(obj, ma, count, req)
-	vm_object_t obj;
-	vm_page_t *ma;
-	int count;
-	int req;
+dead_pager_getpages(vm_object_t obj, vm_page_t *ma, int count, int *rbehind,
+    int *rahead)
 {
-	return VM_PAGER_FAIL;
+
+	return (VM_PAGER_FAIL);
 }
 
 static vm_object_t
@@ -158,8 +158,6 @@
 	&mgtdevicepagerops,	/* OBJT_MGTDEVICE */
 };
 
-static const int npagers = sizeof(pagertab) / sizeof(pagertab[0]);
-
 /*
  * Kernel address space for mapping pages.
  * Used by pagers where KVAs are needed for IO.
@@ -168,7 +166,7 @@
  * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size
  * (MAXPHYS == 64k) if you want to get the most efficiency.
  */
-struct mtx_padalign pbuf_mtx;
+struct mtx_padalign __exclusive_cache_line pbuf_mtx;
 static TAILQ_HEAD(swqueue, buf) bswlist;
 static int bswneeded;
 vm_offset_t swapbkva;		/* swap buffers kva */
@@ -182,7 +180,7 @@
 	/*
 	 * Initialize known pagers
 	 */
-	for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++)
+	for (pgops = pagertab; pgops < &pagertab[nitems(pagertab)]; pgops++)
 		if ((*pgops)->pgo_init != NULL)
 			(*(*pgops)->pgo_init)();
 }
@@ -208,6 +206,7 @@
 
 	cluster_pbuf_freecnt = nswbuf / 2;
 	vnode_pbuf_freecnt = nswbuf / 2 + 1;
+	vnode_async_pbuf_freecnt = nswbuf / 2;
 }
 
 /*
@@ -241,8 +240,80 @@
 	(*pagertab[object->type]->pgo_dealloc) (object);
 }
 
+static void
+vm_pager_assert_in(vm_object_t object, vm_page_t *m, int count)
+{
+#ifdef INVARIANTS
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(count > 0, ("%s: 0 count", __func__));
+	/*
+	 * All pages must be busied, not mapped, not fully valid,
+	 * not dirty and belong to the proper object.
+	 */
+	for (int i = 0 ; i < count; i++) {
+		vm_page_assert_xbusied(m[i]);
+		KASSERT(!pmap_page_is_mapped(m[i]),
+		    ("%s: page %p is mapped", __func__, m[i]));
+		KASSERT(m[i]->valid != VM_PAGE_BITS_ALL,
+		    ("%s: request for a valid page %p", __func__, m[i]));
+		KASSERT(m[i]->dirty == 0,
+		    ("%s: page %p is dirty", __func__, m[i]));
+		KASSERT(m[i]->object == object,
+		    ("%s: wrong object %p/%p", __func__, object, m[i]->object));
+	}
+#endif
+}
+
 /*
- * vm_pager_get_pages() - inline, see vm/vm_pager.h
+ * Page in the pages for the object using its associated pager.
+ * The requested page must be fully valid on successful return.
+ */
+int
+vm_pager_get_pages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+    int *rahead)
+{
+#ifdef INVARIANTS
+	vm_pindex_t pindex = m[0]->pindex;
+#endif
+	int r;
+
+	vm_pager_assert_in(object, m, count);
+
+	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, rbehind,
+	    rahead);
+	if (r != VM_PAGER_OK)
+		return (r);
+
+	for (int i = 0; i < count; i++) {
+		/*
+		 * If pager has replaced a page, assert that it had
+		 * updated the array.
+		 */
+		KASSERT(m[i] == vm_page_lookup(object, pindex++),
+		    ("%s: mismatch page %p pindex %ju", __func__,
+		    m[i], (uintmax_t )pindex - 1));
+		/*
+		 * Zero out partially filled data.
+		 */
+		if (m[i]->valid != VM_PAGE_BITS_ALL)
+			vm_page_zero_invalid(m[i], TRUE);
+	}
+	return (VM_PAGER_OK);
+}
+
+int
+vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count,
+    int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
+{
+
+	vm_pager_assert_in(object, m, count);
+
+	return ((*pagertab[object->type]->pgo_getpages_async)(object, m,
+	    count, rbehind, rahead, iodone, arg));
+}
+
+/*
  * vm_pager_put_pages() - inline, see vm/vm_pager.h
  * vm_pager_has_page() - inline, see vm/vm_pager.h
  */
@@ -289,12 +360,11 @@
 	bp->b_rcred = NOCRED;
 	bp->b_wcred = NOCRED;
 	bp->b_qindex = 0;	/* On no queue (QUEUE_NONE) */
-	bp->b_saveaddr = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva;
-	bp->b_data = bp->b_saveaddr;
-	bp->b_kvabase = bp->b_saveaddr;
+	bp->b_kvabase = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva;
+	bp->b_data = bp->b_kvabase;
 	bp->b_kvasize = MAXPHYS;
+	bp->b_flags = 0;
 	bp->b_xflags = 0;
-	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_iodone = NULL;
 	bp->b_error = 0;

Modified: trunk/sys/vm/vm_pager.h
===================================================================
--- trunk/sys/vm/vm_pager.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pager.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_pager.h	8.4 (Berkeley) 1/12/94
- * $FreeBSD: stable/10/sys/vm/vm_pager.h 308365 2016-11-06 13:37:33Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_pager.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*
@@ -51,19 +51,26 @@
 typedef vm_object_t pgo_alloc_t(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t,
     struct ucred *);
 typedef void pgo_dealloc_t(vm_object_t);
-typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int);
+typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int *, int *);
+typedef void pgo_getpages_iodone_t(void *, vm_page_t *, int, int);
+typedef int pgo_getpages_async_t(vm_object_t, vm_page_t *, int, int *, int *,
+    pgo_getpages_iodone_t, void *);
 typedef void pgo_putpages_t(vm_object_t, vm_page_t *, int, int, int *);
 typedef boolean_t pgo_haspage_t(vm_object_t, vm_pindex_t, int *, int *);
+typedef int pgo_populate_t(vm_object_t, vm_pindex_t, int, vm_prot_t,
+    vm_pindex_t *, vm_pindex_t *);
 typedef void pgo_pageunswapped_t(vm_page_t);
 
 struct pagerops {
-	pgo_init_t	*pgo_init;		/* Initialize pager. */
-	pgo_alloc_t	*pgo_alloc;		/* Allocate pager. */
-	pgo_dealloc_t	*pgo_dealloc;		/* Disassociate. */
-	pgo_getpages_t	*pgo_getpages;		/* Get (read) page. */
-	pgo_putpages_t	*pgo_putpages;		/* Put (write) page. */
-	pgo_haspage_t	*pgo_haspage;		/* Does pager have page? */
-	pgo_pageunswapped_t *pgo_pageunswapped;
+	pgo_init_t		*pgo_init;		/* Initialize pager. */
+	pgo_alloc_t		*pgo_alloc;		/* Allocate pager. */
+	pgo_dealloc_t		*pgo_dealloc;		/* Disassociate. */
+	pgo_getpages_t		*pgo_getpages;		/* Get (read) page. */
+	pgo_getpages_async_t	*pgo_getpages_async;	/* Get page asyncly. */
+	pgo_putpages_t		*pgo_putpages;		/* Put (write) page. */
+	pgo_haspage_t		*pgo_haspage;		/* Query page. */
+	pgo_populate_t		*pgo_populate;		/* Bulk spec pagein. */
+	pgo_pageunswapped_t	*pgo_pageunswapped;
 };
 
 extern struct pagerops defaultpagerops;
@@ -92,6 +99,7 @@
 
 #define	VM_PAGER_PUT_SYNC		0x0001
 #define	VM_PAGER_PUT_INVAL		0x0002
+#define	VM_PAGER_PUT_NOREUSE		0x0004
 #define VM_PAGER_CLUSTER_OK		0x0008
 
 #ifdef _KERNEL
@@ -103,34 +111,12 @@
     vm_ooffset_t, struct ucred *);
 void vm_pager_bufferinit(void);
 void vm_pager_deallocate(vm_object_t);
-static __inline int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int);
+int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int *, int *);
+int vm_pager_get_pages_async(vm_object_t, vm_page_t *, int, int *, int *,
+    pgo_getpages_iodone_t, void *);
 void vm_pager_init(void);
 vm_object_t vm_pager_object_lookup(struct pagerlst *, void *);
 
-/*
- *	vm_page_get_pages:
- *
- *	Retrieve pages from the VM system in order to map them into an object
- *	( or into VM space somewhere ).  If the pagein was successful, we
- *	must fully validate it.
- */
-static __inline int
-vm_pager_get_pages(
-	vm_object_t object,
-	vm_page_t *m,
-	int count,
-	int reqpage
-) {
-	int r;
-
-	VM_OBJECT_ASSERT_WLOCKED(object);
-	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage);
-	if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) {
-		vm_page_zero_invalid(m[reqpage], TRUE);
-	}
-	return (r);
-}
-
 static __inline void
 vm_pager_put_pages(
 	vm_object_t object,
@@ -170,6 +156,19 @@
 	return (ret);
 } 
 
+static __inline int
+vm_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
+    vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+
+	MPASS((object->flags & OBJ_POPULATE) != 0);
+	MPASS(pidx < object->size);
+	MPASS(object->paging_in_progress > 0);
+	return ((*pagertab[object->type]->pgo_populate)(object, pidx,
+	    fault_type, max_prot, first, last));
+}
+
+
 /* 
  *      vm_pager_page_unswapped
  * 
@@ -195,6 +194,9 @@
 struct cdev_pager_ops {
 	int (*cdev_pg_fault)(vm_object_t vm_obj, vm_ooffset_t offset,
 	    int prot, vm_page_t *mres);
+	int (*cdev_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx,
+	    int fault_type, vm_prot_t max_prot, vm_pindex_t *first,
+	    vm_pindex_t *last);
 	int (*cdev_pg_ctor)(void *handle, vm_ooffset_t size, vm_prot_t prot,
 	    vm_ooffset_t foff, struct ucred *cred, u_short *color);
 	void (*cdev_pg_dtor)(void *handle);

Modified: trunk/sys/vm/vm_param.h
===================================================================
--- trunk/sys/vm/vm_param.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_param.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_param.h 254168 2013-08-09 23:47:43Z zont $
+ * $FreeBSD: stable/11/sys/vm/vm_param.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*
@@ -76,16 +76,17 @@
 #define	VM_TOTAL		1	/* struct vmtotal */
 #define	VM_METER                VM_TOTAL/* deprecated, use VM_TOTAL */
 #define	VM_LOADAVG	 	2	/* struct loadavg */
-#define VM_V_FREE_MIN		3	/* cnt.v_free_min */
-#define VM_V_FREE_TARGET	4	/* cnt.v_free_target */
-#define VM_V_FREE_RESERVED	5	/* cnt.v_free_reserved */
-#define VM_V_INACTIVE_TARGET	6	/* cnt.v_inactive_target */
-#define	VM_V_CACHE_MIN		7	/* cnt.v_cache_min */
-#define	VM_V_CACHE_MAX		8	/* cnt.v_cache_max */
-#define VM_V_PAGEOUT_FREE_MIN	9	/* cnt.v_pageout_free_min */
+#define VM_V_FREE_MIN		3	/* vm_cnt.v_free_min */
+#define VM_V_FREE_TARGET	4	/* vm_cnt.v_free_target */
+#define VM_V_FREE_RESERVED	5	/* vm_cnt.v_free_reserved */
+#define VM_V_INACTIVE_TARGET	6	/* vm_cnt.v_inactive_target */
+#define	VM_OBSOLETE_7		7	/* unused, formerly v_cache_min */
+#define	VM_OBSOLETE_8		8	/* unused, formerly v_cache_max */
+#define VM_V_PAGEOUT_FREE_MIN	9	/* vm_cnt.v_pageout_free_min */
 #define	VM_OBSOLETE_10		10	/* pageout algorithm */
 #define VM_SWAPPING_ENABLED	11	/* swapping enabled */
-#define	VM_MAXID		12	/* number of valid vm ids */
+#define VM_OVERCOMMIT		12	/* vm.overcommit */
+#define	VM_MAXID		13	/* number of valid vm ids */
 
 /*
  * Structure for swap device statistics

Modified: trunk/sys/vm/vm_phys.c
===================================================================
--- trunk/sys/vm/vm_phys.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_phys.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_phys.c 308349 2016-11-05 20:14:23Z markj $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_phys.c 331614 2018-03-27 13:09:35Z kib $");
 
 #include "opt_ddb.h"
 #include "opt_vm.h"
@@ -49,13 +49,14 @@
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
-#if MAXMEMDOM > 1
 #include <sys/proc.h>
-#endif
 #include <sys/queue.h>
+#include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
+#include <sys/tree.h>
 #include <sys/vmmeter.h>
+#include <sys/seq.h>
 
 #include <ddb/ddb.h>
 
@@ -66,10 +67,15 @@
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 
+#include <vm/vm_domain.h>
+
 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
     "Too many physsegs.");
 
+#ifdef VM_NUMA_ALLOC
 struct mem_affinity *mem_affinity;
+int *mem_locality;
+#endif
 
 int vm_ndomains = 1;
 
@@ -76,13 +82,25 @@
 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
 int vm_phys_nsegs;
 
-#define VM_PHYS_FICTITIOUS_NSEGS	8
-static struct vm_phys_fictitious_seg {
+struct vm_phys_fictitious_seg;
+static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
+    struct vm_phys_fictitious_seg *);
+
+RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
+    RB_INITIALIZER(_vm_phys_fictitious_tree);
+
+struct vm_phys_fictitious_seg {
+	RB_ENTRY(vm_phys_fictitious_seg) node;
+	/* Memory region data */
 	vm_paddr_t	start;
 	vm_paddr_t	end;
 	vm_page_t	first_page;
-} vm_phys_fictitious_segs[VM_PHYS_FICTITIOUS_NSEGS];
-static struct mtx vm_phys_fictitious_reg_mtx;
+};
+
+RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
+    vm_phys_fictitious_cmp);
+
+static struct rwlock vm_phys_fictitious_reg_lock;
 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
 
 static struct vm_freelist
@@ -127,21 +145,139 @@
 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 
+#ifdef VM_NUMA_ALLOC
+static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
+#endif
+
 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
     &vm_ndomains, 0, "Number of physical memory domains available.");
 
+/*
+ * Default to first-touch + round-robin.
+ */
+static struct mtx vm_default_policy_mtx;
+MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
+    MTX_DEF);
+#ifdef VM_NUMA_ALLOC
+static struct vm_domain_policy vm_default_policy =
+    VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
+#else
+/* Use round-robin so the domain policy code will only try once per allocation */
+static struct vm_domain_policy vm_default_policy =
+    VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
+#endif
+
 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
     int order);
+static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
+    u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
+    vm_paddr_t boundary);
 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
-static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
     int order);
 
+static int
+sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
+{
+	char policy_name[32];
+	int error;
+
+	mtx_lock(&vm_default_policy_mtx);
+
+	/* Map policy to output string */
+	switch (vm_default_policy.p.policy) {
+	case VM_POLICY_FIRST_TOUCH:
+		strcpy(policy_name, "first-touch");
+		break;
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		strcpy(policy_name, "first-touch-rr");
+		break;
+	case VM_POLICY_ROUND_ROBIN:
+	default:
+		strcpy(policy_name, "rr");
+		break;
+	}
+	mtx_unlock(&vm_default_policy_mtx);
+
+	error = sysctl_handle_string(oidp, &policy_name[0],
+	    sizeof(policy_name), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	mtx_lock(&vm_default_policy_mtx);
+	/* Set: match on the subset of policies that make sense as a default */
+	if (strcmp("first-touch-rr", policy_name) == 0) {
+		vm_domain_policy_set(&vm_default_policy,
+		    VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
+	} else if (strcmp("first-touch", policy_name) == 0) {
+		vm_domain_policy_set(&vm_default_policy,
+		    VM_POLICY_FIRST_TOUCH, 0);
+	} else if (strcmp("rr", policy_name) == 0) {
+		vm_domain_policy_set(&vm_default_policy,
+		    VM_POLICY_ROUND_ROBIN, 0);
+	} else {
+		error = EINVAL;
+		goto finish;
+	}
+
+	error = 0;
+finish:
+	mtx_unlock(&vm_default_policy_mtx);
+	return (error);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
+    0, 0, sysctl_vm_default_policy, "A",
+    "Default policy (rr, first-touch, first-touch-rr");
+
+/*
+ * Red-black tree helpers for vm fictitious range management.
+ */
+static inline int
+vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
+    struct vm_phys_fictitious_seg *range)
+{
+
+	KASSERT(range->start != 0 && range->end != 0,
+	    ("Invalid range passed on search for vm_fictitious page"));
+	if (p->start >= range->end)
+		return (1);
+	if (p->start < range->start)
+		return (-1);
+
+	return (0);
+}
+
+static int
+vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
+    struct vm_phys_fictitious_seg *p2)
+{
+
+	/* Check if this is a search for a page */
+	if (p1->end == 0)
+		return (vm_phys_fictitious_in_range(p1, p2));
+
+	KASSERT(p2->end != 0,
+    ("Invalid range passed as second parameter to vm fictitious comparison"));
+
+	/* Searching to add a new range */
+	if (p1->end <= p2->start)
+		return (-1);
+	if (p1->start >= p2->end)
+		return (1);
+
+	panic("Trying to add overlapping vm fictitious ranges:\n"
+	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
+	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
+}
+
 static __inline int
 vm_rr_selectdomain(void)
 {
-#if MAXMEMDOM > 1
+#ifdef VM_NUMA_ALLOC
 	struct thread *td;
 
 	td = curthread;
@@ -154,6 +290,53 @@
 #endif
 }
 
+/*
+ * Initialise a VM domain iterator.
+ *
+ * Check the thread policy, then the proc policy,
+ * then default to the system policy.
+ *
+ * Later on the various layers will have this logic
+ * plumbed into them and the phys code will be explicitly
+ * handed a VM domain policy to use.
+ */
+static void
+vm_policy_iterator_init(struct vm_domain_iterator *vi)
+{
+#ifdef VM_NUMA_ALLOC
+	struct vm_domain_policy lcl;
+#endif
+
+	vm_domain_iterator_init(vi);
+
+#ifdef VM_NUMA_ALLOC
+	/* Copy out the thread policy */
+	vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
+	if (lcl.p.policy != VM_POLICY_NONE) {
+		/* Thread policy is present; use it */
+		vm_domain_iterator_set_policy(vi, &lcl);
+		return;
+	}
+
+	vm_domain_policy_localcopy(&lcl,
+	    &curthread->td_proc->p_vm_dom_policy);
+	if (lcl.p.policy != VM_POLICY_NONE) {
+		/* Process policy is present; use it */
+		vm_domain_iterator_set_policy(vi, &lcl);
+		return;
+	}
+#endif
+	/* Use system default policy */
+	vm_domain_iterator_set_policy(vi, &vm_default_policy);
+}
+
+static void
+vm_policy_iterator_finish(struct vm_domain_iterator *vi)
+{
+
+	vm_domain_iterator_cleanup(vi);
+}
+
 boolean_t
 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
 {
@@ -243,6 +426,54 @@
 	return (error);
 }
 
+/*
+ * Return affinity, or -1 if there's no affinity information.
+ */
+int
+vm_phys_mem_affinity(int f, int t)
+{
+
+#ifdef VM_NUMA_ALLOC
+	if (mem_locality == NULL)
+		return (-1);
+	if (f >= vm_ndomains || t >= vm_ndomains)
+		return (-1);
+	return (mem_locality[f * vm_ndomains + t]);
+#else
+	return (-1);
+#endif
+}
+
+#ifdef VM_NUMA_ALLOC
+/*
+ * Outputs the VM locality table.
+ */
+static int
+sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sbuf;
+	int error, i, j;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+
+	sbuf_printf(&sbuf, "\n");
+
+	for (i = 0; i < vm_ndomains; i++) {
+		sbuf_printf(&sbuf, "%d: ", i);
+		for (j = 0; j < vm_ndomains; j++) {
+			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
+		}
+		sbuf_printf(&sbuf, "\n");
+	}
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
+	return (error);
+}
+#endif
+
 static void
 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 {
@@ -289,6 +520,7 @@
 static void
 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
 {
+#ifdef VM_NUMA_ALLOC
 	int i;
 
 	if (mem_affinity == NULL) {
@@ -313,6 +545,9 @@
 		    mem_affinity[i].domain);
 		start = mem_affinity[i].end;
 	}
+#else
+	_vm_phys_create_seg(start, end, 0);
+#endif
 }
 
 /*
@@ -473,7 +708,8 @@
 			}
 		}
 	}
-	mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF);
+
+	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
 }
 
 /*
@@ -495,36 +731,6 @@
 }
 
 /*
- * Initialize a physical page and add it to the free lists.
- */
-void
-vm_phys_add_page(vm_paddr_t pa)
-{
-	vm_page_t m;
-	struct vm_domain *vmd;
-
-	cnt.v_page_count++;
-	m = vm_phys_paddr_to_vm_page(pa);
-	m->busy_lock = VPB_UNBUSIED;
-	m->phys_addr = pa;
-	m->queue = PQ_NONE;
-	m->segind = vm_phys_paddr_to_segind(pa);
-	vmd = vm_phys_domain(m);
-	vmd->vmd_page_count++;
-	vmd->vmd_segs |= 1UL << m->segind;
-	m->flags = PG_FREE;
-	KASSERT(m->order == VM_NFREEORDER,
-	    ("vm_phys_add_page: page %p has unexpected order %d",
-	    m, m->order));
-	m->pool = VM_FREEPOOL_DEFAULT;
-	pmap_page_init(m);
-	mtx_lock(&vm_page_queue_free_mtx);
-	vm_phys_freecnt_adj(m, 1);
-	vm_phys_free_pages(m, 0);
-	mtx_unlock(&vm_page_queue_free_mtx);
-}
-
-/*
  * Allocate a contiguous, power of two-sized set of physical pages
  * from the free lists.
  *
@@ -534,7 +740,8 @@
 vm_phys_alloc_pages(int pool, int order)
 {
 	vm_page_t m;
-	int dom, domain, flind;
+	int domain, flind;
+	struct vm_domain_iterator vi;
 
 	KASSERT(pool < VM_NFREEPOOL,
 	    ("vm_phys_alloc_pages: pool %d is out of range", pool));
@@ -541,8 +748,9 @@
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_pages: order %d is out of range", order));
 
-	for (dom = 0; dom < vm_ndomains; dom++) {
-		domain = vm_rr_selectdomain();
+	vm_policy_iterator_init(&vi);
+
+	while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
 		for (flind = 0; flind < vm_nfreelists; flind++) {
 			m = vm_phys_alloc_domain_pages(domain, flind, pool,
 			    order);
@@ -550,6 +758,8 @@
 				return (m);
 		}
 	}
+
+	vm_policy_iterator_finish(&vi);
 	return (NULL);
 }
 
@@ -564,7 +774,8 @@
 vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
 {
 	vm_page_t m;
-	int dom, domain;
+	struct vm_domain_iterator vi;
+	int domain;
 
 	KASSERT(freelist < VM_NFREELIST,
 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
@@ -573,13 +784,17 @@
 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
-	for (dom = 0; dom < vm_ndomains; dom++) {
-		domain = vm_rr_selectdomain();
+
+	vm_policy_iterator_init(&vi);
+
+	while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
 		m = vm_phys_alloc_domain_pages(domain,
 		    vm_freelist_to_flind[freelist], pool, order);
 		if (m != NULL)
 			return (m);
 	}
+
+	vm_policy_iterator_finish(&vi);
 	return (NULL);
 }
 
@@ -643,23 +858,39 @@
 vm_page_t
 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
 {
-	struct vm_phys_fictitious_seg *seg;
+	struct vm_phys_fictitious_seg tmp, *seg;
 	vm_page_t m;
-	int segind;
 
 	m = NULL;
-	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
-		seg = &vm_phys_fictitious_segs[segind];
-		if (pa >= seg->start && pa < seg->end) {
-			m = &seg->first_page[atop(pa - seg->start)];
-			KASSERT((m->flags & PG_FICTITIOUS) != 0,
-			    ("%p not fictitious", m));
-			break;
-		}
-	}
+	tmp.start = pa;
+	tmp.end = 0;
+
+	rw_rlock(&vm_phys_fictitious_reg_lock);
+	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
+	rw_runlock(&vm_phys_fictitious_reg_lock);
+	if (seg == NULL)
+		return (NULL);
+
+	m = &seg->first_page[atop(pa - seg->start)];
+	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
+
 	return (m);
 }
 
+static inline void
+vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
+    long page_count, vm_memattr_t memattr)
+{
+	long i;
+
+	bzero(range, page_count * sizeof(*range));
+	for (i = 0; i < page_count; i++) {
+		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
+		range[i].oflags &= ~VPO_UNMANAGED;
+		range[i].busy_lock = VPB_UNBUSIED;
+	}
+}
+
 int
 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
     vm_memattr_t memattr)
@@ -666,104 +897,145 @@
 {
 	struct vm_phys_fictitious_seg *seg;
 	vm_page_t fp;
-	long i, page_count;
-	int segind;
+	long page_count;
 #ifdef VM_PHYSSEG_DENSE
-	long pi;
-	boolean_t malloced;
+	long pi, pe;
+	long dpage_count;
 #endif
 
+	KASSERT(start < end,
+	    ("Start of segment isn't less than end (start: %jx end: %jx)",
+	    (uintmax_t)start, (uintmax_t)end));
+
 	page_count = (end - start) / PAGE_SIZE;
 
 #ifdef VM_PHYSSEG_DENSE
 	pi = atop(start);
-	if (pi >= first_page && pi < vm_page_array_size + first_page) {
-		if (atop(end) >= vm_page_array_size + first_page)
-			return (EINVAL);
+	pe = atop(end);
+	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		fp = &vm_page_array[pi - first_page];
-		malloced = FALSE;
-	} else
+		if ((pe - first_page) > vm_page_array_size) {
+			/*
+			 * We have a segment that starts inside
+			 * of vm_page_array, but ends outside of it.
+			 *
+			 * Use vm_page_array pages for those that are
+			 * inside of the vm_page_array range, and
+			 * allocate the remaining ones.
+			 */
+			dpage_count = vm_page_array_size - (pi - first_page);
+			vm_phys_fictitious_init_range(fp, start, dpage_count,
+			    memattr);
+			page_count -= dpage_count;
+			start += ptoa(dpage_count);
+			goto alloc;
+		}
+		/*
+		 * We can allocate the full range from vm_page_array,
+		 * so there's no need to register the range in the tree.
+		 */
+		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
+		return (0);
+	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
+		/*
+		 * We have a segment that ends inside of vm_page_array,
+		 * but starts outside of it.
+		 */
+		fp = &vm_page_array[0];
+		dpage_count = pe - first_page;
+		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
+		    memattr);
+		end -= ptoa(dpage_count);
+		page_count -= dpage_count;
+		goto alloc;
+	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
+		/*
+		 * Trying to register a fictitious range that expands before
+		 * and after vm_page_array.
+		 */
+		return (EINVAL);
+	} else {
+alloc:
 #endif
-	{
 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
-		    M_WAITOK | M_ZERO);
+		    M_WAITOK);
 #ifdef VM_PHYSSEG_DENSE
-		malloced = TRUE;
-#endif
 	}
-	for (i = 0; i < page_count; i++) {
-		vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr);
-		fp[i].oflags &= ~VPO_UNMANAGED;
-		fp[i].busy_lock = VPB_UNBUSIED;
-	}
-	mtx_lock(&vm_phys_fictitious_reg_mtx);
-	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
-		seg = &vm_phys_fictitious_segs[segind];
-		if (seg->start == 0 && seg->end == 0) {
-			seg->start = start;
-			seg->end = end;
-			seg->first_page = fp;
-			mtx_unlock(&vm_phys_fictitious_reg_mtx);
-			return (0);
-		}
-	}
-	mtx_unlock(&vm_phys_fictitious_reg_mtx);
-#ifdef VM_PHYSSEG_DENSE
-	if (malloced)
 #endif
-		free(fp, M_FICT_PAGES);
-	return (EBUSY);
+	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
+
+	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
+	seg->start = start;
+	seg->end = end;
+	seg->first_page = fp;
+
+	rw_wlock(&vm_phys_fictitious_reg_lock);
+	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
+	rw_wunlock(&vm_phys_fictitious_reg_lock);
+
+	return (0);
 }
 
 void
 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
 {
-	struct vm_phys_fictitious_seg *seg;
-	vm_page_t fp;
-	int segind;
+	struct vm_phys_fictitious_seg *seg, tmp;
 #ifdef VM_PHYSSEG_DENSE
-	long pi;
+	long pi, pe;
 #endif
 
+	KASSERT(start < end,
+	    ("Start of segment isn't less than end (start: %jx end: %jx)",
+	    (uintmax_t)start, (uintmax_t)end));
+
 #ifdef VM_PHYSSEG_DENSE
 	pi = atop(start);
-#endif
-
-	mtx_lock(&vm_phys_fictitious_reg_mtx);
-	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
-		seg = &vm_phys_fictitious_segs[segind];
-		if (seg->start == start && seg->end == end) {
-			seg->start = seg->end = 0;
-			fp = seg->first_page;
-			seg->first_page = NULL;
-			mtx_unlock(&vm_phys_fictitious_reg_mtx);
-#ifdef VM_PHYSSEG_DENSE
-			if (pi < first_page || atop(end) >= vm_page_array_size)
-#endif
-				free(fp, M_FICT_PAGES);
+	pe = atop(end);
+	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
+		if ((pe - first_page) <= vm_page_array_size) {
+			/*
+			 * This segment was allocated using vm_page_array
+			 * only, there's nothing to do since those pages
+			 * were never added to the tree.
+			 */
 			return;
 		}
+		/*
+		 * We have a segment that starts inside
+		 * of vm_page_array, but ends outside of it.
+		 *
+		 * Calculate how many pages were added to the
+		 * tree and free them.
+		 */
+		start = ptoa(first_page + vm_page_array_size);
+	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
+		/*
+		 * We have a segment that ends inside of vm_page_array,
+		 * but starts outside of it.
+		 */
+		end = ptoa(first_page);
+	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
+		/* Since it's not possible to register such a range, panic. */
+		panic(
+		    "Unregistering not registered fictitious range [%#jx:%#jx]",
+		    (uintmax_t)start, (uintmax_t)end);
 	}
-	mtx_unlock(&vm_phys_fictitious_reg_mtx);
-	KASSERT(0, ("Unregistering not registered fictitious range"));
-}
+#endif
+	tmp.start = start;
+	tmp.end = 0;
 
-/*
- * Find the segment containing the given physical address.
- */
-static int
-vm_phys_paddr_to_segind(vm_paddr_t pa)
-{
-	struct vm_phys_seg *seg;
-	int segind;
-
-	for (segind = 0; segind < vm_phys_nsegs; segind++) {
-		seg = &vm_phys_segs[segind];
-		if (pa >= seg->start && pa < seg->end)
-			return (segind);
+	rw_wlock(&vm_phys_fictitious_reg_lock);
+	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
+	if (seg->start != start || seg->end != end) {
+		rw_wunlock(&vm_phys_fictitious_reg_lock);
+		panic(
+		    "Unregistering not registered fictitious range [%#jx:%#jx]",
+		    (uintmax_t)start, (uintmax_t)end);
 	}
-	panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
-	    (uintmax_t)pa);
+	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
+	rw_wunlock(&vm_phys_fictitious_reg_lock);
+	free(seg->first_page, M_FICT_PAGES);
+	free(seg, M_FICT_PAGES);
 }
 
 /*
@@ -853,6 +1125,56 @@
 }
 
 /*
+ * Scan physical memory between the specified addresses "low" and "high" for a
+ * run of contiguous physical pages that satisfy the specified conditions, and
+ * return the lowest page in the run.  The specified "alignment" determines
+ * the alignment of the lowest physical page in the run.  If the specified
+ * "boundary" is non-zero, then the run of physical pages cannot span a
+ * physical address that is a multiple of "boundary".
+ *
+ * "npages" must be greater than zero.  Both "alignment" and "boundary" must
+ * be a power of two.
+ */
+vm_page_t
+vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary, int options)
+{
+	vm_paddr_t pa_end;
+	vm_page_t m_end, m_run, m_start;
+	struct vm_phys_seg *seg;
+	int segind;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	if (low >= high)
+		return (NULL);
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		if (seg->start >= high)
+			break;
+		if (low >= seg->end)
+			continue;
+		if (low <= seg->start)
+			m_start = seg->first_page;
+		else
+			m_start = &seg->first_page[atop(low - seg->start)];
+		if (high < seg->end)
+			pa_end = high;
+		else
+			pa_end = seg->end;
+		if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
+			continue;
+		m_end = &seg->first_page[atop(pa_end - seg->start)];
+		m_run = vm_page_scan_contig(npages, m_start, m_end,
+		    alignment, boundary, options);
+		if (m_run != NULL)
+			return (m_run);
+	}
+	return (NULL);
+}
+
+/*
  * Set the pool for a contiguous, power of two-sized set of physical pages. 
  */
 void
@@ -946,7 +1268,7 @@
 	for (;;) {
 		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) {
 			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
-				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
+				if ((m_tmp->flags & PG_ZERO) == 0) {
 					vm_phys_unfree_page(m_tmp);
 					vm_phys_freecnt_adj(m, -1);
 					mtx_unlock(&vm_page_queue_free_mtx);
@@ -990,85 +1312,125 @@
 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
+	vm_paddr_t pa_end, pa_start;
+	vm_page_t m_run;
+	struct vm_domain_iterator vi;
+	struct vm_phys_seg *seg;
+	int domain, segind;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	if (low >= high)
+		return (NULL);
+	vm_policy_iterator_init(&vi);
+restartdom:
+	if (vm_domain_iterator_run(&vi, &domain) != 0) {
+		vm_policy_iterator_finish(&vi);
+		return (NULL);
+	}
+	m_run = NULL;
+	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
+		seg = &vm_phys_segs[segind];
+		if (seg->start >= high || seg->domain != domain)
+			continue;
+		if (low >= seg->end)
+			break;
+		if (low <= seg->start)
+			pa_start = seg->start;
+		else
+			pa_start = low;
+		if (high < seg->end)
+			pa_end = high;
+		else
+			pa_end = seg->end;
+		if (pa_end - pa_start < ptoa(npages))
+			continue;
+		m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
+		    alignment, boundary);
+		if (m_run != NULL)
+			break;
+	}
+	if (m_run == NULL && !vm_domain_iterator_isdone(&vi))
+		goto restartdom;
+	vm_policy_iterator_finish(&vi);
+	return (m_run);
+}
+
+/*
+ * Allocate a run of contiguous physical pages from the free list for the
+ * specified segment.
+ */
+static vm_page_t
+vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
+    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
+{
 	struct vm_freelist *fl;
-	struct vm_phys_seg *seg;
-	vm_paddr_t pa, pa_last, size;
+	vm_paddr_t pa, pa_end, size;
 	vm_page_t m, m_ret;
 	u_long npages_end;
-	int dom, domain, flind, oind, order, pind;
+	int oind, order, pind;
 
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	size = npages << PAGE_SHIFT;
-	KASSERT(size != 0,
-	    ("vm_phys_alloc_contig: size must not be 0"));
-	KASSERT((alignment & (alignment - 1)) == 0,
-	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
-	KASSERT((boundary & (boundary - 1)) == 0,
-	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
 	/* Compute the queue that is the best fit for npages. */
 	for (order = 0; (1 << order) < npages; order++);
-	dom = 0;
-restartdom:
-	domain = vm_rr_selectdomain();
-	for (flind = 0; flind < vm_nfreelists; flind++) {
-		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
-			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-				fl = &vm_phys_free_queues[domain][flind][pind][0];
-				TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
+	/* Search for a run satisfying the specified conditions. */
+	size = npages << PAGE_SHIFT;
+	for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
+	    oind++) {
+		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+			fl = (*seg->free_queues)[pind];
+			TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
+				/*
+				 * Is the size of this allocation request
+				 * larger than the largest block size?
+				 */
+				if (order >= VM_NFREEORDER) {
 					/*
-					 * A free list may contain physical pages
-					 * from one or more segments.
+					 * Determine if a sufficient number of
+					 * subsequent blocks to satisfy the
+					 * allocation request are free.
 					 */
-					seg = &vm_phys_segs[m_ret->segind];
-					if (seg->start > high ||
-					    low >= seg->end)
+					pa = VM_PAGE_TO_PHYS(m_ret);
+					pa_end = pa + size;
+					if (pa_end < pa)
 						continue;
-
-					/*
-					 * Is the size of this allocation request
-					 * larger than the largest block size?
-					 */
-					if (order >= VM_NFREEORDER) {
-						/*
-						 * Determine if a sufficient number
-						 * of subsequent blocks to satisfy
-						 * the allocation request are free.
-						 */
-						pa = VM_PAGE_TO_PHYS(m_ret);
-						pa_last = pa + size;
-						for (;;) {
-							pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
-							if (pa >= pa_last)
-								break;
-							if (pa < seg->start ||
-							    pa >= seg->end)
-								break;
-							m = &seg->first_page[atop(pa - seg->start)];
-							if (m->order != VM_NFREEORDER - 1)
-								break;
-						}
-						/* If not, continue to the next block. */
-						if (pa < pa_last)
-							continue;
+					for (;;) {
+						pa += 1 << (PAGE_SHIFT +
+						    VM_NFREEORDER - 1);
+						if (pa >= pa_end ||
+						    pa < seg->start ||
+						    pa >= seg->end)
+							break;
+						m = &seg->first_page[atop(pa -
+						    seg->start)];
+						if (m->order != VM_NFREEORDER -
+						    1)
+							break;
 					}
+					/* If not, go to the next block. */
+					if (pa < pa_end)
+						continue;
+				}
 
-					/*
-					 * Determine if the blocks are within the given range,
-					 * satisfy the given alignment, and do not cross the
-					 * given boundary.
-					 */
-					pa = VM_PAGE_TO_PHYS(m_ret);
-					if (pa >= low &&
-					    pa + size <= high &&
-					    (pa & (alignment - 1)) == 0 &&
-					    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
-						goto done;
-				}
+				/*
+				 * Determine if the blocks are within the
+				 * given range, satisfy the given alignment,
+				 * and do not cross the given boundary.
+				 */
+				pa = VM_PAGE_TO_PHYS(m_ret);
+				pa_end = pa + size;
+				if (pa >= low && pa_end <= high &&
+				    (pa & (alignment - 1)) == 0 &&
+				    rounddown2(pa ^ (pa_end - 1), boundary) == 0)
+					goto done;
 			}
 		}
 	}
-	if (++dom < vm_ndomains)
-		goto restartdom;
 	return (NULL);
 done:
 	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {

Modified: trunk/sys/vm/vm_phys.h
===================================================================
--- trunk/sys/vm/vm_phys.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_phys.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -29,7 +29,7 @@
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/vm_phys.h 285634 2015-07-16 14:41:58Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_phys.h 329381 2018-02-16 16:16:33Z mjg $
  */
 
 /*
@@ -62,6 +62,7 @@
 };
 
 extern struct mem_affinity *mem_affinity;
+extern int *mem_locality;
 extern int vm_ndomains;
 extern struct vm_phys_seg vm_phys_segs[];
 extern int vm_phys_nsegs;
@@ -69,7 +70,6 @@
 /*
  * The following functions are only to be used by the virtual memory system.
  */
-void vm_phys_add_page(vm_paddr_t pa);
 void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end);
 vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary);
@@ -84,9 +84,12 @@
 void vm_phys_free_pages(vm_page_t m, int order);
 void vm_phys_init(void);
 vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
+vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary, int options);
 void vm_phys_set_pool(int pool, vm_page_t m, int order);
 boolean_t vm_phys_unfree_page(vm_page_t m);
 boolean_t vm_phys_zero_pages_idle(void);
+int vm_phys_mem_affinity(int f, int t);
 
 /*
  *	vm_phys_domain:
@@ -96,7 +99,7 @@
 static inline struct vm_domain *
 vm_phys_domain(vm_page_t m)
 {
-#if MAXMEMDOM > 1
+#ifdef VM_NUMA_ALLOC
 	int domn, segind;
 
 	/* XXXKIB try to assert that the page is managed */
@@ -110,13 +113,13 @@
 #endif
 }
 
-static inline void
+static inline u_int
 vm_phys_freecnt_adj(vm_page_t m, int adj)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	cnt.v_free_count += adj;
 	vm_phys_domain(m)->vmd_free_count += adj;
+	return (vm_cnt.v_free_count += adj);
 }
 
 #endif	/* _KERNEL */

Modified: trunk/sys/vm/vm_radix.c
===================================================================
--- trunk/sys/vm/vm_radix.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_radix.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -50,7 +50,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_radix.c 298653 2016-04-26 17:39:54Z pfg $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_radix.c 327785 2018-01-10 20:39:26Z markj $");
 
 #include "opt_ddb.h"
 
@@ -299,21 +299,19 @@
 	 * are needed to store them.
 	 */
 	if (!uma_zone_reserve_kva(vm_radix_node_zone,
-	    ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
+	    ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
 	    sizeof(struct vm_radix_node))))
 		panic("%s: unable to reserve KVA", __func__);
 }
-SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND,
+SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_THIRD,
     vm_radix_reserve_kva, NULL);
 #endif
 
 /*
  * Initialize the UMA slab zone.
- * Until vm_radix_prealloc() is called, the zone will be served by the
- * UMA boot-time pre-allocated pool of pages.
  */
 void
-vm_radix_init(void)
+vm_radix_zinit(void)
 {
 
 	vm_radix_node_zone = uma_zcreate("RADIX NODE",
@@ -342,8 +340,6 @@
 
 	index = page->pindex;
 
-restart:
-
 	/*
 	 * The owner of record for root is not really important because it
 	 * will never be used.
@@ -361,32 +357,10 @@
 				panic("%s: key %jx is already present",
 				    __func__, (uintmax_t)index);
 			clev = vm_radix_keydiff(m->pindex, index);
-
-			/*
-			 * During node allocation the trie that is being
-			 * walked can be modified because of recursing radix
-			 * trie operations.
-			 * If this is the case, the recursing functions signal
-			 * such situation and the insert operation must
-			 * start from scratch again.
-			 * The freed radix node will then be in the UMA
-			 * caches very likely to avoid the same situation
-			 * to happen.
-			 */
-			rtree->rt_flags |= RT_INSERT_INPROG;
 			tmp = vm_radix_node_get(vm_radix_trimkey(index,
 			    clev + 1), 2, clev);
-			rtree->rt_flags &= ~RT_INSERT_INPROG;
-			if (tmp == NULL) {
-				rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+			if (tmp == NULL)
 				return (ENOMEM);
-			}
-			if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
-				rtree->rt_flags &= ~RT_TRIE_MODIFIED;
-				tmp->rn_count = 0;
-				vm_radix_node_put(tmp);
-				goto restart;
-			}
 			*parentp = tmp;
 			vm_radix_addpage(tmp, index, clev, page);
 			vm_radix_addpage(tmp, m->pindex, clev, m);
@@ -410,21 +384,9 @@
 	 */
 	newind = rnode->rn_owner;
 	clev = vm_radix_keydiff(newind, index);
-
-	/* See the comments above. */
-	rtree->rt_flags |= RT_INSERT_INPROG;
 	tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
-	rtree->rt_flags &= ~RT_INSERT_INPROG;
-	if (tmp == NULL) {
-		rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+	if (tmp == NULL)
 		return (ENOMEM);
-	}
-	if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
-		rtree->rt_flags &= ~RT_TRIE_MODIFIED;
-		tmp->rn_count = 0;
-		vm_radix_node_put(tmp);
-		goto restart;
-	}
 	*parentp = tmp;
 	vm_radix_addpage(tmp, index, clev, page);
 	slot = vm_radix_slot(newind, clev);
@@ -699,10 +661,10 @@
 }
 
 /*
- * Remove the specified index from the tree.
- * Panics if the key is not present.
+ * Remove the specified index from the trie, and return the value stored at
+ * that index.  If the index is not present, return NULL.
  */
-void
+vm_page_t
 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
 {
 	struct vm_radix_node *rnode, *parent;
@@ -709,41 +671,27 @@
 	vm_page_t m;
 	int i, slot;
 
-	/*
-	 * Detect if a page is going to be removed from a trie which is
-	 * already undergoing another trie operation.
-	 * Right now this is only possible for vm_radix_remove() recursing
-	 * into vm_radix_insert().
-	 * If this is the case, the caller must be notified about this
-	 * situation.  It will also takecare to update the RT_TRIE_MODIFIED
-	 * accordingly.
-	 * The RT_TRIE_MODIFIED bit is set here because the remove operation
-	 * will always succeed.
-	 */
-	if ((rtree->rt_flags & RT_INSERT_INPROG) != 0)
-		rtree->rt_flags |= RT_TRIE_MODIFIED;
-
 	rnode = vm_radix_getroot(rtree);
 	if (vm_radix_isleaf(rnode)) {
 		m = vm_radix_topage(rnode);
 		if (m->pindex != index)
-			panic("%s: invalid key found", __func__);
+			return (NULL);
 		vm_radix_setroot(rtree, NULL);
-		return;
+		return (m);
 	}
 	parent = NULL;
 	for (;;) {
 		if (rnode == NULL)
-			panic("vm_radix_remove: impossible to locate the key");
+			return (NULL);
 		slot = vm_radix_slot(index, rnode->rn_clev);
 		if (vm_radix_isleaf(rnode->rn_child[slot])) {
 			m = vm_radix_topage(rnode->rn_child[slot]);
 			if (m->pindex != index)
-				panic("%s: invalid key found", __func__);
+				return (NULL);
 			rnode->rn_child[slot] = NULL;
 			rnode->rn_count--;
 			if (rnode->rn_count > 1)
-				break;
+				return (m);
 			for (i = 0; i < VM_RADIX_COUNT; i++)
 				if (rnode->rn_child[i] != NULL)
 					break;
@@ -760,7 +708,7 @@
 			rnode->rn_count--;
 			rnode->rn_child[i] = NULL;
 			vm_radix_node_put(rnode);
-			break;
+			return (m);
 		}
 		parent = rnode;
 		rnode = rnode->rn_child[slot];
@@ -777,9 +725,6 @@
 {
 	struct vm_radix_node *root;
 
-	KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0,
-	    ("vm_radix_reclaim_allnodes: unexpected trie recursion"));
-
 	root = vm_radix_getroot(rtree);
 	if (root == NULL)
 		return;
@@ -831,6 +776,12 @@
 	panic("%s: original replacing page not found", __func__);
 }
 
+void
+vm_radix_wait(void)
+{
+	uma_zwait(vm_radix_node_zone);
+}
+
 #ifdef DDB
 /*
  * Show details about the given radix node.

Modified: trunk/sys/vm/vm_radix.h
===================================================================
--- trunk/sys/vm/vm_radix.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_radix.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/vm_radix.h 266591 2014-05-23 17:47:49Z alc $
+ * $FreeBSD: stable/11/sys/vm/vm_radix.h 327785 2018-01-10 20:39:26Z markj $
  */
 
 #ifndef _VM_RADIX_H_
@@ -36,15 +36,30 @@
 
 #ifdef _KERNEL
 
-void		vm_radix_init(void);
 int		vm_radix_insert(struct vm_radix *rtree, vm_page_t page);
+void		vm_radix_wait(void);
 boolean_t	vm_radix_is_singleton(struct vm_radix *rtree);
 vm_page_t	vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index);
 vm_page_t	vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index);
 vm_page_t	vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index);
 void		vm_radix_reclaim_allnodes(struct vm_radix *rtree);
-void		vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t	vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index);
 vm_page_t	vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage);
+void		vm_radix_zinit(void);
 
+static __inline void
+vm_radix_init(struct vm_radix *rtree)
+{
+
+	rtree->rt_root = 0;
+}
+
+static __inline boolean_t
+vm_radix_is_empty(struct vm_radix *rtree)
+{
+
+	return (rtree->rt_root == 0);
+}
+
 #endif /* _KERNEL */
 #endif /* !_VM_RADIX_H_ */

Modified: trunk/sys/vm/vm_reserv.c
===================================================================
--- trunk/sys/vm/vm_reserv.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_reserv.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -1,7 +1,7 @@
 /* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002-2006 Rice University
- * Copyright (c) 2007-2008 Alan L. Cox <alc at cs.rice.edu>
+ * Copyright (c) 2007-2011 Alan L. Cox <alc at cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_reserv.c 280045 2015-03-15 18:40:06Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_reserv.c 351826 2019-09-04 19:31:37Z ray $");
 
 #include "opt_vm.h"
 
@@ -52,6 +52,7 @@
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -63,7 +64,7 @@
 
 /*
  * The reservation system supports the speculative allocation of large physical
- * pages ("superpages").  Speculative allocation enables the fully-automatic
+ * pages ("superpages").  Speculative allocation enables the fully automatic
  * utilization of superpages by the virtual memory system.  In other words, no
  * programmatic directives are required to use superpages.
  */
@@ -94,6 +95,61 @@
     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
 
 /*
+ * The size of a population map entry
+ */
+typedef	u_long		popmap_t;
+
+/*
+ * The number of bits in a population map entry
+ */
+#define	NBPOPMAP	(NBBY * sizeof(popmap_t))
+
+/*
+ * The number of population map entries in a reservation
+ */
+#define	NPOPMAP		howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
+
+/*
+ * Clear a bit in the population map.
+ */
+static __inline void
+popmap_clear(popmap_t popmap[], int i)
+{
+
+	popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
+}
+
+/*
+ * Set a bit in the population map.
+ */
+static __inline void
+popmap_set(popmap_t popmap[], int i)
+{
+
+	popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
+}
+
+/*
+ * Is a bit in the population map clear?
+ */
+static __inline boolean_t
+popmap_is_clear(popmap_t popmap[], int i)
+{
+
+	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
+}
+
+/*
+ * Is a bit in the population map set?
+ */
+static __inline boolean_t
+popmap_is_set(popmap_t popmap[], int i)
+{
+
+	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
+}
+
+/*
  * The reservation structure
  *
  * A reservation structure is constructed whenever a large physical page is
@@ -101,11 +157,11 @@
  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  * within that object.  The reservation's "popcnt" tracks the number of these
  * small physical pages that are in use at any given time.  When and if the
- * reservation is not fully utilized, it appears in the queue of partially-
+ * reservation is not fully utilized, it appears in the queue of partially
  * populated reservations.  The reservation always appears on the containing
  * object's list of reservations.
  *
- * A partially-populated reservation can be broken and reclaimed at any time.
+ * A partially populated reservation can be broken and reclaimed at any time.
  */
 struct vm_reserv {
 	TAILQ_ENTRY(vm_reserv) partpopq;
@@ -115,6 +171,7 @@
 	vm_page_t	pages;			/* first page of a superpage */
 	int		popcnt;			/* # of pages in use */
 	char		inpartpopq;
+	popmap_t	popmap[NPOPMAP];	/* bit vector of used pages */
 };
 
 /*
@@ -141,11 +198,11 @@
 static vm_reserv_t vm_reserv_array;
 
 /*
- * The partially-populated reservation queue
+ * The partially populated reservation queue
  *
- * This queue enables the fast recovery of an unused cached or free small page
- * from a partially-populated reservation.  The reservation at the head of
- * this queue is the least-recently-changed, partially-populated reservation.
+ * This queue enables the fast recovery of an unused free small page from a
+ * partially populated reservation.  The reservation at the head of this queue
+ * is the least recently changed, partially populated reservation.
  *
  * Access to this queue is synchronized by the free page queue lock.
  */
@@ -162,26 +219,60 @@
 SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
     &vm_reserv_freed, 0, "Cumulative number of freed reservations");
 
+static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
+
+SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
+    sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
+
 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
-    sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues");
+    sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues");
 
 static long vm_reserv_reclaimed;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
     &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
 
-static void		vm_reserv_depopulate(vm_reserv_t rv);
+static void		vm_reserv_break(vm_reserv_t rv);
+static void		vm_reserv_depopulate(vm_reserv_t rv, int index);
 static vm_reserv_t	vm_reserv_from_page(vm_page_t m);
 static boolean_t	vm_reserv_has_pindex(vm_reserv_t rv,
 			    vm_pindex_t pindex);
-static void		vm_reserv_populate(vm_reserv_t rv);
+static void		vm_reserv_populate(vm_reserv_t rv, int index);
 static void		vm_reserv_reclaim(vm_reserv_t rv);
 
 /*
- * Describes the current state of the partially-populated reservation queue.
+ * Returns the current number of full reservations.
+ *
+ * Since the number of full reservations is computed without acquiring the
+ * free page queue lock, the returned value may be inexact.
  */
 static int
+sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
+{
+	vm_paddr_t paddr;
+	struct vm_phys_seg *seg;
+	vm_reserv_t rv;
+	int fullpop, segind;
+
+	fullpop = 0;
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
+		while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
+		    VM_LEVEL_0_SIZE <= seg->end) {
+			rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
+			fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
+			paddr += VM_LEVEL_0_SIZE;
+		}
+	}
+	return (sysctl_handle_int(oidp, &fullpop, 0, req));
+}
+
+/*
+ * Describes the current state of the partially populated reservation queue.
+ */
+static int
 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
@@ -213,18 +304,21 @@
 /*
  * Reduces the given reservation's population count.  If the population count
  * becomes zero, the reservation is destroyed.  Additionally, moves the
- * reservation to the tail of the partially-populated reservations queue if the
+ * reservation to the tail of the partially populated reservation queue if the
  * population count is non-zero.
  *
  * The free page queue lock must be held.
  */
 static void
-vm_reserv_depopulate(vm_reserv_t rv)
+vm_reserv_depopulate(vm_reserv_t rv, int index)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_depopulate: reserv %p is free", rv));
+	KASSERT(popmap_is_set(rv->popmap, index),
+	    ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
+	    index));
 	KASSERT(rv->popcnt > 0,
 	    ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
 	if (rv->inpartpopq) {
@@ -236,6 +330,7 @@
 		    rv));
 		rv->pages->psind = 0;
 	}
+	popmap_clear(rv->popmap, index);
 	rv->popcnt--;
 	if (rv->popcnt == 0) {
 		LIST_REMOVE(rv, objq);
@@ -271,17 +366,20 @@
 
 /*
  * Increases the given reservation's population count.  Moves the reservation
- * to the tail of the partially-populated reservation queue.
+ * to the tail of the partially populated reservation queue.
  *
  * The free page queue must be locked.
  */
 static void
-vm_reserv_populate(vm_reserv_t rv)
+vm_reserv_populate(vm_reserv_t rv, int index)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_populate: reserv %p is free", rv));
+	KASSERT(popmap_is_clear(rv->popmap, index),
+	    ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
+	    index));
 	KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
 	    ("vm_reserv_populate: reserv %p is already full", rv));
 	KASSERT(rv->pages->psind == 0,
@@ -290,6 +388,7 @@
 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 		rv->inpartpopq = FALSE;
 	}
+	popmap_set(rv->popmap, index);
 	rv->popcnt++;
 	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 		rv->inpartpopq = TRUE;
@@ -308,14 +407,18 @@
  * physical address boundary that is a multiple of that value.  Both
  * "alignment" and "boundary" must be a power of two.
  *
+ * The page "mpred" must immediately precede the offset "pindex" within the
+ * specified object.
+ *
  * The object and free page queue must be locked.
  */
 vm_page_t
 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
-    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
+    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
+    vm_page_t mpred)
 {
 	vm_paddr_t pa, size;
-	vm_page_t m, m_ret, mpred, msucc;
+	vm_page_t m, m_ret, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
 	u_long allocpages, maxpages, minpages;
@@ -352,10 +455,11 @@
 	/*
 	 * Look for an existing reservation.
 	 */
-	mpred = vm_radix_lookup_le(&object->rtree, pindex);
 	if (mpred != NULL) {
+		KASSERT(mpred->object == object,
+		    ("vm_reserv_alloc_contig: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < pindex,
-		    ("vm_reserv_alloc_contig: pindex already allocated"));
+		    ("vm_reserv_alloc_contig: mpred doesn't precede pindex"));
 		rv = vm_reserv_from_page(mpred);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
@@ -364,7 +468,7 @@
 		msucc = TAILQ_FIRST(&object->memq);
 	if (msucc != NULL) {
 		KASSERT(msucc->pindex > pindex,
-		    ("vm_reserv_alloc_contig: pindex already allocated"));
+		    ("vm_reserv_alloc_contig: msucc doesn't succeed pindex"));
 		rv = vm_reserv_from_page(msucc);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
@@ -460,9 +564,13 @@
 		KASSERT(!rv->inpartpopq,
 		    ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
 		    rv));
+		for (i = 0; i < NPOPMAP; i++)
+			KASSERT(rv->popmap[i] == 0,
+		    ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted",
+			    rv));
 		n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
 		for (i = 0; i < n; i++)
-			vm_reserv_populate(rv);
+			vm_reserv_populate(rv, index + i);
 		npages -= n;
 		if (m_ret == NULL) {
 			m_ret = &rv->pages[index];
@@ -489,15 +597,15 @@
 		return (NULL);
 	/* Handle vm_page_rename(m, new_object, ...). */
 	for (i = 0; i < npages; i++)
-		if ((rv->pages[index + i].flags & (PG_CACHED | PG_FREE)) == 0)
+		if (popmap_is_set(rv->popmap, index + i))
 			return (NULL);
 	for (i = 0; i < npages; i++)
-		vm_reserv_populate(rv);
+		vm_reserv_populate(rv, index + i);
 	return (m);
 }
 
 /*
- * Allocates a page from an existing or newly-created reservation.
+ * Allocates a page from an existing or newly created reservation.
  *
  * The page "mpred" must immediately precede the offset "pindex" within the
  * specified object.
@@ -510,6 +618,7 @@
 	vm_page_t m, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
+	int i, index;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	VM_OBJECT_ASSERT_WLOCKED(object);
@@ -598,22 +707,93 @@
 	    ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
 	KASSERT(!rv->inpartpopq,
 	    ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
-	vm_reserv_populate(rv);
-	return (&rv->pages[VM_RESERV_INDEX(object, pindex)]);
+	for (i = 0; i < NPOPMAP; i++)
+		KASSERT(rv->popmap[i] == 0,
+		    ("vm_reserv_alloc_page: reserv %p's popmap is corrupted",
+		    rv));
+	index = VM_RESERV_INDEX(object, pindex);
+	vm_reserv_populate(rv, index);
+	return (&rv->pages[index]);
 
 	/*
 	 * Found a matching reservation.
 	 */
 found:
-	m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
+	index = VM_RESERV_INDEX(object, pindex);
+	m = &rv->pages[index];
 	/* Handle vm_page_rename(m, new_object, ...). */
-	if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
+	if (popmap_is_set(rv->popmap, index))
 		return (NULL);
-	vm_reserv_populate(rv);
+	vm_reserv_populate(rv, index);
 	return (m);
 }
 
 /*
+ * Breaks the given reservation.  All free pages in the reservation
+ * are returned to the physical memory allocator.  The reservation's
+ * population count and map are reset to their initial state.
+ *
+ * The given reservation must not be in the partially populated reservation
+ * queue.  The free page queue lock must be held.
+ */
+static void
+vm_reserv_break(vm_reserv_t rv)
+{
+	int begin_zeroes, hi, i, lo;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	KASSERT(rv->object != NULL,
+	    ("vm_reserv_break: reserv %p is free", rv));
+	KASSERT(!rv->inpartpopq,
+	    ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv));
+	LIST_REMOVE(rv, objq);
+	rv->object = NULL;
+	rv->pages->psind = 0;
+	i = hi = 0;
+	do {
+		/* Find the next 0 bit.  Any previous 0 bits are < "hi". */
+		lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
+		if (lo == 0) {
+			/* Redundantly clears bits < "hi". */
+			rv->popmap[i] = 0;
+			rv->popcnt -= NBPOPMAP - hi;
+			while (++i < NPOPMAP) {
+				lo = ffsl(~rv->popmap[i]);
+				if (lo == 0) {
+					rv->popmap[i] = 0;
+					rv->popcnt -= NBPOPMAP;
+				} else
+					break;
+			}
+			if (i == NPOPMAP)
+				break;
+			hi = 0;
+		}
+		KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo));
+		/* Convert from ffsl() to ordinary bit numbering. */
+		lo--;
+		if (lo > 0) {
+			/* Redundantly clears bits < "hi". */
+			rv->popmap[i] &= ~((1UL << lo) - 1);
+			rv->popcnt -= lo - hi;
+		}
+		begin_zeroes = NBPOPMAP * i + lo;
+		/* Find the next 1 bit. */
+		do
+			hi = ffsl(rv->popmap[i]);
+		while (hi == 0 && ++i < NPOPMAP);
+		if (i != NPOPMAP)
+			/* Convert from ffsl() to ordinary bit numbering. */
+			hi--;
+		vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
+		    hi - begin_zeroes);
+	} while (i < NPOPMAP);
+	KASSERT(rv->popcnt == 0,
+	    ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
+	vm_reserv_broken++;
+}
+
+/*
  * Breaks all reservations belonging to the given object.
  */
 void
@@ -620,7 +800,6 @@
 vm_reserv_break_all(vm_object_t object)
 {
 	vm_reserv_t rv;
-	int i;
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
@@ -630,18 +809,7 @@
 			TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 			rv->inpartpopq = FALSE;
 		}
-		LIST_REMOVE(rv, objq);
-		rv->object = NULL;
-		for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
-			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
-				vm_phys_free_pages(&rv->pages[i], 0);
-			else
-				rv->popcnt--;
-		}
-		KASSERT(rv->popcnt == 0,
-		    ("vm_reserv_break_all: reserv %p's popcnt is corrupted",
-		    rv));
-		vm_reserv_broken++;
+		vm_reserv_break(rv);
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
@@ -661,10 +829,7 @@
 	rv = vm_reserv_from_page(m);
 	if (rv->object == NULL)
 		return (FALSE);
-	if ((m->flags & PG_CACHED) != 0 && m->pool != VM_FREEPOOL_CACHE)
-		vm_phys_set_pool(VM_FREEPOOL_CACHE, rv->pages,
-		    VM_LEVEL_0_ORDER);
-	vm_reserv_depopulate(rv);
+	vm_reserv_depopulate(rv, m - rv->pages);
 	return (TRUE);
 }
 
@@ -678,15 +843,18 @@
 vm_reserv_init(void)
 {
 	vm_paddr_t paddr;
-	int i;
+	struct vm_phys_seg *seg;
+	int segind;
 
 	/*
 	 * Initialize the reservation array.  Specifically, initialize the
 	 * "pages" field for every element that has an underlying superpage.
 	 */
-	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
-		paddr = roundup2(phys_avail[i], VM_LEVEL_0_SIZE);
-		while (paddr + VM_LEVEL_0_SIZE <= phys_avail[i + 1]) {
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
+		while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
+		    VM_LEVEL_0_SIZE <= seg->end) {
 			vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
 			    PHYS_TO_VM_PAGE(paddr);
 			paddr += VM_LEVEL_0_SIZE;
@@ -695,77 +863,50 @@
 }
 
 /*
- * Returns a reservation level if the given page belongs to a fully-populated
- * reservation and -1 otherwise.
+ * Returns true if the given page belongs to a reservation and that page is
+ * free.  Otherwise, returns false.
  */
+bool
+vm_reserv_is_page_free(vm_page_t m)
+{
+	vm_reserv_t rv;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	rv = vm_reserv_from_page(m);
+	if (rv->object == NULL)
+		return (false);
+	return (popmap_is_clear(rv->popmap, m - rv->pages));
+}
+
+/*
+ * If the given page belongs to a reservation, returns the level of that
+ * reservation.  Otherwise, returns -1.
+ */
 int
-vm_reserv_level_iffullpop(vm_page_t m)
+vm_reserv_level(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	rv = vm_reserv_from_page(m);
-	return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
+	return (rv->object != NULL ? 0 : -1);
 }
 
 /*
- * Prepare for the reactivation of a cached page.
- *
- * First, suppose that the given page "m" was allocated individually, i.e., not
- * as part of a reservation, and cached.  Then, suppose a reservation
- * containing "m" is allocated by the same object.  Although "m" and the
- * reservation belong to the same object, "m"'s pindex may not match the
- * reservation's.
- *
- * The free page queue must be locked.
+ * Returns a reservation level if the given page belongs to a fully populated
+ * reservation and -1 otherwise.
  */
-boolean_t
-vm_reserv_reactivate_page(vm_page_t m)
+int
+vm_reserv_level_iffullpop(vm_page_t m)
 {
 	vm_reserv_t rv;
-	int i, m_index;
 
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	rv = vm_reserv_from_page(m);
-	if (rv->object == NULL)
-		return (FALSE);
-	KASSERT((m->flags & PG_CACHED) != 0,
-	    ("vm_reserv_uncache_page: page %p is not cached", m));
-	if (m->object == rv->object &&
-	    m->pindex - rv->pindex == VM_RESERV_INDEX(m->object, m->pindex))
-		vm_reserv_populate(rv);
-	else {
-		KASSERT(rv->inpartpopq,
-		    ("vm_reserv_uncache_page: reserv %p's inpartpopq is FALSE",
-		    rv));
-		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
-		rv->inpartpopq = FALSE;
-		LIST_REMOVE(rv, objq);
-		rv->object = NULL;
-		/* Don't vm_phys_free_pages(m, 0). */
-		m_index = m - rv->pages;
-		for (i = 0; i < m_index; i++) {
-			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
-				vm_phys_free_pages(&rv->pages[i], 0);
-			else
-				rv->popcnt--;
-		}
-		for (i++; i < VM_LEVEL_0_NPAGES; i++) {
-			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
-				vm_phys_free_pages(&rv->pages[i], 0);
-			else
-				rv->popcnt--;
-		}
-		KASSERT(rv->popcnt == 0,
-		    ("vm_reserv_uncache_page: reserv %p's popcnt is corrupted",
-		    rv));
-		vm_reserv_broken++;
-	}
-	return (TRUE);
+	return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 }
 
 /*
- * Breaks the given partially-populated reservation, releasing its cached and
- * free pages to the physical memory allocator.
+ * Breaks the given partially populated reservation, releasing its free pages
+ * to the physical memory allocator.
  *
  * The free page queue lock must be held.
  */
@@ -772,32 +913,20 @@
 static void
 vm_reserv_reclaim(vm_reserv_t rv)
 {
-	int i;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->inpartpopq,
-	    ("vm_reserv_reclaim: reserv %p's inpartpopq is corrupted", rv));
+	    ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
 	TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 	rv->inpartpopq = FALSE;
-	KASSERT(rv->object != NULL,
-	    ("vm_reserv_reclaim: reserv %p is free", rv));
-	LIST_REMOVE(rv, objq);
-	rv->object = NULL;
-	for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
-		if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
-			vm_phys_free_pages(&rv->pages[i], 0);
-		else
-			rv->popcnt--;
-	}
-	KASSERT(rv->popcnt == 0,
-	    ("vm_reserv_reclaim: reserv %p's popcnt is corrupted", rv));
+	vm_reserv_break(rv);
 	vm_reserv_reclaimed++;
 }
 
 /*
- * Breaks the reservation at the head of the partially-populated reservation
- * queue, releasing its cached and free pages to the physical memory
- * allocator.  Returns TRUE if a reservation is broken and FALSE otherwise.
+ * Breaks the reservation at the head of the partially populated reservation
+ * queue, releasing its free pages to the physical memory allocator.  Returns
+ * TRUE if a reservation is broken and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
@@ -815,11 +944,10 @@
 }
 
 /*
- * Searches the partially-populated reservation queue for the least recently
- * active reservation with unused pages, i.e., cached or free, that satisfy the
- * given request for contiguous physical memory.  If a satisfactory reservation
- * is found, it is broken.  Returns TRUE if a reservation is broken and FALSE
- * otherwise.
+ * Searches the partially populated reservation queue for the least recently
+ * changed reservation with free pages that satisfy the given request for
+ * contiguous physical memory.  If a satisfactory reservation is found, it is
+ * broken.  Returns TRUE if a reservation is broken and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
@@ -827,9 +955,9 @@
 vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
-	vm_paddr_t pa, pa_length, size;
+	vm_paddr_t pa, size;
 	vm_reserv_t rv;
-	int i;
+	int hi, i, lo, low_index, next_free;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if (npages > VM_LEVEL_0_NPAGES - 1)
@@ -838,30 +966,72 @@
 	TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
 		pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 		if (pa + PAGE_SIZE - size < low) {
-			/* this entire reservation is too low; go to next */
+			/* This entire reservation is too low; go to next. */
 			continue;
 		}
-		pa_length = 0;
-		for (i = 0; i < VM_LEVEL_0_NPAGES; i++)
-			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) {
-				pa_length += PAGE_SIZE;
-				if (pa_length == PAGE_SIZE) {
-					pa = VM_PAGE_TO_PHYS(&rv->pages[i]);
-					if (pa + size > high) {
-						/* skip to next reservation */
-						break;
-					} else if (pa < low ||
-					    (pa & (alignment - 1)) != 0 ||
-					    ((pa ^ (pa + size - 1)) &
-					    ~(boundary - 1)) != 0)
-						pa_length = 0;
+		pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
+		if (pa + size > high) {
+			/* This entire reservation is too high; go to next. */
+			continue;
+		}
+		if (pa < low) {
+			/* Start the search for free pages at "low". */
+			low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT;
+			i = low_index / NBPOPMAP;
+			hi = low_index % NBPOPMAP;
+		} else
+			i = hi = 0;
+		do {
+			/* Find the next free page. */
+			lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
+			while (lo == 0 && ++i < NPOPMAP)
+				lo = ffsl(~rv->popmap[i]);
+			if (i == NPOPMAP)
+				break;
+			/* Convert from ffsl() to ordinary bit numbering. */
+			lo--;
+			next_free = NBPOPMAP * i + lo;
+			pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]);
+			KASSERT(pa >= low,
+			    ("vm_reserv_reclaim_contig: pa is too low"));
+			if (pa + size > high) {
+				/* The rest of this reservation is too high. */
+				break;
+			} else if ((pa & (alignment - 1)) != 0 ||
+			    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
+				/*
+				 * The current page doesn't meet the alignment
+				 * and/or boundary requirements.  Continue
+				 * searching this reservation until the rest
+				 * of its free pages are either excluded or
+				 * exhausted.
+				 */
+				hi = lo + 1;
+				if (hi >= NBPOPMAP) {
+					hi = 0;
+					i++;
 				}
-				if (pa_length >= size) {
+				continue;
+			}
+			/* Find the next used page. */
+			hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1));
+			while (hi == 0 && ++i < NPOPMAP) {
+				if ((NBPOPMAP * i - next_free) * PAGE_SIZE >=
+				    size) {
 					vm_reserv_reclaim(rv);
 					return (TRUE);
 				}
-			} else
-				pa_length = 0;
+				hi = ffsl(rv->popmap[i]);
+			}
+			/* Convert from ffsl() to ordinary bit numbering. */
+			if (i != NPOPMAP)
+				hi--;
+			if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >=
+			    size) {
+				vm_reserv_reclaim(rv);
+				return (TRUE);
+			}
+		} while (i < NPOPMAP);
 	}
 	return (FALSE);
 }
@@ -892,6 +1062,23 @@
 }
 
 /*
+ * Returns the size (in bytes) of a reservation of the specified level.
+ */
+int
+vm_reserv_size(int level)
+{
+
+	switch (level) {
+	case 0:
+		return (VM_LEVEL_0_SIZE);
+	case -1:
+		return (PAGE_SIZE);
+	default:
+		return (0);
+	}
+}
+
+/*
  * Allocates the virtual and physical memory required by the reservation
  * management system's data structures, in particular, the reservation array.
  */
@@ -925,4 +1112,18 @@
 	return (new_end);
 }
 
+/*
+ * Returns the superpage containing the given page.
+ */
+vm_page_t
+vm_reserv_to_superpage(vm_page_t m)
+{
+	vm_reserv_t rv;
+
+	VM_OBJECT_ASSERT_LOCKED(m->object);
+	rv = vm_reserv_from_page(m);
+	return (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES ?
+	    rv->pages : NULL);
+}
+
 #endif	/* VM_NRESERVLEVEL > 0 */

Modified: trunk/sys/vm/vm_reserv.h
===================================================================
--- trunk/sys/vm/vm_reserv.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_reserv.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -29,7 +29,7 @@
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/vm_reserv.h 250577 2013-05-12 16:50:18Z alc $
+ * $FreeBSD: stable/11/sys/vm/vm_reserv.h 324399 2017-10-07 20:22:04Z alc $
  */
 
 /*
@@ -48,21 +48,24 @@
  */
 vm_page_t	vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex,
 		    u_long npages, vm_paddr_t low, vm_paddr_t high,
-		    u_long alignment, vm_paddr_t boundary);
+		    u_long alignment, vm_paddr_t boundary, vm_page_t mpred);
 vm_page_t	vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex,
 		    vm_page_t mpred);
 void		vm_reserv_break_all(vm_object_t object);
 boolean_t	vm_reserv_free_page(vm_page_t m);
 void		vm_reserv_init(void);
+bool		vm_reserv_is_page_free(vm_page_t m);
+int		vm_reserv_level(vm_page_t m);
 int		vm_reserv_level_iffullpop(vm_page_t m);
-boolean_t	vm_reserv_reactivate_page(vm_page_t m);
 boolean_t	vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low,
 		    vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
 boolean_t	vm_reserv_reclaim_inactive(void);
 void		vm_reserv_rename(vm_page_t m, vm_object_t new_object,
 		    vm_object_t old_object, vm_pindex_t old_object_offset);
+int		vm_reserv_size(int level);
 vm_paddr_t	vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end,
 		    vm_paddr_t high_water);
+vm_page_t	vm_reserv_to_superpage(vm_page_t m);
 
 #endif	/* VM_NRESERVLEVEL > 0 */
 #endif	/* _KERNEL */

Added: trunk/sys/vm/vm_swapout.c
===================================================================
--- trunk/sys/vm/vm_swapout.c	                        (rev 0)
+++ trunk/sys/vm/vm_swapout.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,955 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 1994 David Greenman
+ * All rights reserved.
+ * Copyright (c) 2005 Yahoo! Technologies Norway AS
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * The Mach Operating System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
+ *
+ *
+ * Copyright (c) 1987, 1990 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Avadis Tevanian, Jr., Michael Wayne Young
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution at CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_swapout.c 338335 2018-08-27 09:39:34Z kib $");
+
+#include "opt_kstack_pages.h"
+#include "opt_kstack_max_pages.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/kernel.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/_kstack_cache.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/mount.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/vmmeter.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
+#include <vm/swap_pager.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+/* the kernel process "vm_daemon" */
+static void vm_daemon(void);
+static struct proc *vmproc;
+
+static struct kproc_desc vm_kp = {
+	"vmdaemon",
+	vm_daemon,
+	&vmproc
+};
+SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
+
+static int vm_swap_enabled = 1;
+static int vm_swap_idle_enabled = 0;
+
+SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW,
+    &vm_swap_enabled, 0,
+    "Enable entire process swapout");
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW,
+    &vm_swap_idle_enabled, 0,
+    "Allow swapout on idle criteria");
+
+/*
+ * Swap_idle_threshold1 is the guaranteed swapped in time for a process
+ */
+static int swap_idle_threshold1 = 2;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
+    &swap_idle_threshold1, 0,
+    "Guaranteed swapped in time for a process");
+
+/*
+ * Swap_idle_threshold2 is the time that a process can be idle before
+ * it will be swapped out, if idle swapping is enabled.
+ */
+static int swap_idle_threshold2 = 10;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
+    &swap_idle_threshold2, 0,
+    "Time before a process will be swapped out");
+
+static int vm_pageout_req_swapout;	/* XXX */
+static int vm_daemon_needed;
+static struct mtx vm_daemon_mtx;
+/* Allow for use by vm_pageout before vm_daemon is initialized. */
+MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
+
+static int swapped_cnt;
+static int swap_inprogress;	/* Pending swap-ins done outside swapper. */
+static int last_swapin;
+
+static void swapclear(struct proc *);
+static int swapout(struct proc *);
+static void vm_swapout_map_deactivate_pages(vm_map_t, long);
+static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long);
+static void swapout_procs(int action);
+static void vm_req_vmdaemon(int req);
+static void vm_thread_swapout(struct thread *td);
+
+/*
+ *	vm_swapout_object_deactivate_pages
+ *
+ *	Deactivate enough pages to satisfy the inactive target
+ *	requirements.
+ *
+ *	The object and map must be locked.
+ */
+static void
+vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
+    long desired)
+{
+	vm_object_t backing_object, object;
+	vm_page_t p;
+	int act_delta, remove_mode;
+
+	VM_OBJECT_ASSERT_LOCKED(first_object);
+	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
+		return;
+	for (object = first_object;; object = backing_object) {
+		if (pmap_resident_count(pmap) <= desired)
+			goto unlock_return;
+		VM_OBJECT_ASSERT_LOCKED(object);
+		if ((object->flags & OBJ_UNMANAGED) != 0 ||
+		    object->paging_in_progress != 0)
+			goto unlock_return;
+
+		remove_mode = 0;
+		if (object->shadow_count > 1)
+			remove_mode = 1;
+		/*
+		 * Scan the object's entire memory queue.
+		 */
+		TAILQ_FOREACH(p, &object->memq, listq) {
+			if (pmap_resident_count(pmap) <= desired)
+				goto unlock_return;
+			if (should_yield())
+				goto unlock_return;
+			if (vm_page_busied(p))
+				continue;
+			PCPU_INC(cnt.v_pdpages);
+			vm_page_lock(p);
+			if (p->wire_count != 0 || p->hold_count != 0 ||
+			    !pmap_page_exists_quick(pmap, p)) {
+				vm_page_unlock(p);
+				continue;
+			}
+			act_delta = pmap_ts_referenced(p);
+			if ((p->aflags & PGA_REFERENCED) != 0) {
+				if (act_delta == 0)
+					act_delta = 1;
+				vm_page_aflag_clear(p, PGA_REFERENCED);
+			}
+			if (!vm_page_active(p) && act_delta != 0) {
+				vm_page_activate(p);
+				p->act_count += act_delta;
+			} else if (vm_page_active(p)) {
+				if (act_delta == 0) {
+					p->act_count -= min(p->act_count,
+					    ACT_DECLINE);
+					if (!remove_mode && p->act_count == 0) {
+						pmap_remove_all(p);
+						vm_page_deactivate(p);
+					} else
+						vm_page_requeue(p);
+				} else {
+					vm_page_activate(p);
+					if (p->act_count < ACT_MAX -
+					    ACT_ADVANCE)
+						p->act_count += ACT_ADVANCE;
+					vm_page_requeue(p);
+				}
+			} else if (vm_page_inactive(p))
+				pmap_remove_all(p);
+			vm_page_unlock(p);
+		}
+		if ((backing_object = object->backing_object) == NULL)
+			goto unlock_return;
+		VM_OBJECT_RLOCK(backing_object);
+		if (object != first_object)
+			VM_OBJECT_RUNLOCK(object);
+	}
+unlock_return:
+	if (object != first_object)
+		VM_OBJECT_RUNLOCK(object);
+}
+
+/*
+ * deactivate some number of pages in a map, try to do it fairly, but
+ * that is really hard to do.
+ */
+static void
+vm_swapout_map_deactivate_pages(vm_map_t map, long desired)
+{
+	vm_map_entry_t tmpe;
+	vm_object_t obj, bigobj;
+	int nothingwired;
+
+	if (!vm_map_trylock_read(map))
+		return;
+
+	bigobj = NULL;
+	nothingwired = TRUE;
+
+	/*
+	 * first, search out the biggest object, and try to free pages from
+	 * that.
+	 */
+	tmpe = map->header.next;
+	while (tmpe != &map->header) {
+		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
+			obj = tmpe->object.vm_object;
+			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
+				if (obj->shadow_count <= 1 &&
+				    (bigobj == NULL ||
+				     bigobj->resident_page_count <
+				     obj->resident_page_count)) {
+					if (bigobj != NULL)
+						VM_OBJECT_RUNLOCK(bigobj);
+					bigobj = obj;
+				} else
+					VM_OBJECT_RUNLOCK(obj);
+			}
+		}
+		if (tmpe->wired_count > 0)
+			nothingwired = FALSE;
+		tmpe = tmpe->next;
+	}
+
+	if (bigobj != NULL) {
+		vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired);
+		VM_OBJECT_RUNLOCK(bigobj);
+	}
+	/*
+	 * Next, hunt around for other pages to deactivate.  We actually
+	 * do this search sort of wrong -- .text first is not the best idea.
+	 */
+	tmpe = map->header.next;
+	while (tmpe != &map->header) {
+		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
+			break;
+		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
+			obj = tmpe->object.vm_object;
+			if (obj != NULL) {
+				VM_OBJECT_RLOCK(obj);
+				vm_swapout_object_deactivate_pages(map->pmap,
+				    obj, desired);
+				VM_OBJECT_RUNLOCK(obj);
+			}
+		}
+		tmpe = tmpe->next;
+	}
+
+	/*
+	 * Remove all mappings if a process is swapped out, this will free page
+	 * table pages.
+	 */
+	if (desired == 0 && nothingwired) {
+		pmap_remove(vm_map_pmap(map), vm_map_min(map),
+		    vm_map_max(map));
+	}
+
+	vm_map_unlock_read(map);
+}
+
+/*
+ * Swap out requests
+ */
+#define VM_SWAP_NORMAL 1
+#define VM_SWAP_IDLE 2
+
+void
+vm_swapout_run(void)
+{
+
+	if (vm_swap_enabled)
+		vm_req_vmdaemon(VM_SWAP_NORMAL);
+}
+
+/*
+ * Idle process swapout -- run once per second when pagedaemons are
+ * reclaiming pages.
+ */
+void
+vm_swapout_run_idle(void)
+{
+	static long lsec;
+
+	if (!vm_swap_idle_enabled || time_second == lsec)
+		return;
+	vm_req_vmdaemon(VM_SWAP_IDLE);
+	lsec = time_second;
+}
+
+static void
+vm_req_vmdaemon(int req)
+{
+	static int lastrun = 0;
+
+	mtx_lock(&vm_daemon_mtx);
+	vm_pageout_req_swapout |= req;
+	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
+		wakeup(&vm_daemon_needed);
+		lastrun = ticks;
+	}
+	mtx_unlock(&vm_daemon_mtx);
+}
+
+static void
+vm_daemon(void)
+{
+	struct rlimit rsslim;
+	struct proc *p;
+	struct thread *td;
+	struct vmspace *vm;
+	int breakout, swapout_flags, tryagain, attempts;
+#ifdef RACCT
+	uint64_t rsize, ravailable;
+#endif
+
+	while (TRUE) {
+		mtx_lock(&vm_daemon_mtx);
+		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
+#ifdef RACCT
+		    racct_enable ? hz : 0
+#else
+		    0
+#endif
+		);
+		swapout_flags = vm_pageout_req_swapout;
+		vm_pageout_req_swapout = 0;
+		mtx_unlock(&vm_daemon_mtx);
+		if (swapout_flags)
+			swapout_procs(swapout_flags);
+
+		/*
+		 * scan the processes for exceeding their rlimits or if
+		 * process is swapped out -- deactivate pages
+		 */
+		tryagain = 0;
+		attempts = 0;
+again:
+		attempts++;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			vm_pindex_t limit, size;
+
+			/*
+			 * if this is a system process or if we have already
+			 * looked at this process, skip it.
+			 */
+			PROC_LOCK(p);
+			if (p->p_state != PRS_NORMAL ||
+			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			/*
+			 * if the process is in a non-running type state,
+			 * don't touch it.
+			 */
+			breakout = 0;
+			FOREACH_THREAD_IN_PROC(p, td) {
+				thread_lock(td);
+				if (!TD_ON_RUNQ(td) &&
+				    !TD_IS_RUNNING(td) &&
+				    !TD_IS_SLEEPING(td) &&
+				    !TD_IS_SUSPENDED(td)) {
+					thread_unlock(td);
+					breakout = 1;
+					break;
+				}
+				thread_unlock(td);
+			}
+			if (breakout) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			/*
+			 * get a limit
+			 */
+			lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
+			limit = OFF_TO_IDX(
+			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
+
+			/*
+			 * let processes that are swapped out really be
+			 * swapped out set the limit to nothing (will force a
+			 * swap-out.)
+			 */
+			if ((p->p_flag & P_INMEM) == 0)
+				limit = 0;	/* XXX */
+			vm = vmspace_acquire_ref(p);
+			_PHOLD_LITE(p);
+			PROC_UNLOCK(p);
+			if (vm == NULL) {
+				PRELE(p);
+				continue;
+			}
+			sx_sunlock(&allproc_lock);
+
+			size = vmspace_resident_count(vm);
+			if (size >= limit) {
+				vm_swapout_map_deactivate_pages(
+				    &vm->vm_map, limit);
+				size = vmspace_resident_count(vm);
+			}
+#ifdef RACCT
+			if (racct_enable) {
+				rsize = IDX_TO_OFF(size);
+				PROC_LOCK(p);
+				if (p->p_state == PRS_NORMAL)
+					racct_set(p, RACCT_RSS, rsize);
+				ravailable = racct_get_available(p, RACCT_RSS);
+				PROC_UNLOCK(p);
+				if (rsize > ravailable) {
+					/*
+					 * Don't be overly aggressive; this
+					 * might be an innocent process,
+					 * and the limit could've been exceeded
+					 * by some memory hog.  Don't try
+					 * to deactivate more than 1/4th
+					 * of process' resident set size.
+					 */
+					if (attempts <= 8) {
+						if (ravailable < rsize -
+						    (rsize / 4)) {
+							ravailable = rsize -
+							    (rsize / 4);
+						}
+					}
+					vm_swapout_map_deactivate_pages(
+					    &vm->vm_map,
+					    OFF_TO_IDX(ravailable));
+					/* Update RSS usage after paging out. */
+					size = vmspace_resident_count(vm);
+					rsize = IDX_TO_OFF(size);
+					PROC_LOCK(p);
+					if (p->p_state == PRS_NORMAL)
+						racct_set(p, RACCT_RSS, rsize);
+					PROC_UNLOCK(p);
+					if (rsize > ravailable)
+						tryagain = 1;
+				}
+			}
+#endif
+			vmspace_free(vm);
+			sx_slock(&allproc_lock);
+			PRELE(p);
+		}
+		sx_sunlock(&allproc_lock);
+		if (tryagain != 0 && attempts <= 10) {
+			maybe_yield();
+			goto again;
+		}
+	}
+}
+
+/*
+ * Allow a thread's kernel stack to be paged out.
+ */
+static void
+vm_thread_swapout(struct thread *td)
+{
+	vm_object_t ksobj;
+	vm_page_t m;
+	int i, pages;
+
+	cpu_thread_swapout(td);
+	pages = td->td_kstack_pages;
+	ksobj = td->td_kstack_obj;
+	pmap_qremove(td->td_kstack, pages);
+	VM_OBJECT_WLOCK(ksobj);
+	for (i = 0; i < pages; i++) {
+		m = vm_page_lookup(ksobj, i);
+		if (m == NULL)
+			panic("vm_thread_swapout: kstack already missing?");
+		vm_page_dirty(m);
+		vm_page_lock(m);
+		vm_page_unwire(m, PQ_INACTIVE);
+		vm_page_unlock(m);
+	}
+	VM_OBJECT_WUNLOCK(ksobj);
+}
+
+/*
+ * Bring the kernel stack for a specified thread back in.
+ */
+static void
+vm_thread_swapin(struct thread *td, int oom_alloc)
+{
+	vm_object_t ksobj;
+	vm_page_t ma[KSTACK_MAX_PAGES];
+	int a, count, i, j, pages, rv;
+
+	pages = td->td_kstack_pages;
+	ksobj = td->td_kstack_obj;
+	VM_OBJECT_WLOCK(ksobj);
+	(void)vm_page_grab_pages(ksobj, 0, oom_alloc | VM_ALLOC_WIRED, ma,
+	    pages);
+	for (i = 0; i < pages;) {
+		vm_page_assert_xbusied(ma[i]);
+		if (ma[i]->valid == VM_PAGE_BITS_ALL) {
+			vm_page_xunbusy(ma[i]);
+			i++;
+			continue;
+		}
+		vm_object_pip_add(ksobj, 1);
+		for (j = i + 1; j < pages; j++)
+			if (ma[j]->valid == VM_PAGE_BITS_ALL)
+				break;
+		rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a);
+		KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i]));
+		count = min(a + 1, j - i);
+		rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL);
+		KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d",
+		    __func__, td->td_proc->p_pid));
+		vm_object_pip_wakeup(ksobj);
+		for (j = i; j < i + count; j++)
+			vm_page_xunbusy(ma[j]);
+		i += count;
+	}
+	VM_OBJECT_WUNLOCK(ksobj);
+	pmap_qenter(td->td_kstack, ma, pages);
+	cpu_thread_swapin(td);
+}
+
+void
+faultin(struct proc *p)
+{
+	struct thread *td;
+	int oom_alloc;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	/*
+	 * If another process is swapping in this process,
+	 * just wait until it finishes.
+	 */
+	if (p->p_flag & P_SWAPPINGIN) {
+		while (p->p_flag & P_SWAPPINGIN)
+			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
+		return;
+	}
+
+	if ((p->p_flag & P_INMEM) == 0) {
+		oom_alloc = (p->p_flag & P_WKILLED) != 0 ? VM_ALLOC_SYSTEM :
+		    VM_ALLOC_NORMAL;
+
+		/*
+		 * Don't let another thread swap process p out while we are
+		 * busy swapping it in.
+		 */
+		++p->p_lock;
+		p->p_flag |= P_SWAPPINGIN;
+		PROC_UNLOCK(p);
+		sx_xlock(&allproc_lock);
+		MPASS(swapped_cnt > 0);
+		swapped_cnt--;
+		if (curthread != &thread0)
+			swap_inprogress++;
+		sx_xunlock(&allproc_lock);
+
+		/*
+		 * We hold no lock here because the list of threads
+		 * can not change while all threads in the process are
+		 * swapped out.
+		 */
+		FOREACH_THREAD_IN_PROC(p, td)
+			vm_thread_swapin(td, oom_alloc);
+
+		if (curthread != &thread0) {
+			sx_xlock(&allproc_lock);
+			MPASS(swap_inprogress > 0);
+			swap_inprogress--;
+			last_swapin = ticks;
+			sx_xunlock(&allproc_lock);
+		}
+		PROC_LOCK(p);
+		swapclear(p);
+		p->p_swtick = ticks;
+
+		/* Allow other threads to swap p out now. */
+		wakeup(&p->p_flag);
+		--p->p_lock;
+	}
+}
+
+/*
+ * This swapin algorithm attempts to swap-in processes only if there
+ * is enough space for them.  Of course, if a process waits for a long
+ * time, it will be swapped in anyway.
+ */
+
+static struct proc *
+swapper_selector(bool wkilled_only)
+{
+	struct proc *p, *res;
+	struct thread *td;
+	int ppri, pri, slptime, swtime;
+
+	sx_assert(&allproc_lock, SA_SLOCKED);
+	if (swapped_cnt == 0)
+		return (NULL);
+	res = NULL;
+	ppri = INT_MIN;
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT |
+		    P_SWAPPINGIN | P_INMEM)) != 0) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) {
+			/*
+			 * A swapped-out process might have mapped a
+			 * large portion of the system's pages as
+			 * anonymous memory.  There is no other way to
+			 * release the memory other than to kill the
+			 * process, for which we need to swap it in.
+			 */
+			return (p);
+		}
+		if (wkilled_only) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		swtime = (ticks - p->p_swtick) / hz;
+		FOREACH_THREAD_IN_PROC(p, td) {
+			/*
+			 * An otherwise runnable thread of a process
+			 * swapped out has only the TDI_SWAPPED bit set.
+			 */
+			thread_lock(td);
+			if (td->td_inhibitors == TDI_SWAPPED) {
+				slptime = (ticks - td->td_slptick) / hz;
+				pri = swtime + slptime;
+				if ((td->td_flags & TDF_SWAPINREQ) == 0)
+					pri -= p->p_nice * 8;
+				/*
+				 * if this thread is higher priority
+				 * and there is enough space, then select
+				 * this process instead of the previous
+				 * selection.
+				 */
+				if (pri > ppri) {
+					res = p;
+					ppri = pri;
+				}
+			}
+			thread_unlock(td);
+		}
+		PROC_UNLOCK(p);
+	}
+
+	if (res != NULL)
+		PROC_LOCK(res);
+	return (res);
+}
+
+#define	SWAPIN_INTERVAL	(MAXSLP * hz / 2)
+
+/*
+ * Limit swapper to swap in one non-WKILLED process in MAXSLP/2
+ * interval, assuming that there is:
+ * - no memory shortage;
+ * - no parallel swap-ins;
+ * - no other swap-ins in the current SWAPIN_INTERVAL.
+ */
+static bool
+swapper_wkilled_only(void)
+{
+
+	return (vm_page_count_min() || swap_inprogress > 0 ||
+	    (u_int)(ticks - last_swapin) < SWAPIN_INTERVAL);
+}
+
+void
+swapper(void)
+{
+	struct proc *p;
+
+	for (;;) {
+		sx_slock(&allproc_lock);
+		p = swapper_selector(swapper_wkilled_only());
+		sx_sunlock(&allproc_lock);
+
+		if (p == NULL) {
+			tsleep(&proc0, PVM, "swapin", SWAPIN_INTERVAL);
+		} else {
+			PROC_LOCK_ASSERT(p, MA_OWNED);
+
+			/*
+			 * Another process may be bringing or may have
+			 * already brought this process in while we
+			 * traverse all threads.  Or, this process may
+			 * have exited or even being swapped out
+			 * again.
+			 */
+			if (p->p_state == PRS_NORMAL && (p->p_flag & (P_INMEM |
+			    P_SWAPPINGOUT | P_SWAPPINGIN)) == 0) {
+				faultin(p);
+			}
+			PROC_UNLOCK(p);
+		}
+	}
+}
+
+/*
+ * First, if any processes have been sleeping or stopped for at least
+ * "swap_idle_threshold1" seconds, they are swapped out.  If, however,
+ * no such processes exist, then the longest-sleeping or stopped
+ * process is swapped out.  Finally, and only as a last resort, if
+ * there are no sleeping or stopped processes, the longest-resident
+ * process is swapped out.
+ */
+static void
+swapout_procs(int action)
+{
+	struct proc *p;
+	struct thread *td;
+	int slptime;
+	bool didswap, doswap;
+
+	MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0);
+
+	didswap = false;
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		/*
+		 * Filter out not yet fully constructed processes.  Do
+		 * not swap out held processes.  Avoid processes which
+		 * are system, exiting, execing, traced, already swapped
+		 * out or are in the process of being swapped in or out.
+		 */
+		PROC_LOCK(p);
+		if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag &
+		    (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE |
+		    P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) !=
+		    P_INMEM) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+
+		/*
+		 * Further consideration of this process for swap out
+		 * requires iterating over its threads.  We release
+		 * allproc_lock here so that process creation and
+		 * destruction are not blocked while we iterate.
+		 *
+		 * To later reacquire allproc_lock and resume
+		 * iteration over the allproc list, we will first have
+		 * to release the lock on the process.  We place a
+		 * hold on the process so that it remains in the
+		 * allproc list while it is unlocked.
+		 */
+		_PHOLD_LITE(p);
+		sx_sunlock(&allproc_lock);
+
+		/*
+		 * Do not swapout a realtime process.
+		 * Guarantee swap_idle_threshold1 time in memory.
+		 * If the system is under memory stress, or if we are
+		 * swapping idle processes >= swap_idle_threshold2,
+		 * then swap the process out.
+		 */
+		doswap = true;
+		FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
+			slptime = (ticks - td->td_slptick) / hz;
+			if (PRI_IS_REALTIME(td->td_pri_class) ||
+			    slptime < swap_idle_threshold1 ||
+			    !thread_safetoswapout(td) ||
+			    ((action & VM_SWAP_NORMAL) == 0 &&
+			    slptime < swap_idle_threshold2))
+				doswap = false;
+			thread_unlock(td);
+			if (!doswap)
+				break;
+		}
+		if (doswap && swapout(p) == 0)
+			didswap = true;
+
+		PROC_UNLOCK(p);
+		if (didswap) {
+			sx_xlock(&allproc_lock);
+			swapped_cnt++;
+			sx_downgrade(&allproc_lock);
+		} else
+			sx_slock(&allproc_lock);
+		PRELE(p);
+	}
+	sx_sunlock(&allproc_lock);
+
+	/*
+	 * If we swapped something out, and another process needed memory,
+	 * then wakeup the sched process.
+	 */
+	if (didswap)
+		wakeup(&proc0);
+}
+
+static void
+swapclear(struct proc *p)
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		td->td_flags |= TDF_INMEM;
+		td->td_flags &= ~TDF_SWAPINREQ;
+		TD_CLR_SWAPPED(td);
+		if (TD_CAN_RUN(td))
+			if (setrunnable(td)) {
+#ifdef INVARIANTS
+				/*
+				 * XXX: We just cleared TDI_SWAPPED
+				 * above and set TDF_INMEM, so this
+				 * should never happen.
+				 */
+				panic("not waking up swapper");
+#endif
+			}
+		thread_unlock(td);
+	}
+	p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT);
+	p->p_flag |= P_INMEM;
+}
+
+static int
+swapout(struct proc *p)
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	/*
+	 * The states of this process and its threads may have changed
+	 * by now.  Assuming that there is only one pageout daemon thread,
+	 * this process should still be in memory.
+	 */
+	KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) ==
+	    P_INMEM, ("swapout: lost a swapout race?"));
+
+	/*
+	 * Remember the resident count.
+	 */
+	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
+
+	/*
+	 * Check and mark all threads before we proceed.
+	 */
+	p->p_flag &= ~P_INMEM;
+	p->p_flag |= P_SWAPPINGOUT;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		if (!thread_safetoswapout(td)) {
+			thread_unlock(td);
+			swapclear(p);
+			return (EBUSY);
+		}
+		td->td_flags &= ~TDF_INMEM;
+		TD_SET_SWAPPED(td);
+		thread_unlock(td);
+	}
+	td = FIRST_THREAD_IN_PROC(p);
+	++td->td_ru.ru_nswap;
+	PROC_UNLOCK(p);
+
+	/*
+	 * This list is stable because all threads are now prevented from
+	 * running.  The list is only modified in the context of a running
+	 * thread in this process.
+	 */
+	FOREACH_THREAD_IN_PROC(p, td)
+		vm_thread_swapout(td);
+
+	PROC_LOCK(p);
+	p->p_flag &= ~P_SWAPPINGOUT;
+	p->p_swtick = ticks;
+	return (0);
+}


Property changes on: trunk/sys/vm/vm_swapout.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/vm/vm_swapout_dummy.c
===================================================================
--- trunk/sys/vm/vm_swapout_dummy.c	                        (rev 0)
+++ trunk/sys/vm/vm_swapout_dummy.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,123 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 1994 David Greenman
+ * All rights reserved.
+ * Copyright (c) 2005 Yahoo! Technologies Norway AS
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * The Mach Operating System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
+ *
+ *
+ * Copyright (c) 1987, 1990 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Avadis Tevanian, Jr., Michael Wayne Young
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution at CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_swapout_dummy.c 325647 2017-11-10 13:17:40Z kib $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
+
+static int vm_swap_enabled = 0;
+SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD,
+    &vm_swap_enabled, 0,
+    "Enable entire process swapout");
+
+static int vm_swap_idle_enabled = 0;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD,
+    &vm_swap_idle_enabled, 0,
+    "Allow swapout on idle criteria");
+
+void
+vm_swapout_run(void)
+{
+}
+
+void
+vm_swapout_run_idle(void)
+{
+}
+
+void
+faultin(struct proc *p)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if ((p->p_flag & P_INMEM) == 0)
+		panic("faultin: proc %p swapped out with NO_SWAPPING", p);
+}
+
+void
+swapper(void)
+{
+
+	for (;;)
+		tsleep(&proc0, PVM, "swapin", MAXSLP * hz);
+}


Property changes on: trunk/sys/vm/vm_swapout_dummy.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/vm_unix.c
===================================================================
--- trunk/sys/vm/vm_unix.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_unix.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -44,7 +44,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_unix.c 284665 2015-06-21 06:28:26Z trasz $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_unix.c 341467 2018-12-04 15:04:48Z emaste $");
 
 #include <sys/param.h>
 #include <sys/lock.h>
@@ -72,9 +72,7 @@
  */
 /* ARGSUSED */
 int
-sys_obreak(td, uap)
-	struct thread *td;
-	struct obreak_args *uap;
+sys_obreak(struct thread *td, struct obreak_args *uap)
 {
 	struct vmspace *vm = td->td_proc->p_vmspace;
 	vm_map_t map = &vm->vm_map;
@@ -84,11 +82,9 @@
 	int error = 0;
 	boolean_t do_map_wirefuture;
 
-	PROC_LOCK(td->td_proc);
-	datalim = lim_cur(td->td_proc, RLIMIT_DATA);
-	lmemlim = lim_cur(td->td_proc, RLIMIT_MEMLOCK);
-	vmemlim = lim_cur(td->td_proc, RLIMIT_VMEM);
-	PROC_UNLOCK(td->td_proc);
+	datalim = lim_cur(td, RLIMIT_DATA);
+	lmemlim = lim_cur(td, RLIMIT_MEMLOCK);
+	vmemlim = lim_cur(td, RLIMIT_VMEM);
 
 	do_map_wirefuture = FALSE;
 	new = round_page((vm_offset_t)uap->nsize);
@@ -167,7 +163,7 @@
 #endif
 		prot = VM_PROT_RW;
 #ifdef COMPAT_FREEBSD32
-#if defined(__amd64__) || defined(__ia64__)
+#if defined(__amd64__)
 		if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32))
 			prot |= VM_PROT_EXECUTE;
 #endif
@@ -248,9 +244,7 @@
  */
 /* ARGSUSED */
 int
-sys_ovadvise(td, uap)
-	struct thread *td;
-	struct ovadvise_args *uap;
+sys_ovadvise(struct thread *td, struct ovadvise_args *uap)
 {
 	/* START_GIANT_OPTIONAL */
 	/* END_GIANT_OPTIONAL */

Modified: trunk/sys/vm/vm_zeroidle.c
===================================================================
--- trunk/sys/vm/vm_zeroidle.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_zeroidle.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_zeroidle.c 254065 2013-08-07 16:36:38Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_zeroidle.c 267992 2014-06-28 03:56:17Z hselasky $");
 
 #include <opt_sched.h>
 
@@ -56,10 +56,9 @@
 #include <vm/vm_phys.h>
 
 static int idlezero_enable_default = 0;
-TUNABLE_INT("vm.idlezero_enable", &idlezero_enable_default);
 /* Defer setting the enable flag until the kthread is running. */
 static int idlezero_enable = 0;
-SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0,
+SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RWTUN, &idlezero_enable, 0,
     "Allow the kernel to use idle cpu cycles to zero-out pages");
 /*
  * Implement the pre-zeroed page mechanism.
@@ -85,9 +84,9 @@
 	 * fast sleeps.  We also do not want to be continuously zeroing
 	 * pages because doing so may flush our L1 and L2 caches too much.
 	 */
-	if (zero_state && vm_page_zero_count >= ZIDLE_LO(cnt.v_free_count))
+	if (zero_state && vm_page_zero_count >= ZIDLE_LO(vm_cnt.v_free_count))
 		return (0);
-	if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
+	if (vm_page_zero_count >= ZIDLE_HI(vm_cnt.v_free_count))
 		return (0);
 	return (1);
 }
@@ -99,7 +98,7 @@
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	zero_state = 0;
 	if (vm_phys_zero_pages_idle()) {
-		if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
+		if (vm_page_zero_count >= ZIDLE_HI(vm_cnt.v_free_count))
 			zero_state = 1;
 	}
 }

Modified: trunk/sys/vm/vnode_pager.c
===================================================================
--- trunk/sys/vm/vnode_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vnode_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -52,8 +52,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vnode_pager.c 291454 2015-11-29 14:44:40Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vnode_pager.c 331722 2018-03-29 02:50:57Z eadler $");
 
+#include "opt_vm.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
@@ -83,21 +85,27 @@
 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
-static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
+static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
+    int *, vop_getpages_iodone_t, void *);
 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *cred);
+static int vnode_pager_generic_getpages_done(struct buf *);
+static void vnode_pager_generic_getpages_done_async(struct buf *);
 
 struct pagerops vnodepagerops = {
 	.pgo_alloc =	vnode_pager_alloc,
 	.pgo_dealloc =	vnode_pager_dealloc,
 	.pgo_getpages =	vnode_pager_getpages,
+	.pgo_getpages_async = vnode_pager_getpages_async,
 	.pgo_putpages =	vnode_pager_putpages,
 	.pgo_haspage =	vnode_pager_haspage,
 };
 
 int vnode_pbuf_freecnt;
+int vnode_async_pbuf_freecnt;
 
 /* Create the VM system backing object for this vnode */
 int
@@ -157,14 +165,26 @@
 		return;
 	ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
 	VM_OBJECT_WLOCK(obj);
+	umtx_shm_object_terminated(obj);
 	if (obj->ref_count == 0) {
 		/*
 		 * don't double-terminate the object
 		 */
-		if ((obj->flags & OBJ_DEAD) == 0)
+		if ((obj->flags & OBJ_DEAD) == 0) {
 			vm_object_terminate(obj);
-		else
+		} else {
+			/*
+			 * Waiters were already handled during object
+			 * termination.  The exclusive vnode lock hopefully
+			 * prevented new waiters from referencing the dying
+			 * object.
+			 */
+			KASSERT((obj->flags & OBJ_DISCONNECTWNT) == 0,
+			    ("OBJ_DISCONNECTWNT set obj %p flags %x",
+			    obj, obj->flags));
+			vp->v_object = NULL;
 			VM_OBJECT_WUNLOCK(obj);
+		}
 	} else {
 		/*
 		 * Woe to the process that tries to page now :-).
@@ -172,7 +192,7 @@
 		vm_pager_deallocate(obj);
 		VM_OBJECT_WUNLOCK(obj);
 	}
-	vp->v_object = NULL;
+	KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object));
 }
 
 
@@ -241,9 +261,12 @@
 		VI_UNLOCK(vp);
 	} else {
 		object->ref_count++;
+#if VM_NRESERVLEVEL > 0
+		vm_object_color(object, 0);
+#endif
 		VM_OBJECT_WUNLOCK(object);
 	}
-	vref(vp);
+	vrefact(vp);
 	return (object);
 }
 
@@ -251,8 +274,7 @@
  *	The object must be locked.
  */
 static void
-vnode_pager_dealloc(object)
-	vm_object_t object;
+vnode_pager_dealloc(vm_object_t object)
 {
 	struct vnode *vp;
 	int refs;
@@ -287,11 +309,8 @@
 }
 
 static boolean_t
-vnode_pager_haspage(object, pindex, before, after)
-	vm_object_t object;
-	vm_pindex_t pindex;
-	int *before;
-	int *after;
+vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+    int *after)
 {
 	struct vnode *vp = object->handle;
 	daddr_t bn;
@@ -338,16 +357,21 @@
 			*before += poff;
 		}
 		if (after) {
-			int numafter;
+			/*
+			 * The BMAP vop can report a partial block in the
+			 * 'after', but must not report blocks after EOF.
+			 * Assert the latter, and truncate 'after' in case
+			 * of the former.
+			 */
+			KASSERT((reqblock + *after) * pagesperblock <
+			    roundup2(object->size, pagesperblock),
+			    ("%s: reqblock %jd after %d size %ju", __func__,
+			    (intmax_t )reqblock, *after,
+			    (uintmax_t )object->size));
 			*after *= pagesperblock;
-			numafter = pagesperblock - (poff + 1);
-			if (IDX_TO_OFF(pindex + numafter) >
-			    object->un_pager.vnp.vnp_size) {
-				numafter =
-		    		    OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
-				    pindex;
-			}
-			*after += numafter;
+			*after += pagesperblock - (poff + 1);
+			if (pindex + *after >= object->size)
+				*after = object->size - 1 - pindex;
 		}
 	} else {
 		if (before) {
@@ -370,9 +394,7 @@
  * operation (possibly at object termination time), so we must be careful.
  */
 void
-vnode_pager_setsize(vp, nsize)
-	struct vnode *vp;
-	vm_ooffset_t nsize;
+vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
 {
 	vm_object_t object;
 	vm_page_t m;
@@ -445,10 +467,6 @@
 			 * replacement from working properly.
 			 */
 			vm_page_clear_dirty(m, base, PAGE_SIZE - base);
-		} else if ((nsize & PAGE_MASK) &&
-		    vm_page_is_cached(object, OFF_TO_IDX(nsize))) {
-			vm_page_cache_free(object, OFF_TO_IDX(nsize),
-			    nobjsize);
 		}
 	}
 	object->un_pager.vnp.vnp_size = nsize;
@@ -497,9 +515,7 @@
  * small block filesystem vnode pager input
  */
 static int
-vnode_pager_input_smlfs(object, m)
-	vm_object_t object;
-	vm_page_t m;
+vnode_pager_input_smlfs(vm_object_t object, vm_page_t m)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
@@ -591,9 +607,7 @@
  * old style vnode pager input routine
  */
 static int
-vnode_pager_input_old(object, m)
-	vm_object_t object;
-	vm_page_t m;
+vnode_pager_input_old(vm_object_t object, vm_page_t m)
 {
 	struct uio auio;
 	struct iovec aiov;
@@ -666,19 +680,15 @@
  * backing vp's VOP_GETPAGES.
  */
 static int
-vnode_pager_getpages(object, m, count, reqpage)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	int reqpage;
+vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+    int *rahead)
 {
+	struct vnode *vp;
 	int rtval;
-	struct vnode *vp;
-	int bytes = count * PAGE_SIZE;
 
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
-	rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
+	rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages not implemented\n"));
 	VM_OBJECT_WLOCK(object);
@@ -685,261 +695,373 @@
 	return rtval;
 }
 
+static int
+vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count,
+    int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg)
+{
+	struct vnode *vp;
+	int rtval;
+
+	vp = object->handle;
+	VM_OBJECT_WUNLOCK(object);
+	rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg);
+	KASSERT(rtval != EOPNOTSUPP,
+	    ("vnode_pager: FS getpages_async not implemented\n"));
+	VM_OBJECT_WLOCK(object);
+	return (rtval);
+}
+
 /*
+ * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for
+ * local filesystems, where partially valid pages can only occur at
+ * the end of file.
+ */
+int
+vnode_pager_local_getpages(struct vop_getpages_args *ap)
+{
+
+	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
+	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
+}
+
+int
+vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap)
+{
+
+	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
+	    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
+}
+
+/*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_GETPAGES.
  */
 int
-vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
-	struct vnode *vp;
-	vm_page_t *m;
-	int bytecount;
-	int reqpage;
+vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count,
+    int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg)
 {
 	vm_object_t object;
 	struct bufobj *bo;
 	struct buf *bp;
-	struct mount *mp;
-	vm_offset_t kva;
-	daddr_t firstaddr, reqblock;
-	off_t foff, nextoff, tfoff, pib;
-	int pbefore, pafter, i, size, bsize, first, last;
-	int count, error, before, after, secmask;
+	off_t foff;
+	int bsize, pagesperblock, *freecnt;
+	int error, before, after, rbehind, rahead, poff, i;
+	int bytecount, secmask;
 
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
-	    ("vnode_pager_generic_getpages does not support devices"));
+	    ("%s does not support devices", __func__));
+
 	if (vp->v_iflag & VI_DOOMED)
 		return (VM_PAGER_BAD);
 
 	object = vp->v_object;
-	count = bytecount / PAGE_SIZE;
+	foff = IDX_TO_OFF(m[0]->pindex);
 	bsize = vp->v_mount->mnt_stat.f_iosize;
+	pagesperblock = bsize / PAGE_SIZE;
 
-	/* get the UNDERLYING device for the file with VOP_BMAP() */
+	KASSERT(foff < object->un_pager.vnp.vnp_size,
+	    ("%s: page %p offset beyond vp %p size", __func__, m[0], vp));
+	KASSERT(count <= sizeof(bp->b_pages),
+	    ("%s: requested %d pages", __func__, count));
 
 	/*
-	 * originally, we did not check for an error return value -- assuming
-	 * an fs always has a bmap entry point -- that assumption is wrong!!!
+	 * The last page has valid blocks.  Invalid part can only
+	 * exist at the end of file, and the page is made fully valid
+	 * by zeroing in vm_pager_get_pages().
 	 */
-	foff = IDX_TO_OFF(m[reqpage]->pindex);
+	if (m[count - 1]->valid != 0 && --count == 0) {
+		if (iodone != NULL)
+			iodone(arg, m, 1, 0);
+		return (VM_PAGER_OK);
+	}
 
 	/*
-	 * if we can't bmap, use old VOP code
+	 * Synchronous and asynchronous paging operations use different
+	 * free pbuf counters.  This is done to avoid asynchronous requests
+	 * to consume all pbufs.
+	 * Allocate the pbuf at the very beginning of the function, so that
+	 * if we are low on certain kind of pbufs don't even proceed to BMAP,
+	 * but sleep.
 	 */
-	error = VOP_BMAP(vp, IDX_TO_OFF(m[reqpage]->pindex) / bsize, &bo,
-	    &reqblock, &after, &before);
+	freecnt = iodone != NULL ?
+	    &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt;
+	bp = getpbuf(freecnt);
+
+	/*
+	 * Get the underlying device blocks for the file with VOP_BMAP().
+	 * If the file system doesn't support VOP_BMAP, use old way of
+	 * getting pages via VOP_READ.
+	 */
+	error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before);
 	if (error == EOPNOTSUPP) {
+		relpbuf(bp, freecnt);
 		VM_OBJECT_WLOCK(object);
-		
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
-		PCPU_INC(cnt.v_vnodein);
-		PCPU_INC(cnt.v_vnodepgsin);
-		error = vnode_pager_input_old(object, m[reqpage]);
+		for (i = 0; i < count; i++) {
+			PCPU_INC(cnt.v_vnodein);
+			PCPU_INC(cnt.v_vnodepgsin);
+			error = vnode_pager_input_old(object, m[i]);
+			if (error)
+				break;
+		}
 		VM_OBJECT_WUNLOCK(object);
 		return (error);
 	} else if (error != 0) {
-		VM_OBJECT_WLOCK(object);
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
-		VM_OBJECT_WUNLOCK(object);
+		relpbuf(bp, freecnt);
 		return (VM_PAGER_ERROR);
+	}
 
-		/*
-		 * if the blocksize is smaller than a page size, then use
-		 * special small filesystem code.  NFS sometimes has a small
-		 * blocksize, but it can handle large reads itself.
-		 */
-	} else if ((PAGE_SIZE / bsize) > 1 &&
-	    (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
-		VM_OBJECT_WLOCK(object);
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
-		VM_OBJECT_WUNLOCK(object);
-		PCPU_INC(cnt.v_vnodein);
-		PCPU_INC(cnt.v_vnodepgsin);
-		return (vnode_pager_input_smlfs(object, m[reqpage]));
+	/*
+	 * If the file system supports BMAP, but blocksize is smaller
+	 * than a page size, then use special small filesystem code.
+	 */
+	if (pagesperblock == 0) {
+		relpbuf(bp, freecnt);
+		for (i = 0; i < count; i++) {
+			PCPU_INC(cnt.v_vnodein);
+			PCPU_INC(cnt.v_vnodepgsin);
+			error = vnode_pager_input_smlfs(object, m[i]);
+			if (error)
+				break;
+		}
+		return (error);
 	}
 
 	/*
-	 * If we have a completely valid page available to us, we can
-	 * clean up and return.  Otherwise we have to re-read the
-	 * media.
+	 * A sparse file can be encountered only for a single page request,
+	 * which may not be preceded by call to vm_pager_haspage().
 	 */
-	VM_OBJECT_WLOCK(object);
-	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
+	if (bp->b_blkno == -1) {
+		KASSERT(count == 1,
+		    ("%s: array[%d] request to a sparse file %p", __func__,
+		    count, vp));
+		relpbuf(bp, freecnt);
+		pmap_zero_page(m[0]);
+		KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty",
+		    __func__, m[0]));
+		VM_OBJECT_WLOCK(object);
+		m[0]->valid = VM_PAGE_BITS_ALL;
 		VM_OBJECT_WUNLOCK(object);
-		return VM_PAGER_OK;
-	} else if (reqblock == -1) {
-		pmap_zero_page(m[reqpage]);
-		KASSERT(m[reqpage]->dirty == 0,
-		    ("vnode_pager_generic_getpages: page %p is dirty", m));
-		m[reqpage]->valid = VM_PAGE_BITS_ALL;
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
-		VM_OBJECT_WUNLOCK(object);
 		return (VM_PAGER_OK);
 	}
-	m[reqpage]->valid = 0;
-	VM_OBJECT_WUNLOCK(object);
 
-	pib = IDX_TO_OFF(m[reqpage]->pindex) % bsize;
-	pbefore = ((daddr_t)before * bsize + pib) / PAGE_SIZE;
-	pafter = ((daddr_t)(after + 1) * bsize - pib) / PAGE_SIZE - 1;
-	first = reqpage < pbefore ? 0 : reqpage - pbefore;
-	last = reqpage + pafter >= count ? count - 1 : reqpage + pafter;
-	if (first > 0 || last + 1 < count) {
+	bp->b_blkno += (foff % bsize) / DEV_BSIZE;
+
+	/* Recalculate blocks available after/before to pages. */
+	poff = (foff % bsize) / PAGE_SIZE;
+	before *= pagesperblock;
+	before += poff;
+	after *= pagesperblock;
+	after += pagesperblock - (poff + 1);
+	if (m[0]->pindex + after >= object->size)
+		after = object->size - 1 - m[0]->pindex;
+	KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d",
+	    __func__, count, after + 1));
+	after -= count - 1;
+
+	/* Trim requested rbehind/rahead to possible values. */   
+	rbehind = a_rbehind ? *a_rbehind : 0;
+	rahead = a_rahead ? *a_rahead : 0;
+	rbehind = min(rbehind, before);
+	rbehind = min(rbehind, m[0]->pindex);
+	rahead = min(rahead, after);
+	rahead = min(rahead, object->size - m[count - 1]->pindex);
+	KASSERT(rbehind + rahead + count <= sizeof(bp->b_pages),
+	    ("%s: behind %d ahead %d count %d", __func__,
+	    rbehind, rahead, count));
+
+	/*
+	 * Fill in the bp->b_pages[] array with requested and optional   
+	 * read behind or read ahead pages.  Read behind pages are looked
+	 * up in a backward direction, down to a first cached page.  Same
+	 * for read ahead pages, but there is no need to shift the array
+	 * in case of encountering a cached page.
+	 */
+	i = bp->b_npages = 0;
+	if (rbehind) {
+		vm_pindex_t startpindex, tpindex;
+		vm_page_t p;
+
 		VM_OBJECT_WLOCK(object);
-		for (i = 0; i < first; i++) {
-			vm_page_lock(m[i]);
-			vm_page_free(m[i]);
-			vm_page_unlock(m[i]);
+		startpindex = m[0]->pindex - rbehind;
+		if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL &&
+		    p->pindex >= startpindex)
+			startpindex = p->pindex + 1;
+
+		/* tpindex is unsigned; beware of numeric underflow. */
+		for (tpindex = m[0]->pindex - 1;
+		    tpindex >= startpindex && tpindex < m[0]->pindex;
+		    tpindex--, i++) {
+			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+			if (p == NULL) {
+				/* Shift the array. */
+				for (int j = 0; j < i; j++)
+					bp->b_pages[j] = bp->b_pages[j + 
+					    tpindex + 1 - startpindex]; 
+				break;
+			}
+			bp->b_pages[tpindex - startpindex] = p;
 		}
-		for (i = last + 1; i < count; i++) {
-			vm_page_lock(m[i]);
-			vm_page_free(m[i]);
-			vm_page_unlock(m[i]);
+
+		bp->b_pgbefore = i;
+		bp->b_npages += i;
+		bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE;
+	} else
+		bp->b_pgbefore = 0;
+
+	/* Requested pages. */
+	for (int j = 0; j < count; j++, i++)
+		bp->b_pages[i] = m[j];
+	bp->b_npages += count;
+
+	if (rahead) {
+		vm_pindex_t endpindex, tpindex;
+		vm_page_t p;
+
+		if (!VM_OBJECT_WOWNED(object))
+			VM_OBJECT_WLOCK(object);
+		endpindex = m[count - 1]->pindex + rahead + 1;
+		if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL &&
+		    p->pindex < endpindex)
+			endpindex = p->pindex;
+		if (endpindex > object->size)
+			endpindex = object->size;
+
+		for (tpindex = m[count - 1]->pindex + 1;
+		    tpindex < endpindex; i++, tpindex++) {
+			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+			if (p == NULL)
+				break;
+			bp->b_pages[i] = p;
 		}
-		VM_OBJECT_WUNLOCK(object);
-	}
 
-	/*
-	 * here on direct device I/O
-	 */
-	firstaddr = reqblock;
-	firstaddr += pib / DEV_BSIZE;
-	firstaddr -= IDX_TO_OFF(reqpage - first) / DEV_BSIZE;
+		bp->b_pgafter = i - bp->b_npages;
+		bp->b_npages = i;
+	} else
+		bp->b_pgafter = 0;
 
-	/*
-	 * The first and last page have been calculated now, move
-	 * input pages to be zero based, and adjust the count.
-	 */
-	m += first;
-	reqpage -= first;
-	count = last - first + 1;
+	if (VM_OBJECT_WOWNED(object))
+		VM_OBJECT_WUNLOCK(object);
 
-	/*
-	 * calculate the file virtual address for the transfer
-	 */
-	foff = IDX_TO_OFF(m[0]->pindex);
+	/* Report back actual behind/ahead read. */
+	if (a_rbehind)
+		*a_rbehind = bp->b_pgbefore;
+	if (a_rahead)
+		*a_rahead = bp->b_pgafter;
 
-	/*
-	 * calculate the size of the transfer
-	 */
-	size = count * PAGE_SIZE;
-	KASSERT(count > 0, ("zero count"));
-	if ((foff + size) > object->un_pager.vnp.vnp_size)
-		size = object->un_pager.vnp.vnp_size - foff;
-	KASSERT(size > 0, ("zero size"));
+	KASSERT(bp->b_npages <= sizeof(bp->b_pages),
+	    ("%s: buf %p overflowed", __func__, bp));
 
 	/*
-	 * round up physical size for real devices.
+	 * Recalculate first offset and bytecount with regards to read behind.
+	 * Truncate bytecount to vnode real size and round up physical size
+	 * for real devices.
 	 */
+	foff = IDX_TO_OFF(bp->b_pages[0]->pindex);
+	bytecount = bp->b_npages << PAGE_SHIFT;
+	if ((foff + bytecount) > object->un_pager.vnp.vnp_size)
+		bytecount = object->un_pager.vnp.vnp_size - foff;
 	secmask = bo->bo_bsize - 1;
 	KASSERT(secmask < PAGE_SIZE && secmask > 0,
-	    ("vnode_pager_generic_getpages: sector size %d too large",
-	    secmask + 1));
-	size = (size + secmask) & ~secmask;
+	    ("%s: sector size %d too large", __func__, secmask + 1));
+	bytecount = (bytecount + secmask) & ~secmask;
 
-	bp = getpbuf(&vnode_pbuf_freecnt);
-	kva = (vm_offset_t)bp->b_data;
-
 	/*
-	 * and map the pages to be read into the kva, if the filesystem
+	 * And map the pages to be read into the kva, if the filesystem
 	 * requires mapped buffers.
 	 */
-	mp = vp->v_mount;
-	if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
+	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
 	    unmapped_buf_allowed) {
 		bp->b_data = unmapped_buf;
-		bp->b_kvabase = unmapped_buf;
 		bp->b_offset = 0;
-		bp->b_flags |= B_UNMAPPED;
-		bp->b_npages = count;
-		for (i = 0; i < count; i++)
-			bp->b_pages[i] = m[i];
-	} else
-		pmap_qenter(kva, m, count);
+	} else {
+		bp->b_data = bp->b_kvabase;
+		pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+	}
 
-	/* build a minimal buffer header */
+	/* Build a minimal buffer header. */
 	bp->b_iocmd = BIO_READ;
-	bp->b_iodone = bdone;
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
 	bp->b_wcred = crhold(curthread->td_ucred);
-	bp->b_blkno = firstaddr;
 	pbgetbo(bo, bp);
 	bp->b_vp = vp;
-	bp->b_bcount = size;
-	bp->b_bufsize = size;
-	bp->b_runningbufspace = bp->b_bufsize;
+	bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount;
+	bp->b_iooffset = dbtob(bp->b_blkno);
+
 	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
-
 	PCPU_INC(cnt.v_vnodein);
-	PCPU_ADD(cnt.v_vnodepgsin, count);
+	PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages);
 
-	/* do the input */
-	bp->b_iooffset = dbtob(bp->b_blkno);
-	bstrategy(bp);
+	if (iodone != NULL) { /* async */
+		bp->b_pgiodone = iodone;
+		bp->b_caller1 = arg;
+		bp->b_iodone = vnode_pager_generic_getpages_done_async;
+		bp->b_flags |= B_ASYNC;
+		BUF_KERNPROC(bp);
+		bstrategy(bp);
+		return (VM_PAGER_OK);
+	} else {
+		bp->b_iodone = bdone;
+		bstrategy(bp);
+		bwait(bp, PVM, "vnread");
+		error = vnode_pager_generic_getpages_done(bp);
+		for (i = 0; i < bp->b_npages; i++)
+			bp->b_pages[i] = NULL;
+		bp->b_vp = NULL;
+		pbrelbo(bp);
+		relpbuf(bp, &vnode_pbuf_freecnt);
+		return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
+	}
+}
 
-	bwait(bp, PVM, "vnread");
+static void
+vnode_pager_generic_getpages_done_async(struct buf *bp)
+{
+	int error;
 
-	if ((bp->b_ioflags & BIO_ERROR) != 0)
-		error = EIO;
+	error = vnode_pager_generic_getpages_done(bp);
+	/* Run the iodone upon the requested range. */
+	bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore,
+	    bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error);
+	for (int i = 0; i < bp->b_npages; i++)
+		bp->b_pages[i] = NULL;
+	bp->b_vp = NULL;
+	pbrelbo(bp);
+	relpbuf(bp, &vnode_async_pbuf_freecnt);
+}
 
-	if (error == 0 && size != count * PAGE_SIZE) {
-		if ((bp->b_flags & B_UNMAPPED) != 0) {
-			bp->b_flags &= ~B_UNMAPPED;
-			pmap_qenter(kva, m, count);
+static int
+vnode_pager_generic_getpages_done(struct buf *bp)
+{
+	vm_object_t object;
+	off_t tfoff, nextoff;
+	int i, error;
+
+	error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0;
+	object = bp->b_vp->v_object;
+
+	if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) {
+		if (!buf_mapped(bp)) {
+			bp->b_data = bp->b_kvabase;
+			pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages,
+			    bp->b_npages);
 		}
-		bzero((caddr_t)kva + size, PAGE_SIZE * count - size);
+		bzero(bp->b_data + bp->b_bcount,
+		    PAGE_SIZE * bp->b_npages - bp->b_bcount);
 	}
-	if ((bp->b_flags & B_UNMAPPED) == 0)
-		pmap_qremove(kva, count);
-	if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) {
-		bp->b_data = (caddr_t)kva;
-		bp->b_kvabase = (caddr_t)kva;
-		bp->b_flags &= ~B_UNMAPPED;
-		for (i = 0; i < count; i++)
-			bp->b_pages[i] = NULL;
+	if (buf_mapped(bp)) {
+		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
+		bp->b_data = unmapped_buf;
 	}
 
-	/*
-	 * free the buffer header back to the swap buffer pool
-	 */
-	bp->b_vp = NULL;
-	pbrelbo(bp);
-	relpbuf(bp, &vnode_pbuf_freecnt);
-
 	VM_OBJECT_WLOCK(object);
-	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
+	for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex);
+	    i < bp->b_npages; i++, tfoff = nextoff) {
 		vm_page_t mt;
 
 		nextoff = tfoff + PAGE_SIZE;
-		mt = m[i];
+		mt = bp->b_pages[i];
 
 		if (nextoff <= object->un_pager.vnp.vnp_size) {
 			/*
@@ -947,11 +1069,9 @@
 			 */
 			mt->valid = VM_PAGE_BITS_ALL;
 			KASSERT(mt->dirty == 0,
-			    ("vnode_pager_generic_getpages: page %p is dirty",
-			    mt));
+			    ("%s: page %p is dirty", __func__, mt));
 			KASSERT(!pmap_page_is_mapped(mt),
-			    ("vnode_pager_generic_getpages: page %p is mapped",
-			    mt));
+			    ("%s: page %p is mapped", __func__, mt));
 		} else {
 			/*
 			 * Read did not fill up entire page.
@@ -964,18 +1084,17 @@
 			    object->un_pager.vnp.vnp_size - tfoff);
 			KASSERT((mt->dirty & vm_page_bits(0,
 			    object->un_pager.vnp.vnp_size - tfoff)) == 0,
-			    ("vnode_pager_generic_getpages: page %p is dirty",
-			    mt));
+			    ("%s: page %p is dirty", __func__, mt));
 		}
-		
-		if (i != reqpage)
+
+		if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter)
 			vm_page_readahead_finish(mt);
 	}
 	VM_OBJECT_WUNLOCK(object);
-	if (error) {
-		printf("vnode_pager_getpages: I/O read error\n");
-	}
-	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
+	if (error != 0)
+		printf("%s: I/O read error %d\n", __func__, error);
+
+	return (error);
 }
 
 /*
@@ -1006,7 +1125,7 @@
 	 * daemon up.  This should be probably be addressed XXX.
 	 */
 
-	if (cnt.v_free_count + cnt.v_cache_count < cnt.v_pageout_free_min)
+	if (vm_cnt.v_free_count < vm_cnt.v_pageout_free_min)
 		flags |= VM_PAGER_PUT_SYNC;
 
 	/*
@@ -1014,19 +1133,36 @@
 	 */
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
-	rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals, 0);
+	rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals);
 	KASSERT(rtval != EOPNOTSUPP, 
 	    ("vnode_pager: stale FS putpages\n"));
 	VM_OBJECT_WLOCK(object);
 }
 
+static int
+vn_off2bidx(vm_ooffset_t offset)
+{
 
+	return ((offset & PAGE_MASK) / DEV_BSIZE);
+}
+
+static bool
+vn_dirty_blk(vm_page_t m, vm_ooffset_t offset)
+{
+
+	KASSERT(IDX_TO_OFF(m->pindex) <= offset &&
+	    offset < IDX_TO_OFF(m->pindex + 1),
+	    ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex,
+	    (uintmax_t)offset));
+	return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0);
+}
+
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_PUTPAGES.
  *
  * This is typically called indirectly via the pageout daemon and
- * clustering has already typically occured, so in general we ask the
+ * clustering has already typically occurred, so in general we ask the
  * underlying filesystem to write the data out asynchronously rather
  * then delayed.
  */
@@ -1034,18 +1170,14 @@
 vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
     int flags, int *rtvals)
 {
-	int i;
 	vm_object_t object;
 	vm_page_t m;
-	int count;
-
-	int maxsize, ncount;
-	vm_ooffset_t poffset;
+	vm_ooffset_t maxblksz, next_offset, poffset, prev_offset;
 	struct uio auio;
 	struct iovec aiov;
-	int error;
-	int ioflags;
-	int ppscheck = 0;
+	off_t prev_resid, wrsz;
+	int count, error, i, maxsize, ncount, pgoff, ppscheck;
+	bool in_hole;
 	static struct timeval lastfail;
 	static int curfail;
 
@@ -1056,10 +1188,11 @@
 		rtvals[i] = VM_PAGER_ERROR;
 
 	if ((int64_t)ma[0]->pindex < 0) {
-		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n",
-		    (long)ma[0]->pindex, (u_long)ma[0]->dirty);
+		printf("vnode_pager_generic_putpages: "
+		    "attempt to write meta-data 0x%jx(%lx)\n",
+		    (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty);
 		rtvals[0] = VM_PAGER_BAD;
-		return VM_PAGER_BAD;
+		return (VM_PAGER_BAD);
 	}
 
 	maxsize = count * PAGE_SIZE;
@@ -1069,7 +1202,7 @@
 
 	/*
 	 * If the page-aligned write is larger then the actual file we
-	 * have to invalidate pages occuring beyond the file EOF.  However,
+	 * have to invalidate pages occurring beyond the file EOF.  However,
 	 * there is an edge case where a file may not be page-aligned where
 	 * the last page is partially invalid.  In this case the filesystem
 	 * may not properly clear the dirty bits for the entire page (which
@@ -1079,14 +1212,20 @@
 	 * We do not under any circumstances truncate the valid bits, as
 	 * this will screw up bogus page replacement.
 	 */
-	VM_OBJECT_WLOCK(object);
+	VM_OBJECT_RLOCK(object);
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
+		if (!VM_OBJECT_TRYUPGRADE(object)) {
+			VM_OBJECT_RUNLOCK(object);
+			VM_OBJECT_WLOCK(object);
+			if (maxsize + poffset <= object->un_pager.vnp.vnp_size)
+				goto downgrade;
+		}
 		if (object->un_pager.vnp.vnp_size > poffset) {
-			int pgoff;
-
 			maxsize = object->un_pager.vnp.vnp_size - poffset;
 			ncount = btoc(maxsize);
 			if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
+				pgoff = roundup2(pgoff, DEV_BSIZE);
+
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
@@ -1097,6 +1236,7 @@
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 		("vnode_pager_generic_putpages: page %p is not read-only", m));
+				MPASS(m->dirty != 0);
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
@@ -1104,64 +1244,152 @@
 			maxsize = 0;
 			ncount = 0;
 		}
-		if (ncount < count) {
-			for (i = ncount; i < count; i++) {
-				rtvals[i] = VM_PAGER_BAD;
+		for (i = ncount; i < count; i++)
+			rtvals[i] = VM_PAGER_BAD;
+downgrade:
+		VM_OBJECT_LOCK_DOWNGRADE(object);
+	}
+
+	auio.uio_iov = &aiov;
+	auio.uio_segflg = UIO_NOCOPY;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = NULL;
+	maxblksz = roundup2(poffset + maxsize, DEV_BSIZE);
+
+	for (prev_offset = poffset; prev_offset < maxblksz;) {
+		/* Skip clean blocks. */
+		for (in_hole = true; in_hole && prev_offset < maxblksz;) {
+			m = ma[OFF_TO_IDX(prev_offset - poffset)];
+			for (i = vn_off2bidx(prev_offset);
+			    i < sizeof(vm_page_bits_t) * NBBY &&
+			    prev_offset < maxblksz; i++) {
+				if (vn_dirty_blk(m, prev_offset)) {
+					in_hole = false;
+					break;
+				}
+				prev_offset += DEV_BSIZE;
 			}
 		}
+		if (in_hole)
+			goto write_done;
+
+		/* Find longest run of dirty blocks. */
+		for (next_offset = prev_offset; next_offset < maxblksz;) {
+			m = ma[OFF_TO_IDX(next_offset - poffset)];
+			for (i = vn_off2bidx(next_offset);
+			    i < sizeof(vm_page_bits_t) * NBBY &&
+			    next_offset < maxblksz; i++) {
+				if (!vn_dirty_blk(m, next_offset))
+					goto start_write;
+				next_offset += DEV_BSIZE;
+			}
+		}
+start_write:
+		if (next_offset > poffset + maxsize)
+			next_offset = poffset + maxsize;
+
+		/*
+		 * Getting here requires finding a dirty block in the
+		 * 'skip clean blocks' loop.
+		 */
+		MPASS(prev_offset < next_offset);
+
+		VM_OBJECT_RUNLOCK(object);
+		aiov.iov_base = NULL;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = prev_offset;
+		prev_resid = auio.uio_resid = aiov.iov_len = next_offset -
+		    prev_offset;
+		error = VOP_WRITE(vp, &auio,
+		    vnode_pager_putpages_ioflags(flags), curthread->td_ucred);
+
+		wrsz = prev_resid - auio.uio_resid;
+		if (wrsz == 0) {
+			if (ppsratecheck(&lastfail, &curfail, 1) != 0) {
+				vn_printf(vp, "vnode_pager_putpages: "
+				    "zero-length write at %ju resid %zd\n",
+				    auio.uio_offset, auio.uio_resid);
+			}
+			VM_OBJECT_RLOCK(object);
+			break;
+		}
+
+		/* Adjust the starting offset for next iteration. */
+		prev_offset += wrsz;
+		MPASS(auio.uio_offset == prev_offset);
+
+		ppscheck = 0;
+		if (error != 0 && (ppscheck = ppsratecheck(&lastfail,
+		    &curfail, 1)) != 0)
+			vn_printf(vp, "vnode_pager_putpages: I/O error %d\n",
+			    error);
+		if (auio.uio_resid != 0 && (ppscheck != 0 ||
+		    ppsratecheck(&lastfail, &curfail, 1) != 0))
+			vn_printf(vp, "vnode_pager_putpages: residual I/O %zd "
+			    "at %ju\n", auio.uio_resid,
+			    (uintmax_t)ma[0]->pindex);
+		VM_OBJECT_RLOCK(object);
+		if (error != 0 || auio.uio_resid != 0)
+			break;
 	}
-	VM_OBJECT_WUNLOCK(object);
+write_done:
+	/* Mark completely processed pages. */
+	for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++)
+		rtvals[i] = VM_PAGER_OK;
+	/* Mark partial EOF page. */
+	if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0)
+		rtvals[i++] = VM_PAGER_OK;
+	/* Unwritten pages in range, free bonus if the page is clean. */
+	for (; i < ncount; i++)
+		rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR;
+	VM_OBJECT_RUNLOCK(object);
+	PCPU_ADD(cnt.v_vnodepgsout, i);
+	PCPU_INC(cnt.v_vnodeout);
+	return (rtvals[0]);
+}
 
+int
+vnode_pager_putpages_ioflags(int pager_flags)
+{
+	int ioflags;
+
 	/*
-	 * pageouts are already clustered, use IO_ASYNC to force a bawrite()
-	 * rather then a bdwrite() to prevent paging I/O from saturating 
-	 * the buffer cache.  Dummy-up the sequential heuristic to cause
-	 * large ranges to cluster.  If neither IO_SYNC or IO_ASYNC is set,
-	 * the system decides how to cluster.
+	 * Pageouts are already clustered, use IO_ASYNC to force a
+	 * bawrite() rather then a bdwrite() to prevent paging I/O
+	 * from saturating the buffer cache.  Dummy-up the sequential
+	 * heuristic to cause large ranges to cluster.  If neither
+	 * IO_SYNC or IO_ASYNC is set, the system decides how to
+	 * cluster.
 	 */
 	ioflags = IO_VMIO;
-	if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
+	if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0)
 		ioflags |= IO_SYNC;
-	else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
+	else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0)
 		ioflags |= IO_ASYNC;
-	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
+	ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0;
+	ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0;
 	ioflags |= IO_SEQMAX << IO_SEQSHIFT;
-
-	aiov.iov_base = (caddr_t) 0;
-	aiov.iov_len = maxsize;
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_offset = poffset;
-	auio.uio_segflg = UIO_NOCOPY;
-	auio.uio_rw = UIO_WRITE;
-	auio.uio_resid = maxsize;
-	auio.uio_td = (struct thread *) 0;
-	error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
-	PCPU_INC(cnt.v_vnodeout);
-	PCPU_ADD(cnt.v_vnodepgsout, ncount);
-
-	if (error) {
-		if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
-			printf("vnode_pager_putpages: I/O error %d\n", error);
-	}
-	if (auio.uio_resid) {
-		if (ppscheck || ppsratecheck(&lastfail, &curfail, 1))
-			printf("vnode_pager_putpages: residual I/O %zd at %lu\n",
-			    auio.uio_resid, (u_long)ma[0]->pindex);
-	}
-	for (i = 0; i < ncount; i++) {
-		rtvals[i] = VM_PAGER_OK;
-	}
-	return rtvals[0];
+	return (ioflags);
 }
 
+/*
+ * vnode_pager_undirty_pages().
+ *
+ * A helper to mark pages as clean after pageout that was possibly
+ * done with a short write.  The lpos argument specifies the page run
+ * length in bytes, and the written argument specifies how many bytes
+ * were actually written.  eof is the offset past the last valid byte
+ * in the vnode using the absolute file position of the first byte in
+ * the run as the base from which it is computed.
+ */
 void
-vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written)
+vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof,
+    int lpos)
 {
 	vm_object_t obj;
-	int i, pos;
+	int i, pos, pos_devb;
 
-	if (written == 0)
+	if (written == 0 && eof >= lpos)
 		return;
 	obj = ma[0]->object;
 	VM_OBJECT_WLOCK(obj);
@@ -1175,6 +1403,37 @@
 			vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK);
 		}
 	}
+	if (eof >= lpos) /* avoid truncation */
+		goto done;
+	for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) {
+		if (pos != trunc_page(pos)) {
+			/*
+			 * The page contains the last valid byte in
+			 * the vnode, mark the rest of the page as
+			 * clean, potentially making the whole page
+			 * clean.
+			 */
+			pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE);
+			vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE -
+			    pos_devb);
+
+			/*
+			 * If the page was cleaned, report the pageout
+			 * on it as successful.  msync() no longer
+			 * needs to write out the page, endlessly
+			 * creating write requests and dirty buffers.
+			 */
+			if (ma[i]->dirty == 0)
+				rtvals[i] = VM_PAGER_OK;
+
+			pos = round_page(pos);
+		} else {
+			/* vm_pageout_flush() clears dirty */
+			rtvals[i] = VM_PAGER_BAD;
+			pos += PAGE_SIZE;
+		}
+	}
+done:
 	VM_OBJECT_WUNLOCK(obj);
 }
 

Modified: trunk/sys/vm/vnode_pager.h
===================================================================
--- trunk/sys/vm/vnode_pager.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vnode_pager.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vnode_pager.h	8.1 (Berkeley) 6/11/93
- * $FreeBSD: stable/10/sys/vm/vnode_pager.h 232071 2012-02-23 21:07:16Z kib $
+ * $FreeBSD: stable/11/sys/vm/vnode_pager.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef	_VNODE_PAGER_
@@ -42,14 +42,17 @@
 #ifdef _KERNEL
 
 int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m,
-					  int count, int reqpage);
+    int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone,
+    void *arg);
 int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m,
-					  int count, boolean_t sync,
-					  int *rtvals);
-
+    int count, int flags, int *rtvals);
+int vnode_pager_local_getpages(struct vop_getpages_args *ap);
+int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap);
+int vnode_pager_putpages_ioflags(int pager_flags);
 void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end);
-void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written);
+void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written,
+    off_t eof, int lpos);
 void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end);
 


From laffer1 at midnightbsd.org  Sat Feb  8 14:38:54 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:38:54 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12315] trunk/sys/tools/vnode_if.awk: sync
 with FreeBSD 11-stable
Message-ID: <202002081938.018Jcseu062935@stargazer.midnightbsd.org>

Revision: 12315
          http://svnweb.midnightbsd.org/src/?rev=12315
Author:   laffer1
Date:     2020-02-08 14:38:54 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/tools/vnode_if.awk

Modified: trunk/sys/tools/vnode_if.awk
===================================================================
--- trunk/sys/tools/vnode_if.awk	2020-02-08 19:35:48 UTC (rev 12314)
+++ trunk/sys/tools/vnode_if.awk	2020-02-08 19:38:54 UTC (rev 12315)
@@ -30,7 +30,7 @@
 
 #
 #	@(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
-# $FreeBSD: stable/10/sys/tools/vnode_if.awk 289798 2015-10-23 07:40:43Z avg $
+# $FreeBSD: stable/11/sys/tools/vnode_if.awk 331722 2018-03-29 02:50:57Z eadler $
 # $MidnightBSD$
 #
 # Script to produce VFS front-end sugar.
@@ -166,8 +166,6 @@
 
 if (cfile) {
 	printc(common_head \
-	    "#include \"opt_kdtrace.h\"\n" \
-	    "\n" \
 	    "#include <sys/param.h>\n" \
 	    "#include <sys/event.h>\n" \
 	    "#include <sys/kernel.h>\n" \


From laffer1 at midnightbsd.org  Sat Feb  8 14:39:08 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:39:08 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12316] trunk/sys/ufs: sync with FreeBSD
 11-stable
Message-ID: <202002081939.018Jd8ZR062993@stargazer.midnightbsd.org>

Revision: 12316
          http://svnweb.midnightbsd.org/src/?rev=12316
Author:   laffer1
Date:     2020-02-08 14:39:08 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/ufs/ffs/ffs_alloc.c
    trunk/sys/ufs/ffs/ffs_balloc.c
    trunk/sys/ufs/ffs/ffs_extern.h
    trunk/sys/ufs/ffs/ffs_inode.c
    trunk/sys/ufs/ffs/ffs_rawread.c
    trunk/sys/ufs/ffs/ffs_snapshot.c
    trunk/sys/ufs/ffs/ffs_softdep.c
    trunk/sys/ufs/ffs/ffs_subr.c
    trunk/sys/ufs/ffs/ffs_suspend.c
    trunk/sys/ufs/ffs/ffs_tables.c
    trunk/sys/ufs/ffs/ffs_vfsops.c
    trunk/sys/ufs/ffs/ffs_vnops.c
    trunk/sys/ufs/ffs/fs.h
    trunk/sys/ufs/ffs/softdep.h
    trunk/sys/ufs/ufs/README.acls
    trunk/sys/ufs/ufs/README.extattr
    trunk/sys/ufs/ufs/acl.h
    trunk/sys/ufs/ufs/dinode.h
    trunk/sys/ufs/ufs/dir.h
    trunk/sys/ufs/ufs/dirhash.h
    trunk/sys/ufs/ufs/extattr.h
    trunk/sys/ufs/ufs/gjournal.h
    trunk/sys/ufs/ufs/inode.h
    trunk/sys/ufs/ufs/quota.h
    trunk/sys/ufs/ufs/ufs_acl.c
    trunk/sys/ufs/ufs/ufs_bmap.c
    trunk/sys/ufs/ufs/ufs_dirhash.c
    trunk/sys/ufs/ufs/ufs_extattr.c
    trunk/sys/ufs/ufs/ufs_extern.h
    trunk/sys/ufs/ufs/ufs_gjournal.c
    trunk/sys/ufs/ufs/ufs_inode.c
    trunk/sys/ufs/ufs/ufs_lookup.c
    trunk/sys/ufs/ufs/ufs_quota.c
    trunk/sys/ufs/ufs/ufs_vfsops.c
    trunk/sys/ufs/ufs/ufs_vnops.c
    trunk/sys/ufs/ufs/ufsmount.h

Modified: trunk/sys/ufs/ffs/ffs_alloc.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_alloc.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_alloc.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -61,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_alloc.c 306630 2016-10-03 10:15:16Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_alloc.c 344861 2019-03-06 23:59:56Z mckusick $");
 
 #include "opt_quota.h"
 
@@ -164,13 +164,13 @@
 #endif
 
 	*bnp = 0;
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 #ifdef INVARIANTS
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
-		    devtoname(ip->i_dev), (long)fs->fs_bsize, size,
+		    devtoname(ump->um_dev), (long)fs->fs_bsize, size,
 		    fs->fs_fsmnt);
 		panic("ffs_alloc: bad size");
 	}
@@ -261,9 +261,9 @@
 	int64_t delta;
 
 	vp = ITOV(ip);
-	fs = ip->i_fs;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	bp = NULL;
-	ump = ip->i_ump;
 	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
 
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
@@ -274,7 +274,7 @@
 	    (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
 		printf(
 		"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
-		    devtoname(ip->i_dev), (long)fs->fs_bsize, osize,
+		    devtoname(ump->um_dev), (long)fs->fs_bsize, osize,
 		    nsize, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad size");
 	}
@@ -289,7 +289,7 @@
 	}
 	if (bprev == 0) {
 		printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n",
-		    devtoname(ip->i_dev), (long)fs->fs_bsize, (intmax_t)bprev,
+		    devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev,
 		    fs->fs_fsmnt);
 		panic("ffs_realloccg: bad bprev");
 	}
@@ -384,7 +384,7 @@
 		break;
 	default:
 		printf("dev = %s, optim = %ld, fs = %s\n",
-		    devtoname(ip->i_dev), (long)fs->fs_optim, fs->fs_fsmnt);
+		    devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad optim");
 		/* NOTREACHED */
 	}
@@ -392,7 +392,7 @@
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
 		if (!DOINGSOFTDEP(vp))
-			ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize,
+			ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
 			    ip->i_number, vp->v_type, NULL);
 		delta = btodb(nsize - osize);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
@@ -482,9 +482,19 @@
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
+	struct ufsmount *ump;
 
-	if (doreallocblks == 0)
+	/*
+	 * If the underlying device can do deletes, then skip reallocating
+	 * the blocks of this file into contiguous sequences. Devices that
+	 * benefit from BIO_DELETE also benefit from not moving the data.
+	 * These devices are flash and therefore work less well with this
+	 * optimization. Also skip if reallocblks has been disabled globally.
+	 */
+	ump = ap->a_vp->v_mount->mnt_data;
+	if (ump->um_candelete || doreallocblks == 0)
 		return (ENOSPC);
+
 	/*
 	 * We can't wait in softdep prealloc as it may fsync and recurse
 	 * here.  Instead we simply fail to reallocate blocks if this
@@ -493,7 +503,7 @@
 	if (DOINGSOFTDEP(ap->a_vp))
 		if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
 			return (ENOSPC);
-	if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1)
+	if (ump->um_fstype == UFS1)
 		return (ffs_reallocblks_ufs1(ap));
 	return (ffs_reallocblks_ufs2(ap));
 }
@@ -520,8 +530,8 @@
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	/*
 	 * If we are not tracking block clusters or if we have less than 4%
 	 * free blocks left, then do not attempt to cluster. Running with
@@ -718,7 +728,7 @@
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (!DOINGSOFTDEP(vp))
-			ffs_blkfree(ump, fs, ip->i_devvp,
+			ffs_blkfree(ump, fs, ump->um_devvp,
 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
 			    fs->fs_bsize, ip->i_number, vp->v_type, NULL);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
@@ -769,8 +779,8 @@
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	/*
 	 * If we are not tracking block clusters or if we have less than 4%
 	 * free blocks left, then do not attempt to cluster. Running with
@@ -895,7 +905,7 @@
 	 */
 #ifdef DEBUG
 	if (prtrealloc)
-		printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number,
+		printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number,
 		    (intmax_t)start_lbn, (intmax_t)end_lbn);
 #endif
 	blkno = newblk;
@@ -966,7 +976,7 @@
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (!DOINGSOFTDEP(vp))
-			ffs_blkfree(ump, fs, ip->i_devvp,
+			ffs_blkfree(ump, fs, ump->um_devvp,
 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
 			    fs->fs_bsize, ip->i_number, vp->v_type, NULL);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
@@ -1031,8 +1041,8 @@
 
 	*vpp = NULL;
 	pip = VTOI(pvp);
-	fs = pip->i_fs;
-	ump = pip->i_ump;
+	ump = ITOUMP(pip);
+	fs = ump->um_fs;
 
 	UFS_LOCK(ump);
 	reclaimed = 0;
@@ -1079,8 +1089,8 @@
 	ip = VTOI(*vpp);
 	if (ip->i_mode) {
 dup_alloc:
-		printf("mode = 0%o, inum = %lu, fs = %s\n",
-		    ip->i_mode, (u_long)ip->i_number, fs->fs_fsmnt);
+		printf("mode = 0%o, inum = %ju, fs = %s\n",
+		    ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt);
 		panic("ffs_valloc: dup alloc");
 	}
 	if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) {  /* XXX */
@@ -1093,8 +1103,8 @@
 	/*
 	 * Set up a new generation number for this inode.
 	 */
-	if (ip->i_gen == 0 || ++ip->i_gen == 0)
-		ip->i_gen = arc4random() / 2 + 1;
+	while (ip->i_gen == 0 || ++ip->i_gen == 0)
+		ip->i_gen = arc4random();
 	DIP_SET(ip, i_gen, ip->i_gen);
 	if (fs->fs_magic == FS_UFS2_MAGIC) {
 		vfs_timestamp(&ts);
@@ -1105,10 +1115,12 @@
 	ip->i_flag = 0;
 	(*vpp)->v_vflag = 0;
 	(*vpp)->v_type = VNON;
-	if (fs->fs_magic == FS_UFS2_MAGIC)
+	if (fs->fs_magic == FS_UFS2_MAGIC) {
 		(*vpp)->v_op = &ffs_vnodeops2;
-	else
+		ip->i_flag |= IN_UFS2;
+	} else {
 		(*vpp)->v_op = &ffs_vnodeops1;
+	}
 	return (0);
 noinodes:
 	if (reclaimed == 0) {
@@ -1149,8 +1161,8 @@
 	u_int mincg, minndir;
 	u_int maxcontigdirs;
 
-	mtx_assert(UFS_MTX(pip->i_ump), MA_OWNED);
-	fs = pip->i_fs;
+	mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED);
+	fs = ITOFS(pip);
 
 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
 	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
@@ -1217,16 +1229,17 @@
 	 * We scan from our preferred cylinder group forward looking
 	 * for a cylinder group that meets our criterion. If we get
 	 * to the final cylinder group and do not find anything,
-	 * we start scanning backwards from our preferred cylinder
-	 * group. The ideal would be to alternate looking forward
-	 * and backward, but that is just too complex to code for
-	 * the gain it would get. The most likely place where the
-	 * backward scan would take effect is when we start near
-	 * the end of the filesystem and do not find anything from
-	 * where we are to the end. In that case, scanning backward
-	 * will likely find us a suitable cylinder group much closer
-	 * to our desired location than if we were to start scanning
-	 * forward from the beginning of the filesystem.
+	 * we start scanning forwards from the beginning of the
+	 * filesystem. While it might seem sensible to start scanning
+	 * backwards or even to alternate looking forward and backward,
+	 * this approach fails badly when the filesystem is nearly full.
+	 * Specifically, we first search all the areas that have no space
+	 * and finally try the one preceding that. We repeat this on
+	 * every request and in the case of the final block end up
+	 * searching the entire filesystem. By jumping to the front
+	 * of the filesystem, our future forward searches always look
+	 * in new cylinder groups so finds every possible block after
+	 * one pass over the filesystem.
 	 */
 	prefcg = ino_to_cg(fs, pip->i_number);
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
@@ -1297,8 +1310,8 @@
 	ufs2_daddr_t pref;
 
 	KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
-	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
-	fs = ip->i_fs;
+	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
+	fs = ITOFS(ip);
 	/*
 	 * Allocation of indirect blocks is indicated by passing negative
 	 * values in indx: -1 for single indirect, -2 for double indirect,
@@ -1341,7 +1354,7 @@
 	/*
 	 * If we are at the beginning of a file, or we have already allocated
 	 * the maximum number of blocks per cylinder group, or we do not
-	 * have a block allocated immediately preceeding us, then we need
+	 * have a block allocated immediately preceding us, then we need
 	 * to decide where to start allocating new blocks.
 	 */
 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
@@ -1402,8 +1415,8 @@
 	ufs2_daddr_t pref;
 
 	KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
-	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
-	fs = ip->i_fs;
+	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
+	fs = ITOFS(ip);
 	/*
 	 * Allocation of indirect blocks is indicated by passing negative
 	 * values in indx: -1 for single indirect, -2 for double indirect,
@@ -1446,7 +1459,7 @@
 	/*
 	 * If we are at the beginning of a file, or we have already allocated
 	 * the maximum number of blocks per cylinder group, or we do not
-	 * have a block allocated immediately preceeding us, then we need
+	 * have a block allocated immediately preceding us, then we need
 	 * to decide where to start allocating new blocks.
 	 */
 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
@@ -1516,12 +1529,12 @@
 	ufs2_daddr_t result;
 	u_int i, icg = cg;
 
-	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
+	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
 #ifdef INVARIANTS
 	if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
 		panic("ffs_hashalloc: allocation on suspended filesystem");
 #endif
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	/*
 	 * 1: preferred cylinder group
 	 */
@@ -1579,8 +1592,8 @@
 	int i, error;
 	u_int8_t *blksfree;
 
-	ump = ip->i_ump;
-	fs = ip->i_fs;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
 		return (0);
 	frags = numfrags(fs, nsize);
@@ -1590,8 +1603,8 @@
 		return (0);
 	}
 	UFS_UNLOCK(ump);
-	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
-		(int)fs->fs_cgsize, NOCRED, &bp);
+	error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)),
+	    (int)fs->fs_cgsize, NOCRED, &bp);
 	if (error)
 		goto fail;
 	cgp = (struct cg *)bp->b_data;
@@ -1663,13 +1676,13 @@
 	int i, allocsiz, error, frags;
 	u_int8_t *blksfree;
 
-	ump = ip->i_ump;
-	fs = ip->i_fs;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
 		return (0);
 	UFS_UNLOCK(ump);
-	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
-		(int)fs->fs_cgsize, NOCRED, &bp);
+	error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)),
+	    (int)fs->fs_cgsize, NOCRED, &bp);
 	if (error)
 		goto fail;
 	cgp = (struct cg *)bp->b_data;
@@ -1765,8 +1778,8 @@
 	u_int8_t *blksfree;
 	int i, cgbpref;
 
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	cgp = (struct cg *)bp->b_data;
 	blksfree = cg_blksfree(cgp);
@@ -1851,12 +1864,12 @@
 	int32_t *lp;
 	u_int8_t *blksfree;
 
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	if (fs->fs_maxcluster[cg] < len)
 		return (0);
 	UFS_UNLOCK(ump);
-	if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize,
+	if (bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize,
 	    NOCRED, &bp))
 		goto fail_lock;
 	cgp = (struct cg *)bp->b_data;
@@ -1955,13 +1968,23 @@
 {
 	struct fs *fs;
 
-	fs = ip->i_fs;
-	return (getblk(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs,
+	fs = ITOFS(ip);
+	return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs,
 	    cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
 	    gbflags));
 }
 
 /*
+ * Synchronous inode initialization is needed only when barrier writes do not
+ * work as advertised, and will impose a heavy cost on file creation in a newly
+ * created filesystem.
+ */
+static int doasyncinodeinit = 1;
+SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN,
+    &doasyncinodeinit, 0,
+    "Perform inode block initialization using asynchronous writes");
+
+/*
  * Determine whether an inode can be allocated.
  *
  * Check to see if an inode is available, and if it is,
@@ -1987,13 +2010,13 @@
 	int error, start, len, i;
 	u_int32_t old_initediblk;
 
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 check_nifree:
 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
 		return (0);
 	UFS_UNLOCK(ump);
-	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+	error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
@@ -2070,9 +2093,11 @@
 		bzero(ibp->b_data, (int)fs->fs_bsize);
 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
 		for (i = 0; i < INOPB(fs); i++) {
-			dp2->di_gen = arc4random() / 2 + 1;
+			while (dp2->di_gen == 0)
+				dp2->di_gen = arc4random();
 			dp2++;
 		}
+
 		/*
 		 * Rather than adding a soft updates dependency to ensure
 		 * that the new inode block is written before it is claimed
@@ -2082,7 +2107,10 @@
 		 * written. The barrier write should only slow down bulk
 		 * loading of newly created filesystems.
 		 */
-		babarrierwrite(ibp);
+		if (doasyncinodeinit)
+			babarrierwrite(ibp);
+		else
+			bwrite(ibp);
 
 		/*
 		 * After the inode block is written, try to update the
@@ -2090,7 +2118,7 @@
 		 * to it, then leave it unchanged as the other thread
 		 * has already set it correctly.
 		 */
-		error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		    (int)fs->fs_cgsize, NOCRED, &bp);
 		UFS_LOCK(ump);
 		ACTIVECLEAR(fs, cg);
@@ -2155,7 +2183,8 @@
 	cg = dtog(fs, bno);
 	if (devvp->v_type == VREG) {
 		/* devvp is a snapshot */
-		dev = VTOI(devvp)->i_devvp->v_rdev;
+		MPASS(devvp->v_mount->mnt_data == ump);
+		dev = ump->um_devvp->v_rdev;
 		cgblkno = fragstoblks(fs, cgtod(fs, cg));
 	} else if (devvp->v_type == VCHR) {
 		/* devvp is a normal disk device */
@@ -2386,7 +2415,7 @@
 	int i, error, frags, free;
 	u_int8_t *blksfree;
 
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("bsize = %ld, size = %ld, fs = %s\n",
 		    (long)fs->fs_bsize, size, fs->fs_fsmnt);
@@ -2394,7 +2423,7 @@
 	}
 	if ((u_int)bno >= fs->fs_size)
 		panic("ffs_checkblk: bad block %jd", (intmax_t)bno);
-	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))),
+	error = bread(ITODEVVP(ip), fsbtodb(fs, cgtod(fs, dtog(fs, bno))),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error)
 		panic("ffs_checkblk: cg bread failed");
@@ -2428,6 +2457,7 @@
 	ino_t ino;
 	int mode;
 {
+	struct ufsmount *ump;
 	struct inode *ip;
 
 	if (DOINGSOFTDEP(pvp)) {
@@ -2435,8 +2465,8 @@
 		return (0);
 	}
 	ip = VTOI(pvp);
-	return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode,
-	    NULL));
+	ump = VFSTOUFS(pvp->v_mount);
+	return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL));
 }
 
 /*
@@ -2463,7 +2493,8 @@
 	cg = ino_to_cg(fs, ino);
 	if (devvp->v_type == VREG) {
 		/* devvp is a snapshot */
-		dev = VTOI(devvp)->i_devvp->v_rdev;
+		MPASS(devvp->v_mount->mnt_data == ump);
+		dev = ump->um_devvp->v_rdev;
 		cgbno = fragstoblks(fs, cgtod(fs, cg));
 	} else if (devvp->v_type == VCHR) {
 		/* devvp is a normal disk device */
@@ -2658,6 +2689,8 @@
  *	the count to zero will cause the inode to be freed.
  * adjblkcnt(inode, amt) - adjust the number of blocks used by the
  *	inode by the specified amount.
+ * adjsize(inode, size) - set the size of the inode to the
+ *	specified size.
  * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) -
  *	adjust the superblock summary.
  * freedirs(inode, count) - directory inodes [inode..inode + count - 1]
@@ -2699,6 +2732,9 @@
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Adjust Inode Used Blocks Count");
 
+static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, CTLFLAG_WR,
+	sysctl_ffs_fsck, "Set the inode size");
+
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Adjust number of directories");
 
@@ -2756,13 +2792,12 @@
 	struct thread *td = curthread;
 	struct fsck_cmd cmd;
 	struct ufsmount *ump;
-	struct vnode *vp, *vpold, *dvp, *fdvp;
+	struct vnode *vp, *dvp, *fdvp;
 	struct inode *ip, *dp;
 	struct mount *mp;
 	struct fs *fs;
 	ufs2_daddr_t blkno;
 	long blkcnt, blksize;
-	struct filedesc *fdp;
 	struct file *fp, *vfp;
 	cap_rights_t rights;
 	int filetype, error;
@@ -2774,7 +2809,7 @@
 		return (error);
 	if (cmd.version != FFS_CMD_VERSION)
 		return (ERPCMISMATCH);
-	if ((error = getvnode(td->td_proc->p_fd, cmd.handle,
+	if ((error = getvnode(td, cmd.handle,
 	    cap_rights_init(&rights, CAP_FSCK), &fp)) != 0)
 		return (error);
 	vp = fp->f_data;
@@ -2851,6 +2886,23 @@
 		vput(vp);
 		break;
 
+	case FFS_SET_SIZE:
+#ifdef DEBUG
+		if (fsckcmds) {
+			printf("%s: set inode %jd size to %jd\n",
+			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
+			    (intmax_t)cmd.size);
+		}
+#endif /* DEBUG */
+		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
+			break;
+		ip = VTOI(vp);
+		DIP_SET(ip, i_size, cmd.size);
+		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
+		error = ffs_update(vp, 1);
+		vput(vp);
+		break;
+
 	case FFS_DIR_FREE:
 		filetype = IFDIR;
 		/* fall through */
@@ -2977,12 +3029,7 @@
 			break;
 		}
 		VOP_UNLOCK(vp, 0);
-		fdp = td->td_proc->p_fd;
-		FILEDESC_XLOCK(fdp);
-		vpold = fdp->fd_cdir;
-		fdp->fd_cdir = vp;
-		FILEDESC_XUNLOCK(fdp);
-		vrele(vpold);
+		pwd_chdir(td, vp);
 		break;
 
 	case FFS_SET_DOTDOT:
@@ -3057,7 +3104,7 @@
 			break;
 		AUDIT_ARG_VNODE1(vp);
 		ip = VTOI(vp);
-		if (ip->i_ump->um_fstype == UFS1)
+		if (I_IS_UFS1(ip))
 			error = copyin((void *)(intptr_t)cmd.size, ip->i_din1,
 			    sizeof(struct ufs1_dinode));
 		else
@@ -3077,7 +3124,7 @@
 			error = EPERM;
 			break;
 		}
-		if (VTOI(vp)->i_ump != ump) {
+		if (ITOUMP(VTOI(vp)) != ump) {
 			error = EINVAL;
 			break;
 		}
@@ -3089,7 +3136,7 @@
 			    (intmax_t)cmd.value);
 		}
 #endif /* DEBUG */
-		if ((error = getvnode(td->td_proc->p_fd, cmd.value,
+		if ((error = getvnode(td, cmd.value,
 		    cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0)
 			break;
 		if (vfp->f_vnode->v_type != VCHR) {
@@ -3174,11 +3221,11 @@
 		return (EINVAL);
 	}
 	ip = VTOI(vp);
-	if (ip->i_devvp != devvp) {
+	if (ITODEVVP(ip) != devvp) {
 		vput(vp);
 		return (EINVAL);
 	}
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	vput(vp);
 	foffset_lock_uio(fp, uio, flags);
 	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);

Modified: trunk/sys/ufs/ffs/ffs_balloc.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_balloc.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_balloc.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -61,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_balloc.c 304672 2016-08-23 07:55:32Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_balloc.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -70,6 +70,7 @@
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
+#include <sys/vmmeter.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
@@ -112,8 +113,8 @@
 
 	ip = VTOI(vp);
 	dp = ip->i_din1;
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	fs = ITOFS(ip);
+	ump = ITOUMP(ip);
 	lbn = lblkno(fs, startoffset);
 	size = blkoff(fs, startoffset) + size;
 	reclaimed = 0;
@@ -549,7 +550,7 @@
 		}
 		lbns_remfree++;
 #endif
-		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
+		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL);
 	}
 	return (error);
@@ -585,8 +586,8 @@
 
 	ip = VTOI(vp);
 	dp = ip->i_din2;
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	fs = ITOFS(ip);
+	ump = ITOUMP(ip);
 	lbn = lblkno(fs, startoffset);
 	size = blkoff(fs, startoffset) + size;
 	reclaimed = 0;
@@ -1144,7 +1145,7 @@
 		}
 		lbns_remfree++;
 #endif
-		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
+		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL);
 	}
 	return (error);

Modified: trunk/sys/ufs/ffs/ffs_extern.h
===================================================================
--- trunk/sys/ufs/ffs/ffs_extern.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_extern.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_extern.h	8.6 (Berkeley) 3/30/95
- * $FreeBSD: stable/10/sys/ufs/ffs/ffs_extern.h 306175 2016-09-22 10:42:40Z kib $
+ * $FreeBSD: stable/11/sys/ufs/ffs/ffs_extern.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _UFS_FFS_EXTERN_H
@@ -78,7 +78,6 @@
 int	ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
 int	ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);
 void	ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
-int	ffs_mountroot(void);
 void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
 int	ffs_own_mount(const struct mount *mp);
 int	ffs_reallocblks(struct vop_reallocblks_args *);
@@ -179,6 +178,11 @@
  * deadlock when flushing snapshot inodes while holding snaplk.
  */
 #define	NO_INO_UPDT		0x00000001
+/*
+ * Request data sync only from ffs_syncvnode(), not touching even more
+ * metadata than NO_INO_UPDT.
+ */
+#define	DATA_ONLY		0x00000002
 
 int	ffs_rdonly(struct inode *);
 

Modified: trunk/sys/ufs/ffs/ffs_inode.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_inode.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_inode.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -31,22 +31,24 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_inode.c 300600 2016-05-24 10:41:34Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_inode.c 349308 2019-06-23 14:49:30Z asomers $");
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/mount.h>
-#include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
-#include <sys/vnode.h>
 #include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/random.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
+#include <sys/stat.h>
 #include <sys/vmmeter.h>
-#include <sys/stat.h>
+#include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
@@ -91,8 +93,8 @@
 	if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0)
 		return (0);
 	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
-	fs = ip->i_fs;
-	if (fs->fs_ronly && ip->i_ump->um_fsckpid == 0)
+	fs = ITOFS(ip);
+	if (fs->fs_ronly && ITOUMP(ip)->um_fsckpid == 0)
 		return (0);
 	/*
 	 * If we are updating a snapshot and another process is currently
@@ -109,14 +111,12 @@
 	if (IS_SNAPSHOT(ip))
 		flags = GB_LOCK_NOWAIT;
 loop:
-	error = breadn_flags(ip->i_devvp,
+	error = breadn_flags(ITODEVVP(ip),
 	     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 	     (int) fs->fs_bsize, 0, 0, 0, NOCRED, flags, &bp);
 	if (error != 0) {
-		if (error != EBUSY) {
-			brelse(bp);
+		if (error != EBUSY)
 			return (error);
-		}
 		KASSERT((IS_SNAPSHOT(ip)), ("EBUSY from non-snapshot"));
 		/*
 		 * Wait for our inode block to become available.
@@ -144,12 +144,17 @@
 		softdep_update_inodeblock(ip, bp, waitfor);
 	else if (ip->i_effnlink != ip->i_nlink)
 		panic("ffs_update: bad link cnt");
-	if (ip->i_ump->um_fstype == UFS1)
+	if (I_IS_UFS1(ip)) {
 		*((struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
-	else
+		/* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */
+		random_harvest_queue(&(ip->i_din1), sizeof(ip->i_din1), 1, RANDOM_FS_ATIME);
+	} else {
 		*((struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
+		/* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */
+		random_harvest_queue(&(ip->i_din2), sizeof(ip->i_din2), 1, RANDOM_FS_ATIME);
+	}
 	if (waitfor && !DOINGASYNC(vp))
 		error = bwrite(bp);
 	else if (vm_page_count_severe() || buf_dirty_count_severe()) {
@@ -181,7 +186,7 @@
 	struct inode *ip;
 	ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR];
 	ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
-	ufs2_daddr_t count, blocksreleased = 0, datablocks;
+	ufs2_daddr_t count, blocksreleased = 0, datablocks, blkno;
 	struct bufobj *bo;
 	struct fs *fs;
 	struct buf *bp;
@@ -189,12 +194,12 @@
 	int softdeptrunc, journaltrunc;
 	int needextclean, extblocks;
 	int offset, size, level, nblocks;
-	int i, error, allerror;
+	int i, error, allerror, indiroff;
 	off_t osize;
 
 	ip = VTOI(vp);
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	ump = VFSTOUFS(vp->v_mount);
+	fs = ump->um_fs;
 	bo = &vp->v_bufobj;
 
 	ASSERT_VOP_LOCKED(vp, "ffs_truncate");
@@ -265,7 +270,7 @@
 			for (i = 0; i < NXADDR; i++) {
 				if (oldblks[i] == 0)
 					continue;
-				ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i],
+				ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i],
 				    sblksize(fs, osize, i), ip->i_number,
 				    vp->v_type, NULL);
 			}
@@ -326,16 +331,57 @@
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (ffs_update(vp, !DOINGASYNC(vp)));
 	}
-	if (DOINGSOFTDEP(vp)) {
+	/*
+	 * Lookup block number for a given offset. Zero length files
+	 * have no blocks, so return a blkno of -1.
+	 */
+	lbn = lblkno(fs, length - 1);
+	if (length == 0) {
+		blkno = -1;
+	} else if (lbn < NDADDR) {
+		blkno = DIP(ip, i_db[lbn]);
+	} else {
+		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
+		    cred, BA_METAONLY, &bp);
+		if (error)
+			return (error);
+		indiroff = (lbn - NDADDR) % NINDIR(fs);
+		if (I_IS_UFS1(ip))
+			blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
+		else
+			blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
+		/*
+		 * If the block number is non-zero, then the indirect block
+		 * must have been previously allocated and need not be written.
+		 * If the block number is zero, then we may have allocated
+		 * the indirect block and hence need to write it out.
+		 */
+		if (blkno != 0)
+			brelse(bp);
+		else if (DOINGSOFTDEP(vp) || DOINGASYNC(vp))
+			bdwrite(bp);
+		else
+			bwrite(bp);
+	}
+	/*
+	 * If the block number at the new end of the file is zero,
+	 * then we must allocate it to ensure that the last block of 
+	 * the file is allocated. Soft updates does not handle this
+	 * case, so here we have to clean up the soft updates data
+	 * structures describing the allocation past the truncation
+	 * point. Finding and deallocating those structures is a lot of
+	 * work. Since partial truncation with a hole at the end occurs
+	 * rarely, we solve the problem by syncing the file so that it
+	 * will have no soft updates data structures left.
+	 */
+	if (blkno == 0 && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
+		return (error);
+	if (blkno != 0 && DOINGSOFTDEP(vp)) {
 		if (softdeptrunc == 0 && journaltrunc == 0) {
 			/*
-			 * If a file is only partially truncated, then
-			 * we have to clean up the data structures
-			 * describing the allocation past the truncation
-			 * point. Finding and deallocating those structures
-			 * is a lot of work. Since partial truncation occurs
-			 * rarely, we solve the problem by syncing the file
-			 * so that it will have no data structures left.
+			 * If soft updates cannot handle this truncation,
+			 * clean up soft dependency data structures and
+			 * fall through to the synchronous truncation.
 			 */
 			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 				return (error);
@@ -355,15 +401,17 @@
 		}
 	}
 	/*
-	 * Shorten the size of the file. If the file is not being
-	 * truncated to a block boundary, the contents of the
-	 * partial block following the end of the file must be
-	 * zero'ed in case it ever becomes accessible again because
-	 * of subsequent file growth. Directories however are not
+	 * Shorten the size of the file. If the last block of the
+	 * shortened file is unallocated, we must allocate it.
+	 * Additionally, if the file is not being truncated to a
+	 * block boundary, the contents of the partial block
+	 * following the end of the file must be zero'ed in
+	 * case it ever becomes accessible again because of
+	 * subsequent file growth. Directories however are not
 	 * zero'ed as they should grow back initialized to empty.
 	 */
 	offset = blkoff(fs, length);
-	if (offset == 0) {
+	if (blkno != 0 && offset == 0) {
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 	} else {
@@ -387,7 +435,7 @@
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 		size = blksize(fs, ip, lbn);
-		if (vp->v_type != VDIR)
+		if (vp->v_type != VDIR && offset != 0)
 			bzero((char *)bp->b_data + offset,
 			    (u_int)(size - offset));
 		/* Kirk's code has reallocbuf(bp, size, 1) here */
@@ -450,7 +498,7 @@
 	ip->i_size = osize;
 	DIP_SET(ip, i_size, osize);
 
-	error = vtruncbuf(vp, cred, length, fs->fs_bsize);
+	error = vtruncbuf(vp, length, fs->fs_bsize);
 	if (error && (allerror == 0))
 		allerror = error;
 
@@ -470,7 +518,7 @@
 			blocksreleased += count;
 			if (lastiblock[level] < 0) {
 				DIP_SET(ip, i_ib[level], 0);
-				ffs_blkfree(ump, fs, ip->i_devvp, bn,
+				ffs_blkfree(ump, fs, ump->um_devvp, bn,
 				    fs->fs_bsize, ip->i_number,
 				    vp->v_type, NULL);
 				blocksreleased += nblocks;
@@ -491,7 +539,7 @@
 			continue;
 		DIP_SET(ip, i_db[i], 0);
 		bsize = blksize(fs, ip, i);
-		ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number,
+		ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number,
 		    vp->v_type, NULL);
 		blocksreleased += btodb(bsize);
 	}
@@ -523,7 +571,7 @@
 			 * required for the storage we're keeping.
 			 */
 			bn += numfrags(fs, newspace);
-			ffs_blkfree(ump, fs, ip->i_devvp, bn,
+			ffs_blkfree(ump, fs, ump->um_devvp, bn,
 			   oldspace - newspace, ip->i_number, vp->v_type, NULL);
 			blocksreleased += btodb(oldspace - newspace);
 		}
@@ -582,7 +630,7 @@
 	ufs2_daddr_t *countp;
 {
 	struct buf *bp;
-	struct fs *fs = ip->i_fs;
+	struct fs *fs;
 	struct vnode *vp;
 	caddr_t copy = NULL;
 	int i, nblocks, error = 0, allerror = 0;
@@ -590,8 +638,10 @@
 	ufs2_daddr_t blkcount, factor, blocksreleased = 0;
 	ufs1_daddr_t *bap1 = NULL;
 	ufs2_daddr_t *bap2 = NULL;
-#	define BAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? bap1[i] : bap2[i])
+#define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i])
 
+	fs = ITOFS(ip);
+
 	/*
 	 * Calculate index in current block of last
 	 * block to be kept.  -1 indicates the entire
@@ -613,6 +663,13 @@
 	vp = ITOV(ip);
 	bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0);
 	if ((bp->b_flags & B_CACHE) == 0) {
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_buf(curproc, bp, 0);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
 		curthread->td_ru.ru_inblock++;	/* pay for read */
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
@@ -631,7 +688,7 @@
 		return (error);
 	}
 
-	if (ip->i_ump->um_fstype == UFS1)
+	if (I_IS_UFS1(ip))
 		bap1 = (ufs1_daddr_t *)bp->b_data;
 	else
 		bap2 = (ufs2_daddr_t *)bp->b_data;
@@ -639,7 +696,7 @@
 		copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK);
 		bcopy((caddr_t)bp->b_data, copy, (u_int)fs->fs_bsize);
 		for (i = last + 1; i < NINDIR(fs); i++)
-			if (ip->i_ump->um_fstype == UFS1)
+			if (I_IS_UFS1(ip))
 				bap1[i] = 0;
 			else
 				bap2[i] = 0;
@@ -650,7 +707,7 @@
 			if (error)
 				allerror = error;
 		}
-		if (ip->i_ump->um_fstype == UFS1)
+		if (I_IS_UFS1(ip))
 			bap1 = (ufs1_daddr_t *)copy;
 		else
 			bap2 = (ufs2_daddr_t *)copy;
@@ -670,7 +727,7 @@
 				allerror = error;
 			blocksreleased += blkcount;
 		}
-		ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize,
+		ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize,
 		    ip->i_number, vp->v_type, NULL);
 		blocksreleased += nblocks;
 	}
@@ -704,6 +761,6 @@
 ffs_rdonly(struct inode *ip)
 {
 
-	return (ip->i_ump->um_fs->fs_ronly != 0);
+	return (ITOFS(ip)->fs_ronly != 0);
 }
 

Modified: trunk/sys/ufs/ffs/ffs_rawread.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_rawread.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_rawread.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_rawread.c 318267 2017-05-14 12:00:00Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_rawread.c 318266 2017-05-14 11:51:30Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -63,8 +63,7 @@
 				 off_t offset,
 				 size_t len,
 				 struct thread *td,
-				 struct buf *bp,
-				 caddr_t sa);
+				 struct buf *bp);
 static int ffs_rawread_main(struct vnode *vp,
 			    struct uio *uio);
 
@@ -191,8 +190,7 @@
 		      off_t offset,
 		      size_t len,
 		      struct thread *td,
-		      struct buf *bp,
-		      caddr_t sa)
+		      struct buf *bp)
 {
 	int error;
 	u_int iolen;
@@ -207,7 +205,7 @@
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	
 	ip = VTOI(vp);
-	dp = ip->i_devvp;
+	dp = ITODEVVP(ip);
 
 	iolen = ((vm_offset_t) udata) & PAGE_MASK;
 	bp->b_bcount = len;
@@ -220,7 +218,6 @@
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = bdone;
 	bp->b_data = udata;
-	bp->b_saveaddr = sa;
 	blockno = offset / bsize;
 	blockoff = (offset % bsize) / DEV_BSIZE;
 	if ((daddr_t) blockno != blockno) {
@@ -273,7 +270,6 @@
 {
 	int error, nerror;
 	struct buf *bp, *nbp, *tbp;
-	caddr_t sa, nsa, tsa;
 	u_int iolen;
 	caddr_t udata;
 	long resid;
@@ -295,8 +291,6 @@
 	
 	bp = NULL;
 	nbp = NULL;
-	sa = NULL;
-	nsa = NULL;
 	
 	while (resid > 0) {
 		
@@ -303,10 +297,9 @@
 		if (bp == NULL) { /* Setup first read */
 			/* XXX: Leave some bufs for swap */
 			bp = getpbuf(&ffsrawbufcnt);
-			sa = bp->b_data;
 			pbgetvp(vp, bp);
 			error = ffs_rawread_readahead(vp, udata, offset,
-						     resid, td, bp, sa);
+						     resid, td, bp);
 			if (error != 0)
 				break;
 			
@@ -317,7 +310,6 @@
 				else
 					nbp = NULL;
 				if (nbp != NULL) {
-					nsa = nbp->b_data;
 					pbgetvp(vp, nbp);
 					
 					nerror = ffs_rawread_readahead(vp, 
@@ -328,8 +320,7 @@
 								       resid -
 								       bp->b_bufsize,
 								       td,
-								       nbp,
-								       nsa);
+								       nbp);
 					if (nerror) {
 						pbrelvp(nbp);
 						relpbuf(nbp, &ffsrawbufcnt);
@@ -362,8 +353,7 @@
 						      offset,
 						      bp->b_bufsize - iolen,
 						      td,
-						      bp,
-						      sa);
+						      bp);
 			if (error != 0)
 				break;
 		} else if (nbp != NULL) { /* Complete read with readahead */
@@ -372,10 +362,6 @@
 			bp = nbp;
 			nbp = tbp;
 			
-			tsa = sa;
-			sa = nsa;
-			nsa = tsa;
-			
 			if (resid <= bp->b_bufsize) { /* No more readaheads */
 				pbrelvp(nbp);
 				relpbuf(nbp, &ffsrawbufcnt);
@@ -389,8 +375,7 @@
 							       resid -
 							       bp->b_bufsize,
 							       td,
-							       nbp,
-							       nsa);
+							       nbp);
 				if (nerror != 0) {
 					pbrelvp(nbp);
 					relpbuf(nbp, &ffsrawbufcnt);
@@ -401,7 +386,7 @@
 			break;		
 		}  else if (resid > 0) { /* More to read, no readahead */
 			error = ffs_rawread_readahead(vp, udata, offset,
-						      resid, td, bp, sa);
+						      resid, td, bp);
 			if (error != 0)
 				break;
 		}
@@ -450,7 +435,7 @@
 
 		/* Only handle sector aligned reads */
 		ip = VTOI(vp);
-		secsize = ip->i_devvp->v_bufobj.bo_bsize;
+		secsize = ITODEVVP(ip)->v_bufobj.bo_bsize;
 		if ((uio->uio_offset & (secsize - 1)) == 0 &&
 		    (uio->uio_resid & (secsize - 1)) == 0) {
 			
@@ -470,7 +455,7 @@
 				}
 				
 				partialbytes = ((unsigned int) ip->i_size) %
-					ip->i_fs->fs_bsize;
+				    ITOFS(ip)->fs_bsize;
 				blockbytes = (int) filebytes - partialbytes;
 				if (blockbytes > 0) {
 					skipbytes = uio->uio_resid -

Modified: trunk/sys/ufs/ffs/ffs_snapshot.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_snapshot.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_snapshot.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_snapshot.c 322132 2017-08-07 02:29:09Z mckusick $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_snapshot.c 342819 2019-01-06 22:34:47Z mckusick $");
 
 #include "opt_quota.h"
 
@@ -301,9 +301,10 @@
 		return (error);
 	}
 	vp = nd.ni_vp;
+	vnode_create_vobject(nd.ni_vp, fs->fs_size, td);
 	vp->v_vflag |= VV_SYSTEM;
 	ip = VTOI(vp);
-	devvp = ip->i_devvp;
+	devvp = ITODEVVP(ip);
 	/*
 	 * Allocate and copy the last block contents so as to be able
 	 * to set size to that of the filesystem.
@@ -559,7 +560,7 @@
 		}
 		VI_UNLOCK(xvp);
 		if (snapdebug)
-			vprint("ffs_snapshot: busy vnode", xvp);
+			vn_printf(xvp, "ffs_snapshot: busy vnode ");
 		if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 &&
 		    vat.va_nlink > 0) {
 			VOP_UNLOCK(xvp, 0);
@@ -588,7 +589,7 @@
 			}
 		}
 		snaplistsize += 1;
-		if (xp->i_ump->um_fstype == UFS1)
+		if (I_IS_UFS1(xp))
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 			    BLK_NOCOPY, 1);
 		else
@@ -621,7 +622,7 @@
 			goto out1;
 		}
 		xp = VTOI(xvp);
-		if (xp->i_ump->um_fstype == UFS1)
+		if (I_IS_UFS1(xp))
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 			    BLK_NOCOPY, 0);
 		else
@@ -707,7 +708,7 @@
 	TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
 		if (xp == ip)
 			break;
-		if (xp->i_ump->um_fstype == UFS1)
+		if (I_IS_UFS1(xp))
 			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
 			    BLK_SNAP, 0);
 		else
@@ -736,7 +737,7 @@
 	 * blocks marked as used in the snapshot bitmaps. Also, collect
 	 * the list of allocated blocks in i_snapblklist.
 	 */
-	if (ip->i_ump->um_fstype == UFS1)
+	if (I_IS_UFS1(ip))
 		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
 		    BLK_SNAP, 0);
 	else
@@ -888,9 +889,9 @@
 	int error, len, loc, indiroff;
 
 	ip = VTOI(vp);
-	fs = ip->i_fs;
-	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
-		(int)fs->fs_cgsize, KERNCRED, &bp);
+	fs = ITOFS(ip);
+	error = bread(ITODEVVP(ip), fsbtodb(fs, cgtod(fs, cg)),
+	    (int)fs->fs_cgsize, KERNCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
@@ -900,7 +901,7 @@
 		brelse(bp);
 		return (EIO);
 	}
-	UFS_LOCK(ip->i_ump);
+	UFS_LOCK(ITOUMP(ip));
 	ACTIVESET(fs, cg);
 	/*
 	 * Recomputation of summary information might not have been performed
@@ -909,7 +910,7 @@
 	 * fsck is slightly more consistent.
 	 */
 	fs->fs_cs(fs, cg) = cgp->cg_cs;
-	UFS_UNLOCK(ip->i_ump);
+	UFS_UNLOCK(ITOUMP(ip));
 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
 	if (fs->fs_cgsize < fs->fs_bsize)
 		bzero(&nbp->b_data[fs->fs_cgsize],
@@ -953,7 +954,7 @@
 			}
 			indiroff = 0;
 		}
-		if (ip->i_ump->um_fstype == UFS1) {
+		if (I_IS_UFS1(ip)) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
@@ -1258,7 +1259,7 @@
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
-		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum,
+		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
 		    vp->v_type, NULL);
 	}
 	return (0);
@@ -1542,7 +1543,7 @@
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
-		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum,
+		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
 		    vp->v_type, NULL);
 	}
 	return (0);
@@ -1566,7 +1567,7 @@
 	 * Find snapshot in incore list.
 	 */
 	xp = NULL;
-	sn = ip->i_devvp->v_rdev->si_snapdata;
+	sn = ITODEVVP(ip)->v_rdev->si_snapdata;
 	if (sn != NULL)
 		TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
 			if (xp == ip)
@@ -1579,8 +1580,8 @@
 	/*
 	 * Delete snapshot inode from superblock. Keep list dense.
 	 */
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	UFS_LOCK(ump);
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == ip->i_number)
@@ -1612,8 +1613,8 @@
 	struct snapdata *sn;
 
 	ip = VTOI(vp);
-	fs = ip->i_fs;
-	devvp = ip->i_devvp;
+	fs = ITOFS(ip);
+	devvp = ITODEVVP(ip);
 	/*
 	 * If active, delete from incore list (this snapshot may
 	 * already have been in the process of being deleted, so
@@ -1651,7 +1652,7 @@
 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 			DIP_SET(ip, i_db[blkno], 0);
 		else if ((dblk == blkstofrags(fs, blkno) &&
-		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
+		     ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize,
 		     ip->i_number, vp->v_type, NULL))) {
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
 			    btodb(fs->fs_bsize));
@@ -1669,7 +1670,7 @@
 		else
 			last = fs->fs_size - blkno;
 		for (loc = 0; loc < last; loc++) {
-			if (ip->i_ump->um_fstype == UFS1) {
+			if (I_IS_UFS1(ip)) {
 				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 				if (dblk == 0)
 					continue;
@@ -1676,7 +1677,7 @@
 				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				else if ((dblk == blkstofrags(fs, blkno) &&
-				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
+				     ffs_snapblkfree(fs, ITODEVVP(ip), dblk,
 				     fs->fs_bsize, ip->i_number, vp->v_type,
 				     NULL))) {
 					ip->i_din1->di_blocks -=
@@ -1691,7 +1692,7 @@
 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			else if ((dblk == blkstofrags(fs, blkno) &&
-			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
+			     ffs_snapblkfree(fs, ITODEVVP(ip), dblk,
 			     fs->fs_bsize, ip->i_number, vp->v_type, NULL))) {
 				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
@@ -1786,7 +1787,7 @@
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
-			if (ip->i_ump->um_fstype == UFS1)
+			if (I_IS_UFS1(ip))
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
@@ -1811,7 +1812,7 @@
 			if (lbn < NDADDR) {
 				DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
-			} else if (ip->i_ump->um_fstype == UFS1) {
+			} else if (I_IS_UFS1(ip)) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
@@ -1859,7 +1860,7 @@
 			}
 			if (lbn < NDADDR) {
 				DIP_SET(ip, i_db[lbn], bno);
-			} else if (ip->i_ump->um_fstype == UFS1) {
+			} else if (I_IS_UFS1(ip)) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			} else {
@@ -1991,15 +1992,19 @@
 			continue;
 		}
 		ip = VTOI(vp);
-		if (!IS_SNAPSHOT(ip) || ip->i_size ==
+		if (vp->v_type != VREG) {
+			reason = "non-file snapshot";
+		} else if (!IS_SNAPSHOT(ip)) {
+			reason = "non-snapshot";
+		} else if (ip->i_size ==
 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
-			if (!IS_SNAPSHOT(ip)) {
-				reason = "non-snapshot";
-			} else {
-				reason = "old format snapshot";
-				(void)ffs_truncate(vp, (off_t)0, 0, NOCRED);
-				(void)ffs_syncvnode(vp, MNT_WAIT, 0);
-			}
+			reason = "old format snapshot";
+			(void)ffs_truncate(vp, (off_t)0, 0, NOCRED);
+			(void)ffs_syncvnode(vp, MNT_WAIT, 0);
+		} else {
+			reason = NULL;
+		}
+		if (reason != NULL) {
 			printf("ffs_snapshot_mount: %s inode %d\n",
 			    reason, fs->fs_snapinum[snaploc]);
 			vput(vp);
@@ -2141,7 +2146,7 @@
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
 		return (0);
-	fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
+	fs = ITOFS(TAILQ_FIRST(&sn->sn_head));
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	snapblklist = sn->sn_blklist;
 	upper = sn->sn_listsize - 1;
@@ -2268,7 +2273,7 @@
 		return (0);		/* No snapshot */
 	}
 	ip = TAILQ_FIRST(&sn->sn_head);
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	snapblklist = sn->sn_blklist;
 	upper = sn->sn_listsize - 1;
@@ -2342,7 +2347,7 @@
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
-			if (ip->i_ump->um_fstype == UFS1)
+			if (I_IS_UFS1(ip))
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
@@ -2498,15 +2503,19 @@
 {
 	struct inode *ip = VTOI(vp);
 	struct bio *bip;
+	struct fs *fs;
 
+	ip = VTOI(vp);
+	fs = ITOFS(ip);
+
 	bip = g_alloc_bio();
 	bip->bio_cmd = BIO_READ;
-	bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
+	bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn)));
 	bip->bio_data = bp->b_data;
 	bip->bio_length = bp->b_bcount;
 	bip->bio_done = NULL;
 
-	g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
+	g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private);
 	bp->b_error = biowait(bip, "snaprdb");
 	g_destroy_bio(bip);
 	return (bp->b_error);

Modified: trunk/sys/ufs/ffs/ffs_softdep.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_softdep.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_softdep.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_softdep.c 324612 2017-10-13 22:40:57Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_softdep.c 357034 2020-01-23 06:24:11Z mckusick $");
 
 #include "opt_ffs.h"
 #include "opt_quota.h"
@@ -70,6 +70,7 @@
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/racct.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
@@ -901,8 +902,10 @@
 	    struct pagedep **);
 static	void pause_timer(void *);
 static	int request_cleanup(struct mount *, int);
+static	int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
 static	void schedule_cleanup(struct mount *);
-static void softdep_ast_cleanup_proc(void);
+static void softdep_ast_cleanup_proc(struct thread *);
+static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
 static	int process_worklist_item(struct mount *, int, int);
 static	void process_removes(struct vnode *);
 static	void process_truncates(struct vnode *);
@@ -1105,7 +1108,7 @@
 	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
 		if (wk->wk_type == D_JSEGDEP)
 			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
-		if (wk->wk_type == D_FREEDEP)
+		else if (wk->wk_type == D_FREEDEP)
 			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
 	}
 
@@ -1534,10 +1537,10 @@
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(wk->wk_mp);
-	WORKLIST_REMOVE(wk);
 	if (ump->softdep_worklist_tail == wk)
 		ump->softdep_worklist_tail =
 		    (struct worklist *)wk->wk_list.le_prev;
+	WORKLIST_REMOVE(wk);
 	ump->softdep_on_worklist -= 1;
 }
 
@@ -1835,11 +1838,11 @@
 		wake_worklist(wk);
 		add_to_worklist(wk, WK_HEAD);
 	}
-	LIST_REMOVE(&sentinel, wk_list);
 	/* Sentinal could've become the tail from remove_from_worklist. */
 	if (ump->softdep_worklist_tail == &sentinel)
 		ump->softdep_worklist_tail =
 		    (struct worklist *)sentinel.wk_list.le_prev;
+	LIST_REMOVE(&sentinel, wk_list);
 	PRELE(curproc);
 	return (matchcnt);
 }
@@ -2893,7 +2896,6 @@
 	if (ump->softdep_journal_tail == wk)
 		ump->softdep_journal_tail =
 		    (struct worklist *)wk->wk_list.le_prev;
-
 	WORKLIST_REMOVE(wk);
 	ump->softdep_on_journal -= 1;
 }
@@ -3994,7 +3996,7 @@
 	struct jmvref *jmvref;
 
 	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
-	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
+	workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
 	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
 	jmvref->jm_parent = dp->i_number;
 	jmvref->jm_ino = ino;
@@ -4021,7 +4023,7 @@
 	struct jremref *jremref;
 
 	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
-	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
+	workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
 	jremref->jr_state = ATTACHED;
 	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
 	   nlink, ip->i_mode);
@@ -4057,7 +4059,7 @@
 	struct jaddref *jaddref;
 
 	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
-	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
+	workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
 	jaddref->ja_state = ATTACHED;
 	jaddref->ja_mkdir = NULL;
 	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
@@ -4645,7 +4647,7 @@
 
 	KASSERT(ip->i_nlink >= ip->i_effnlink,
 	    ("inodedep_lookup_ip: bad delta"));
-	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC,
+	(void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
 	    &inodedep);
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
@@ -4668,12 +4670,12 @@
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_create called on non-softdep filesystem"));
 	KASSERT(ip->i_nlink == 1,
 	    ("softdep_setup_create: Invalid link count."));
 	dvp = ITOV(dp);
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
@@ -4682,7 +4684,7 @@
 		    ("softdep_setup_create: No addref structure present."));
 	}
 	softdep_prelink(dvp, NULL);
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
@@ -4700,7 +4702,7 @@
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	jaddref = NULL;
@@ -4711,13 +4713,13 @@
 	if (DOINGSUJ(dvp))
 		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
 		    dp->i_effnlink - 1, dp->i_mode);
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(dp);
 	if (jaddref)
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
 	softdep_prelink(dvp, ITOV(ip));
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
@@ -4735,7 +4737,7 @@
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_link called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	jaddref = NULL;
@@ -4742,13 +4744,13 @@
 	if (DOINGSUJ(dvp))
 		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
 		    ip->i_mode);
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (jaddref)
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 		    if_deps);
 	softdep_prelink(dvp, ITOV(ip));
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
@@ -4768,7 +4770,7 @@
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_mkdir called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 	dotaddref = dotdotaddref = NULL;
@@ -4780,7 +4782,7 @@
 		    dp->i_effnlink - 1, dp->i_mode);
 		dotdotaddref->ja_state |= MKDIR_PARENT;
 	}
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
@@ -4798,7 +4800,7 @@
 		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
 		    &dotdotaddref->ja_ref, if_deps);
 	softdep_prelink(ITOV(dp), NULL);
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
@@ -4812,14 +4814,14 @@
 {
 	struct vnode *dvp;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_rmdir called on non-softdep filesystem"));
 	dvp = ITOV(dp);
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
 	softdep_prelink(dvp, ITOV(ip));
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
@@ -4833,14 +4835,14 @@
 {
 	struct vnode *dvp;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_setup_unlink called on non-softdep filesystem"));
 	dvp = ITOV(dp);
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
 	softdep_prelink(dvp, ITOV(ip));
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
@@ -4856,10 +4858,10 @@
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
 	    ("softdep_revert_create called on non-softdep filesystem"));
 	dvp = ITOV(dp);
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
@@ -4868,7 +4870,7 @@
 		    ("softdep_revert_create: addref parent mismatch"));
 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 	}
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
@@ -4884,10 +4886,10 @@
 	struct jaddref *jaddref;
 	struct vnode *dvp;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_revert_link called on non-softdep filesystem"));
 	dvp = ITOV(dp);
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(ip);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
@@ -4896,7 +4898,7 @@
 		    ("softdep_revert_link: addref parent mismatch"));
 		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 	}
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
@@ -4913,11 +4915,11 @@
 	struct jaddref *dotaddref;
 	struct vnode *dvp;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_revert_mkdir called on non-softdep filesystem"));
 	dvp = ITOV(dp);
 
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	inodedep = inodedep_lookup_ip(dp);
 	if (DOINGSUJ(dvp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
@@ -4939,7 +4941,7 @@
 		    ("softdep_revert_mkdir: dot addref parent mismatch"));
 		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
 	}
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /* 
@@ -4951,12 +4953,12 @@
 	struct inode *ip;
 {
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 	    ("softdep_revert_rmdir called on non-softdep filesystem"));
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ITOUMP(dp));
 	(void) inodedep_lookup_ip(ip);
 	(void) inodedep_lookup_ip(dp);
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ITOUMP(dp));
 }
 
 /*
@@ -5007,10 +5009,10 @@
 	struct mount *mp;
 	struct fs *fs;
 
-	mp = UFSTOVFS(ip->i_ump);
+	mp = ITOVFS(ip);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
-	fs = ip->i_ump->um_fs;
+	fs = VFSTOUFS(mp)->um_fs;
 	jaddref = NULL;
 
 	/*
@@ -5042,7 +5044,7 @@
 	bmsafemap = malloc(sizeof(struct bmsafemap),
 	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
-	ACQUIRE_LOCK(ip->i_ump);
+	ACQUIRE_LOCK(ITOUMP(ip));
 	if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
 		panic("softdep_setup_inomapdep: dependency %p for new"
 		    "inode already exists", inodedep);
@@ -5057,7 +5059,7 @@
 	}
 	inodedep->id_bmsafemap = bmsafemap;
 	inodedep->id_state &= ~DEPCOMPLETE;
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(ITOUMP(ip));
 }
 
 /*
@@ -5279,7 +5281,7 @@
 	ufs_lbn_t lbn;
 
 	lbn = bp->b_lblkno;
-	mp = UFSTOVFS(ip->i_ump);
+	mp = ITOVFS(ip);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
 	if (oldblkno && oldblkno != newblkno)
@@ -5291,7 +5293,7 @@
 	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
 	    "off %jd newsize %ld oldsize %d",
 	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
-	ACQUIRE_LOCK(ip->i_ump);
+	ACQUIRE_LOCK(ITOUMP(ip));
 	if (off >= NDADDR) {
 		if (lbn > 0)
 			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
@@ -5363,7 +5365,7 @@
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 		if (oldadp != NULL && oldadp->ad_offset == off)
 			allocdirect_merge(adphead, adp, oldadp);
-		FREE_LOCK(ip->i_ump);
+		FREE_LOCK(ITOUMP(ip));
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
@@ -5377,7 +5379,7 @@
 	if (oldadp->ad_offset == off)
 		allocdirect_merge(adphead, adp, oldadp);
 
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(ITOUMP(ip));
 }
 
 /*
@@ -5541,10 +5543,10 @@
 	struct jfreefrag *jfreefrag;
 	struct fs *fs;
 
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
 	    M_SOFTDEP_FLAGS);
-	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
+	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
 	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
 	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
 	jfreefrag->fr_ino = ip->i_number;
@@ -5567,16 +5569,18 @@
 	ufs_lbn_t lbn;
 {
 	struct freefrag *freefrag;
+	struct ufsmount *ump;
 	struct fs *fs;
 
 	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
 	    ip->i_number, blkno, size, lbn);
-	fs = ip->i_fs;
+	ump = ITOUMP(ip);
+	fs = ump->um_fs;
 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 		panic("newfreefrag: frag size");
 	freefrag = malloc(sizeof(struct freefrag),
 	    M_FREEFRAG, M_SOFTDEP_FLAGS);
-	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
+	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
 	freefrag->ff_state = ATTACHED;
 	LIST_INIT(&freefrag->ff_jwork);
 	freefrag->ff_inum = ip->i_number;
@@ -5584,7 +5588,7 @@
 	freefrag->ff_blkno = blkno;
 	freefrag->ff_fragsize = size;
 
-	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
+	if (MOUNTEDSUJ(UFSTOVFS(ump))) {
 		freefrag->ff_jdep = (struct worklist *)
 		    newjfreefrag(freefrag, ip, blkno, size, lbn);
 	} else {
@@ -5656,9 +5660,11 @@
 	struct jnewblk *jnewblk;
 	struct newblk *newblk;
 	struct mount *mp;
+	struct ufsmount *ump;
 	ufs_lbn_t lbn;
 
-	mp = UFSTOVFS(ip->i_ump);
+	mp = ITOVFS(ip);
+	ump = VFSTOUFS(mp);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_allocext called on non-softdep filesystem"));
 	KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR",
@@ -5670,7 +5676,7 @@
 	else
 		freefrag = NULL;
 
-	ACQUIRE_LOCK(ip->i_ump);
+	ACQUIRE_LOCK(ump);
 	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
 		panic("softdep_setup_allocext: lost block");
 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
@@ -5721,7 +5727,7 @@
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 		if (oldadp != NULL && oldadp->ad_offset == off)
 			allocdirect_merge(adphead, adp, oldadp);
-		FREE_LOCK(ip->i_ump);
+		FREE_LOCK(ump);
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
@@ -5734,7 +5740,7 @@
 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 	if (oldadp->ad_offset == off)
 		allocdirect_merge(adphead, adp, oldadp);
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(ump);
 }
 
 /*
@@ -5779,11 +5785,11 @@
 	struct jnewblk *jnewblk;
 
 	if (oldblkno)
-		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
+		freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn);
 	else
 		freefrag = NULL;
-	ACQUIRE_LOCK(ip->i_ump);
-	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
+	ACQUIRE_LOCK(ITOUMP(ip));
+	if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0)
 		panic("new_allocindir: lost block");
 	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
 	    ("newallocindir: newblk already initialized"));
@@ -5823,8 +5829,10 @@
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 	struct mount *mp;
+	struct ufsmount *ump;
 
-	mp = UFSTOVFS(ip->i_ump);
+	mp = ITOVFS(ip);
+	ump = VFSTOUFS(mp);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_allocindir_page called on non-softdep filesystem"));
 	KASSERT(lbn == nbp->b_lblkno,
@@ -5845,7 +5853,7 @@
 		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
 	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(ump);
 	if (freefrag)
 		handle_workitem_freefrag(freefrag);
 }
@@ -5864,9 +5872,11 @@
 {
 	struct inodedep *inodedep;
 	struct allocindir *aip;
+	struct ufsmount *ump;
 	ufs_lbn_t lbn;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
+	ump = ITOUMP(ip);
+	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
 	CTR3(KTR_SUJ,
 	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
@@ -5874,12 +5884,11 @@
 	lbn = nbp->b_lblkno;
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
 	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
-	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC,
-	    &inodedep);
+	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
 	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
 		panic("softdep_setup_allocindir_meta: Block already existed");
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(ump);
 }
 
 static void
@@ -5921,7 +5930,7 @@
 	LOCK_OWNED(ump);
 	indirdep = NULL;
 	newindirdep = NULL;
-	fs = ip->i_fs;
+	fs = ump->um_fs;
 	for (;;) {
 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 			if (wk->wk_type != D_INDIRDEP)
@@ -5943,7 +5952,7 @@
 		    M_INDIRDEP, M_SOFTDEP_FLAGS);
 		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
 		newindirdep->ir_state = ATTACHED;
-		if (ip->i_ump->um_fstype == UFS1)
+		if (I_IS_UFS1(ip))
 			newindirdep->ir_state |= UFS1FMT;
 		TAILQ_INIT(&newindirdep->ir_trunc);
 		newindirdep->ir_saveddata = NULL;
@@ -5958,7 +5967,7 @@
 		}
 		newindirdep->ir_freeblks = NULL;
 		newindirdep->ir_savebp =
-		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
+		    getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
 		newindirdep->ir_bp = bp;
 		BUF_KERNPROC(newindirdep->ir_savebp);
 		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
@@ -5996,10 +6005,12 @@
 	struct allocindir *oldaip;
 	struct freefrag *freefrag;
 	struct mount *mp;
+	struct ufsmount *ump;
 
-	LOCK_OWNED(ip->i_ump);
-	mp = UFSTOVFS(ip->i_ump);
-	fs = ip->i_fs;
+	mp = ITOVFS(ip);
+	ump = VFSTOUFS(mp);
+	LOCK_OWNED(ump);
+	fs = ump->um_fs;
 	if (bp->b_lblkno >= 0)
 		panic("setup_allocindir_phase2: not indir blk");
 	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
@@ -6084,6 +6095,7 @@
 	int i;
 	int needj;
 {
+	struct ufsmount *ump;
 	ufs2_daddr_t blkno;
 	int frags;
 
@@ -6091,9 +6103,10 @@
 	if (blkno == 0)
 		return;
 	DIP_SET(ip, i_db[i], 0);
-	frags = sblksize(ip->i_fs, ip->i_size, i);
-	frags = numfrags(ip->i_fs, frags);
-	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
+	ump = ITOUMP(ip);
+	frags = sblksize(ump->um_fs, ip->i_size, i);
+	frags = numfrags(ump->um_fs, frags);
+	newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj);
 }
 
 static inline void
@@ -6103,6 +6116,7 @@
 	int i;
 	int needj;
 {
+	struct ufsmount *ump;
 	ufs2_daddr_t blkno;
 	int frags;
 
@@ -6110,9 +6124,10 @@
 	if (blkno == 0)
 		return;
 	ip->i_din2->di_extb[i] = 0;
-	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
-	frags = numfrags(ip->i_fs, frags);
-	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
+	ump = ITOUMP(ip);
+	frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i);
+	frags = numfrags(ump->um_fs, frags);
+	newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
 }
 
 static inline void
@@ -6123,6 +6138,7 @@
 	ufs_lbn_t lbn;
 	int needj;
 {
+	struct ufsmount *ump;
 	ufs2_daddr_t blkno;
 
 	blkno = DIP(ip, i_ib[i]);
@@ -6129,7 +6145,8 @@
 	if (blkno == 0)
 		return;
 	DIP_SET(ip, i_ib[i], 0);
-	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
+	ump = ITOUMP(ip);
+	newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag,
 	    0, needj);
 }
 
@@ -6152,7 +6169,7 @@
 	freeblks->fb_inum = ip->i_number;
 	freeblks->fb_vtype = ITOV(ip)->v_type;
 	freeblks->fb_modrev = DIP(ip, i_modrev);
-	freeblks->fb_devvp = ip->i_devvp;
+	freeblks->fb_devvp = ITODEVVP(ip);
 	freeblks->fb_chkcnt = 0;
 	freeblks->fb_len = 0;
 
@@ -6207,6 +6224,7 @@
 	struct freework *freework;
 	struct newblk *newblk;
 	struct mount *mp;
+	struct ufsmount *ump;
 	struct buf *bp;
 	uint8_t *start;
 	uint8_t *end;
@@ -6220,6 +6238,7 @@
 	if (blkno == 0)
 		return (0);
 	mp = freeblks->fb_list.wk_mp;
+	ump = VFSTOUFS(mp);
 	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
 	if ((bp->b_flags & B_CACHE) == 0) {
 		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
@@ -6229,6 +6248,13 @@
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_buf(curproc, bp, 0);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
 		curthread->td_ru.ru_inblock++;
 		error = bufwait(bp);
 		if (error) {
@@ -6237,22 +6263,21 @@
 		}
 	}
 	level = lbn_level(lbn);
-	lbnadd = lbn_offset(ip->i_fs, level);
+	lbnadd = lbn_offset(ump->um_fs, level);
 	/*
 	 * Compute the offset of the last block we want to keep.  Store
 	 * in the freework the first block we want to completely free.
 	 */
 	off = (lastlbn - -(lbn + level)) / lbnadd;
-	if (off + 1 == NINDIR(ip->i_fs))
+	if (off + 1 == NINDIR(ump->um_fs))
 		goto nowork;
-	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
-	    0);
+	freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0);
 	/*
 	 * Link the freework into the indirdep.  This will prevent any new
 	 * allocations from proceeding until we are finished with the
 	 * truncate and the block is written.
 	 */
-	ACQUIRE_LOCK(ip->i_ump);
+	ACQUIRE_LOCK(ump);
 	indirdep = indirdep_lookup(mp, ip, bp);
 	if (indirdep->ir_freeblks)
 		panic("setup_trunc_indir: indirdep already truncated.");
@@ -6264,12 +6289,12 @@
 	 * live on this newblk.
 	 */
 	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
-		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
+		newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0, &newblk);
 		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
 			trunc_indirdep(indirn, freeblks, bp, off);
 	} else
 		trunc_indirdep(indirdep, freeblks, bp, off);
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(ump);
 	/*
 	 * Creation is protected by the buf lock. The saveddata is only
 	 * needed if a full truncation follows a partial truncation but it
@@ -6280,7 +6305,7 @@
 		    M_SOFTDEP_FLAGS);
 nowork:
 	/* Fetch the blkno of the child and the zero start offset. */
-	if (ip->i_ump->um_fstype == UFS1) {
+	if (I_IS_UFS1(ip)) {
 		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
 		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
 	} else {
@@ -6490,9 +6515,9 @@
 	ufs_lbn_t tmpval, lbn, lastlbn;
 	int frags, lastoff, iboff, allocblock, needj, error, i;
 
-	fs = ip->i_fs;
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
 	mp = UFSTOVFS(ump);
+	fs = ump->um_fs;
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_journal_freeblocks called on non-softdep filesystem"));
 	vp = ITOV(ip);
@@ -6572,13 +6597,13 @@
 			blkno = DIP(ip, i_db[lastlbn]);
 			if (blkno && oldfrags != frags) {
 				oldfrags -= frags;
-				oldfrags = numfrags(ip->i_fs, oldfrags);
-				blkno += numfrags(ip->i_fs, frags);
+				oldfrags = numfrags(fs, oldfrags);
+				blkno += numfrags(fs, frags);
 				newfreework(ump, freeblks, NULL, lastlbn,
 				    blkno, oldfrags, 0, needj);
 				if (needj)
 					adjust_newfreework(freeblks,
-					    numfrags(ip->i_fs, frags));
+					    numfrags(fs, frags));
 			} else if (blkno == 0)
 				allocblock = 1;
 		}
@@ -6595,7 +6620,7 @@
 		DIP_SET(ip, i_size, ip->i_size);
 		datablocks = DIP(ip, i_blocks) - extblocks;
 		if (length != 0)
-			datablocks = blkcount(ip->i_fs, datablocks, length);
+			datablocks = blkcount(fs, datablocks, length);
 		freeblks->fb_len = length;
 	}
 	if ((flags & IO_EXT) != 0) {
@@ -6622,7 +6647,7 @@
 	 */
 	ufs_itimes(vp);
 	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
-	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 	    (int)fs->fs_bsize, cred, &bp);
 	if (error) {
 		brelse(bp);
@@ -6762,20 +6787,22 @@
 	struct inode *ip;
 {
 	struct jfsync *jfsync;
+	struct ufsmount *ump;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
+	ump = ITOUMP(ip);
+	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_journal_fsync called on non-softdep filesystem"));
 	if ((ip->i_flag & IN_TRUNCATED) == 0)
 		return;
 	ip->i_flag &= ~IN_TRUNCATED;
 	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
-	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
+	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump));
 	jfsync->jfs_size = ip->i_size;
 	jfsync->jfs_ino = ip->i_number;
-	ACQUIRE_LOCK(ip->i_ump);
+	ACQUIRE_LOCK(ump);
 	add_to_journal(&jfsync->jfs_list);
 	jwait(&jfsync->jfs_list, MNT_WAIT);
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(ump);
 }
 
 /*
@@ -6827,7 +6854,7 @@
 	ufs_lbn_t tmpval;
 	ufs_lbn_t lbn;
 
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
 	mp = UFSTOVFS(ump);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_freeblocks called on non-softdep filesystem"));
@@ -6834,7 +6861,14 @@
 	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
 	    ip->i_number, length);
 	KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
-	fs = ip->i_fs;
+	fs = ump->um_fs;
+	if ((error = bread(ump->um_devvp,
+	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
+		brelse(bp);
+		softdep_error("softdep_setup_freeblocks", error);
+		return;
+	}
 	freeblks = newfreeblks(mp, ip);
 	extblocks = 0;
 	datablocks = 0;
@@ -6867,16 +6901,10 @@
 	UFS_UNLOCK(ump);
 	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
 	/*
-	 * Push the zero'ed inode to to its disk buffer so that we are free
+	 * Push the zero'ed inode to its disk buffer so that we are free
 	 * to delete its dependencies below. Once the dependencies are gone
 	 * the buffer can be safely released.
 	 */
-	if ((error = bread(ip->i_devvp,
-	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
-	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
-		brelse(bp);
-		softdep_error("softdep_setup_freeblocks", error);
-	}
 	if (ump->um_fstype == UFS1) {
 		dp1 = ((struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number));
@@ -6969,7 +6997,7 @@
 	off_t end, extend;
 
 	vp = ITOV(ip);
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
 	if ((flags & IO_EXT) != 0)
 		vn_pages_remove(vp, extend, 0);
@@ -7219,9 +7247,9 @@
 	struct worklist *wk, *wkn;
 	struct ufsmount *ump;
 
-	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
+	ump = softdep_bp_to_mp(bp);
+	if (ump == NULL)
 		goto done;
-	ump = VFSTOUFS(wk->wk_mp);
 	ACQUIRE_LOCK(ump);
 	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
 		switch (wk->wk_type) {
@@ -7505,7 +7533,7 @@
 	struct freeblks *freeblks;
 	struct ufsmount *ump;
 
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
 	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_freefile called on non-softdep filesystem"));
 	/*
@@ -7516,10 +7544,10 @@
 	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
 	freefile->fx_mode = mode;
 	freefile->fx_oldinum = ino;
-	freefile->fx_devvp = ip->i_devvp;
+	freefile->fx_devvp = ump->um_devvp;
 	LIST_INIT(&freefile->fx_jwork);
 	UFS_LOCK(ump);
-	ip->i_fs->fs_pendinginodes += 1;
+	ump->um_fs->fs_pendinginodes += 1;
 	UFS_UNLOCK(ump);
 
 	/*
@@ -8439,8 +8467,8 @@
 	struct mount *mp;
 	int isindir;
 
-	ump = dp->i_ump;
-	mp = UFSTOVFS(ump);
+	mp = ITOVFS(dp);
+	ump = VFSTOUFS(mp);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_directory_add called on non-softdep filesystem"));
 	/*
@@ -8453,7 +8481,7 @@
 	}
 	jaddref = NULL;
 	mkdir1 = mkdir2 = NULL;
-	fs = dp->i_fs;
+	fs = ump->um_fs;
 	lbn = lblkno(fs, diroffset);
 	offset = blkoff(fs, diroffset);
 	dap = malloc(sizeof(struct diradd), M_DIRADD,
@@ -8606,10 +8634,12 @@
 	struct diradd *dap;
 	struct direct *de;
 	struct mount *mp;
+	struct ufsmount *ump;
 	ufs_lbn_t lbn;
 	int flags;
 
-	mp = UFSTOVFS(dp->i_ump);
+	mp = ITOVFS(dp);
+	ump = VFSTOUFS(mp);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_change_directoryentry_offset called on "
 	     "non-softdep filesystem"));
@@ -8627,11 +8657,11 @@
 		    dp->i_offset + (oldloc - base),
 		    dp->i_offset + (newloc - base));
 	}
-	lbn = lblkno(dp->i_fs, dp->i_offset);
-	offset = blkoff(dp->i_fs, dp->i_offset);
+	lbn = lblkno(ump->um_fs, dp->i_offset);
+	offset = blkoff(ump->um_fs, dp->i_offset);
 	oldoffset = offset + (oldloc - base);
 	newoffset = offset + (newloc - base);
-	ACQUIRE_LOCK(dp->i_ump);
+	ACQUIRE_LOCK(ump);
 	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
 		goto done;
 	dap = diradd_lookup(pagedep, oldoffset);
@@ -8653,7 +8683,7 @@
 		add_to_journal(&jmvref->jm_list);
 	}
 	bcopy(oldloc, newloc, entrysize);
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ump);
 }
 
 /*
@@ -8896,9 +8926,11 @@
 {
 	struct dirrem *dirrem, *prevdirrem;
 	struct inodedep *inodedep;
+	struct ufsmount *ump;
 	int direct;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
+	ump = ITOUMP(ip);
+	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_setup_remove called on non-softdep filesystem"));
 	/*
 	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
@@ -8910,8 +8942,7 @@
 	 * Add the dirrem to the inodedep's pending remove list for quick
 	 * discovery later.
 	 */
-	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
-	    &inodedep) == 0)
+	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0)
 		panic("softdep_setup_remove: Lost inodedep.");
 	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
 	dirrem->dm_state |= ONDEPLIST;
@@ -8931,7 +8962,7 @@
 	if ((dirrem->dm_state & COMPLETE) == 0) {
 		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
 		    dm_next);
-		FREE_LOCK(ip->i_ump);
+		FREE_LOCK(ump);
 	} else {
 		if (prevdirrem != NULL)
 			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
@@ -8938,7 +8969,7 @@
 			    prevdirrem, dm_next);
 		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
 		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
-		FREE_LOCK(ip->i_ump);
+		FREE_LOCK(ump);
 		if (direct)
 			handle_workitem_remove(dirrem, 0);
 	}
@@ -8980,8 +9011,7 @@
 	struct diradd *dap;
 	struct worklist *wk;
 
-	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
-	    &pagedep) == 0)
+	if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0)
 		return (jremref);
 	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
 	if (dap == NULL)
@@ -9013,9 +9043,10 @@
 	struct ufsmount *ump;
 	struct mkdir *mkdir;
 	struct diradd *dap;
+	struct mount *mp;
 
-	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
-	    &inodedep) == 0)
+	mp = ITOVFS(ip);
+	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 		return (jremref);
 	dap = inodedep->id_mkdiradd;
 	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
@@ -9030,8 +9061,7 @@
 	if ((jaddref = mkdir->md_jaddref) != NULL) {
 		mkdir->md_jaddref = NULL;
 		jaddref->ja_state &= ~MKDIR_PARENT;
-		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
-		    &inodedep) == 0)
+		if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0)
 			panic("cancel_mkdir_dotdot: Lost parent inodedep");
 		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
 			journal_jremref(dirrem, jremref, inodedep);
@@ -9102,6 +9132,7 @@
 	struct jremref *dotremref;
 	struct jremref *dotdotremref;
 	struct vnode *dvp;
+	struct ufsmount *ump;
 
 	/*
 	 * Whiteouts have no deletion dependencies.
@@ -9109,6 +9140,8 @@
 	if (ip == NULL)
 		panic("newdirrem: whiteout");
 	dvp = ITOV(dp);
+	ump = ITOUMP(dp);
+
 	/*
 	 * If the system is over its limit and our filesystem is
 	 * responsible for more than our share of that usage and
@@ -9116,11 +9149,11 @@
 	 * Limiting the number of dirrem structures will also limit
 	 * the number of freefile and freeblks structures.
 	 */
-	ACQUIRE_LOCK(ip->i_ump);
-	if (!IS_SNAPSHOT(ip) && softdep_excess_items(ip->i_ump, D_DIRREM))
-		schedule_cleanup(ITOV(dp)->v_mount);
+	ACQUIRE_LOCK(ump);
+	if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM))
+		schedule_cleanup(UFSTOVFS(ump));
 	else
-		FREE_LOCK(ip->i_ump);
+		FREE_LOCK(ump);
 	dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
 	    M_ZERO);
 	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
@@ -9150,10 +9183,10 @@
 			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
 			    ip->i_effnlink + 1);
 	}
-	ACQUIRE_LOCK(ip->i_ump);
-	lbn = lblkno(dp->i_fs, dp->i_offset);
-	offset = blkoff(dp->i_fs, dp->i_offset);
-	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
+	ACQUIRE_LOCK(ump);
+	lbn = lblkno(ump->um_fs, dp->i_offset);
+	offset = blkoff(ump->um_fs, dp->i_offset);
+	pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC,
 	    &pagedep);
 	dirrem->dm_pagedep = pagedep;
 	dirrem->dm_offset = offset;
@@ -9260,9 +9293,11 @@
 	struct inodedep *inodedep;
 	struct jaddref *jaddref;
 	struct mount *mp;
+	struct ufsmount *ump;
 
-	offset = blkoff(dp->i_fs, dp->i_offset);
-	mp = UFSTOVFS(dp->i_ump);
+	mp = ITOVFS(dp);
+	ump = VFSTOUFS(mp);
+	offset = blkoff(ump->um_fs, dp->i_offset);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	   ("softdep_setup_directory_change called on non-softdep filesystem"));
 
@@ -9312,7 +9347,7 @@
 			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
 				add_to_worklist(&dirrem->dm_list, 0);
 		}
-		FREE_LOCK(dp->i_ump);
+		FREE_LOCK(ump);
 		return;
 	}
 	/*
@@ -9386,7 +9421,7 @@
 	 */
 	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
 		merge_diradd(inodedep, dap);
-	FREE_LOCK(dp->i_ump);
+	FREE_LOCK(ump);
 }
 
 /*
@@ -9400,16 +9435,17 @@
 	struct inode *ip;	/* the inode with the increased link count */
 {
 	struct inodedep *inodedep;
+	struct ufsmount *ump;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
+	ump = ITOUMP(ip);
+	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_change_linkcnt called on non-softdep filesystem"));
-	ACQUIRE_LOCK(ip->i_ump);
-	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC,
-	    &inodedep);
+	ACQUIRE_LOCK(ump);
+	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("softdep_change_linkcnt: bad delta");
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(ump);
 }
 
 /*
@@ -9741,14 +9777,20 @@
 	/*
 	 * Move all dependencies waiting on the remove to complete
 	 * from the dirrem to the inode inowait list to be completed
-	 * after the inode has been updated and written to disk.  Any
-	 * marked MKDIR_PARENT are saved to be completed when the .. ref
-	 * is removed.
+	 * after the inode has been updated and written to disk.
+	 *
+	 * Any marked MKDIR_PARENT are saved to be completed when the 
+	 * dotdot ref is removed unless DIRCHG is specified.  For
+	 * directory change operations there will be no further
+	 * directory writes and the jsegdeps need to be moved along
+	 * with the rest to be completed when the inode is free or
+	 * stable in the inode free list.
 	 */
 	LIST_INIT(&dotdotwk);
 	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
 		WORKLIST_REMOVE(wk);
-		if (wk->wk_state & MKDIR_PARENT) {
+		if ((dirrem->dm_state & DIRCHG) == 0 &&
+		    wk->wk_state & MKDIR_PARENT) {
 			wk->wk_state &= ~MKDIR_PARENT;
 			WORKLIST_INSERT(&dotdotwk, wk);
 			continue;
@@ -9938,9 +9980,9 @@
 		panic("softdep_disk_io_initiation: Writing buffer with "
 		    "background write in progress: %p", bp);
 
-	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
+	ump = softdep_bp_to_mp(bp);
+	if (ump == NULL)
 		return;
-	ump = VFSTOUFS(wk->wk_mp);
 
 	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
 	PHOLD(curproc);			/* Don't swap out kernel stack */
@@ -10181,22 +10223,22 @@
 		prevlbn = adp->ad_offset;
 		if (adp->ad_offset < NDADDR &&
 		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
-			panic("%s: direct pointer #%jd mismatch %d != %jd",
-			    "softdep_write_inodeblock",
+			panic("initiate_write_inodeblock_ufs1: "
+			    "direct pointer #%jd mismatch %d != %jd",
 			    (intmax_t)adp->ad_offset,
 			    dp->di_db[adp->ad_offset],
 			    (intmax_t)adp->ad_newblkno);
 		if (adp->ad_offset >= NDADDR &&
 		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
-			panic("%s: indirect pointer #%jd mismatch %d != %jd",
-			    "softdep_write_inodeblock",
+			panic("initiate_write_inodeblock_ufs1: "
+			    "indirect pointer #%jd mismatch %d != %jd",
 			    (intmax_t)adp->ad_offset - NDADDR,
 			    dp->di_ib[adp->ad_offset - NDADDR],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_offset;
 		if ((adp->ad_state & ATTACHED) == 0)
-			panic("softdep_write_inodeblock: Unknown state 0x%x",
-			    adp->ad_state);
+			panic("initiate_write_inodeblock_ufs1: "
+			    "Unknown state 0x%x", adp->ad_state);
 #endif /* INVARIANTS */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
@@ -10219,7 +10261,8 @@
 		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
-				panic("softdep_write_inodeblock: lost dep1");
+				panic("initiate_write_inodeblock_ufs1: "
+				    "lost dep1");
 #endif /* INVARIANTS */
 			dp->di_db[i] = 0;
 		}
@@ -10227,7 +10270,8 @@
 #ifdef INVARIANTS
 			if (dp->di_ib[i] != 0 &&
 			    (deplist & ((1 << NDADDR) << i)) == 0)
-				panic("softdep_write_inodeblock: lost dep2");
+				panic("initiate_write_inodeblock_ufs1: "
+				    "lost dep2");
 #endif /* INVARIANTS */
 			dp->di_ib[i] = 0;
 		}
@@ -10349,18 +10393,18 @@
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef INVARIANTS
 		if (deplist != 0 && prevlbn >= adp->ad_offset)
-			panic("softdep_write_inodeblock: lbn order");
+			panic("initiate_write_inodeblock_ufs2: lbn order");
 		prevlbn = adp->ad_offset;
 		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
-			panic("%s: direct pointer #%jd mismatch %jd != %jd",
-			    "softdep_write_inodeblock",
+			panic("initiate_write_inodeblock_ufs2: "
+			    "ext pointer #%jd mismatch %jd != %jd",
 			    (intmax_t)adp->ad_offset,
 			    (intmax_t)dp->di_extb[adp->ad_offset],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_offset;
 		if ((adp->ad_state & ATTACHED) == 0)
-			panic("softdep_write_inodeblock: Unknown state 0x%x",
-			    adp->ad_state);
+			panic("initiate_write_inodeblock_ufs2: Unknown "
+			    "state 0x%x", adp->ad_state);
 #endif /* INVARIANTS */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
@@ -10381,7 +10425,8 @@
 		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
-				panic("softdep_write_inodeblock: lost dep1");
+				panic("initiate_write_inodeblock_ufs2: "
+				    "lost dep1");
 #endif /* INVARIANTS */
 			dp->di_extb[i] = 0;
 		}
@@ -10414,22 +10459,22 @@
 		prevlbn = adp->ad_offset;
 		if (adp->ad_offset < NDADDR &&
 		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
-			panic("%s: direct pointer #%jd mismatch %jd != %jd",
-			    "softdep_write_inodeblock",
+			panic("initiate_write_inodeblock_ufs2: "
+			    "direct pointer #%jd mismatch %jd != %jd",
 			    (intmax_t)adp->ad_offset,
 			    (intmax_t)dp->di_db[adp->ad_offset],
 			    (intmax_t)adp->ad_newblkno);
 		if (adp->ad_offset >= NDADDR &&
 		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
-			panic("%s indirect pointer #%jd mismatch %jd != %jd",
-			    "softdep_write_inodeblock:",
+			panic("initiate_write_inodeblock_ufs2: "
+			    "indirect pointer #%jd mismatch %jd != %jd",
 			    (intmax_t)adp->ad_offset - NDADDR,
 			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_offset;
 		if ((adp->ad_state & ATTACHED) == 0)
-			panic("softdep_write_inodeblock: Unknown state 0x%x",
-			    adp->ad_state);
+			panic("initiate_write_inodeblock_ufs2: Unknown "
+			     "state 0x%x", adp->ad_state);
 #endif /* INVARIANTS */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
@@ -10452,7 +10497,8 @@
 		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
 #ifdef INVARIANTS
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
-				panic("softdep_write_inodeblock: lost dep2");
+				panic("initiate_write_inodeblock_ufs2: "
+				    "lost dep2");
 #endif /* INVARIANTS */
 			dp->di_db[i] = 0;
 		}
@@ -10460,7 +10506,8 @@
 #ifdef INVARIANTS
 			if (dp->di_ib[i] != 0 &&
 			    (deplist & ((1 << NDADDR) << i)) == 0)
-				panic("softdep_write_inodeblock: lost dep3");
+				panic("initiate_write_inodeblock_ufs2: "
+				    "lost dep3");
 #endif /* INVARIANTS */
 			dp->di_ib[i] = 0;
 		}
@@ -10940,6 +10987,10 @@
 	struct freeblks *freeblks;
 	struct buf *sbp;
 
+	ump = softdep_bp_to_mp(bp);
+	if (ump == NULL)
+		return;
+
 	/*
 	 * If an error occurred while doing the write, then the data
 	 * has not hit the disk and the dependencies cannot be processed.
@@ -10946,6 +10997,7 @@
 	 * But we do have to go through and roll forward any dependencies
 	 * that were rolled back before the disk write.
 	 */
+	ACQUIRE_LOCK(ump);
 	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 			switch (wk->wk_type) {
@@ -10973,18 +11025,16 @@
 				continue;
 			}
 		}
+		FREE_LOCK(ump);
 		return;
 	}
-	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
-		return;
-	ump = VFSTOUFS(wk->wk_mp);
 	LIST_INIT(&reattach);
+
 	/*
-	 * This lock must not be released anywhere in this code segment.
+	 * Ump SU lock must not be released anywhere in this code segment.
 	 */
 	sbp = NULL;
 	owk = NULL;
-	ACQUIRE_LOCK(ump);
 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		atomic_add_long(&dep_write[wk->wk_type], 1);
@@ -11487,7 +11537,8 @@
 		panic("handle_written_inodeblock: bad size");
 	if (inodedep->id_savednlink > LINK_MAX)
 		panic("handle_written_inodeblock: Invalid link count "
-		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
+		    "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink,
+		    inodedep);
 	if (fstype == UFS1) {
 		if (dp1->di_nlink != inodedep->id_savednlink) { 
 			dp1->di_nlink = inodedep->id_savednlink;
@@ -12104,21 +12155,22 @@
 	struct inode *ip;	/* the "in_core" copy of the inode */
 {
 	struct inodedep *inodedep;
+	struct ufsmount *ump;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
+	ump = ITOUMP(ip);
+	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_load_inodeblock called on non-softdep filesystem"));
 	/*
 	 * Check for alternate nlink count.
 	 */
 	ip->i_effnlink = ip->i_nlink;
-	ACQUIRE_LOCK(ip->i_ump);
-	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
-	    &inodedep) == 0) {
-		FREE_LOCK(ip->i_ump);
+	ACQUIRE_LOCK(ump);
+	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) {
+		FREE_LOCK(ump);
 		return;
 	}
 	ip->i_effnlink -= inodedep->id_nlinkdelta;
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(ump);
 }
 
 /*
@@ -12146,11 +12198,11 @@
 	struct fs *fs;
 	int error;
 
-	ump = ip->i_ump;
+	ump = ITOUMP(ip);
 	mp = UFSTOVFS(ump);
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_update_inodeblock called on non-softdep filesystem"));
-	fs = ip->i_fs;
+	fs = ump->um_fs;
 	/*
 	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
 	 * does not have access to the in-core ip so must write directly into
@@ -12315,9 +12367,9 @@
 	ufs_lbn_t lbn;
 
 	ip = VTOI(vp);
-	fs = ip->i_fs;
-	ump = ip->i_ump;
 	mp = vp->v_mount;
+	ump = VFSTOUFS(mp);
+	fs = ump->um_fs;
 	if (MOUNTEDSOFTDEP(mp) == 0)
 		return (0);
 	ACQUIRE_LOCK(ump);
@@ -12384,24 +12436,13 @@
 		FREE_LOCK(ump);
 		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
 		    FFSV_FORCEINSMQ)) {
-			error = vfs_busy(mp, MBF_NOWAIT);
-			if (error != 0) {
-				vfs_ref(mp);
-				VOP_UNLOCK(vp, 0);
-				error = vfs_busy(mp, 0);
-				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-				vfs_rel(mp);
-				if (error != 0)
-					return (ENOENT);
-				if (vp->v_iflag & VI_DOOMED) {
-					vfs_unbusy(mp);
-					return (ENOENT);
-				}
-			}
+			/*
+			 * Unmount cannot proceed after unlock because
+			 * caller must have called vn_start_write().
+			 */
 			VOP_UNLOCK(vp, 0);
 			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
 			    &pvp, FFSV_FORCEINSMQ);
-			vfs_unbusy(mp);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			if (vp->v_iflag & VI_DOOMED) {
 				if (error == 0)
@@ -12590,13 +12631,13 @@
 	int error;
 
 	ip = VTOI(vp);
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
+	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
 	    ("softdep_sync_metadata called on non-softdep filesystem"));
 	/*
 	 * Ensure that any direct block dependencies have been cleared,
 	 * truncations are started, and inode references are journaled.
 	 */
-	ACQUIRE_LOCK(ip->i_ump);
+	ACQUIRE_LOCK(VFSTOUFS(vp->v_mount));
 	/*
 	 * Write all journal records to prevent rollbacks on devvp.
 	 */
@@ -12608,7 +12649,7 @@
 	 * indirect blocks.
 	 */
 	process_truncates(vp);
-	FREE_LOCK(ip->i_ump);
+	FREE_LOCK(VFSTOUFS(vp->v_mount));
 
 	return (error);
 }
@@ -12643,7 +12684,7 @@
 			return (EBUSY);
 		return (0);
 	}
-	ump = VTOI(vp)->i_ump;
+	ump = VFSTOUFS(vp->v_mount);
 	ACQUIRE_LOCK(ump);
 	/*
 	 * As we hold the buffer locked, none of its dependencies
@@ -13226,10 +13267,9 @@
 {
 	struct ufsmount *ump;
 	struct mount *mp;
-	struct vnode *lvp, *mvp;
 	long starttime;
 	ufs2_daddr_t needed;
-	int error;
+	int error, failed_vnode;
 
 	/*
 	 * If we are being called because of a process doing a
@@ -13281,7 +13321,7 @@
 	 *
 	 * Additionally, if we are unpriviledged and allocating space,
 	 * we need to ensure that we clean up enough blocks to get the
-	 * needed number of blocks over the threshhold of the minimum
+	 * needed number of blocks over the threshold of the minimum
 	 * number of blocks required to be kept free by the filesystem
 	 * (fs_minfree).
 	 */
@@ -13320,43 +13360,90 @@
 	 * to the worklist that we can then process to reap addition
 	 * resources. We walk the vnodes associated with the mount point
 	 * until we get the needed worklist requests that we can reap.
+	 *
+	 * If there are several threads all needing to clean the same
+	 * mount point, only one is allowed to walk the mount list.
+	 * When several threads all try to walk the same mount list,
+	 * they end up competing with each other and often end up in
+	 * livelock. This approach ensures that forward progress is
+	 * made at the cost of occational ENOSPC errors being returned
+	 * that might otherwise have been avoided.
 	 */
+	error = 1;
 	if ((resource == FLUSH_BLOCKS_WAIT && 
 	     fs->fs_cstotal.cs_nbfree <= needed) ||
 	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
 	     fs->fs_cstotal.cs_nifree <= needed)) {
-		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
-			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
-				VI_UNLOCK(lvp);
-				continue;
+		ACQUIRE_LOCK(ump);
+		if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) {
+			ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE;
+			FREE_LOCK(ump);
+			failed_vnode = softdep_request_cleanup_flush(mp, ump);
+			ACQUIRE_LOCK(ump);
+			ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE;
+			FREE_LOCK(ump);
+			if (ump->softdep_on_worklist > 0) {
+				stat_cleanup_retries += 1;
+				if (!failed_vnode)
+					goto retry;
 			}
-			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
-			    curthread))
-				continue;
-			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
-				vput(lvp);
-				continue;
-			}
-			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
-			vput(lvp);
+		} else {
+			FREE_LOCK(ump);
+			error = 0;
 		}
-		lvp = ump->um_devvp;
-		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
-			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
-			VOP_UNLOCK(lvp, 0);
-		}
-		if (ump->softdep_on_worklist > 0) {
-			stat_cleanup_retries += 1;
-			goto retry;
-		}
 		stat_cleanup_failures += 1;
 	}
 	if (time_second - starttime > stat_cleanup_high_delay)
 		stat_cleanup_high_delay = time_second - starttime;
 	UFS_LOCK(ump);
-	return (1);
+	return (error);
 }
 
+/*
+ * Scan the vnodes for the specified mount point flushing out any
+ * vnodes that can be locked without waiting. Finally, try to flush
+ * the device associated with the mount point if it can be locked
+ * without waiting.
+ *
+ * We return 0 if we were able to lock every vnode in our scan.
+ * If we had to skip one or more vnodes, we return 1.
+ */
+static int
+softdep_request_cleanup_flush(mp, ump)
+	struct mount *mp;
+	struct ufsmount *ump;
+{
+	struct thread *td;
+	struct vnode *lvp, *mvp;
+	int failed_vnode;
+
+	failed_vnode = 0;
+	td = curthread;
+	MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
+		if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
+			VI_UNLOCK(lvp);
+			continue;
+		}
+		if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
+		    td) != 0) {
+			failed_vnode = 1;
+			continue;
+		}
+		if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
+			vput(lvp);
+			continue;
+		}
+		(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
+		vput(lvp);
+	}
+	lvp = ump->um_devvp;
+	if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
+		VOP_FSYNC(lvp, MNT_NOWAIT, td);
+		VOP_UNLOCK(lvp, 0);
+	}
+	return (failed_vnode);
+}
+
 static bool
 softdep_excess_items(struct ufsmount *ump, int item)
 {
@@ -13397,15 +13484,13 @@
 }
 
 static void
-softdep_ast_cleanup_proc(void)
+softdep_ast_cleanup_proc(struct thread *td)
 {
-	struct thread *td;
 	struct mount *mp;
 	struct ufsmount *ump;
 	int error;
 	bool req;
 
-	td = curthread;
 	while ((mp = td->td_su) != NULL) {
 		td->td_su = NULL;
 		error = vfs_busy(mp, MBF_NOWAIT);
@@ -13443,6 +13528,10 @@
 		}
 		vfs_unbusy(mp);
 	}
+	if ((mp = td->td_su) != NULL) {
+		td->td_su = NULL;
+		vfs_rel(mp);
+	}
 }
 
 /*
@@ -13688,7 +13777,7 @@
 	/*
 	 * Find the last inode in the block with dependencies.
 	 */
-	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
+	firstino = rounddown2(inodedep->id_ino, INOPB(fs));
 	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
 		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
 			break;
@@ -13764,12 +13853,14 @@
 {
 	struct buf *bp;
 	struct fs *fs;
+	struct ufsmount *ump;
 	int error;
 
-	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
+	ump = ITOUMP(ip);
+	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 	    ("softdep_inode_append called on non-softdep filesystem"));
-	fs = ip->i_fs;
-	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+	fs = ump->um_fs;
+	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 	    (int)fs->fs_bsize, cred, &bp);
 	if (error) {
 		bqrelse(bp);
@@ -13797,6 +13888,58 @@
 	FREE_LOCK(ump);
 }
 
+static struct ufsmount *
+softdep_bp_to_mp(bp)
+	struct buf *bp;
+{
+	struct mount *mp;
+	struct vnode *vp;
+
+	if (LIST_EMPTY(&bp->b_dep))
+		return (NULL);
+	vp = bp->b_vp;
+	KASSERT(vp != NULL,
+	    ("%s, buffer with dependencies lacks vnode", __func__));
+
+	/*
+	 * The ump mount point is stable after we get a correct
+	 * pointer, since bp is locked and this prevents unmount from
+	 * proceeding.  But to get to it, we cannot dereference bp->b_dep
+	 * head wk_mp, because we do not yet own SU ump lock and
+	 * workitem might be freed while dereferenced.
+	 */
+retry:
+	switch (vp->v_type) {
+	case VCHR:
+		VI_LOCK(vp);
+		mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL;
+		VI_UNLOCK(vp);
+		if (mp == NULL)
+			goto retry;
+		break;
+	case VREG:
+	case VDIR:
+	case VLNK:
+	case VFIFO:
+	case VSOCK:
+		mp = vp->v_mount;
+		break;
+	case VBLK:
+		vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n");
+		/* FALLTHROUGH */
+	case VNON:
+	case VBAD:
+	case VMARKER:
+		mp = NULL;
+		break;
+	default:
+		vn_printf(vp, "unknown vnode type");
+		mp = NULL;
+		break;
+	}
+	return (VFSTOUFS(mp));
+}
+
 /*
  * Function to determine if the buffer has outstanding dependencies
  * that will cause a roll-back if the buffer is written. If wantcount
@@ -13822,10 +13965,10 @@
 	struct diradd *dap;
 	int i, retval;
 
+	ump = softdep_bp_to_mp(bp);
+	if (ump == NULL)
+		return (0);
 	retval = 0;
-	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
-		return (0);
-	ump = VFSTOUFS(wk->wk_mp);
 	ACQUIRE_LOCK(ump);
 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 		switch (wk->wk_type) {
@@ -13960,7 +14103,7 @@
 	}
 out:
 	FREE_LOCK(ump);
-	return retval;
+	return (retval);
 }
 
 /*
@@ -13982,7 +14125,7 @@
 		error = BUF_LOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
 		/*
-		 * Even if we sucessfully acquire bp here, we have dropped
+		 * Even if we successfully acquire bp here, we have dropped
 		 * lock, which may violates our guarantee.
 		 */
 		if (error == 0)
@@ -14009,11 +14152,7 @@
 		BUF_UNLOCK(bp);
 		if (waitfor != MNT_WAIT)
 			return (NULL);
-		/*
-		 * The lock argument must be bp->b_vp's mutex in
-		 * this case.
-		 */
-#ifdef	DEBUG_VFS_LOCKS
+#ifdef DEBUG_VFS_LOCKS
 		if (bp->b_vp->v_type != VCHR)
 			ASSERT_BO_WLOCKED(bp->b_bufobj);
 #endif
@@ -14170,25 +14309,14 @@
 
 /*
  * Wait for pending output on a vnode to complete.
- * Must be called with vnode lock and interlock locked.
- *
- * XXX: Should just be a call to bufobj_wwait().
  */
 static void
 drain_output(vp)
 	struct vnode *vp;
 {
-	struct bufobj *bo;
 
-	bo = &vp->v_bufobj;
 	ASSERT_VOP_LOCKED(vp, "drain_output");
-	ASSERT_BO_WLOCKED(bo);
-
-	while (bo->bo_numoutput) {
-		bo->bo_flag |= BO_WWAIT;
-		msleep((caddr_t)&bo->bo_numoutput,
-		    BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0);
-	}
+	(void)bufobj_wwait(&vp->v_bufobj, 0, 0);
 }
 
 /*
@@ -14230,13 +14358,14 @@
 static void
 inodedep_print(struct inodedep *inodedep, int verbose)
 {
-	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
+	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %jd nlink %jd"
 	    " saveino %p\n",
 	    inodedep, inodedep->id_fs, inodedep->id_state,
 	    (intmax_t)inodedep->id_ino,
 	    (intmax_t)fsbtodb(inodedep->id_fs,
 	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
-	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
+	    (intmax_t)inodedep->id_nlinkdelta,
+	    (intmax_t)inodedep->id_savednlink,
 	    inodedep->id_savedino1);
 
 	if (verbose == 0)

Modified: trunk/sys/ufs/ffs/ffs_subr.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_subr.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_subr.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_subr.c 207141 2010-04-24 07:05:35Z jeff $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_subr.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 
@@ -56,10 +56,6 @@
 #include <ufs/ffs/ffs_extern.h>
 #include <ufs/ffs/fs.h>
 
-#ifdef KDB
-void	ffs_checkoverlap(struct buf *, struct inode *);
-#endif
-
 /*
  * Return buffer with the contents of block "offset" from the beginning of
  * directory "ip".  If "res" is non-zero, fill it in with a pointer to the
@@ -79,7 +75,7 @@
 	int bsize, error;
 
 	ip = VTOI(vp);
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	lbn = lblkno(fs, offset);
 	bsize = blksize(fs, ip, lbn);
 
@@ -107,7 +103,7 @@
 	ino_t ino;
 {
 
-	if (ip->i_ump->um_fstype == UFS1) {
+	if (I_IS_UFS1(ip)) {
 		*ip->i_din1 =
 		    *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino));
 		ip->i_mode = ip->i_din1->di_mode;
@@ -166,37 +162,6 @@
 	}
 }
 
-#ifdef KDB
-void
-ffs_checkoverlap(bp, ip)
-	struct buf *bp;
-	struct inode *ip;
-{
-	struct buf *ebp, *ep;
-	ufs2_daddr_t start, last;
-	struct vnode *vp;
-
-	ebp = &buf[nbuf];
-	start = bp->b_blkno;
-	last = start + btodb(bp->b_bcount) - 1;
-	for (ep = buf; ep < ebp; ep++) {
-		if (ep == bp || (ep->b_flags & B_INVAL) ||
-		    ep->b_vp == NULLVP)
-			continue;
-		vp = ip->i_devvp;
-		/* look for overlap */
-		if (ep->b_bcount == 0 || ep->b_blkno > last ||
-		    ep->b_blkno + btodb(ep->b_bcount) <= start)
-			continue;
-		vprint("Disk overlap", vp);
-		printf("\tstart %jd, end %jd overlap start %jd, end %jd\n",
-		    (intmax_t)start, (intmax_t)last, (intmax_t)ep->b_blkno,
-		    (intmax_t)(ep->b_blkno + btodb(ep->b_bcount) - 1));
-		panic("ffs_checkoverlap: Disk buffer overlap");
-	}
-}
-#endif /* KDB */
-
 /*
  * block operations
  *

Modified: trunk/sys/ufs/ffs/ffs_suspend.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_suspend.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_suspend.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -27,14 +27,15 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/ufs/ffs/ffs_suspend.c 306175 2016-09-22 10:42:40Z kib $
+ * $FreeBSD: stable/11/sys/ufs/ffs/ffs_suspend.c 337483 2018-08-08 18:51:39Z kib $
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_suspend.c 306175 2016-09-22 10:42:40Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_suspend.c 337483 2018-08-08 18:51:39Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/buf.h>
 #include <sys/ioccom.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
@@ -214,6 +215,31 @@
 }
 
 static void
+ffs_susp_unsuspend(struct mount *mp)
+{
+	struct ufsmount *ump;
+
+	sx_assert(&ffs_susp_lock, SA_XLOCKED);
+
+	/*
+	 * XXX: The status is kept per-process; the vfs_write_resume() routine
+	 * 	asserts that the resuming thread is the same one that called
+	 * 	vfs_write_suspend().  The cdevpriv data, however, is attached
+	 * 	to the file descriptor, e.g. is inherited during fork.  Thus,
+	 * 	it's possible that the resuming process will be different from
+	 * 	the one that started the suspension.
+	 *
+	 * 	Work around by fooling the check in vfs_write_resume().
+	 */
+	mp->mnt_susp_owner = curthread;
+
+	vfs_write_resume(mp, 0);
+	ump = VFSTOUFS(mp);
+	ump->um_writesuspended = 0;
+	vfs_unbusy(mp);
+}
+
+static void
 ffs_susp_dtor(void *data)
 {
 	struct fs *fs;
@@ -239,22 +265,7 @@
 	if (error != 0)
 		panic("failed to unsuspend writes on %s", fs->fs_fsmnt);
 
-	/*
-	 * XXX: The status is kept per-process; the vfs_write_resume() routine
-	 * 	asserts that the resuming thread is the same one that called
-	 * 	vfs_write_suspend().  The cdevpriv data, however, is attached
-	 * 	to the file descriptor, e.g. is inherited during fork.  Thus,
-	 * 	it's possible that the resuming process will be different from
-	 * 	the one that started the suspension.
-	 *
-	 * 	Work around by fooling the check in vfs_write_resume().
-	 */
-	mp->mnt_susp_owner = curthread;
-
-	vfs_write_resume(mp, 0);
-	vfs_unbusy(mp);
-	ump->um_writesuspended = 0;
-
+	ffs_susp_unsuspend(mp);
 	sx_xunlock(&ffs_susp_lock);
 }
 
@@ -294,7 +305,8 @@
 			break;
 		}
 		error = devfs_set_cdevpriv(mp, ffs_susp_dtor);
-		KASSERT(error == 0, ("devfs_set_cdevpriv failed"));
+		if (error != 0)
+			ffs_susp_unsuspend(mp);
 		break;
 	case UFSRESUME:
 		error = devfs_get_cdevpriv((void **)&mp);

Modified: trunk/sys/ufs/ffs/ffs_tables.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_tables.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_tables.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_tables.c 139825 2005-01-07 02:29:27Z imp $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_tables.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <ufs/ufs/dinode.h>

Modified: trunk/sys/ufs/ffs/ffs_vfsops.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_vfsops.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_vfsops.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_vfsops.c 309208 2016-11-27 09:14:52Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_vfsops.c 357030 2020-01-23 06:06:32Z mckusick $");
 
 #include "opt_quota.h"
 #include "opt_ufs.h"
@@ -55,6 +55,7 @@
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
+#include <sys/vmmeter.h>
 
 #include <security/mac/mac_framework.h>
 
@@ -149,7 +150,7 @@
 	struct fs *fs;
 	pid_t fsckpid = 0;
 	int error, error1, flags;
-	uint64_t mntorflags;
+	uint64_t mntorflags, saved_mnt_flag;
 	accmode_t accmode;
 	struct nameidata ndp;
 	char *fspec;
@@ -240,7 +241,6 @@
 			if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 ||
 			    (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0)
 				return (error);
-			DROP_GIANT();
 			g_topology_lock();
 			/*
 			 * Return to normal read-only mode.
@@ -247,7 +247,6 @@
 			 */
 			error = g_access(ump->um_cp, 0, -1, 0);
 			g_topology_unlock();
-			PICKUP_GIANT();
 			ump->um_fsckpid = 0;
 		}
 		if (fs->fs_ronly == 0 &&
@@ -295,7 +294,6 @@
 			}
 			if (MOUNTEDSOFTDEP(mp))
 				softdep_unmount(mp);
-			DROP_GIANT();
 			g_topology_lock();
 			/*
 			 * Drop our write and exclusive access.
@@ -302,7 +300,6 @@
 			 */
 			g_access(ump->um_cp, 0, -1, -1);
 			g_topology_unlock();
-			PICKUP_GIANT();
 			fs->fs_ronly = 1;
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_RDONLY;
@@ -360,7 +357,6 @@
 					return (EPERM);
 				}
 			}
-			DROP_GIANT();
 			g_topology_lock();
 			/*
 			 * Request exclusive write access.
@@ -367,30 +363,44 @@
 			 */
 			error = g_access(ump->um_cp, 0, 1, 1);
 			g_topology_unlock();
-			PICKUP_GIANT();
 			if (error)
 				return (error);
 			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
 				return (error);
+			error = vfs_write_suspend_umnt(mp);
+			if (error != 0)
+				return (error);
 			fs->fs_ronly = 0;
 			MNT_ILOCK(mp);
-			mp->mnt_flag &= ~MNT_RDONLY;
+			saved_mnt_flag = MNT_RDONLY;
+			if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag &
+			    MNT_ASYNC) != 0)
+				saved_mnt_flag |= MNT_ASYNC;
+			mp->mnt_flag &= ~saved_mnt_flag;
 			MNT_IUNLOCK(mp);
 			fs->fs_mtime = time_second;
 			/* check to see if we need to start softdep */
 			if ((fs->fs_flags & FS_DOSOFTDEP) &&
 			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
-				vn_finished_write(mp);
+				fs->fs_ronly = 1;
+				MNT_ILOCK(mp);
+				mp->mnt_flag |= saved_mnt_flag;
+				MNT_IUNLOCK(mp);
+				vfs_write_resume(mp, 0);
 				return (error);
 			}
 			fs->fs_clean = 0;
 			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
-				vn_finished_write(mp);
+				fs->fs_ronly = 1;
+				MNT_ILOCK(mp);
+				mp->mnt_flag |= saved_mnt_flag;
+				MNT_IUNLOCK(mp);
+				vfs_write_resume(mp, 0);
 				return (error);
 			}
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
-			vn_finished_write(mp);
+			vfs_write_resume(mp, 0);
 		}
 		/*
 		 * Soft updates is incompatible with "async",
@@ -434,7 +444,6 @@
 			}
 			KASSERT(MOUNTEDSOFTDEP(mp) == 0,
 			    ("soft updates enabled on read-only file system"));
-			DROP_GIANT();
 			g_topology_lock();
 			/*
 			 * Request write access.
@@ -441,7 +450,6 @@
 			 */
 			error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
-			PICKUP_GIANT();
 			if (error) {
 				vfs_mount_error(mp,
 				    "Checker activation failed on %s",
@@ -540,7 +548,6 @@
 			    ("soft updates enabled on read-only file system"));
 			ump = VFSTOUFS(mp);
 			fs = ump->um_fs;
-			DROP_GIANT();
 			g_topology_lock();
 			/*
 			 * Request write access.
@@ -547,7 +554,6 @@
 			 */
 			error = g_access(ump->um_cp, 0, 1, 0);
 			g_topology_unlock();
-			PICKUP_GIANT();
 			if (error) {
 				printf("WARNING: %s: Checker activation "
 				    "failed\n", fs->fs_fsmnt);
@@ -798,11 +804,9 @@
 		VOP_UNLOCK(devvp, 0);
 		return (EBUSY);
 	}
-	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
 	g_topology_unlock();
-	PICKUP_GIANT();
 	if (error != 0) {
 		atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 		VOP_UNLOCK(devvp, 0);
@@ -849,7 +853,7 @@
 		goto out;
 	}
 	fs->fs_fmod = 0;
-	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indicies */
+	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indices */
 	fs->fs_flags &= ~FS_UNCLEAN;
 	if (fs->fs_clean == 0) {
 		fs->fs_flags |= FS_UNCLEAN;
@@ -1117,11 +1121,9 @@
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
-		DROP_GIANT();
 		g_topology_lock();
 		g_vfs_close(cp);
 		g_topology_unlock();
-		PICKUP_GIANT();
 	}
 	if (ump) {
 		mtx_destroy(UFS_MTX(ump));
@@ -1307,7 +1309,6 @@
 		taskqueue_drain_all(ump->um_trim_tq);
 		taskqueue_free(ump->um_trim_tq);
 	}
-	DROP_GIANT();
 	g_topology_lock();
 	if (ump->um_fsckpid > 0) {
 		/*
@@ -1318,7 +1319,6 @@
 	}
 	g_vfs_close(ump->um_cp);
 	g_topology_unlock();
-	PICKUP_GIANT();
 	atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0);
 	vrele(ump->um_devvp);
 	dev_rel(ump->um_dev);
@@ -1334,6 +1334,10 @@
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
+	if (td->td_su == mp) {
+		td->td_su = NULL;
+		vfs_rel(mp);
+	}
 	return (error);
 
 fail:
@@ -1480,8 +1484,12 @@
 
 	allerror = 0;
 	td = curthread;
-	if ((mp->mnt_flag & MNT_NOATIME) != 0)
-		goto qupdate;
+	if ((mp->mnt_flag & MNT_NOATIME) != 0) {
+#ifdef QUOTA
+		qsync(mp);
+#endif
+		goto sbupdate;
+	}
 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
@@ -1503,6 +1511,9 @@
 		if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
 		    td)) != 0)
 			continue;
+#ifdef QUOTA
+		qsyncvp(vp);
+#endif
 		if (sync_doupdate(ip))
 			error = ffs_update(vp, 0);
 		if (error != 0)
@@ -1509,12 +1520,7 @@
 			allerror = error;
 		vput(vp);
 	}
-
-qupdate:
-#ifdef QUOTA
-	qsync(mp);
-#endif
-
+sbupdate:
 	if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 &&
 	    (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0)
 		allerror = error;
@@ -1607,6 +1613,9 @@
 			}
 			continue;
 		}
+#ifdef QUOTA
+		qsyncvp(vp);
+#endif
 		if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0)
 			allerror = error;
 		vput(vp);
@@ -1621,9 +1630,6 @@
 		if (allerror == 0 && count)
 			goto loop;
 	}
-#ifdef QUOTA
-	qsync(mp);
-#endif
 
 	devvp = ump->um_devvp;
 	bo = &devvp->v_bufobj;
@@ -1687,7 +1693,6 @@
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct vnode *vp;
-	struct cdev *dev;
 	int error;
 
 	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
@@ -1711,7 +1716,6 @@
 	 */
 
 	ump = VFSTOUFS(mp);
-	dev = ump->um_dev;
 	fs = ump->um_fs;
 	ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
 
@@ -1732,11 +1736,10 @@
 	vp->v_bufobj.bo_bsize = fs->fs_bsize;
 	ip->i_vnode = vp;
 	ip->i_ump = ump;
-	ip->i_fs = fs;
-	ip->i_dev = dev;
 	ip->i_number = ino;
 	ip->i_ea_refs = 0;
 	ip->i_nextclustercg = -1;
+	ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2;
 #ifdef QUOTA
 	{
 		int i;
@@ -1773,7 +1776,7 @@
 		*vpp = NULL;
 		return (error);
 	}
-	if (ip->i_ump->um_fstype == UFS1)
+	if (I_IS_UFS1(ip))
 		ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
 	else
 		ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
@@ -1788,10 +1791,8 @@
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
-	if (ip->i_ump->um_fstype == UFS1)
-		error = ufs_vinit(mp, &ffs_fifoops1, &vp);
-	else
-		error = ufs_vinit(mp, &ffs_fifoops2, &vp);
+	error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2,
+	    &vp);
 	if (error) {
 		vput(vp);
 		*vpp = NULL;
@@ -1811,7 +1812,8 @@
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
-		ip->i_gen = arc4random() / 2 + 1;
+		while (ip->i_gen == 0)
+			ip->i_gen = arc4random();
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			ip->i_flag |= IN_MODIFIED;
 			DIP_SET(ip, i_gen, ip->i_gen);
@@ -1843,6 +1845,7 @@
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
+ * - for UFS2 check that the inode number is initialized
  * - call ffs_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
@@ -1856,13 +1859,37 @@
 	struct vnode **vpp;
 {
 	struct ufid *ufhp;
+	struct ufsmount *ump;
 	struct fs *fs;
+	struct cg *cgp;
+	struct buf *bp;
+	ino_t ino;
+	u_int cg;
+	int error;
 
 	ufhp = (struct ufid *)fhp;
-	fs = VFSTOUFS(mp)->um_fs;
-	if (ufhp->ufid_ino < ROOTINO ||
-	    ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg)
+	ino = ufhp->ufid_ino;
+	ump = VFSTOUFS(mp);
+	fs = ump->um_fs;
+	if (ino < ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg)
 		return (ESTALE);
+	/*
+	 * Need to check if inode is initialized because UFS2 does lazy
+	 * initialization and nfs_fhtovp can offer arbitrary inode numbers.
+	 */
+	if (fs->fs_magic != FS_UFS2_MAGIC)
+		return (ufs_fhtovp(mp, ufhp, flags, vpp));
+	cg = ino_to_cg(fs, ino);
+	error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		(int)fs->fs_cgsize, NOCRED, &bp);
+	if (error)
+		return (error);
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp) || ino >= cg * fs->fs_ipg + cgp->cg_initediblk) {
+		brelse(bp);
+		return (ESTALE);
+	}
+	brelse(bp);
 	return (ufs_fhtovp(mp, ufhp, flags, vpp));
 }
 
@@ -1950,13 +1977,13 @@
 	}
 	bp = sbbp;
 	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
-	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
+	    (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
 		fs->fs_sblockloc = SBLOCK_UFS1;
 	}
 	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
-	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
+	    (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
 		printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n",
 		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
 		fs->fs_sblockloc = SBLOCK_UFS2;
@@ -2032,7 +2059,6 @@
 	/*
 	 * Process dependencies then return any unfinished ones.
 	 */
-	pbrelvp(bp);
 	if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0)
 		buf_complete(bp);
 #ifdef SOFTUPDATES
@@ -2045,6 +2071,7 @@
 	 */
 	bp->b_flags |= B_NOCACHE;
 	bp->b_flags &= ~B_CACHE;
+	pbrelvp(bp);
 
 	/*
 	 * Prevent brelse() from trying to keep and re-dirtying bp on
@@ -2138,7 +2165,7 @@
 		if (newbp == NULL)
 			goto normal_write;
 
-		KASSERT((bp->b_flags & B_UNMAPPED) == 0, ("Unmapped cg"));
+		KASSERT(buf_mapped(bp), ("Unmapped cg"));
 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags |= BV_BKGRDINPROG;

Modified: trunk/sys/ufs/ffs/ffs_vnops.c
===================================================================
--- trunk/sys/ufs/ffs/ffs_vnops.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/ffs_vnops.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -63,7 +63,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_vnops.c 284201 2015-06-10 02:14:33Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_vnops.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <sys/bio.h>
@@ -78,6 +78,7 @@
 #include <sys/priv.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
+#include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
@@ -103,9 +104,10 @@
 #ifdef DIRECTIO
 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 #endif
+static vop_fdatasync_t	ffs_fdatasync;
 static vop_fsync_t	ffs_fsync;
+static vop_getpages_t	ffs_getpages;
 static vop_lock1_t	ffs_lock;
-static vop_getpages_t	ffs_getpages;
 static vop_read_t	ffs_read;
 static vop_write_t	ffs_write;
 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
@@ -120,12 +122,13 @@
 static vop_setextattr_t	ffs_setextattr;
 static vop_vptofh_t	ffs_vptofh;
 
-
 /* Global vfs data structures for ufs. */
 struct vop_vector ffs_vnodeops1 = {
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
+	.vop_fdatasync =	ffs_fdatasync,
 	.vop_getpages =		ffs_getpages,
+	.vop_getpages_async =	vnode_pager_local_getpages_async,
 	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
 	.vop_reallocblks =	ffs_reallocblks,
@@ -136,6 +139,7 @@
 struct vop_vector ffs_fifoops1 = {
 	.vop_default =		&ufs_fifoops,
 	.vop_fsync =		ffs_fsync,
+	.vop_fdatasync =	ffs_fdatasync,
 	.vop_reallocblks =	ffs_reallocblks, /* XXX: really ??? */
 	.vop_vptofh =		ffs_vptofh,
 };
@@ -144,7 +148,9 @@
 struct vop_vector ffs_vnodeops2 = {
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
+	.vop_fdatasync =	ffs_fdatasync,
 	.vop_getpages =		ffs_getpages,
+	.vop_getpages_async =	vnode_pager_local_getpages_async,
 	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
 	.vop_reallocblks =	ffs_reallocblks,
@@ -161,6 +167,7 @@
 struct vop_vector ffs_fifoops2 = {
 	.vop_default =		&ufs_fifoops,
 	.vop_fsync =		ffs_fsync,
+	.vop_fdatasync =	ffs_fdatasync,
 	.vop_lock1 =		ffs_lock,
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_strategy =		ffsext_strategy,
@@ -216,10 +223,10 @@
 {
 	struct inode *ip;
 	struct bufobj *bo;
-	struct buf *bp;
-	struct buf *nbp;
+	struct buf *bp, *nbp;
 	ufs_lbn_t lbn;
-	int error, wait, passes;
+	int error, passes;
+	bool still_dirty, wait;
 
 	ip = VTOI(vp);
 	ip->i_flag &= ~IN_NEEDSYNC;
@@ -238,8 +245,8 @@
 	 */
 	error = 0;
 	passes = 0;
-	wait = 0;	/* Always do an async pass first. */
-	lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
+	wait = false;	/* Always do an async pass first. */
+	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
 	BO_LOCK(bo);
 loop:
 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
@@ -254,15 +261,23 @@
 		if ((bp->b_vflags & BV_SCANNED) != 0)
 			continue;
 		bp->b_vflags |= BV_SCANNED;
-		/* Flush indirects in order. */
+		/*
+		 * Flush indirects in order, if requested.
+		 *
+		 * Note that if only datasync is requested, we can
+		 * skip indirect blocks when softupdates are not
+		 * active.  Otherwise we must flush them with data,
+		 * since dependencies prevent data block writes.
+		 */
 		if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR &&
-		    lbn_level(bp->b_lblkno) >= passes)
+		    (lbn_level(bp->b_lblkno) >= passes ||
+		    ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
 			continue;
 		if (bp->b_lblkno > lbn)
 			panic("ffs_syncvnode: syncing truncated data.");
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
 			BO_UNLOCK(bo);
-		} else if (wait != 0) {
+		} else if (wait) {
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) != 0) {
@@ -330,31 +345,59 @@
 	 * these will be done with one sync and one async pass.
 	 */
 	if (bo->bo_dirty.bv_cnt > 0) {
-		/* Write the inode after sync passes to flush deps. */
-		if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) {
-			BO_UNLOCK(bo);
-			ffs_update(vp, 1);
-			BO_LOCK(bo);
+		if ((flags & DATA_ONLY) == 0) {
+			still_dirty = true;
+		} else {
+			/*
+			 * For data-only sync, dirty indirect buffers
+			 * are ignored.
+			 */
+			still_dirty = false;
+			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
+				if (bp->b_lblkno > -NDADDR) {
+					still_dirty = true;
+					break;
+				}
+			}
 		}
-		/* switch between sync/async. */
-		wait = !wait;
-		if (wait == 1 || ++passes < NIADDR + 2)
-			goto loop;
+
+		if (still_dirty) {
+			/* Write the inode after sync passes to flush deps. */
+			if (wait && DOINGSOFTDEP(vp) &&
+			    (flags & NO_INO_UPDT) == 0) {
+				BO_UNLOCK(bo);
+				ffs_update(vp, 1);
+				BO_LOCK(bo);
+			}
+			/* switch between sync/async. */
+			wait = !wait;
+			if (wait || ++passes < NIADDR + 2)
+				goto loop;
 #ifdef INVARIANTS
-		if (!vn_isdisk(vp, NULL))
-			vprint("ffs_fsync: dirty", vp);
+			if (!vn_isdisk(vp, NULL))
+				vn_printf(vp, "ffs_fsync: dirty ");
 #endif
+		}
 	}
 	BO_UNLOCK(bo);
 	error = 0;
-	if ((flags & NO_INO_UPDT) == 0)
-		error = ffs_update(vp, 1);
-	if (DOINGSUJ(vp))
-		softdep_journal_fsync(VTOI(vp));
+	if ((flags & DATA_ONLY) == 0) {
+		if ((flags & NO_INO_UPDT) == 0)
+			error = ffs_update(vp, 1);
+		if (DOINGSUJ(vp))
+			softdep_journal_fsync(VTOI(vp));
+	}
 	return (error);
 }
 
 static int
+ffs_fdatasync(struct vop_fdatasync_args *ap)
+{
+
+	return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
+}
+
+static int
 ffs_lock(ap)
 	struct vop_lock1_args /* {
 		struct vnode *a_vp;
@@ -477,7 +520,7 @@
 	if (orig_resid == 0)
 		return (0);
 	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	if (uio->uio_offset < ip->i_size &&
 	    uio->uio_offset >= fs->fs_maxfilesize)
 		return (EOVERFLOW);
@@ -559,15 +602,6 @@
 		}
 
 		/*
-		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
-		 * will cause us to attempt to release the buffer later on
-		 * and will cause the buffer cache to attempt to free the
-		 * underlying pages.
-		 */
-		if (ioflag & IO_DIRECT)
-			bp->b_flags |= B_DIRECT;
-
-		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
@@ -581,7 +615,7 @@
 			xfersize = size;
 		}
 
-		if ((bp->b_flags & B_UNMAPPED) == 0) {
+		if (buf_mapped(bp)) {
 			error = vn_io_fault_uiomove((char *)bp->b_data +
 			    blkoffset, (int)xfersize, uio);
 		} else {
@@ -591,25 +625,7 @@
 		if (error)
 			break;
 
-		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_EMPTY(&bp->b_dep))) {
-			/*
-			 * If there are no dependencies, and it's VMIO,
-			 * then we don't need the buf, mark it available
-			 * for freeing.  For non-direct VMIO reads, the VM
-			 * has the data.
-			 */
-			bp->b_flags |= B_RELBUF;
-			brelse(bp);
-		} else {
-			/*
-			 * Otherwise let whoever
-			 * made the request take care of
-			 * freeing it. We just queue
-			 * it onto another list.
-			 */
-			bqrelse(bp);
-		}
+		vfs_bio_brelse(bp, ioflag);
 	}
 
 	/*
@@ -618,15 +634,8 @@
 	 * and on normal completion has not set a new value into it.
 	 * so it must have come from a 'break' statement
 	 */
-	if (bp != NULL) {
-		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_EMPTY(&bp->b_dep))) {
-			bp->b_flags |= B_RELBUF;
-			brelse(bp);
-		} else {
-			bqrelse(bp);
-		}
-	}
+	if (bp != NULL)
+		vfs_bio_brelse(bp, ioflag);
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 &&
@@ -700,7 +709,7 @@
 
 	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
 	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
 		return (EFBIG);
 	/*
@@ -744,8 +753,6 @@
 			vnode_pager_setsize(vp, ip->i_size);
 			break;
 		}
-		if (ioflag & IO_DIRECT)
-			bp->b_flags |= B_DIRECT;
 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
 			bp->b_flags |= B_NOCACHE;
 
@@ -758,7 +765,7 @@
 		if (size < xfersize)
 			xfersize = size;
 
-		if ((bp->b_flags & B_UNMAPPED) == 0) {
+		if (buf_mapped(bp)) {
 			error = vn_io_fault_uiomove((char *)bp->b_data +
 			    blkoffset, (int)xfersize, uio);
 		} else {
@@ -785,11 +792,9 @@
 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
 		    fs->fs_bsize == xfersize)
 			vfs_bio_clrbuf(bp);
-		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_EMPTY(&bp->b_dep))) {
-			bp->b_flags |= B_RELBUF;
-		}
 
+		vfs_bio_set_flags(bp, ioflag);
+
 		/*
 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
 		 * if we have a severe page deficiency write the buffer
@@ -848,48 +853,6 @@
 }
 
 /*
- * get page routine
- */
-static int
-ffs_getpages(ap)
-	struct vop_getpages_args *ap;
-{
-	int i;
-	vm_page_t mreq;
-	int pcount;
-
-	pcount = round_page(ap->a_count) / PAGE_SIZE;
-	mreq = ap->a_m[ap->a_reqpage];
-
-	/*
-	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
-	 * then the entire page is valid.  Since the page may be mapped,
-	 * user programs might reference data beyond the actual end of file
-	 * occuring within the page.  We have to zero that data.
-	 */
-	VM_OBJECT_WLOCK(mreq->object);
-	if (mreq->valid) {
-		if (mreq->valid != VM_PAGE_BITS_ALL)
-			vm_page_zero_invalid(mreq, TRUE);
-		for (i = 0; i < pcount; i++) {
-			if (i != ap->a_reqpage) {
-				vm_page_lock(ap->a_m[i]);
-				vm_page_free(ap->a_m[i]);
-				vm_page_unlock(ap->a_m[i]);
-			}
-		}
-		VM_OBJECT_WUNLOCK(mreq->object);
-		return VM_PAGER_OK;
-	}
-	VM_OBJECT_WUNLOCK(mreq->object);
-
-	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
-					    ap->a_count,
-					    ap->a_reqpage);
-}
-
-
-/*
  * Extended attribute area reading.
  */
 static int
@@ -906,7 +869,7 @@
 	int error;
 
 	ip = VTOI(vp);
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	dp = ip->i_din2;
 
 #ifdef INVARIANTS
@@ -978,15 +941,6 @@
 		}
 
 		/*
-		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
-		 * will cause us to attempt to release the buffer later on
-		 * and will cause the buffer cache to attempt to free the
-		 * underlying pages.
-		 */
-		if (ioflag & IO_DIRECT)
-			bp->b_flags |= B_DIRECT;
-
-		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
@@ -1004,26 +958,7 @@
 					(int)xfersize, uio);
 		if (error)
 			break;
-
-		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_EMPTY(&bp->b_dep))) {
-			/*
-			 * If there are no dependencies, and it's VMIO,
-			 * then we don't need the buf, mark it available
-			 * for freeing.  For non-direct VMIO reads, the VM
-			 * has the data.
-			 */
-			bp->b_flags |= B_RELBUF;
-			brelse(bp);
-		} else {
-			/*
-			 * Otherwise let whoever
-			 * made the request take care of
-			 * freeing it. We just queue
-			 * it onto another list.
-			 */
-			bqrelse(bp);
-		}
+		vfs_bio_brelse(bp, ioflag);
 	}
 
 	/*
@@ -1032,15 +967,8 @@
 	 * and on normal completion has not set a new value into it.
 	 * so it must have come from a 'break' statement
 	 */
-	if (bp != NULL) {
-		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_EMPTY(&bp->b_dep))) {
-			bp->b_flags |= B_RELBUF;
-			brelse(bp);
-		} else {
-			bqrelse(bp);
-		}
-	}
+	if (bp != NULL)
+		vfs_bio_brelse(bp, ioflag);
 	return (error);
 }
 
@@ -1060,7 +988,7 @@
 	int blkoffset, error, flags, size, xfersize;
 
 	ip = VTOI(vp);
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	dp = ip->i_din2;
 
 #ifdef INVARIANTS
@@ -1109,8 +1037,6 @@
 		 */
 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
 			vfs_bio_clrbuf(bp);
-		if (ioflag & IO_DIRECT)
-			bp->b_flags |= B_DIRECT;
 
 		if (uio->uio_offset + xfersize > dp->di_extsize)
 			dp->di_extsize = uio->uio_offset + xfersize;
@@ -1121,11 +1047,9 @@
 
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
-		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
-		   (LIST_EMPTY(&bp->b_dep))) {
-			bp->b_flags |= B_RELBUF;
-		}
 
+		vfs_bio_set_flags(bp, ioflag);
+
 		/*
 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
 		 * if we have a severe page deficiency write the buffer
@@ -1232,7 +1156,7 @@
 	u_char *eae;
 
 	ip = VTOI(vp);
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 	dp = ip->i_din2;
 	easize = dp->di_extsize;
 	if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize)
@@ -1386,8 +1310,7 @@
 
 	vp = ap->a_vp;
 	lbn = ap->a_bp->b_lblkno;
-	if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
-	    lbn < 0 && lbn >= -NXADDR)
+	if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -NXADDR)
 		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
 	if (vp->v_type == VFIFO)
 		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
@@ -1463,7 +1386,7 @@
 	u_char *eae, *p;
 
 	ip = VTOI(ap->a_vp);
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
@@ -1666,7 +1589,7 @@
 	u_char *eae, *p;
 
 	ip = VTOI(ap->a_vp);
-	fs = ip->i_fs;
+	fs = ITOFS(ip);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
@@ -1786,3 +1709,38 @@
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
+
+SYSCTL_DECL(_vfs_ffs);
+static int use_buf_pager = 0;
+SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
+    "Always use buffer pager instead of bmap");
+
+static daddr_t
+ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
+{
+
+	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
+}
+
+static int
+ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
+{
+
+	return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
+}
+
+static int
+ffs_getpages(struct vop_getpages_args *ap)
+{
+	struct vnode *vp;
+	struct ufsmount *um;
+
+	vp = ap->a_vp;
+	um = VFSTOUFS(vp->v_mount);
+
+	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
+		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
+		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
+	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
+	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
+}

Modified: trunk/sys/ufs/ffs/fs.h
===================================================================
--- trunk/sys/ufs/ffs/fs.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/fs.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -11,7 +11,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)fs.h	8.13 (Berkeley) 3/21/95
- * $FreeBSD: stable/10/sys/ufs/ffs/fs.h 322860 2017-08-24 21:44:23Z mckusick $
+ * $FreeBSD: stable/11/sys/ufs/ffs/fs.h 356905 2020-01-20 08:28:54Z eugen $
  */
 
 #ifndef _UFS_FFS_FS_H_
@@ -220,7 +220,8 @@
 #define	FFS_UNLINK		14	/* remove a name in the filesystem */
 #define	FFS_SET_INODE		15	/* update an on-disk inode */
 #define	FFS_SET_BUFOUTPUT	16	/* set buffered writing on descriptor */
-#define	FFS_MAXID		16	/* number of valid ffs ids */
+#define	FFS_SET_SIZE		17	/* set inode size */
+#define	FFS_MAXID		17	/* number of valid ffs ids */
 
 /*
  * Command structure passed in to the filesystem to adjust filesystem values.
@@ -238,9 +239,7 @@
  * A recovery structure placed at the end of the boot block area by newfs
  * that can be used by fsck to search for alternate superblocks.
  */
-#define RESID	(4096 - 20)	/* disk sector size minus recovery area size */
 struct fsrecovery {
-	char	block[RESID];	/* unused part of sector */
 	int32_t	fsr_magic;	/* magic number */
 	int32_t	fsr_fsbtodb;	/* fsbtodb and dbtofsb shift constant */
 	int32_t	fsr_sblkno;	/* offset of super-block in filesys */
@@ -416,8 +415,8 @@
  * flag to enforce that inconsistent filesystems be mounted read-only.
  * The FS_INDEXDIRS flag when set indicates that the kernel maintains
  * on-disk auxiliary indexes (such as B-trees) for speeding directory
- * accesses. Kernels that do not support auxiliary indicies clear the
- * flag to indicate that the indicies need to be rebuilt (by fsck) before
+ * accesses. Kernels that do not support auxiliary indices clear the
+ * flag to indicate that the indices need to be rebuilt (by fsck) before
  * they can be used.
  *
  * FS_ACLS indicates that POSIX.1e ACLs are administratively enabled

Modified: trunk/sys/ufs/ffs/softdep.h
===================================================================
--- trunk/sys/ufs/ffs/softdep.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ffs/softdep.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)softdep.h	9.7 (McKusick) 6/21/00
- * $FreeBSD: stable/10/sys/ufs/ffs/softdep.h 307534 2016-10-17 21:49:54Z mckusick $
+ * $FreeBSD: stable/11/sys/ufs/ffs/softdep.h 320057 2017-06-17 17:10:50Z kib $
  */
 
 #include <sys/queue.h>
@@ -133,7 +133,7 @@
 #define	INPROGRESS	0x001000 /* dirrem, freeblks, freefrag, freefile only */
 #define	UFS1FMT		0x002000 /* indirdep only */
 #define	EXTDATA		0x004000 /* allocdirect only */
-#define ONWORKLIST	0x008000
+#define	ONWORKLIST	0x008000
 #define	IOWAITING	0x010000 /* Thread is waiting for IO to complete. */
 #define	ONDEPLIST	0x020000 /* Structure is on a dependency list. */
 #define	UNLINKED	0x040000 /* inodedep has been unlinked. */
@@ -1066,6 +1066,7 @@
 #define FLUSH_EXIT	0x0001	/* time to exit */
 #define FLUSH_CLEANUP	0x0002	/* need to clear out softdep structures */
 #define	FLUSH_STARTING	0x0004	/* flush thread not yet started */
+#define	FLUSH_RC_ACTIVE	0x0008	/* a thread is flushing the mount point */
 
 /*
  * Keep the old names from when these were in the ufsmount structure.

Modified: trunk/sys/ufs/ufs/README.acls
===================================================================
--- trunk/sys/ufs/ufs/README.acls	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/README.acls	2020-02-08 19:39:08 UTC (rev 12316)
@@ -1,4 +1,4 @@
-$FreeBSD: stable/10/sys/ufs/ufs/README.acls 105456 2002-10-19 16:09:16Z rwatson $
+$FreeBSD: stable/11/sys/ufs/ufs/README.acls 105456 2002-10-19 16:09:16Z rwatson $
 
   UFS Access Control Lists Copyright
 

Modified: trunk/sys/ufs/ufs/README.extattr
===================================================================
--- trunk/sys/ufs/ufs/README.extattr	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/README.extattr	2020-02-08 19:39:08 UTC (rev 12316)
@@ -1,4 +1,4 @@
-$FreeBSD: stable/10/sys/ufs/ufs/README.extattr 105417 2002-10-18 21:11:36Z rwatson $
+$FreeBSD: stable/11/sys/ufs/ufs/README.extattr 105417 2002-10-18 21:11:36Z rwatson $
 
   UFS Extended Attributes Copyright
 

Modified: trunk/sys/ufs/ufs/acl.h
===================================================================
--- trunk/sys/ufs/ufs/acl.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/acl.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/ufs/ufs/acl.h 200796 2009-12-21 19:39:10Z trasz $
+ * $FreeBSD: stable/11/sys/ufs/ufs/acl.h 200796 2009-12-21 19:39:10Z trasz $
  */
 /*
  * Developed by the TrustedBSD Project.

Modified: trunk/sys/ufs/ufs/dinode.h
===================================================================
--- trunk/sys/ufs/ufs/dinode.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/dinode.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -63,7 +63,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)dinode.h	8.3 (Berkeley) 1/21/94
- * $FreeBSD: stable/10/sys/ufs/ufs/dinode.h 259223 2013-12-11 19:25:17Z pfg $
+ * $FreeBSD: stable/11/sys/ufs/ufs/dinode.h 257029 2013-10-24 00:33:29Z pfg $
  */
 
 #ifndef _UFS_UFS_DINODE_H_

Modified: trunk/sys/ufs/ufs/dir.h
===================================================================
--- trunk/sys/ufs/ufs/dir.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/dir.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -16,7 +16,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)dir.h	8.2 (Berkeley) 1/21/94
- * $FreeBSD: stable/10/sys/ufs/ufs/dir.h 262779 2014-03-05 04:23:19Z pfg $
+ * $FreeBSD: stable/11/sys/ufs/ufs/dir.h 347475 2019-05-10 23:46:42Z mckusick $
  */
 
 #ifndef _UFS_UFS_DIR_H_
@@ -106,13 +106,11 @@
  * The DIRSIZ macro gives the minimum record length which will hold
  * the directory entry.  This requires the amount of space in struct direct
  * without the d_name field, plus enough space for the name with a terminating
- * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary.
- *
- * 
+ * null byte (dp->d_namlen + 1), rounded up to a 4 byte boundary.
  */
-#define	DIRECTSIZ(namlen)						\
-	(((uintptr_t)&((struct direct *)0)->d_name +			\
-	  ((namlen)+1)*sizeof(((struct direct *)0)->d_name[0]) + 3) & ~3)
+#define	DIR_ROUNDUP	4	/* Directory name roundup size */
+#define	DIRECTSIZ(namlen) \
+    (roundup2(__offsetof(struct direct, d_name) + (namlen) + 1, DIR_ROUNDUP))
 #if (BYTE_ORDER == LITTLE_ENDIAN)
 #define	DIRSIZ(oldfmt, dp) \
     ((oldfmt) ? DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen))

Modified: trunk/sys/ufs/ufs/dirhash.h
===================================================================
--- trunk/sys/ufs/ufs/dirhash.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/dirhash.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/ufs/ufs/dirhash.h 262779 2014-03-05 04:23:19Z pfg $
+ * $FreeBSD: stable/11/sys/ufs/ufs/dirhash.h 298804 2016-04-29 20:43:51Z pfg $
  */
 
 #ifndef _UFS_UFS_DIRHASH_H_
@@ -61,7 +61,7 @@
  * together on a TAILQ list, and hashes with higher scores filter
  * towards the tail (most recently used) end of the list.
  *
- * New hash entries are given an inital score of DH_SCOREINIT and are
+ * New hash entries are given an initial score of DH_SCOREINIT and are
  * placed at the most-recently-used end of the list. This helps a lot
  * in the worst-case case scenario where every directory access is
  * to a directory that is not hashed (i.e. the working set of hash

Modified: trunk/sys/ufs/ufs/extattr.h
===================================================================
--- trunk/sys/ufs/ufs/extattr.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/extattr.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/ufs/ufs/extattr.h 262779 2014-03-05 04:23:19Z pfg $
+ * $FreeBSD: stable/11/sys/ufs/ufs/extattr.h 306553 2016-10-01 09:19:43Z kib $
  */
 /*
  * Developed by the TrustedBSD Project.
@@ -134,6 +134,10 @@
 	int	uepm_flags;
 };
 
+struct vop_getextattr_args;
+struct vop_deleteextattr_args;
+struct vop_setextattr_args;
+
 void	ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm);
 void	ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm);
 int	ufs_extattr_start(struct mount *mp, struct thread *td);

Modified: trunk/sys/ufs/ufs/gjournal.h
===================================================================
--- trunk/sys/ufs/ufs/gjournal.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/gjournal.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/ufs/ufs/gjournal.h 262779 2014-03-05 04:23:19Z pfg $
+ * $FreeBSD: stable/11/sys/ufs/ufs/gjournal.h 262678 2014-03-02 02:52:34Z pfg $
  */
 
 #ifndef _UFS_UFS_GJOURNAL_H_

Modified: trunk/sys/ufs/ufs/inode.h
===================================================================
--- trunk/sys/ufs/ufs/inode.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/inode.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)inode.h	8.9 (Berkeley) 5/14/95
- * $FreeBSD: stable/10/sys/ufs/ufs/inode.h 283640 2015-05-28 00:11:36Z mckusick $
+ * $FreeBSD: stable/11/sys/ufs/ufs/inode.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _UFS_UFS_INODE_H_
@@ -67,14 +67,25 @@
 struct inode {
 	TAILQ_ENTRY(inode) i_nextsnap; /* snapshot file list. */
 	struct	vnode  *i_vnode;/* Vnode associated with this inode. */
-	struct	ufsmount *i_ump;/* Ufsmount point associated with this inode. */
+	struct 	ufsmount *i_ump;/* Ufsmount point associated with this inode. */
+	struct	 dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */
+	union {
+		struct dirhash *dirhash; /* Hashing for large directories. */
+		daddr_t *snapblklist;    /* Collect expunged snapshot blocks. */
+	} i_un;
+	/*
+	 * The real copy of the on-disk inode.
+	 */
+	union {
+		struct ufs1_dinode *din1;	/* UFS1 on-disk dinode. */
+		struct ufs2_dinode *din2;	/* UFS2 on-disk dinode. */
+	} dinode_u;
+
+	ino_t	  i_number;	/* The identity of the inode. */
 	u_int32_t i_flag;	/* flags, see below */
-	struct cdev *i_dev;	/* Device associated with the inode. */
-	ino_t	  i_number;	/* The identity of the inode. */
 	int	  i_effnlink;	/* i_nlink when I/O completes */
 
-	struct	 fs *i_fs;	/* Associated filesystem superblock. */
-	struct	 dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */
+
 	/*
 	 * Side effects; used during directory lookup.
 	 */
@@ -83,11 +94,6 @@
 	doff_t	  i_diroff;	/* Offset in dir, where we found last entry. */
 	doff_t	  i_offset;	/* Offset of free space in directory. */
 
-	union {
-		struct dirhash *dirhash; /* Hashing for large directories. */
-		daddr_t *snapblklist;    /* Collect expunged snapshot blocks. */
-	} i_un;
-
 	int	i_nextclustercg; /* last cg searched for cluster */
 
 	/*
@@ -101,20 +107,13 @@
 	/*
 	 * Copies from the on-disk dinode itself.
 	 */
-	u_int16_t i_mode;	/* IFMT, permissions; see below. */
-	int16_t	  i_nlink;	/* File link count. */
 	u_int64_t i_size;	/* File byte count. */
+	u_int64_t i_gen;	/* Generation number. */
 	u_int32_t i_flags;	/* Status flags (chflags). */
-	u_int64_t i_gen;	/* Generation number. */
 	u_int32_t i_uid;	/* File owner. */
 	u_int32_t i_gid;	/* File group. */
-	/*
-	 * The real copy of the on-disk inode.
-	 */
-	union {
-		struct ufs1_dinode *din1;	/* UFS1 on-disk dinode. */
-		struct ufs2_dinode *din2;	/* UFS2 on-disk dinode. */
-	} dinode_u;
+	u_int16_t i_mode;	/* IFMT, permissions; see below. */
+	int16_t	  i_nlink;	/* File link count. */
 };
 /*
  * These flags are kept in i_flag.
@@ -124,16 +123,16 @@
 #define	IN_UPDATE	0x0004		/* Modification time update request. */
 #define	IN_MODIFIED	0x0008		/* Inode has been modified. */
 #define	IN_NEEDSYNC	0x0010		/* Inode requires fsync. */
-#define	IN_LAZYMOD	0x0040		/* Modified, but don't write yet. */
-#define	IN_LAZYACCESS	0x0100		/* Process IN_ACCESS after the
+#define	IN_LAZYMOD	0x0020		/* Modified, but don't write yet. */
+#define	IN_LAZYACCESS	0x0040		/* Process IN_ACCESS after the
 					   suspension finished */
-#define	IN_EA_LOCKED	0x0200
-#define	IN_EA_LOCKWAIT	0x0400
+#define	IN_EA_LOCKED	0x0080
+#define	IN_EA_LOCKWAIT	0x0100
 
-#define	IN_TRUNCATED	0x0800		/* Journaled truncation pending. */
+#define	IN_TRUNCATED	0x0200		/* Journaled truncation pending. */
 
-#define	i_devvp i_ump->um_devvp
-#define	i_umbufobj i_ump->um_bo
+#define	IN_UFS2		0x0400		/* UFS2 vs UFS1 */
+
 #define	i_dirhash i_un.dirhash
 #define	i_snapblklist i_un.snapblklist
 #define	i_din1 dinode_u.din1
@@ -140,23 +139,42 @@
 #define	i_din2 dinode_u.din2
 
 #ifdef _KERNEL
+
+#define	ITOUMP(ip)	((ip)->i_ump)
+#define	ITODEV(ip)	(ITOUMP(ip)->um_dev)
+#define	ITODEVVP(ip)	(ITOUMP(ip)->um_devvp)
+#define	ITOFS(ip)	(ITOUMP(ip)->um_fs)
+#define	ITOVFS(ip)	((ip)->i_vnode->v_mount)
+
+static inline _Bool
+I_IS_UFS1(const struct inode *ip)
+{
+
+	return ((ip->i_flag & IN_UFS2) == 0);
+}
+
+static inline _Bool
+I_IS_UFS2(const struct inode *ip)
+{
+
+	return ((ip->i_flag & IN_UFS2) != 0);
+}
+
 /*
  * The DIP macro is used to access fields in the dinode that are
  * not cached in the inode itself.
  */
-#define	DIP(ip, field) \
-	(((ip)->i_ump->um_fstype == UFS1) ? \
-	(ip)->i_din1->d##field : (ip)->i_din2->d##field)
-#define	DIP_SET(ip, field, val) do { \
-	if ((ip)->i_ump->um_fstype == UFS1) \
-		(ip)->i_din1->d##field = (val); \
-	else \
-		(ip)->i_din2->d##field = (val); \
+#define	DIP(ip, field)	(I_IS_UFS1(ip) ? (ip)->i_din1->d##field : \
+    (ip)->i_din2->d##field)
+#define	DIP_SET(ip, field, val) do {				\
+	if (I_IS_UFS1(ip))					\
+		(ip)->i_din1->d##field = (val); 		\
+	else							\
+		(ip)->i_din2->d##field = (val); 		\
 	} while (0)
 
-#define	SHORTLINK(ip) \
-	(((ip)->i_ump->um_fstype == UFS1) ? \
-	(caddr_t)(ip)->i_din1->di_db : (caddr_t)(ip)->i_din2->di_db)
+#define	SHORTLINK(ip)	(I_IS_UFS1(ip) ?			\
+    (caddr_t)(ip)->i_din1->di_db : (caddr_t)(ip)->i_din2->di_db)
 #define	IS_SNAPSHOT(ip)		((ip)->i_flags & SF_SNAPSHOT)
 
 /*

Modified: trunk/sys/ufs/ufs/quota.h
===================================================================
--- trunk/sys/ufs/ufs/quota.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/quota.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)quota.h	8.3 (Berkeley) 8/19/94
- * $FreeBSD: stable/10/sys/ufs/ufs/quota.h 262779 2014-03-05 04:23:19Z pfg $
+ * $FreeBSD: stable/11/sys/ufs/ufs/quota.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _UFS_UFS_QUOTA_H_

Modified: trunk/sys/ufs/ufs/ufs_acl.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_acl.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_acl.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_acl.c 241011 2012-09-27 23:30:49Z mdf $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_acl.c 306553 2016-10-01 09:19:43Z kib $");
 
 #include "opt_ufs.h"
 #include "opt_quota.h"
@@ -46,6 +46,7 @@
 #include <sys/acl.h>
 #include <sys/event.h>
 #include <sys/extattr.h>
+#include <sys/proc.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
@@ -184,7 +185,7 @@
 		 */
 		printf("ufs_getacl_nfs4(): Loaded invalid ACL ("
 		    "%d bytes), inumber %ju on %s\n", len,
-		    (uintmax_t)ip->i_number, ip->i_fs->fs_fsmnt);
+		    (uintmax_t)ip->i_number, ITOFS(ip)->fs_fsmnt);
 
 		return (EPERM);
 	}
@@ -193,7 +194,7 @@
 	if (error) {
 		printf("ufs_getacl_nfs4(): Loaded invalid ACL "
 		    "(failed acl_nfs4_check), inumber %ju on %s\n",
-		    (uintmax_t)ip->i_number, ip->i_fs->fs_fsmnt);
+		    (uintmax_t)ip->i_number, ITOFS(ip)->fs_fsmnt);
 
 		return (EPERM);
 	}
@@ -220,7 +221,7 @@
 
 /*
  * Read POSIX.1e ACL from an EA.  Return error if its not found
- * or if any other error has occured.
+ * or if any other error has occurred.
  */
 static int
 ufs_get_oldacl(acl_type_t type, struct oldacl *old, struct vnode *vp,
@@ -261,7 +262,7 @@
 		 */
 		printf("ufs_get_oldacl(): Loaded invalid ACL "
 		    "(len = %d), inumber %ju on %s\n", len,
-		    (uintmax_t)ip->i_number, ip->i_fs->fs_fsmnt);
+		    (uintmax_t)ip->i_number, ITOFS(ip)->fs_fsmnt);
 		return (EPERM);
 	}
 

Modified: trunk/sys/ufs/ufs/ufs_bmap.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_bmap.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_bmap.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_bmap.c 284021 2015-06-05 08:36:25Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_bmap.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -45,6 +45,7 @@
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 
@@ -78,7 +79,7 @@
 	 * to physical mapping is requested.
 	 */
 	if (ap->a_bop != NULL)
-		*ap->a_bop = &VTOI(ap->a_vp)->i_devvp->v_bufobj;
+		*ap->a_bop = &VFSTOUFS(ap->a_vp->v_mount)->um_devvp->v_bufobj;
 	if (ap->a_bnp == NULL)
 		return (0);
 
@@ -224,6 +225,13 @@
 			vfs_busy_pages(bp, 0);
 			bp->b_iooffset = dbtob(bp->b_blkno);
 			bstrategy(bp);
+#ifdef RACCT
+			if (racct_enable) {
+				PROC_LOCK(curproc);
+				racct_add_buf(curproc, bp, 0);
+				PROC_UNLOCK(curproc);
+			}
+#endif /* RACCT */
 			curthread->td_ru.ru_inblock++;
 			error = bufwait(bp);
 			if (error) {
@@ -232,7 +240,7 @@
 			}
 		}
 
-		if (ip->i_ump->um_fstype == UFS1) {
+		if (I_IS_UFS1(ip)) {
 			daddr = ((ufs1_daddr_t *)bp->b_data)[ap->in_off];
 			if (num == 1 && daddr && runp) {
 				for (bn = ap->in_off + 1;

Modified: trunk/sys/ufs/ufs/ufs_dirhash.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_dirhash.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_dirhash.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_dirhash.c 326846 2017-12-14 11:45:02Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_dirhash.c 326845 2017-12-14 11:41:12Z kib $");
 
 #include "opt_ufs.h"
 
@@ -86,10 +86,11 @@
 static int ufs_dirhashlowmemcount = 0;
 SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_lowmemcount, CTLFLAG_RD, 
     &ufs_dirhashlowmemcount, 0, "number of times low memory hook called");
-static int ufs_dirhashreclaimage = 60;
-SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_reclaimage, CTLFLAG_RW, 
-    &ufs_dirhashreclaimage, 0, 
-    "max time in seconds of hash inactivity before deletion in low VM events");
+static int ufs_dirhashreclaimpercent = 10;
+static int ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_ufs, OID_AUTO, dirhash_reclaimpercent,
+    CTLTYPE_INT | CTLFLAG_RW, 0, 0, ufsdirhash_set_reclaimpercent, "I",
+    "set percentage of dirhash cache to be removed in low VM events");
 
 
 static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
@@ -1151,7 +1152,7 @@
 	doff_t blkoff, prevoff;
 	int entrypos, i;
 
-	blkoff = offset & ~(DIRBLKSIZ - 1);	/* offset of start of block */
+	blkoff = rounddown2(offset, DIRBLKSIZ);	/* offset of start of block */
 	entrypos = offset & (DIRBLKSIZ - 1);	/* entry relative to block */
 	blkbuf = (char *)dirp - entrypos;
 	prevoff = blkoff;
@@ -1250,50 +1251,53 @@
 ufsdirhash_lowmem()
 {
 	struct dirhash *dh, *dh_temp;
-	int memfreed = 0;
-	/* 
-	 * Will free a *minimum* of 10% of the dirhash, but possibly much
-	 * more (depending on dirhashreclaimage). System with large dirhashes
-	 * probably also need a much larger dirhashreclaimage.
-	 * XXX: this percentage may need to be adjusted.
-	 */
-	int memwanted = ufs_dirhashmem / 10;
+	int memfreed, memwanted;
 
 	ufs_dirhashlowmemcount++;
+	memfreed = 0;
+	memwanted = ufs_dirhashmem * ufs_dirhashreclaimpercent / 100;
 
 	DIRHASHLIST_LOCK();
-	/* 
-	 * Delete dirhashes not used for more than ufs_dirhashreclaimage 
-	 * seconds. If we can't get a lock on the dirhash, it will be skipped.
+
+	/*
+	 * Reclaim up to memwanted from the oldest dirhashes. This will allow
+	 * us to make some progress when the system is running out of memory
+	 * without compromising the dinamicity of maximum age. If the situation
+	 * does not improve lowmem will be eventually retriggered and free some
+	 * other entry in the cache. The entries on the head of the list should
+	 * be the oldest. If during list traversal we can't get a lock on the
+	 * dirhash, it will be skipped.
 	 */
 	TAILQ_FOREACH_SAFE(dh, &ufsdirhash_list, dh_list, dh_temp) {
-		if (!sx_try_xlock(&dh->dh_lock))
-			continue;
-		if (time_second - dh->dh_lastused > ufs_dirhashreclaimage)
+		if (sx_try_xlock(&dh->dh_lock))
 			memfreed += ufsdirhash_destroy(dh);
-		/* Unlock if we didn't delete the dirhash */
-		else
-			ufsdirhash_release(dh);
+		if (memfreed >= memwanted)
+			break;
 	}
-
-	/* 
-	 * If not enough memory was freed, keep deleting hashes from the head 
-	 * of the dirhash list. The ones closest to the head should be the 
-	 * oldest. 
-	 */
-	if (memfreed < memwanted) {
-		TAILQ_FOREACH_SAFE(dh, &ufsdirhash_list, dh_list, dh_temp) {
-			if (!sx_try_xlock(&dh->dh_lock))
-				continue;
-			memfreed += ufsdirhash_destroy(dh);
-			if (memfreed >= memwanted)
-				break;
-		}
-	}
 	DIRHASHLIST_UNLOCK();
 }
 
+static int
+ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS)
+{
+	int error, v;
 
+	v = ufs_dirhashreclaimpercent;
+	error = sysctl_handle_int(oidp, &v, v, req);
+	if (error)
+		return (error);
+	if (req->newptr == NULL)
+		return (error);
+	if (v == ufs_dirhashreclaimpercent)
+		return (0);
+
+	/* Refuse invalid percentages */
+	if (v < 0 || v > 100)
+		return (EINVAL);
+	ufs_dirhashreclaimpercent = v;
+	return (0);
+}
+
 void
 ufsdirhash_init()
 {

Modified: trunk/sys/ufs/ufs/ufs_extattr.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_extattr.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_extattr.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_extattr.c 302233 2016-06-27 21:44:27Z bdrewery $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_extattr.c 298463 2016-04-22 08:09:27Z ngie $");
 
 #include "opt_ufs.h"
 

Modified: trunk/sys/ufs/ufs/ufs_extern.h
===================================================================
--- trunk/sys/ufs/ufs/ufs_extern.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_extern.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_extern.h	8.10 (Berkeley) 5/14/95
- * $FreeBSD: stable/10/sys/ufs/ufs/ufs_extern.h 262779 2014-03-05 04:23:19Z pfg $
+ * $FreeBSD: stable/11/sys/ufs/ufs/ufs_extern.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _UFS_UFS_EXTERN_H_

Modified: trunk/sys/ufs/ufs/ufs_gjournal.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_gjournal.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_gjournal.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -26,12 +26,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_gjournal.c 306630 2016-10-03 10:15:16Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_gjournal.c 306627 2016-10-03 09:37:56Z kib $");
 
 #include "opt_ufs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
@@ -65,15 +66,15 @@
 	ino_t ino;
 
 	ip = VTOI(vp);
-	ump = ip->i_ump;
-	fs = ip->i_fs;
-	devvp = ip->i_devvp;
+	ump = VFSTOUFS(vp->v_mount);
+	fs = ump->um_fs;
+	devvp = ump->um_devvp;
 	ino = ip->i_number;
 
 	cg = ino_to_cg(fs, ino);
 	if (devvp->v_type == VREG) {
 		/* devvp is a snapshot */
-		dev = VTOI(devvp)->i_devvp->v_rdev;
+		dev = VFSTOUFS(devvp->v_mount)->um_devvp->v_rdev;
 		cgbno = fragstoblks(fs, cgtod(fs, cg));
 	} else if (devvp->v_type == VCHR) {
 		/* devvp is a normal disk device */

Modified: trunk/sys/ufs/ufs/ufs_inode.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_inode.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_inode.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_inode.c 234612 2012-04-23 17:54:49Z trasz $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_inode.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include "opt_quota.h"
 #include "opt_ufs.h"
@@ -126,7 +126,7 @@
 		}
 	}
 	isize = ip->i_size;
-	if (ip->i_ump->um_fstype == UFS2)
+	if (I_IS_UFS2(ip))
 		isize += ip->i_din2->di_extsize;
 	if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip))
 		error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, NOCRED);
@@ -215,7 +215,6 @@
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
-	struct ufsmount *ump = ip->i_ump;
 
 	ufs_prepare_reclaim(vp);
 
@@ -234,6 +233,6 @@
 	VI_LOCK(vp);
 	vp->v_data = 0;
 	VI_UNLOCK(vp);
-	UFS_IFREE(ump, ip);
+	UFS_IFREE(ITOUMP(ip), ip);
 	return (0);
 }

Modified: trunk/sys/ufs/ufs/ufs_lookup.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_lookup.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_lookup.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_lookup.c 306180 2016-09-22 10:51:47Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_lookup.c 356965 2020-01-22 01:31:02Z mckusick $");
 
 #include "opt_ufs.h"
 #include "opt_quota.h"
@@ -565,7 +565,7 @@
 	 * in the cache as to where the entry was found.
 	 */
 	if ((flags & ISLASTCN) && nameiop == LOOKUP)
-		dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1);
+		dp->i_diroff = rounddown2(i_offset, DIRBLKSIZ);
 
 	/*
 	 * If deleting, and at end of pathname, return
@@ -824,14 +824,21 @@
 	struct componentname *cnp;
 	struct direct *newdirp;
 {
+	u_int namelen;
 
-#ifdef INVARIANTS
-	if ((cnp->cn_flags & SAVENAME) == 0)
-		panic("ufs_makedirentry: missing name");
-#endif
+	namelen = (unsigned)cnp->cn_namelen;
+	KASSERT((cnp->cn_flags & SAVENAME) != 0,
+		("ufs_makedirentry: missing name"));
+	KASSERT(namelen <= MAXNAMLEN,
+		("ufs_makedirentry: name too long"));
 	newdirp->d_ino = ip->i_number;
-	newdirp->d_namlen = cnp->cn_namelen;
-	bcopy(cnp->cn_nameptr, newdirp->d_name, (unsigned)cnp->cn_namelen + 1);
+	newdirp->d_namlen = namelen;
+
+	/* Zero out after-name padding */
+	*(u_int32_t *)(&newdirp->d_name[namelen & ~(DIR_ROUNDUP - 1)]) = 0;
+
+	bcopy(cnp->cn_nameptr, newdirp->d_name, namelen);
+
 	if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0)
 		newdirp->d_type = IFTODT(ip->i_mode);
 	else {
@@ -1092,7 +1099,7 @@
 	if (dp->i_dirhash != NULL)
 		ufsdirhash_checkblock(dp, dirbuf -
 		    (dp->i_offset & (DIRBLKSIZ - 1)),
-		    dp->i_offset & ~(DIRBLKSIZ - 1));
+		    rounddown2(dp->i_offset, DIRBLKSIZ));
 #endif
 
 	if (DOINGSOFTDEP(dvp)) {
@@ -1125,8 +1132,9 @@
 		error = UFS_TRUNCATE(dvp, (off_t)dp->i_endoff,
 		    IO_NORMAL | (DOINGASYNC(dvp) ? 0 : IO_SYNC), cr);
 		if (error != 0)
-			vn_printf(dvp, "ufs_direnter: failed to truncate "
-			    "err %d", error);
+			vn_printf(dvp,
+			    "ufs_direnter: failed to truncate, error %d\n",
+			    error);
 #ifdef UFS_DIRHASH
 		if (error == 0 && dp->i_dirhash != NULL)
 			ufsdirhash_dirtrunc(dp, dp->i_endoff);
@@ -1160,6 +1168,7 @@
 	struct inode *dp;
 	struct direct *ep, *rep;
 	struct buf *bp;
+	off_t offset;
 	int error;
 
 	dp = VTOI(dvp);
@@ -1169,6 +1178,7 @@
 	 */
 	if (ip) {
 		ip->i_effnlink--;
+		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(dvp)) {
 			softdep_setup_unlink(dp, ip);
 		} else {
@@ -1177,22 +1187,32 @@
 			ip->i_flag |= IN_CHANGE;
 		}
 	}
+	if (flags & DOWHITEOUT)
+		offset = dp->i_offset;
+	else
+		offset = dp->i_offset - dp->i_count;
+	if ((error = UFS_BLKATOFF(dvp, offset, (char **)&ep, &bp)) != 0) {
+		if (ip) {
+			ip->i_effnlink++;
+			ip->i_flag |= IN_CHANGE;
+			if (DOINGSOFTDEP(dvp)) {
+				softdep_change_linkcnt(ip);
+			} else {
+				ip->i_nlink++;
+				DIP_SET(ip, i_nlink, ip->i_nlink);
+				ip->i_flag |= IN_CHANGE;
+			}
+		}
+		return (error);
+	}
 	if (flags & DOWHITEOUT) {
 		/*
 		 * Whiteout entry: set d_ino to WINO.
 		 */
-		if ((error =
-		    UFS_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0)
-			return (error);
 		ep->d_ino = WINO;
 		ep->d_type = DT_WHT;
 		goto out;
 	}
-
-	if ((error = UFS_BLKATOFF(dvp,
-	    (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0)
-		return (error);
-
 	/* Set 'rep' to the entry being removed. */
 	if (dp->i_count == 0)
 		rep = ep;
@@ -1209,22 +1229,27 @@
 	if (ip && rep->d_ino != ip->i_number)
 		panic("ufs_dirremove: ip %ju does not match dirent ino %ju\n",
 		    (uintmax_t)ip->i_number, (uintmax_t)rep->d_ino);
-	if (dp->i_count == 0) {
+	/*
+	 * Zero out the file directory entry metadata to reduce disk
+	 * scavenging disclosure.
+	 */
+	bzero(&rep->d_name[0], rep->d_namlen);
+	rep->d_namlen = 0;
+	rep->d_type = 0;
+	rep->d_ino = 0;
+
+	if (dp->i_count != 0) {
 		/*
-		 * First entry in block: set d_ino to zero.
-		 */
-		ep->d_ino = 0;
-	} else {
-		/*
 		 * Collapse new free space into previous entry.
 		 */
 		ep->d_reclen += rep->d_reclen;
+		rep->d_reclen = 0;
 	}
 #ifdef UFS_DIRHASH
 	if (dp->i_dirhash != NULL)
 		ufsdirhash_checkblock(dp, (char *)ep -
 		    ((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)),
-		    dp->i_offset & ~(DIRBLKSIZ - 1));
+		    rounddown2(dp->i_offset, DIRBLKSIZ));
 #endif
 out:
 	error = 0;
@@ -1277,6 +1302,7 @@
 	 * necessary.
 	 */
 	oip->i_effnlink--;
+	oip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(vdp)) {
 		softdep_setup_unlink(dp, oip);
 	} else {
@@ -1286,13 +1312,23 @@
 	}
 
 	error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp);
-	if (error)
-		return (error);
-	if (ep->d_namlen == 2 && ep->d_name[1] == '.' && ep->d_name[0] == '.' &&
-	    ep->d_ino != oip->i_number) {
+	if (error == 0 && ep->d_namlen == 2 && ep->d_name[1] == '.' &&
+	    ep->d_name[0] == '.' && ep->d_ino != oip->i_number) {
 		brelse(bp);
-		return (EIDRM);
+		error = EIDRM;
 	}
+	if (error) {
+		oip->i_effnlink++;
+		oip->i_flag |= IN_CHANGE;
+		if (DOINGSOFTDEP(vdp)) {
+			softdep_change_linkcnt(oip);
+		} else {
+			oip->i_nlink++;
+			DIP_SET(oip, i_nlink, oip->i_nlink);
+			oip->i_flag |= IN_CHANGE;
+		}
+		return (error);
+	}
 	ep->d_ino = newinum;
 	if (!OFSFMT(vdp))
 		ep->d_type = newtype;
@@ -1469,7 +1505,8 @@
 			}
 		}
 		KASSERT(dd_ino == VTOI(vp1)->i_number,
-		    ("directory %d reparented\n", VTOI(vp1)->i_number));
+		    ("directory %ju reparented\n",
+		    (uintmax_t)VTOI(vp1)->i_number));
 		if (vp != tvp)
 			vput(vp);
 		vp = vp1;

Modified: trunk/sys/ufs/ufs/ufs_quota.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_quota.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_quota.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_quota.c 306178 2016-09-22 10:47:56Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_quota.c 338943 2018-09-26 14:26:29Z kib $");
 
 #include "opt_ffs.h"
 
@@ -233,13 +233,13 @@
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curblocks + change >= dq->dq_bsoftlimit &&
 		    dq->dq_curblocks < dq->dq_bsoftlimit)
-			dq->dq_btime = time_second + ip->i_ump->um_btime[i];
+			dq->dq_btime = time_second + ITOUMP(ip)->um_btime[i];
 		dq->dq_curblocks += change;
 		dq->dq_flags |= DQ_MOD;
 		DQI_UNLOCK(dq);
 		if (warn)
 			uprintf("\n%s: warning, %s disk quota exceeded\n",
-			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+			    ITOVFS(ip)->mnt_stat.f_mntonname,
 			    quotatypes[i]);
 	}
 	return (0);
@@ -265,7 +265,7 @@
 			dq->dq_flags |= DQ_BLKS;
 			DQI_UNLOCK(dq);
 			uprintf("\n%s: write failed, %s disk limit reached\n",
-			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+			    ITOVFS(ip)->mnt_stat.f_mntonname,
 			    quotatypes[type]);
 			return (EDQUOT);
 		}
@@ -278,7 +278,7 @@
 	 */
 	if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) {
 		if (dq->dq_curblocks < dq->dq_bsoftlimit) {
-			dq->dq_btime = time_second + ip->i_ump->um_btime[type];
+			dq->dq_btime = time_second + ITOUMP(ip)->um_btime[type];
 			if (ip->i_uid == cred->cr_uid)
 				*warn = 1;
 			return (0);
@@ -290,7 +290,7 @@
 				DQI_UNLOCK(dq);
 				uprintf("\n%s: write failed, %s "
 				    "disk quota exceeded for too long\n",
-				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+				    ITOVFS(ip)->mnt_stat.f_mntonname,
 				    quotatypes[type]);
 				return (EDQUOT);
 			}
@@ -371,13 +371,13 @@
 		/* Reset timer when crossing soft limit */
 		if (dq->dq_curinodes + change >= dq->dq_isoftlimit &&
 		    dq->dq_curinodes < dq->dq_isoftlimit)
-			dq->dq_itime = time_second + ip->i_ump->um_itime[i];
+			dq->dq_itime = time_second + ITOUMP(ip)->um_itime[i];
 		dq->dq_curinodes += change;
 		dq->dq_flags |= DQ_MOD;
 		DQI_UNLOCK(dq);
 		if (warn)
 			uprintf("\n%s: warning, %s inode quota exceeded\n",
-			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+			    ITOVFS(ip)->mnt_stat.f_mntonname,
 			    quotatypes[i]);
 	}
 	return (0);
@@ -402,7 +402,7 @@
 			dq->dq_flags |= DQ_INODS;
 			DQI_UNLOCK(dq);
 			uprintf("\n%s: write failed, %s inode limit reached\n",
-			    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+			    ITOVFS(ip)->mnt_stat.f_mntonname,
 			    quotatypes[type]);
 			return (EDQUOT);
 		}
@@ -415,7 +415,7 @@
 	 */
 	if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) {
 		if (dq->dq_curinodes < dq->dq_isoftlimit) {
-			dq->dq_itime = time_second + ip->i_ump->um_itime[type];
+			dq->dq_itime = time_second + ITOUMP(ip)->um_itime[type];
 			if (ip->i_uid == cred->cr_uid)
 				*warn = 1;
 			return (0);
@@ -427,7 +427,7 @@
 				DQI_UNLOCK(dq);
 				uprintf("\n%s: write failed, %s "
 				    "inode quota exceeded for too long\n",
-				    ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+				    ITOVFS(ip)->mnt_stat.f_mntonname,
 				    quotatypes[type]);
 				return (EDQUOT);
 			}
@@ -446,10 +446,13 @@
 static void
 chkdquot(struct inode *ip)
 {
-	struct ufsmount *ump = ip->i_ump;
-	struct vnode *vp = ITOV(ip);
+	struct ufsmount *ump;
+	struct vnode *vp;
 	int i;
 
+	ump = ITOUMP(ip);
+	vp = ITOV(ip);
+
 	/*
 	 * Disk quotas must be turned off for system files.  Currently
 	 * these are snapshots and quota files.
@@ -470,7 +473,7 @@
 			continue;
 		if (ip->i_dquot[i] == NODQUOT) {
 			UFS_UNLOCK(ump);
-			vprint("chkdquot: missing dquot", ITOV(ip));
+			vn_printf(ITOV(ip), "chkdquot: missing dquot ");
 			panic("chkdquot: missing dquot");
 		}
 	}
@@ -708,6 +711,34 @@
 	return (error);
 }
 
+static int
+quotaoff_inchange1(struct thread *td, struct mount *mp, int type)
+{
+	int error;
+	bool need_resume;
+
+	/*
+	 * mp is already suspended on unmount.  If not, suspend it, to
+	 * avoid the situation where quotaoff operation eventually
+	 * failing due to SU structures still keeping references on
+	 * dquots, but vnode's references are already clean.  This
+	 * would cause quota accounting leak and asserts otherwise.
+	 * Note that the thread has already called vn_start_write().
+	 */
+	if (mp->mnt_susp_owner == td) {
+		need_resume = false;
+	} else {
+		error = vfs_write_suspend_umnt(mp);
+		if (error != 0)
+			return (error);
+		need_resume = true;
+	}
+	error = quotaoff1(td, mp, type);
+	if (need_resume)
+		vfs_write_resume(mp, VR_START_WRITE);
+	return (error);
+}
+
 /*
  * Turns off quotas, assumes that ump->um_qflags are already checked
  * and QTF_CLOSING is set to indicate operation in progress. Fixes
@@ -717,10 +748,9 @@
 quotaoff_inchange(struct thread *td, struct mount *mp, int type)
 {
 	struct ufsmount *ump;
-	int i;
-	int error;
+	int error, i;
 
-	error = quotaoff1(td, mp, type);
+	error = quotaoff_inchange1(td, mp, type);
 
 	ump = VFSTOUFS(mp);
 	UFS_LOCK(ump);
@@ -1040,11 +1070,9 @@
 	 * Check if the mount point has any quotas.
 	 * If not, simply return.
 	 */
-	UFS_LOCK(ump);
 	for (i = 0; i < MAXQUOTAS; i++)
 		if (ump->um_quotas[i] != NULLVP)
 			break;
-	UFS_UNLOCK(ump);
 	if (i == MAXQUOTAS)
 		return (0);
 	/*
@@ -1089,11 +1117,9 @@
 	 * Check if the mount point has any quotas.
 	 * If not, simply return.
 	 */
-	UFS_LOCK(ump);
 	for (i = 0; i < MAXQUOTAS; i++)
 		if (ump->um_quotas[i] != NULLVP)
 			break;
-	UFS_UNLOCK(ump);
 	if (i == MAXQUOTAS)
 		return (0);
 	/*

Modified: trunk/sys/ufs/ufs/ufs_vfsops.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_vfsops.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_vfsops.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_vfsops.c 278150 2015-02-03 11:54:33Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_vfsops.c 338943 2018-09-26 14:26:29Z kib $");
 
 #include "opt_quota.h"
 #include "opt_ufs.h"
@@ -93,7 +93,8 @@
 	void *arg;
 {
 #ifndef QUOTA
-	if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON)
+	if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON ||
+	    (cmds >> SUBCMDSHIFT) == Q_QUOTAOFF)
 		vfs_unbusy(mp);
 
 	return (EOPNOTSUPP);
@@ -116,13 +117,13 @@
 			break;
 
 		default:
-			if (cmd == Q_QUOTAON)
+			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
 				vfs_unbusy(mp);
 			return (EINVAL);
 		}
 	}
 	if ((u_int)type >= MAXQUOTAS) {
-		if (cmd == Q_QUOTAON)
+		if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
 			vfs_unbusy(mp);
 		return (EINVAL);
 	}
@@ -133,7 +134,11 @@
 		break;
 
 	case Q_QUOTAOFF:
+		vfs_ref(mp);
+		vfs_unbusy(mp);
+		vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
 		error = quotaoff(td, mp, type);
+		vn_finished_write(mp);
 		break;
 
 	case Q_SETQUOTA32:

Modified: trunk/sys/ufs/ufs/ufs_vnops.c
===================================================================
--- trunk/sys/ufs/ufs/ufs_vnops.c	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufs_vnops.c	2020-02-08 19:39:08 UTC (rev 12316)
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/ufs/ufs/ufs_vnops.c 332750 2018-04-19 02:50:15Z pfg $");
+__FBSDID("$FreeBSD: stable/11/sys/ufs/ufs/ufs_vnops.c 346032 2019-04-08 15:52:13Z sjg $");
 
 #include "opt_quota.h"
 #include "opt_suiddir.h"
@@ -123,7 +123,6 @@
 static vop_whiteout_t	ufs_whiteout;
 static vop_close_t	ufsfifo_close;
 static vop_kqfilter_t	ufsfifo_kqfilter;
-static vop_pathconf_t	ufsfifo_pathconf;
 
 SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
 
@@ -325,9 +324,6 @@
 	struct inode *ip = VTOI(vp);
 	accmode_t accmode = ap->a_accmode;
 	int error;
-#ifdef QUOTA
-	int relocked;
-#endif
 #ifdef UFS_ACL
 	struct acl *acl;
 	acl_type_t type;
@@ -350,32 +346,14 @@
 			 * Inode is accounted in the quotas only if struct
 			 * dquot is attached to it. VOP_ACCESS() is called
 			 * from vn_open_cred() and provides a convenient
-			 * point to call getinoquota().
+			 * point to call getinoquota().  The lock mode is
+			 * exclusive when the file is opening for write.
 			 */
-			if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
-
-				/*
-				 * Upgrade vnode lock, since getinoquota()
-				 * requires exclusive lock to modify inode.
-				 */
-				relocked = 1;
-				vhold(vp);
-				vn_lock(vp, LK_UPGRADE | LK_RETRY);
-				VI_LOCK(vp);
-				if (vp->v_iflag & VI_DOOMED) {
-					vdropl(vp);
-					error = ENOENT;
-					goto relock;
-				}
-				vdropl(vp);
-			} else
-				relocked = 0;
-			error = getinoquota(ip);
-relock:
-			if (relocked)
-				vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
-			if (error != 0)
-				return (error);
+			if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) {
+				error = getinoquota(ip);
+				if (error != 0)
+					return (error);
+			}
 #endif
 			break;
 		default:
@@ -385,8 +363,7 @@
 
 	/*
 	 * If immutable bit set, nobody gets to write it.  "& ~VADMIN_PERMS"
-	 * is here, because without it, * it would be impossible for the owner
-	 * to remove the IMMUTABLE flag.
+	 * permits the owner of the file to remove the IMMUTABLE flag.
 	 */
 	if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) &&
 	    (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT)))
@@ -458,7 +435,7 @@
 
 	VI_LOCK(vp);
 	ufs_itimes_locked(vp);
-	if (ip->i_ump->um_fstype == UFS1) {
+	if (I_IS_UFS1(ip)) {
 		vap->va_atime.tv_sec = ip->i_din1->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
 	} else {
@@ -469,13 +446,13 @@
 	/*
 	 * Copy from inode table
 	 */
-	vap->va_fsid = dev2udev(ip->i_dev);
+	vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev);
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
 	vap->va_nlink = ip->i_effnlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
-	if (ip->i_ump->um_fstype == UFS1) {
+	if (I_IS_UFS1(ip)) {
 		vap->va_rdev = ip->i_din1->di_rdev;
 		vap->va_size = ip->i_din1->di_size;
 		vap->va_mtime.tv_sec = ip->i_din1->di_mtime;
@@ -653,8 +630,7 @@
 			DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec);
 			DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec);
 		}
-		if (vap->va_birthtime.tv_sec != VNOVAL &&
-		    ip->i_ump->um_fstype == UFS2) {
+		if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) {
 			ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec;
 			ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec;
 		}
@@ -951,8 +927,8 @@
 	struct inode *dip;
 
 	dip = VTOI(dvp);
-	uprintf("%s: Bad link count %d on parent inode %d in file system %s\n",
-	    funcname, dip->i_effnlink, dip->i_number,
+	uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n",
+	    funcname, dip->i_effnlink, (intmax_t)dip->i_number,
 	    dvp->v_mount->mnt_stat.f_mntonname);
 }
 
@@ -1362,7 +1338,7 @@
 	 *    expunge the original entry's existence.
 	 */
 	if (tip == NULL) {
-		if (tdp->i_dev != fip->i_dev)
+		if (ITODEV(tdp) != ITODEV(fip))
 			panic("ufs_rename: EXDEV");
 		if (doingdirectory && newparent) {
 			/*
@@ -1386,7 +1362,7 @@
 		    tdp->i_endoff < tdp->i_size)
 			endoff = tdp->i_endoff;
 	} else {
-		if (tip->i_dev != tdp->i_dev || tip->i_dev != fip->i_dev)
+		if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip))
 			panic("ufs_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
@@ -1547,8 +1523,9 @@
 		error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC,
 		    tcnp->cn_cred);
 		if (error != 0)
-			vn_printf(tdvp, "ufs_rename: failed to truncate "
-			    "err %d", error);
+			vn_printf(tdvp,
+			    "ufs_rename: failed to truncate, error %d\n",
+			    error);
 #ifdef UFS_DIRHASH
 		else if (tdp->i_dirhash != NULL)
 			ufsdirhash_dirtrunc(tdp, endoff);
@@ -2240,7 +2217,7 @@
 			dstdp.d_fileno = dp->d_ino;
 			dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
 			bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
-			dstdp.d_name[dstdp.d_namlen] = '\0';
+			dirent_terminate(&dstdp);
 			if (dstdp.d_reclen > uio->uio_resid) {
 				if (uio->uio_resid == startresid)
 					error = EINVAL;
@@ -2323,12 +2300,9 @@
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
-	struct bufobj *bo;
-	struct inode *ip;
 	ufs2_daddr_t blkno;
 	int error;
 
-	ip = VTOI(vp);
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL);
 		bp->b_blkno = blkno;
@@ -2346,8 +2320,7 @@
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
-	bo = ip->i_umbufobj;
-	BO_STRATEGY(bo, bp);
+	BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp);
 	return (0);
 }
 
@@ -2364,7 +2337,7 @@
 	struct inode *ip = VTOI(vp);
 
 	printf("\tino %lu, on dev %s", (u_long)ip->i_number,
-	    devtoname(ip->i_dev));
+	    devtoname(ITODEV(ip)));
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
@@ -2414,30 +2387,6 @@
 }
 
 /*
- * Return POSIX pathconf information applicable to fifos.
- */
-static int
-ufsfifo_pathconf(ap)
-	struct vop_pathconf_args /* {
-		struct vnode *a_vp;
-		int a_name;
-		int *a_retval;
-	} */ *ap;
-{
-
-	switch (ap->a_name) {
-	case _PC_ACL_EXTENDED:
-	case _PC_ACL_NFS4:
-	case _PC_ACL_PATH_MAX:
-	case _PC_MAC_PRESENT:
-		return (ufs_pathconf(ap));
-	default:
-		return (fifo_specops.vop_pathconf(ap));
-	}
-	/* NOTREACHED */
-}
-
-/*
  * Return POSIX pathconf information applicable to ufs filesystems.
  */
 static int
@@ -2452,17 +2401,14 @@
 
 	error = 0;
 	switch (ap->a_name) {
-	case _PC_LINK_MAX:
-		*ap->a_retval = LINK_MAX;
-		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		break;
-	case _PC_PATH_MAX:
-		*ap->a_retval = PATH_MAX;
-		break;
 	case _PC_PIPE_BUF:
-		*ap->a_retval = PIPE_BUF;
+		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
+			*ap->a_retval = PIPE_BUF;
+		else
+			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
@@ -2470,28 +2416,20 @@
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		break;
+#ifdef UFS_ACL
 	case _PC_ACL_EXTENDED:
-#ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
-#else
-		*ap->a_retval = 0;
-#endif
 		break;
-
 	case _PC_ACL_NFS4:
-#ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
-#else
-		*ap->a_retval = 0;
+		break;
 #endif
-		break;
-
 	case _PC_ACL_PATH_MAX:
 #ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS))
@@ -2502,24 +2440,17 @@
 		*ap->a_retval = 3;
 #endif
 		break;
+#ifdef MAC
 	case _PC_MAC_PRESENT:
-#ifdef MAC
 		if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
-#else
-		*ap->a_retval = 0;
+		break;
 #endif
-		break;
 	case _PC_MIN_HOLE_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
-	case _PC_ASYNC_IO:
-		/* _PC_ASYNC_IO should have been handled by upper layers. */
-		KASSERT(0, ("_PC_ASYNC_IO should not get here"));
-		error = EINVAL;
-		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
@@ -2549,7 +2480,7 @@
 		break;
 
 	default:
-		error = EINVAL;
+		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
@@ -2571,6 +2502,11 @@
 	vp = *vpp;
 	ip = VTOI(vp);
 	vp->v_type = IFTOVT(ip->i_mode);
+	/*
+	 * Only unallocated inodes should be of type VNON.
+	 */
+	if (ip->i_mode != 0 && vp->v_type == VNON)
+		return (EINVAL);
 	if (vp->v_type == VFIFO)
 		vp->v_op = fifoops;
 	ASSERT_VOP_LOCKED(vp, "ufs_vinit");
@@ -2822,7 +2758,7 @@
 	.vop_inactive =		ufs_inactive,
 	.vop_kqfilter =		ufsfifo_kqfilter,
 	.vop_markatime =	ufs_markatime,
-	.vop_pathconf = 	ufsfifo_pathconf,
+	.vop_pathconf = 	ufs_pathconf,
 	.vop_print =		ufs_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		ufs_reclaim,

Modified: trunk/sys/ufs/ufs/ufsmount.h
===================================================================
--- trunk/sys/ufs/ufs/ufsmount.h	2020-02-08 19:38:54 UTC (rev 12315)
+++ trunk/sys/ufs/ufs/ufsmount.h	2020-02-08 19:39:08 UTC (rev 12316)
@@ -28,14 +28,12 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufsmount.h	8.6 (Berkeley) 3/30/95
- * $FreeBSD: stable/10/sys/ufs/ufs/ufsmount.h 297787 2016-04-10 16:32:21Z kib $
+ * $FreeBSD: stable/11/sys/ufs/ufs/ufsmount.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _UFS_UFS_UFSMOUNT_H_
 #define	_UFS_UFS_UFSMOUNT_H_
 
-#include <sys/buf.h>	/* XXX For struct workhead. */
-
 /*
  * Arguments to mount UFS-based filesystems
  */
@@ -111,8 +109,8 @@
 #define	UFS_VALLOC(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_valloc(aa, bb, cc, dd)
 #define	UFS_VFREE(aa, bb, cc) VFSTOUFS((aa)->v_mount)->um_vfree(aa, bb, cc)
 #define	UFS_IFREE(aa, bb) ((aa)->um_ifree(aa, bb))
-#define	UFS_RDONLY(aa) ((aa)->i_ump->um_rdonly(aa))
-#define	UFS_SNAPGONE(aa) ((aa)->i_ump->um_snapgone(aa))
+#define	UFS_RDONLY(aa) (ITOUMP(aa)->um_rdonly(aa))
+#define	UFS_SNAPGONE(aa) (ITOUMP(aa)->um_snapgone(aa))
 
 #define	UFS_LOCK(aa)	mtx_lock(&(aa)->um_lock)
 #define	UFS_UNLOCK(aa)	mtx_unlock(&(aa)->um_lock)


From laffer1 at midnightbsd.org  Sat Feb  8 14:40:32 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:40:32 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12317] trunk/sys/tools/embed_mfs.sh: sync
 with FreeBSD 11-stable
Message-ID: <202002081940.018JeW9h063726@stargazer.midnightbsd.org>

Revision: 12317
          http://svnweb.midnightbsd.org/src/?rev=12317
Author:   laffer1
Date:     2020-02-08 14:40:31 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/tools/embed_mfs.sh

Modified: trunk/sys/tools/embed_mfs.sh
===================================================================
--- trunk/sys/tools/embed_mfs.sh	2020-02-08 19:39:08 UTC (rev 12316)
+++ trunk/sys/tools/embed_mfs.sh	2020-02-08 19:40:31 UTC (rev 12317)
@@ -23,18 +23,62 @@
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
-# $FreeBSD: stable/10/sys/tools/embed_mfs.sh 175984 2008-02-05 10:46:30Z raj $ 
+# $FreeBSD: stable/11/sys/tools/embed_mfs.sh 331691 2018-03-28 17:19:04Z emaste $ 
 # $MidnightBSD$
 #
-# Embed the MFS image into the kernel body (expects space reserved via 
-# MD_ROOT_SIZE)
+# Embed an MFS image into the kernel body or the loader body (expects space
+# reserved via MD_ROOT_SIZE (kernel) or MD_IMAGE_SIZE (loader))
 #
-# $1: kernel filename
+# $1: kernel or loader filename
 # $2: MFS image filename
 #
 
-obs=`strings -at d $1 | grep "MFS Filesystem goes here" | awk '{print $1}'`
-dd if=$2 ibs=8192 of=$1 obs=${obs} oseek=1 conv=notrunc 2> /dev/null
+if [ $# -ne 2 ]; then
+	echo "usage: $(basename $0) target mfs_image"
+	exit 0
+fi
+if [ ! -w "$1" ]; then
+	echo $1 not writable
+	exit 1
+fi
 
-strings $1 | grep 'MFS Filesystem had better STOP here' > /dev/null || \
-	(rm $1 && echo "MFS image too large" && false)
+mfs_size=`stat -f '%z' $2 2> /dev/null`
+# If we can't determine MFS image size - bail.
+[ -z ${mfs_size} ] && echo "Can't determine MFS image size" && exit 1
+
+err_no_mfs="Can't locate mfs section within "
+
+if file -b $1 | grep -q '^ELF ..-bit .SB executable'; then
+
+	sec_info=`elfdump -c $1 2> /dev/null | grep -A 5 -E "sh_name: oldmfs$"`
+	# If we can't find the mfs section within the given kernel - bail.
+	[ -z "${sec_info}" ] && echo "${err_no_mfs} $1" && exit 1
+
+	sec_size=`echo "${sec_info}" | awk '/sh_size/ {print $2}' 2>/dev/null`
+	sec_start=`echo "${sec_info}" | \
+	    awk '/sh_offset/ {print $2}' 2>/dev/null`
+
+else
+
+	#try to find start byte of MFS start flag otherwise - bail.
+	sec_start=`strings -at d $1 | grep "MFS Filesystem goes here"` || \
+	    { echo "${err_no_mfs} $1"; exit 1; }
+	sec_start=`echo ${sec_start} | awk '{print $1}'`
+
+	#try to find start byte of MFS end flag otherwise - bail.
+	sec_end=`strings -at d $1 | \
+	    grep "MFS Filesystem had better STOP here"` || \
+	    { echo "${err_no_mfs} $1"; exit 1; }
+	sec_end=`echo ${sec_end} | awk '{print $1}'`
+
+	#calculate MFS section size
+	sec_size=`expr ${sec_end} - ${sec_start}`
+
+fi
+
+# If the mfs section size is smaller than the mfs image - bail.
+[ ${sec_size} -lt ${mfs_size} ] && echo "MFS image too large" && exit 1
+
+# Dump the mfs image into the mfs section
+dd if=$2 ibs=8192 of=$1 obs=${sec_start} oseek=1 conv=notrunc 2> /dev/null && \
+    echo "MFS image embedded into $1" && exit 0


From laffer1 at midnightbsd.org  Sat Feb  8 14:41:46 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:41:46 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12318] trunk/sys/tools/fw_stub.awk: sync
 with FreeBSD 11-stable
Message-ID: <202002081941.018JfkIk063810@stargazer.midnightbsd.org>

Revision: 12318
          http://svnweb.midnightbsd.org/src/?rev=12318
Author:   laffer1
Date:     2020-02-08 14:41:45 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/tools/fw_stub.awk

Property Changed:
----------------
    trunk/sys/tools/fw_stub.awk

Modified: trunk/sys/tools/fw_stub.awk
===================================================================
--- trunk/sys/tools/fw_stub.awk	2020-02-08 19:40:31 UTC (rev 12317)
+++ trunk/sys/tools/fw_stub.awk	2020-02-08 19:41:45 UTC (rev 12318)
@@ -25,8 +25,8 @@
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
-# $FreeBSD: src/sys/tools/fw_stub.awk,v 1.6.2.1 2009/11/02 09:47:41 fjoe Exp $
-# $MidnightBSD: src/sys/tools/fw_stub.awk,v 1.3 2012/01/11 04:11:27 laffer1 Exp $
+# $FreeBSD: stable/11/sys/tools/fw_stub.awk 289399 2015-10-16 00:38:05Z bdrewery $
+# $MidnightBSD$
 
 #
 # Script to generate module .c file from a list of firmware images
@@ -157,7 +157,7 @@
 		printc("\
 		TUNABLE_LONG_FETCH(\"legal." opt_l ".license_ack\", &" opt_l "_license_ack);\
 		if (!" opt_l "_license_ack) {\
-			printf(\"" opt_m ": You need to read the LICENSE file in /usr/share/doc/legal/" opt_l "/.\\n\");\
+			printf(\"" opt_m ": You need to read the LICENSE file in /usr/share/doc/legal/" opt_l ".LICENSE.\\n\");\
 			printf(\"" opt_m ": If you agree with the license, set legal." opt_l ".license_ack=1 in /boot/loader.conf.\\n\");\
 			return(EPERM);\
 		}\n");


Property changes on: trunk/sys/tools/fw_stub.awk
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property

From laffer1 at midnightbsd.org  Sat Feb  8 14:43:52 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:43:52 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12319] trunk/sys/tools/fdt/make_dtb.sh: sync
 with FreeBSD 11-stable
Message-ID: <202002081943.018JhqsJ063931@stargazer.midnightbsd.org>

Revision: 12319
          http://svnweb.midnightbsd.org/src/?rev=12319
Author:   laffer1
Date:     2020-02-08 14:43:52 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/tools/fdt/make_dtb.sh

Modified: trunk/sys/tools/fdt/make_dtb.sh
===================================================================
--- trunk/sys/tools/fdt/make_dtb.sh	2020-02-08 19:41:45 UTC (rev 12318)
+++ trunk/sys/tools/fdt/make_dtb.sh	2020-02-08 19:43:52 UTC (rev 12319)
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-# $FreeBSD: stable/10/sys/tools/fdt/make_dtb.sh 273715 2014-10-27 00:47:55Z ian $
+# $FreeBSD: stable/11/sys/tools/fdt/make_dtb.sh 318196 2017-05-11 20:30:44Z gonzo $
 # $MidnightBSD$
 
 # Script generates dtb file ($3) from dts source ($2) in build tree S ($1)
@@ -21,5 +21,5 @@
     dtb=${dtb_path}/`basename $d .dts`.dtb
     echo "converting $d -> $dtb"
     cpp -P -x assembler-with-cpp -I $S/gnu/dts/include -I $S/boot/fdt/dts/${MACHINE} -I $S/gnu/dts/${MACHINE} -include $d /dev/null | 
-	dtc -O dtb -o $dtb -b 0 -p 1024 -i $S/boot/fdt/dts/${MACHINE} -i $S/gnu/dts/${MACHINE}
+	dtc -@ -O dtb -o $dtb -b 0 -p 1024 -i $S/boot/fdt/dts/${MACHINE} -i $S/gnu/dts/${MACHINE}
 done


From laffer1 at midnightbsd.org  Sat Feb  8 14:46:23 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:46:23 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12320] trunk/sys/teken: sync with FreeBSD
 11-stable
Message-ID: <202002081946.018JkNAu064732@stargazer.midnightbsd.org>

Revision: 12320
          http://svnweb.midnightbsd.org/src/?rev=12320
Author:   laffer1
Date:     2020-02-08 14:46:22 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/teken/demo/Makefile
    trunk/sys/teken/demo/teken_demo.c
    trunk/sys/teken/libteken/Makefile
    trunk/sys/teken/libteken/teken.3
    trunk/sys/teken/sequences
    trunk/sys/teken/stress/Makefile
    trunk/sys/teken/stress/teken_stress.c
    trunk/sys/teken/teken.c
    trunk/sys/teken/teken.h
    trunk/sys/teken/teken_scs.h
    trunk/sys/teken/teken_subr.h
    trunk/sys/teken/teken_subr_compat.h

Modified: trunk/sys/teken/demo/Makefile
===================================================================
--- trunk/sys/teken/demo/Makefile	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/demo/Makefile	2020-02-08 19:46:22 UTC (rev 12320)
@@ -1,5 +1,5 @@
 # $MidnightBSD$
-# $FreeBSD: stable/10/sys/teken/demo/Makefile 226341 2011-10-13 14:20:27Z ed $
+# $FreeBSD: stable/11/sys/teken/demo/Makefile 226341 2011-10-13 14:20:27Z ed $
 
 PROG=	teken_demo
 LDADD=	-lncursesw -lteken -lutil

Modified: trunk/sys/teken/demo/teken_demo.c
===================================================================
--- trunk/sys/teken/demo/teken_demo.c	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/demo/teken_demo.c	2020-02-08 19:46:22 UTC (rev 12320)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/teken/demo/teken_demo.c 262861 2014-03-06 18:30:56Z jhb $
+ * $FreeBSD: stable/11/sys/teken/demo/teken_demo.c 286797 2015-08-15 08:29:13Z ed $
  */
 
 #include <sys/ioctl.h>
@@ -38,7 +38,7 @@
 #include <unistd.h>
 
 #include <ncurses.h>
-#if defined(__MidnightBSD__)
+#if defined(__FreeBSD__)
 #include <libutil.h>
 #elif defined(__linux__)
 #include <pty.h>
@@ -73,7 +73,7 @@
 
 #define NCOLS	80
 #define NROWS	24
-struct pixel buffer[NCOLS][NROWS];
+static struct pixel buffer[NCOLS][NROWS];
 
 static int ptfd;
 

Modified: trunk/sys/teken/libteken/Makefile
===================================================================
--- trunk/sys/teken/libteken/Makefile	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/libteken/Makefile	2020-02-08 19:46:22 UTC (rev 12320)
@@ -1,5 +1,5 @@
 # $MidnightBSD$
-# $FreeBSD: stable/10/sys/teken/libteken/Makefile 221698 2011-05-09 16:27:39Z ed $
+# $FreeBSD: stable/11/sys/teken/libteken/Makefile 221698 2011-05-09 16:27:39Z ed $
 
 LIB=	teken
 SHLIB_MAJOR= 0

Modified: trunk/sys/teken/libteken/teken.3
===================================================================
--- trunk/sys/teken/libteken/teken.3	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/libteken/teken.3	2020-02-08 19:46:22 UTC (rev 12320)
@@ -23,9 +23,9 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.\" $FreeBSD: stable/10/sys/teken/libteken/teken.3 223574 2011-06-26 18:25:10Z ed $
+.\" $FreeBSD: stable/11/sys/teken/libteken/teken.3 330916 2018-03-14 07:47:26Z eadler $
 .\"
-.Dd May 9, 2011
+.Dd Mar 13, 2017
 .Dt TEKEN 3
 .Os
 .Sh NAME
@@ -58,6 +58,8 @@
 .Ft const char *
 .Fn teken_get_sequence "teken_t *t" "unsigned int id"
 .Ft teken_color_t
+.Fn teken_256to16 "teken_color_t color"
+.Ft teken_color_t
 .Fn teken_256to8 "teken_color_t color"
 .Ft void
 .Fn teken_get_defattr_cons25 "teken_t *t" "int *fg" "int *bg"
@@ -164,10 +166,22 @@
 any modern applications.
 .Pp
 The
+.Fn teken_256to16
+function converts an xterm-256 256-color code to an xterm 16-color code
+whose color with default palettes is as similar as possible (not very
+similar).
+The lower 3 bits of the result are the ANSI color and the next lowest
+bit is brightness.
+Other layers (hardare and software) that only support 16 colors can use
+this to avoid knowing the details of 256-color codes.
+.Pp
+The
 .Fn teken_256to8
-function converts a color code to one of the 8 primary colors, allowing
-the terminal to be rendered on graphics hardware that only supports 8 or
-16 colors (e.g. VGA).
+function is similar to
+.Fn teken_256to16
+except it converts to an ANSI 8-color code.
+This is more accurate than discarding the brigtness bit in the result of
+.Fn teken_256to16 .
 .Pp
 The
 .Fn teken_get_defattr_cons25
@@ -189,7 +203,7 @@
 .Sh SEE ALSO
 .Xr ncurses 3 ,
 .Xr termcap 3 ,
-.Xr syscons 4 .
+.Xr syscons 4
 .Sh HISTORY
 The
 .Nm

Modified: trunk/sys/teken/sequences
===================================================================
--- trunk/sys/teken/sequences	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/sequences	2020-02-08 19:46:22 UTC (rev 12320)
@@ -23,7 +23,7 @@
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
-# $FreeBSD: stable/10/sys/teken/sequences 214817 2010-11-05 00:56:21Z ed $
+# $FreeBSD: stable/11/sys/teken/sequences 214817 2010-11-05 00:56:21Z ed $
 # $MidnightBSD$
 
 # File format is as follows:

Modified: trunk/sys/teken/stress/Makefile
===================================================================
--- trunk/sys/teken/stress/Makefile	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/stress/Makefile	2020-02-08 19:46:22 UTC (rev 12320)
@@ -1,5 +1,5 @@
 # $MidnightBSD$
-# $FreeBSD: stable/10/sys/teken/stress/Makefile 221698 2011-05-09 16:27:39Z ed $
+# $FreeBSD: stable/11/sys/teken/stress/Makefile 221698 2011-05-09 16:27:39Z ed $
 
 PROG=	teken_stress
 LDADD=	-lteken

Modified: trunk/sys/teken/stress/teken_stress.c
===================================================================
--- trunk/sys/teken/stress/teken_stress.c	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/stress/teken_stress.c	2020-02-08 19:46:22 UTC (rev 12320)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/teken/stress/teken_stress.c 226100 2011-10-07 12:42:03Z ed $
+ * $FreeBSD: stable/11/sys/teken/stress/teken_stress.c 226100 2011-10-07 12:42:03Z ed $
  */
 
 #include <sys/cdefs.h>

Modified: trunk/sys/teken/teken.c
===================================================================
--- trunk/sys/teken/teken.c	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/teken.c	2020-02-08 19:46:22 UTC (rev 12320)
@@ -24,17 +24,17 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/teken/teken.c 287776 2015-09-14 09:12:28Z ed $
+ * $FreeBSD: stable/11/sys/teken/teken.c 330916 2018-03-14 07:47:26Z eadler $
  */
 
 #include <sys/cdefs.h>
-#if defined(__MidnightBSD__) && defined(_KERNEL)
+#if defined(__FreeBSD__) && defined(_KERNEL)
 #include <sys/param.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/systm.h>
 #define	teken_assert(x)		MPASS(x)
-#else /* !(__MidnightBSD__ && _KERNEL) */
+#else /* !(__FreeBSD__ && _KERNEL) */
 #include <sys/types.h>
 #include <assert.h>
 #include <limits.h>
@@ -42,7 +42,7 @@
 #include <stdio.h>
 #include <string.h>
 #define	teken_assert(x)		assert(x)
-#endif /* __MidnightBSD__ && _KERNEL */
+#endif /* __FreeBSD__ && _KERNEL */
 
 /* debug messages */
 #define	teken_printf(x,...)
@@ -453,55 +453,203 @@
 	return (0);
 }
 
+#define	k	TC_BLACK
+#define	b	TC_BLUE
+#define	y	TC_BROWN
+#define	c	TC_CYAN
+#define	g	TC_GREEN
+#define	m	TC_MAGENTA
+#define	r	TC_RED
+#define	w	TC_WHITE
+#define	K	(TC_BLACK | TC_LIGHT)
+#define	B	(TC_BLUE | TC_LIGHT)
+#define	Y	(TC_BROWN | TC_LIGHT)
+#define	C	(TC_CYAN | TC_LIGHT)
+#define	G	(TC_GREEN | TC_LIGHT)
+#define	M	(TC_MAGENTA | TC_LIGHT)
+#define	R	(TC_RED | TC_LIGHT)
+#define	W	(TC_WHITE | TC_LIGHT)
+
+/**
+ * The xterm-256 color map has steps of 0x28 (in the range 0-0xff), except
+ * for the first step which is 0x5f.  Scale to the range 0-6 by dividing
+ * by 0x28 and rounding down.  The range of 0-5 cannot represent the
+ * larger first step.
+ *
+ * This table is generated by the follow rules:
+ * - if all components are equal, the result is black for (0, 0, 0) and
+ *   (2, 2, 2), else white; otherwise:
+ * - subtract the smallest component from all components
+ * - if this gives only one nonzero component, then that is the color
+ * - else if one component is 2 or more larger than the other nonzero one,
+ *   then that component gives the color
+ * - else there are 2 nonzero components.  The color is that of a small
+ *   equal mixture of these components (cyan, yellow or magenta).  E.g.,
+ *   (0, 5, 6) (Turquoise2) is a much purer cyan than (0, 2, 3)
+ *   (DeepSkyBlue4), but we map both to cyan since we can't represent
+ *   delicate shades of either blue or cyan and blue would be worse.
+ *   Here it is important that components of 1 never occur.  Blue would
+ *   be twice as large as green in (0, 1, 2).
+ */
+static const teken_color_t teken_256to8tab[] = {
+	/* xterm normal colors: */
+	k, r, g, y, b, m, c, w,
+
+	/* xterm bright colors: */
+	k, r, g, y, b, m, c, w,
+
+	/* Red0 submap. */
+	k, b, b, b, b, b,
+	g, c, c, b, b, b,
+	g, c, c, c, b, b,
+	g, g, c, c, c, b,
+	g, g, g, c, c, c,
+	g, g, g, g, c, c,
+
+	/* Red2 submap. */
+	r, m, m, b, b, b,
+	y, k, b, b, b, b,
+	y, g, c, c, b, b,
+	g, g, c, c, c, b,
+	g, g, g, c, c, c,
+	g, g, g, g, c, c,
+
+	/* Red3 submap. */
+	r, m, m, m, b, b,
+	y, r, m, m, b, b,
+	y, y, w, b, b, b,
+	y, y, g, c, c, b,
+	g, g, g, c, c, c,
+	g, g, g, g, c, c,
+
+	/* Red4 submap. */
+	r, r, m, m, m, b,
+	r, r, m, m, m, b,
+	y, y, r, m, m, b,
+	y, y, y, w, b, b,
+	y, y, y, g, c, c,
+	g, g, g, g, c, c,
+
+	/* Red5 submap. */
+	r, r, r, m, m, m,
+	r, r, r, m, m, m,
+	r, r, r, m, m, m,
+	y, y, y, r, m, m,
+	y, y, y, y, w, b,
+	y, y, y, y, g, c,
+
+	/* Red6 submap. */
+	r, r, r, r, m, m,
+	r, r, r, r, m, m,
+	r, r, r, r, m, m,
+	r, r, r, r, m, m,
+	y, y, y, y, r, m,
+	y, y, y, y, y, w,
+
+	/* Grey submap. */
+	k, k, k, k, k, k,
+	k, k, k, k, k, k,
+	w, w, w, w, w, w,
+	w, w, w, w, w, w,
+};
+
+/*
+ * This table is generated from the previous one by setting TC_LIGHT for
+ * entries whose luminosity in the xterm256 color map is 60% or larger.
+ * Thus the previous table is currently not really needed.  It will be
+ * used for different fine tuning of the tables.
+ */
+static const teken_color_t teken_256to16tab[] = {
+	/* xterm normal colors: */
+	k, r, g, y, b, m, c, w,
+
+	/* xterm bright colors: */
+	K, R, G, Y, B, M, C, W,
+
+	/* Red0 submap. */
+	k, b, b, b, b, b,
+	g, c, c, b, b, b,
+	g, c, c, c, b, b,
+	g, g, c, c, c, b,
+	g, g, g, c, c, c,
+	g, g, g, g, c, c,
+
+	/* Red2 submap. */
+	r, m, m, b, b, b,
+	y, K, b, b, B, B,
+	y, g, c, c, B, B,
+	g, g, c, c, C, B,
+	g, G, G, C, C, C,
+	g, G, G, G, C, C,
+
+	/* Red3 submap. */
+	r, m, m, m, b, b,
+	y, r, m, m, B, B,
+	y, y, w, B, B, B,
+	y, y, G, C, C, B,
+	g, G, G, C, C, C,
+	g, G, G, G, C, C,
+
+	/* Red4 submap. */
+	r, r, m, m, m, b,
+	r, r, m, m, M, B,
+	y, y, R, M, M, B,
+	y, y, Y, W, B, B,
+	y, Y, Y, G, C, C,
+	g, G, G, G, C, C,
+
+	/* Red5 submap. */
+	r, r, r, m, m, m,
+	r, R, R, M, M, M,
+	r, R, R, M, M, M,
+	y, Y, Y, R, M, M,
+	y, Y, Y, Y, W, B,
+	y, Y, Y, Y, G, C,
+
+	/* Red6 submap. */
+	r, r, r, r, m, m,
+	r, R, R, R, M, M,
+	r, R, R, R, M, M,
+	r, R, R, R, M, M,
+	y, Y, Y, Y, R, M,
+	y, Y, Y, Y, Y, W,
+
+	/* Grey submap. */
+	k, k, k, k, k, k,
+	K, K, K, K, K, K,
+	w, w, w, w, w, w,
+	W, W, W, W, W, W,
+};
+
+#undef	k
+#undef	b
+#undef	y
+#undef	c
+#undef	g
+#undef	m
+#undef	r
+#undef	w
+#undef	K
+#undef	B
+#undef	Y
+#undef	C
+#undef	G
+#undef	M
+#undef	R
+#undef	W
+
 teken_color_t
 teken_256to8(teken_color_t c)
 {
-	unsigned int r, g, b;
 
-	if (c < 16) {
-		/* Traditional color indices. */
-		return (c % 8);
-	} else if (c >= 244) {
-		/* Upper grayscale colors. */
-		return (TC_WHITE);
-	} else if (c >= 232) {
-		/* Lower grayscale colors. */
-		return (TC_BLACK);
-	}
+	return (teken_256to8tab[c % 256]);
+}
 
-	/* Convert to RGB. */
-	c -= 16;
-	b = c % 6;
-	g = (c / 6) % 6;
-	r = c / 36;
+teken_color_t
+teken_256to16(teken_color_t c)
+{
 
-	if (r < g) {
-		/* Possibly green. */
-		if (g < b)
-			return (TC_BLUE);
-		else if (g > b)
-			return (TC_GREEN);
-		else
-			return (TC_CYAN);
-	} else if (r > g) {
-		/* Possibly red. */
-		if (r < b)
-			return (TC_BLUE);
-		else if (r > b)
-			return (TC_RED);
-		else
-			return (TC_MAGENTA);
-	} else {
-		/* Possibly brown. */
-		if (g < b)
-			return (TC_BLUE);
-		else if (g > b)
-			return (TC_BROWN);
-		else if (r < 3)
-			return (TC_BLACK);
-		else
-			return (TC_WHITE);
-	}
+	return (teken_256to16tab[c % 256]);
 }
 
 static const char * const special_strings_cons25[] = {

Modified: trunk/sys/teken/teken.h
===================================================================
--- trunk/sys/teken/teken.h	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/teken.h	2020-02-08 19:46:22 UTC (rev 12320)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/teken/teken.h 262861 2014-03-06 18:30:56Z jhb $
+ * $FreeBSD: stable/11/sys/teken/teken.h 330916 2018-03-14 07:47:26Z eadler $
  */
 
 #ifndef _TEKEN_H_
@@ -57,6 +57,7 @@
 #define	TC_CYAN		6
 #define	TC_WHITE	7
 #define	TC_NCOLORS	8
+#define	TC_LIGHT	8	/* ORed with the others. */
 
 typedef struct {
 	teken_unit_t	tp_row;
@@ -204,6 +205,7 @@
 void	teken_set_cons25(teken_t *);
 
 /* Color conversion. */
+teken_color_t teken_256to16(teken_color_t);
 teken_color_t teken_256to8(teken_color_t);
 
 #endif /* !_TEKEN_H_ */

Modified: trunk/sys/teken/teken_scs.h
===================================================================
--- trunk/sys/teken/teken_scs.h	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/teken_scs.h	2020-02-08 19:46:22 UTC (rev 12320)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/teken/teken_scs.h 203659 2010-02-08 09:16:59Z ed $
+ * $FreeBSD: stable/11/sys/teken/teken_scs.h 203659 2010-02-08 09:16:59Z ed $
  */
 
 static inline teken_char_t

Modified: trunk/sys/teken/teken_subr.h
===================================================================
--- trunk/sys/teken/teken_subr.h	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/teken_subr.h	2020-02-08 19:46:22 UTC (rev 12320)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/teken/teken_subr.h 287776 2015-09-14 09:12:28Z ed $
+ * $FreeBSD: stable/11/sys/teken/teken_subr.h 287098 2015-08-24 07:49:27Z ed $
  */
 
 static void teken_subr_cursor_up(teken_t *, unsigned int);

Modified: trunk/sys/teken/teken_subr_compat.h
===================================================================
--- trunk/sys/teken/teken_subr_compat.h	2020-02-08 19:43:52 UTC (rev 12319)
+++ trunk/sys/teken/teken_subr_compat.h	2020-02-08 19:46:22 UTC (rev 12320)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/teken/teken_subr_compat.h 214817 2010-11-05 00:56:21Z ed $
+ * $FreeBSD: stable/11/sys/teken/teken_subr_compat.h 330916 2018-03-14 07:47:26Z eadler $
  */
 
 static void
@@ -41,8 +41,8 @@
 teken_subr_cons25_set_adapter_background(teken_t *t, unsigned int c)
 {
 
-	t->t_defattr.ta_bgcolor = cons25_colors[c % 8];
-	t->t_curattr.ta_bgcolor = cons25_colors[c % 8];
+	t->t_defattr.ta_bgcolor = cons25_colors[c % 8] | (c & 8);
+	t->t_curattr.ta_bgcolor = cons25_colors[c % 8] | (c & 8);
 }
 
 static void
@@ -49,15 +49,8 @@
 teken_subr_cons25_set_adapter_foreground(teken_t *t, unsigned int c)
 {
 
-	t->t_defattr.ta_fgcolor = cons25_colors[c % 8];
-	t->t_curattr.ta_fgcolor = cons25_colors[c % 8];
-	if (c >= 8) {
-		t->t_defattr.ta_format |= TF_BOLD;
-		t->t_curattr.ta_format |= TF_BOLD;
-	} else {
-		t->t_defattr.ta_format &= ~TF_BOLD;
-		t->t_curattr.ta_format &= ~TF_BOLD;
-	}
+	t->t_defattr.ta_fgcolor = cons25_colors[c % 8] | (c & 8);
+	t->t_curattr.ta_fgcolor = cons25_colors[c % 8] | (c & 8);
 }
 
 static const teken_color_t cons25_revcolors[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };


From laffer1 at midnightbsd.org  Sat Feb  8 14:47:27 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:47:27 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12321] trunk/sys/tests: sync with FreeBSD
 11-stable
Message-ID: <202002081947.018JlRIX064801@stargazer.midnightbsd.org>

Revision: 12321
          http://svnweb.midnightbsd.org/src/?rev=12321
Author:   laffer1
Date:     2020-02-08 14:47:26 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Added Paths:
-----------
    trunk/sys/tests/
    trunk/sys/tests/callout_test/
    trunk/sys/tests/callout_test/callout_test.c
    trunk/sys/tests/callout_test.h
    trunk/sys/tests/framework/
    trunk/sys/tests/framework/kern_testfrwk.c
    trunk/sys/tests/kern_testfrwk.h

Added: trunk/sys/tests/callout_test/callout_test.c
===================================================================
--- trunk/sys/tests/callout_test/callout_test.c	                        (rev 0)
+++ trunk/sys/tests/callout_test/callout_test.c	2020-02-08 19:47:26 UTC (rev 12321)
@@ -0,0 +1,284 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 Netflix Inc. All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/tests/callout_test/callout_test.c 319168 2017-05-30 02:53:00Z ngie $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/cpuctl.h>
+#include <sys/fcntl.h>
+#include <sys/ioccom.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/pmckern.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <tests/kern_testfrwk.h>
+#include <tests/callout_test.h>
+#include <machine/cpu.h>
+
+MALLOC_DEFINE(M_CALLTMP, "Temp callout Memory", "CalloutTest");
+
+struct callout_run {
+	struct mtx lock;
+	struct callout *co_array;
+	int co_test;
+	int co_number_callouts;
+	int co_return_npa;
+	int co_completed;
+	int callout_waiting;
+	int drain_calls;
+	int cnt_zero;
+	int cnt_one;
+	int index;
+};
+
+static struct callout_run *comaster[MAXCPU];
+
+uint64_t callout_total = 0;
+
+static void execute_the_co_test(struct callout_run *rn);
+
+static void
+co_saydone(void *arg)
+{
+	struct callout_run *rn;
+
+	rn = (struct callout_run *)arg;
+	printf("The callout test is now complete for thread %d\n",
+	    rn->index);
+	printf("number_callouts:%d\n",
+	    rn->co_number_callouts);
+	printf("Callouts that bailed (Not PENDING or ACTIVE cleared):%d\n",
+	    rn->co_return_npa);
+	printf("Callouts that completed:%d\n", rn->co_completed);
+	printf("Drain calls:%d\n", rn->drain_calls);
+	printf("Zero returns:%d non-zero:%d\n",
+	    rn->cnt_zero,
+	    rn->cnt_one);
+
+}
+
+static void
+drainit(void *arg)
+{
+	struct callout_run *rn;
+
+	rn = (struct callout_run *)arg;
+	mtx_lock(&rn->lock);
+	rn->drain_calls++;
+	mtx_unlock(&rn->lock);
+}
+
+static void
+test_callout(void *arg)
+{
+	struct callout_run *rn;
+	int cpu;
+
+	critical_enter();
+	cpu = curcpu;
+	critical_exit();
+	rn = (struct callout_run *)arg;
+	atomic_add_int(&rn->callout_waiting, 1);
+	mtx_lock(&rn->lock);
+	if (callout_pending(&rn->co_array[cpu]) ||
+	    !callout_active(&rn->co_array[cpu])) {
+		rn->co_return_npa++;
+		atomic_subtract_int(&rn->callout_waiting, 1);
+		mtx_unlock(&rn->lock);
+		return;
+	}
+	callout_deactivate(&rn->co_array[cpu]);
+	rn->co_completed++;
+	mtx_unlock(&rn->lock);
+	atomic_subtract_int(&rn->callout_waiting, 1);
+}
+
+void
+execute_the_co_test(struct callout_run *rn)
+{
+	int i, ret, cpu;
+	uint32_t tk_s, tk_e, tk_d;
+
+	mtx_lock(&rn->lock);
+	rn->callout_waiting = 0;
+	for (i = 0; i < rn->co_number_callouts; i++) {
+		if (rn->co_test == 1) {
+			/* start all on spread out cpu's */
+			cpu = i % mp_ncpus;
+			callout_reset_sbt_on(&rn->co_array[i], 3, 0, test_callout, rn,
+			    cpu, 0);
+		} else {
+			/* Start all on the same CPU */
+			callout_reset_sbt_on(&rn->co_array[i], 3, 0, test_callout, rn,
+			    rn->index, 0);
+		}
+	}
+	tk_s = ticks;
+	while (rn->callout_waiting != rn->co_number_callouts) {
+		cpu_spinwait();
+		tk_e = ticks;
+		tk_d = tk_e - tk_s;
+		if (tk_d > 100) {
+			break;
+		}
+	}
+	/* OK everyone is waiting and we have the lock */
+	for (i = 0; i < rn->co_number_callouts; i++) {
+		ret = callout_async_drain(&rn->co_array[i], drainit);
+		if (ret) {
+			rn->cnt_one++;
+		} else {
+			rn->cnt_zero++;
+		}
+	}
+	rn->callout_waiting -= rn->cnt_one;
+	mtx_unlock(&rn->lock);
+	/* Now wait until all are done */
+	tk_s = ticks;
+	while (rn->callout_waiting > 0) {
+		cpu_spinwait();
+		tk_e = ticks;
+		tk_d = tk_e - tk_s;
+		if (tk_d > 100) {
+			break;
+		}
+	}
+	co_saydone((void *)rn);
+}
+
+
+static void
+run_callout_test(struct kern_test *test)
+{
+	struct callout_test *u;
+	size_t sz;
+	int i;
+	struct callout_run *rn;
+	int index = test->tot_threads_running;
+
+	u = (struct callout_test *)test->test_options;
+	if (comaster[index] == NULL) {
+		rn = comaster[index] = malloc(sizeof(struct callout_run), M_CALLTMP, M_WAITOK);
+		memset(comaster[index], 0, sizeof(struct callout_run));
+		mtx_init(&rn->lock, "callouttest", NULL, MTX_DUPOK);
+		rn->index = index;
+	} else {
+		rn = comaster[index];
+		rn->co_number_callouts = rn->co_return_npa = 0;
+		rn->co_completed = rn->callout_waiting = 0;
+		rn->drain_calls = rn->cnt_zero = rn->cnt_one = 0;
+		if (rn->co_array) {
+			free(rn->co_array, M_CALLTMP);
+			rn->co_array = NULL;
+		}
+	}
+	rn->co_number_callouts = u->number_of_callouts;
+	rn->co_test = u->test_number;
+	sz = sizeof(struct callout) * rn->co_number_callouts;
+	rn->co_array = malloc(sz, M_CALLTMP, M_WAITOK);
+	for (i = 0; i < rn->co_number_callouts; i++) {
+		callout_init(&rn->co_array[i], CALLOUT_MPSAFE);
+	}
+	execute_the_co_test(rn);
+}
+
+int callout_test_is_loaded = 0;
+
+static void
+cocleanup(void)
+{
+	int i;
+
+	for (i = 0; i < MAXCPU; i++) {
+		if (comaster[i]) {
+			if (comaster[i]->co_array) {
+				free(comaster[i]->co_array, M_CALLTMP);
+				comaster[i]->co_array = NULL;
+			}
+			free(comaster[i], M_CALLTMP);
+			comaster[i] = NULL;
+		}
+	}
+}
+
+static int
+callout_test_modevent(module_t mod, int type, void *data)
+{
+	int err = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		err = kern_testframework_register("callout_test",
+		    run_callout_test);
+		if (err) {
+			printf("Can't load callout_test err:%d returned\n",
+			    err);
+		} else {
+			memset(comaster, 0, sizeof(comaster));
+			callout_test_is_loaded = 1;
+		}
+		break;
+	case MOD_QUIESCE:
+		err = kern_testframework_deregister("callout_test");
+		if (err == 0) {
+			callout_test_is_loaded = 0;
+			cocleanup();
+		}
+		break;
+	case MOD_UNLOAD:
+		if (callout_test_is_loaded) {
+			err = kern_testframework_deregister("callout_test");
+			if (err == 0) {
+				cocleanup();
+				callout_test_is_loaded = 0;
+			}
+		}
+		break;
+	default:
+		return (EOPNOTSUPP);
+	}
+	return (err);
+}
+
+static moduledata_t callout_test_mod = {
+	.name = "callout_test",
+	.evhand = callout_test_modevent,
+	.priv = 0
+};
+
+MODULE_DEPEND(callout_test, kern_testframework, 1, 1, 1);
+DECLARE_MODULE(callout_test, callout_test_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);


Property changes on: trunk/sys/tests/callout_test/callout_test.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/tests/callout_test.h
===================================================================
--- trunk/sys/tests/callout_test.h	                        (rev 0)
+++ trunk/sys/tests/callout_test.h	2020-02-08 19:47:26 UTC (rev 12321)
@@ -0,0 +1,35 @@
+/* $MidnightBSD$ */
+#ifndef __callout_test_h__
+#define __callout_test_h__
+/*-
+ * Copyright (c) 2015
+ *	Netflix Incorporated, All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *__FBSDID("$FreeBSD: stable/11/sys/tests/callout_test.h 290663 2015-11-10 14:14:41Z rrs $");
+ *
+ */
+struct callout_test {
+	int number_of_callouts;
+	int test_number;
+};
+#endif


Property changes on: trunk/sys/tests/callout_test.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/tests/framework/kern_testfrwk.c
===================================================================
--- trunk/sys/tests/framework/kern_testfrwk.c	                        (rev 0)
+++ trunk/sys/tests/framework/kern_testfrwk.c	2020-02-08 19:47:26 UTC (rev 12321)
@@ -0,0 +1,342 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015
+ *	Netflix Incorporated, All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/tests/framework/kern_testfrwk.c 319174 2017-05-30 03:10:05Z ngie $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/queue.h>
+#include <tests/kern_testfrwk.h>
+#ifdef SMP
+#include <machine/cpu.h>
+#endif
+
+struct kern_test_list {
+	TAILQ_ENTRY(kern_test_list) next;
+	char name[TEST_NAME_LEN];
+	kerntfunc func;
+};
+
+TAILQ_HEAD(ktestlist, kern_test_list);
+
+struct kern_test_entry {
+	TAILQ_ENTRY(kern_test_entry) next;
+	struct kern_test_list *kt_e;
+	struct kern_test kt_data;
+};
+
+TAILQ_HEAD(ktestqueue, kern_test_entry);
+
+MALLOC_DEFINE(M_KTFRWK, "kern_tfrwk", "Kernel Test Framework");
+struct kern_totfrwk {
+	struct taskqueue *kfrwk_tq;
+	struct task kfrwk_que;
+	struct ktestlist kfrwk_testlist;
+	struct ktestqueue kfrwk_testq;
+	struct mtx kfrwk_mtx;
+	int kfrwk_waiting;
+};
+
+struct kern_totfrwk kfrwk;
+static int ktest_frwk_inited = 0;
+
+#define KTFRWK_MUTEX_INIT() mtx_init(&kfrwk.kfrwk_mtx, "kern_test_frwk", "tfrwk", MTX_DEF)
+
+#define KTFRWK_DESTROY() mtx_destroy(&kfrwk.kfrwk_mtx)
+
+#define KTFRWK_LOCK() mtx_lock(&kfrwk.kfrwk_mtx)
+
+#define KTFRWK_UNLOCK()	mtx_unlock(&kfrwk.kfrwk_mtx)
+
+static void
+kfrwk_task(void *context, int pending)
+{
+	struct kern_totfrwk *tf;
+	struct kern_test_entry *wk;
+	int free_mem = 0;
+	struct kern_test kt_data;
+	kerntfunc ktf;
+
+	memset(&kt_data, 0, sizeof(kt_data));
+	ktf = NULL;
+	tf = (struct kern_totfrwk *)context;
+	KTFRWK_LOCK();
+	wk = TAILQ_FIRST(&tf->kfrwk_testq);
+	if (wk) {
+		wk->kt_data.tot_threads_running--;
+		tf->kfrwk_waiting--;
+		memcpy(&kt_data, &wk->kt_data, sizeof(kt_data));
+		if (wk->kt_data.tot_threads_running == 0) {
+			TAILQ_REMOVE(&tf->kfrwk_testq, wk, next);
+			free_mem = 1;
+		} else {
+			/* Wake one of my colleages up to help too */
+			taskqueue_enqueue(tf->kfrwk_tq, &tf->kfrwk_que);
+		}
+		if (wk->kt_e) {
+			ktf = wk->kt_e->func;
+		}
+	}
+	KTFRWK_UNLOCK();
+	if (wk && free_mem) {
+		free(wk, M_KTFRWK);
+	}
+	/* Execute the test */
+	if (ktf) {
+		(*ktf) (&kt_data);
+	}
+	/* We are done */
+	atomic_add_int(&tf->kfrwk_waiting, 1);
+}
+
+static int
+kerntest_frwk_init(void)
+{
+	u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+
+	KTFRWK_MUTEX_INIT();
+	TAILQ_INIT(&kfrwk.kfrwk_testq);
+	TAILQ_INIT(&kfrwk.kfrwk_testlist);
+	/* Now lets start up a number of tasks to do the work */
+	TASK_INIT(&kfrwk.kfrwk_que, 0, kfrwk_task, &kfrwk);
+	kfrwk.kfrwk_tq = taskqueue_create_fast("sbtls_task", M_NOWAIT,
+	    taskqueue_thread_enqueue, &kfrwk.kfrwk_tq);
+	if (kfrwk.kfrwk_tq == NULL) {
+		printf("Can't start taskqueue for Kernel Test Framework\n");
+		panic("Taskqueue init fails for kfrwk");
+	}
+	taskqueue_start_threads(&kfrwk.kfrwk_tq, ncpus, PI_NET, "[kt_frwk task]");
+	kfrwk.kfrwk_waiting = ncpus;
+	ktest_frwk_inited = 1;
+	return (0);
+}
+
+static int
+kerntest_frwk_fini(void)
+{
+	KTFRWK_LOCK();
+	if (!TAILQ_EMPTY(&kfrwk.kfrwk_testlist)) {
+		/* Still modules registered */
+		KTFRWK_UNLOCK();
+		return (EBUSY);
+	}
+	ktest_frwk_inited = 0;
+	KTFRWK_UNLOCK();
+	taskqueue_free(kfrwk.kfrwk_tq);
+	/* Ok lets destroy the mutex on the way outs */
+	KTFRWK_DESTROY();
+	return (0);
+}
+
+
+static int kerntest_execute(SYSCTL_HANDLER_ARGS);
+
+SYSCTL_NODE(_kern, OID_AUTO, testfrwk, CTLFLAG_RW, 0, "Kernel Test Framework");
+SYSCTL_PROC(_kern_testfrwk, OID_AUTO, runtest, (CTLTYPE_STRUCT | CTLFLAG_RW),
+    0, 0, kerntest_execute, "IU", "Execute a kernel test");
+
+int
+kerntest_execute(SYSCTL_HANDLER_ARGS)
+{
+	struct kern_test kt;
+	struct kern_test_list *li, *te = NULL;
+	struct kern_test_entry *kte = NULL;
+	int error = 0;
+
+	if (ktest_frwk_inited == 0) {
+		return (ENOENT);
+	}
+	/* Find the entry if possible */
+	error = SYSCTL_IN(req, &kt, sizeof(struct kern_test));
+	if (error) {
+		return (error);
+	}
+	if (kt.num_threads <= 0) {
+		return (EINVAL);
+	}
+	/* Grab some memory */
+	kte = malloc(sizeof(struct kern_test_entry), M_KTFRWK, M_WAITOK);
+	if (kte == NULL) {
+		error = ENOMEM;
+		goto out;
+	}
+	KTFRWK_LOCK();
+	TAILQ_FOREACH(li, &kfrwk.kfrwk_testlist, next) {
+		if (strcmp(li->name, kt.name) == 0) {
+			te = li;
+			break;
+		}
+	}
+	if (te == NULL) {
+		printf("Can't find the test %s\n", kt.name);
+		error = ENOENT;
+		free(kte, M_KTFRWK);
+		goto out;
+	}
+	/* Ok we have a test item to run, can we? */
+	if (!TAILQ_EMPTY(&kfrwk.kfrwk_testq)) {
+		/* We don't know if there is enough threads */
+		error = EAGAIN;
+		free(kte, M_KTFRWK);
+		goto out;
+	}
+	if (kfrwk.kfrwk_waiting < kt.num_threads) {
+		error = E2BIG;
+		free(kte, M_KTFRWK);
+		goto out;
+	}
+	kt.tot_threads_running = kt.num_threads;
+	/* Ok it looks like we can do it, lets get an entry */
+	kte->kt_e = li;
+	memcpy(&kte->kt_data, &kt, sizeof(kt));
+	TAILQ_INSERT_TAIL(&kfrwk.kfrwk_testq, kte, next);
+	taskqueue_enqueue(kfrwk.kfrwk_tq, &kfrwk.kfrwk_que);
+out:
+	KTFRWK_UNLOCK();
+	return (error);
+}
+
+int
+kern_testframework_register(const char *name, kerntfunc func)
+{
+	int error = 0;
+	struct kern_test_list *li, *te = NULL;
+	int len;
+
+	len = strlen(name);
+	if (len >= TEST_NAME_LEN) {
+		return (E2BIG);
+	}
+	te = malloc(sizeof(struct kern_test_list), M_KTFRWK, M_WAITOK);
+	if (te == NULL) {
+		error = ENOMEM;
+		goto out;
+	}
+	KTFRWK_LOCK();
+	/* First does it already exist? */
+	TAILQ_FOREACH(li, &kfrwk.kfrwk_testlist, next) {
+		if (strcmp(li->name, name) == 0) {
+			error = EALREADY;
+			free(te, M_KTFRWK);
+			goto out;
+		}
+	}
+	/* Ok we can do it, lets add it to the list */
+	te->func = func;
+	strcpy(te->name, name);
+	TAILQ_INSERT_TAIL(&kfrwk.kfrwk_testlist, te, next);
+out:
+	KTFRWK_UNLOCK();
+	return (error);
+}
+
+int
+kern_testframework_deregister(const char *name)
+{
+	struct kern_test_list *li, *te = NULL;
+	u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+	int error = 0;
+
+	KTFRWK_LOCK();
+	/* First does it already exist? */
+	TAILQ_FOREACH(li, &kfrwk.kfrwk_testlist, next) {
+		if (strcmp(li->name, name) == 0) {
+			te = li;
+			break;
+		}
+	}
+	if (te == NULL) {
+		/* It is not registered so no problem */
+		goto out;
+	}
+	if (ncpus != kfrwk.kfrwk_waiting) {
+		/* We are busy executing something -- can't unload */
+		error = EBUSY;
+		goto out;
+	}
+	if (!TAILQ_EMPTY(&kfrwk.kfrwk_testq)) {
+		/* Something still to execute */
+		error = EBUSY;
+		goto out;
+	}
+	/* Ok we can remove the dude safely */
+	TAILQ_REMOVE(&kfrwk.kfrwk_testlist, te, next);
+	memset(te, 0, sizeof(struct kern_test_list));
+	free(te, M_KTFRWK);
+out:
+	KTFRWK_UNLOCK();
+	return (error);
+}
+
+static int
+kerntest_mod_init(module_t mod, int type, void *data)
+{
+	int err;
+
+	switch (type) {
+	case MOD_LOAD:
+		err = kerntest_frwk_init();
+		break;
+	case MOD_QUIESCE:
+		KTFRWK_LOCK();
+		if (TAILQ_EMPTY(&kfrwk.kfrwk_testlist)) {
+			err = 0;
+		} else {
+			err = EBUSY;
+		}
+		KTFRWK_UNLOCK();
+		break;
+	case MOD_UNLOAD:
+		err = kerntest_frwk_fini();
+		break;
+	default:
+		return (EOPNOTSUPP);
+	}
+	return (err);
+}
+
+static moduledata_t kern_test_framework = {
+	.name = "kernel_testfrwk",
+	.evhand = kerntest_mod_init,
+	.priv = 0
+};
+
+MODULE_VERSION(kern_testframework, 1);
+DECLARE_MODULE(kern_testframework, kern_test_framework, SI_SUB_PSEUDO, SI_ORDER_ANY);


Property changes on: trunk/sys/tests/framework/kern_testfrwk.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/tests/kern_testfrwk.h
===================================================================
--- trunk/sys/tests/kern_testfrwk.h	                        (rev 0)
+++ trunk/sys/tests/kern_testfrwk.h	2020-02-08 19:47:26 UTC (rev 12321)
@@ -0,0 +1,50 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015
+ *	Netflix Incorporated, All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *__FBSDID("$FreeBSD: stable/11/sys/tests/kern_testfrwk.h 290663 2015-11-10 14:14:41Z rrs $");
+ *
+ */
+#ifndef _SYS_KERN_TESTFRWKT_H_
+#define _SYS_KERN_TESTFRWKT_H_
+
+#define TEST_NAME_LEN 32
+#define TEST_OPTION_SPACE 256
+
+struct kern_test {
+	char name[TEST_NAME_LEN];
+	int num_threads;	       	/* Fill in how many threads you want */
+	int tot_threads_running;	/* For framework */
+	uint8_t test_options[TEST_OPTION_SPACE];
+};
+
+
+typedef void (*kerntfunc)(struct kern_test *);
+
+#ifdef _KERNEL
+int kern_testframework_register(const char *name, kerntfunc);
+
+int kern_testframework_deregister(const char *name);
+#endif
+#endif


Property changes on: trunk/sys/tests/kern_testfrwk.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property

From laffer1 at midnightbsd.org  Sat Feb  8 14:49:05 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:49:05 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12322] trunk/sys/sys: sync with FreeBSD
 11-stable
Message-ID: <202002081949.018Jn5fL064902@stargazer.midnightbsd.org>

Revision: 12322
          http://svnweb.midnightbsd.org/src/?rev=12322
Author:   laffer1
Date:     2020-02-08 14:49:04 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Added Paths:
-----------
    trunk/sys/sys/zlib.h
    trunk/sys/sys/zutil.h

Added: trunk/sys/sys/zlib.h
===================================================================
--- trunk/sys/sys/zlib.h	                        (rev 0)
+++ trunk/sys/sys/zlib.h	2020-02-08 19:49:04 UTC (rev 12322)
@@ -0,0 +1,1019 @@
+/* $MidnightBSD$ */
+/* $FreeBSD: stable/11/sys/sys/zlib.h 281855 2015-04-22 14:38:58Z rodrigc $	*/
+
+/*
+ * This file is derived from zlib.h and zconf.h from the zlib-1.0.4
+ * distribution by Jean-loup Gailly and Mark Adler, with some additions
+ * by Paul Mackerras to aid in implementing Deflate compression and
+ * decompression for PPP packets.
+ */
+
+/*
+ *  ==FILEVERSION 971127==
+ *
+ * This marker is used by the Linux installation script to determine
+ * whether an up-to-date version of this file is already installed.
+ */
+
+
+/* +++ zlib.h */
+/*-
+  zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.0.4, Jul 24th, 1996.
+
+  Copyright (C) 1995-1996 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  gzip at prep.ai.mit.edu    madler at alumni.caltech.edu
+*/
+/*
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files ftp://ds.internic.net/rfc/rfc1950.txt
+  (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+*/
+
+#ifndef _ZLIB_H
+#define _ZLIB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* +++ zconf.h */
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-1996 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h 
+ */
+
+/* From: zconf.h,v 1.20 1996/07/02 15:09:28 me Exp $ */
+
+#ifndef _ZCONF_H
+#define _ZCONF_H
+
+/*
+ * If you *really* need a unique prefix for all types and library functions,
+ * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
+ */
+#ifdef Z_PREFIX
+#  define deflateInit_	z_deflateInit_
+#  define deflate	z_deflate
+#  define deflateEnd	z_deflateEnd
+#  define inflateInit_ 	z_inflateInit_
+#  define inflate	z_inflate
+#  define inflateEnd	z_inflateEnd
+#  define deflateInit2_	z_deflateInit2_
+#  define deflateSetDictionary z_deflateSetDictionary
+#  define deflateCopy	z_deflateCopy
+#  define deflateReset	z_deflateReset
+#  define deflateParams	z_deflateParams
+#  define inflateInit2_	z_inflateInit2_
+#  define inflateSetDictionary z_inflateSetDictionary
+#  define inflateSync	z_inflateSync
+#  define inflateReset	z_inflateReset
+#  define compress	z_compress
+#  define uncompress	z_uncompress
+#  define adler32	z_adler32
+#if 0
+#  define crc32		z_crc32
+#  define get_crc_table z_get_crc_table
+#endif
+
+#  define Byte		z_Byte
+#  define uInt		z_uInt
+#  define uLong		z_uLong
+#  define Bytef	        z_Bytef
+#  define charf		z_charf
+#  define intf		z_intf
+#  define uIntf		z_uIntf
+#  define uLongf	z_uLongf
+#  define voidpf	z_voidpf
+#  define voidp		z_voidp
+#endif
+
+#if (defined(_WIN32) || defined(__WIN32__)) && !defined(WIN32)
+#  define WIN32
+#endif
+#if defined(__GNUC__) || defined(WIN32) || defined(__386__) || defined(__i386__)
+#  ifndef __32BIT__
+#    define __32BIT__
+#  endif
+#endif
+#if defined(__MSDOS__) && !defined(MSDOS)
+#  define MSDOS
+#endif
+
+/*
+ * Compile with -DMAXSEG_64K if the alloc function cannot allocate more
+ * than 64k bytes at a time (needed on systems with 16-bit int).
+ */
+#if defined(MSDOS) && !defined(__32BIT__)
+#  define MAXSEG_64K
+#endif
+#ifdef MSDOS
+#  define UNALIGNED_OK
+#endif
+
+#if (defined(MSDOS) || defined(_WINDOWS) || defined(WIN32))  && !defined(STDC)
+#  define STDC
+#endif
+#if (defined(__STDC__) || defined(__cplusplus)) && !defined(STDC)
+#  define STDC
+#endif
+
+#ifndef STDC
+#  ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */
+#    define const
+#  endif
+#endif
+
+/* Some Mac compilers merge all .h files incorrectly: */
+#if defined(__MWERKS__) || defined(applec) ||defined(THINK_C) ||defined(__SC__)
+#  define NO_DUMMY_DECL
+#endif
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+#  ifdef MAXSEG_64K
+#    define MAX_MEM_LEVEL 8
+#  else
+#    define MAX_MEM_LEVEL 9
+#  endif
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2 */
+#ifndef MAX_WBITS
+#  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* The memory requirements for deflate are (in bytes):
+            1 << (windowBits+2)   +  1 << (memLevel+9)
+ that is: 128K for windowBits=15  +  128K for memLevel = 8  (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+     make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+   The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus a few kilobytes
+ for small objects.
+*/
+
+                        /* Type declarations */
+
+#ifndef OF /* function prototypes */
+#  ifdef STDC
+#    define OF(args)  args
+#  else
+#    define OF(args)  ()
+#  endif
+#endif
+
+/* The following definitions for FAR are needed only for MSDOS mixed
+ * model programming (small or medium model with some far allocations).
+ * This was tested only with MSC; for other MSDOS compilers you may have
+ * to define NO_MEMCPY in zutil.h.  If you don't need the mixed model,
+ * just define FAR to be empty.
+ */
+#if (defined(M_I86SM) || defined(M_I86MM)) && !defined(__32BIT__)
+   /* MSC small or medium model */
+#  define SMALL_MEDIUM
+#  ifdef _MSC_VER
+#    define FAR __far
+#  else
+#    define FAR far
+#  endif
+#endif
+#if defined(__BORLANDC__) && (defined(__SMALL__) || defined(__MEDIUM__))
+#  ifndef __32BIT__
+#    define SMALL_MEDIUM
+#    define FAR __far
+#  endif
+#endif
+#ifndef FAR
+#   define FAR
+#endif
+
+typedef unsigned char  Byte;  /* 8 bits */
+typedef unsigned int   uInt;  /* 16 bits or more */
+typedef unsigned long  uLong; /* 32 bits or more */
+
+#if defined(__BORLANDC__) && defined(SMALL_MEDIUM)
+   /* Borland C/C++ ignores FAR inside typedef */
+#  define Bytef Byte FAR
+#else
+   typedef Byte  FAR Bytef;
+#endif
+typedef char  FAR charf;
+typedef int   FAR intf;
+typedef uInt  FAR uIntf;
+typedef uLong FAR uLongf;
+
+#ifdef STDC
+   typedef void FAR *voidpf;
+   typedef void     *voidp;
+#else
+   typedef Byte FAR *voidpf;
+   typedef Byte     *voidp;
+#endif
+
+
+/* Compile with -DZLIB_DLL for Windows DLL support */
+#if (defined(_WINDOWS) || defined(WINDOWS)) && defined(ZLIB_DLL)
+#  include <windows.h>
+#  define EXPORT  WINAPI
+#else
+#  define EXPORT
+#endif
+
+#endif /* _ZCONF_H */
+/* --- zconf.h */
+
+#define ZLIB_VERSION "1.0.4P"
+
+/* 
+     The 'zlib' compression library provides in-memory compression and
+  decompression functions, including integrity checks of the uncompressed
+  data.  This version of the library supports only one compression method
+  (deflation) but other algorithms may be added later and will have the same
+  stream interface.
+
+     For compression the application must provide the output buffer and
+  may optionally provide the input buffer for optimization. For decompression,
+  the application must provide the input buffer and may optionally provide
+  the output buffer for optimization.
+
+     Compression can be done in a single step if the buffers are large
+  enough (for example if an input file is mmap'ed), or can be done by
+  repeated calls of the compression function.  In the latter case, the
+  application must provide more input and/or consume the output
+  (providing more output space) before each call.
+
+     The library does not install any signal handler. It is recommended to
+  add at least a handler for SIGSEGV when decompressing; the library checks
+  the consistency of the input data whenever possible but may go nuts
+  for some forms of corrupted input.
+*/
+
+typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
+typedef void   (*free_func)  OF((voidpf opaque, voidpf address));
+
+struct internal_state;
+
+typedef struct z_stream_s {
+    Bytef    *next_in;  /* next input byte */
+    uInt     avail_in;  /* number of bytes available at next_in */
+    uLong    total_in;  /* total nb of input bytes read so far */
+
+    Bytef    *next_out; /* next output byte should be put there */
+    uInt     avail_out; /* remaining free space at next_out */
+    uLong    total_out; /* total nb of bytes output so far */
+
+    const char     *msg; /* last error message, NULL if no error */
+    struct internal_state FAR *state; /* not visible by applications */
+
+    alloc_func zalloc;  /* used to allocate the internal state */
+    free_func  zfree;   /* used to free the internal state */
+    voidpf     opaque;  /* private data object passed to zalloc and zfree */
+
+    int     data_type;  /* best guess about the data type: ascii or binary */
+    uLong   adler;      /* adler32 value of the uncompressed data */
+    uLong   reserved;   /* reserved for future use */
+} z_stream;
+
+typedef z_stream FAR *z_streamp;
+
+/*
+   The application must update next_in and avail_in when avail_in has
+   dropped to zero. It must update next_out and avail_out when avail_out
+   has dropped to zero. The application must initialize zalloc, zfree and
+   opaque before calling the init function. All other fields are set by the
+   compression library and must not be updated by the application.
+
+   The opaque value provided by the application will be passed as the first
+   parameter for calls of zalloc and zfree. This can be useful for custom
+   memory management. The compression library attaches no meaning to the
+   opaque value.
+
+   zalloc must return Z_NULL if there is not enough memory for the object.
+   On 16-bit systems, the functions zalloc and zfree must be able to allocate
+   exactly 65536 bytes, but will not be required to allocate more than this
+   if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS,
+   pointers returned by zalloc for objects of exactly 65536 bytes *must*
+   have their offset normalized to zero. The default allocation function
+   provided by this library ensures this (see zutil.c). To reduce memory
+   requirements and avoid any allocation of 64K objects, at the expense of
+   compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h).
+
+   The fields total_in and total_out can be used for statistics or
+   progress reports. After compression, total_in holds the total size of
+   the uncompressed data and may be saved for use in the decompressor
+   (particularly if the decompressor wants to decompress everything in
+   a single step).
+*/
+
+                        /* constants */
+
+#define Z_NO_FLUSH      0
+#define Z_PARTIAL_FLUSH 1
+#define Z_PACKET_FLUSH	2
+#define Z_SYNC_FLUSH    3
+#define Z_FULL_FLUSH    4
+#define Z_FINISH        5
+/* Allowed flush values; see deflate() below for details */
+
+#define Z_OK            0
+#define Z_STREAM_END    1
+#define Z_NEED_DICT     2
+#define Z_ERRNO        (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR   (-3)
+#define Z_MEM_ERROR    (-4)
+#define Z_BUF_ERROR    (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative
+ * values are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION         0
+#define Z_BEST_SPEED             1
+#define Z_BEST_COMPRESSION       9
+#define Z_DEFAULT_COMPRESSION  (-1)
+/* compression levels */
+
+#define Z_FILTERED            1
+#define Z_HUFFMAN_ONLY        2
+#define Z_DEFAULT_STRATEGY    0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY   0
+#define Z_ASCII    1
+#define Z_UNKNOWN  2
+/* Possible values of the data_type field */
+
+#define Z_DEFLATED   8
+/* The deflate compression method (the only one supported in this version) */
+
+#define Z_NULL  0  /* for initializing zalloc, zfree, opaque */
+
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+
+                        /* basic functions */
+
+extern const char * EXPORT zlibVersion OF((void));
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+   If the first character differs, the library code actually used is
+   not compatible with the zlib.h header file used by the application.
+   This check is automatically made by deflateInit and inflateInit.
+ */
+
+/* 
+extern int EXPORT deflateInit OF((z_streamp strm, int level));
+
+     Initializes the internal stream state for compression. The fields
+   zalloc, zfree and opaque must be initialized before by the caller.
+   If zalloc and zfree are set to Z_NULL, deflateInit updates them to
+   use default allocation functions.
+
+     The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+   1 gives best speed, 9 gives best compression, 0 gives no compression at
+   all (the input data is simply copied a block at a time).
+   Z_DEFAULT_COMPRESSION requests a default compromise between speed and
+   compression (currently equivalent to level 6).
+
+     deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if level is not a valid compression level,
+   Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+   with the version assumed by the caller (ZLIB_VERSION).
+   msg is set to null if there is no error message.  deflateInit does not
+   perform any compression: this will be done by deflate().
+*/
+
+
+extern int EXPORT deflate OF((z_streamp strm, int flush));
+/*
+  Performs one or both of the following actions:
+
+  - Compress more input starting at next_in and update next_in and avail_in
+    accordingly. If not all input can be processed (because there is not
+    enough room in the output buffer), next_in and avail_in are updated and
+    processing will resume at this point for the next call of deflate().
+
+  - Provide more output starting at next_out and update next_out and avail_out
+    accordingly. This action is forced if the parameter flush is non zero.
+    Forcing flush frequently degrades the compression ratio, so this parameter
+    should be set only when necessary (in interactive applications).
+    Some output may be provided even if flush is not set.
+
+  Before the call of deflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming
+  more output, and updating avail_in or avail_out accordingly; avail_out
+  should never be zero before the call. The application can consume the
+  compressed output when it wants, for example when the output buffer is full
+  (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK
+  and with zero avail_out, it must be called again after making room in the
+  output buffer because there might be more output pending.
+
+    If the parameter flush is set to Z_PARTIAL_FLUSH, the current compression
+  block is terminated and flushed to the output buffer so that the
+  decompressor can get all input data available so far. For method 9, a future
+  variant on method 8, the current block will be flushed but not terminated.
+  Z_SYNC_FLUSH has the same effect as partial flush except that the compressed
+  output is byte aligned (the compressor can clear its internal bit buffer)
+  and the current block is always terminated; this can be useful if the
+  compressor has to be restarted from scratch after an interruption (in which
+  case the internal state of the compressor may be lost).
+    If flush is set to Z_FULL_FLUSH, the compression block is terminated, a
+  special marker is output and the compression dictionary is discarded; this
+  is useful to allow the decompressor to synchronize if one compressed block
+  has been damaged (see inflateSync below).  Flushing degrades compression and
+  so should be used only when necessary.  Using Z_FULL_FLUSH too often can
+  seriously degrade the compression. If deflate returns with avail_out == 0,
+  this function must be called again with the same value of the flush
+  parameter and more output space (updated avail_out), until the flush is
+  complete (deflate returns with non-zero avail_out).
+
+    If the parameter flush is set to Z_PACKET_FLUSH, the compression
+  block is terminated, and a zero-length stored block is output,
+  omitting the length bytes (the effect of this is that the 3-bit type
+  code 000 for a stored block is output, and the output is then
+  byte-aligned).  This is designed for use at the end of a PPP packet.
+
+    If the parameter flush is set to Z_FINISH, pending input is processed,
+  pending output is flushed and deflate returns with Z_STREAM_END if there
+  was enough output space; if deflate returns with Z_OK, this function must be
+  called again with Z_FINISH and more output space (updated avail_out) but no
+  more input data, until it returns with Z_STREAM_END or an error. After
+  deflate has returned Z_STREAM_END, the only possible operations on the
+  stream are deflateReset or deflateEnd.
+  
+    Z_FINISH can be used immediately after deflateInit if all the compression
+  is to be done in a single step. In this case, avail_out must be at least
+  0.1% larger than avail_in plus 12 bytes.  If deflate does not return
+  Z_STREAM_END, then it must be called again as described above.
+
+    deflate() may update data_type if it can make a good guess about
+  the input data type (Z_ASCII or Z_BINARY). In doubt, the data is considered
+  binary. This field is only for information purposes and does not affect
+  the compression algorithm in any manner.
+
+    deflate() returns Z_OK if some progress has been made (more input
+  processed or more output produced), Z_STREAM_END if all input has been
+  consumed and all output has been produced (only when flush is set to
+  Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+  if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible.
+*/
+
+
+extern int EXPORT deflateEnd OF((z_streamp strm));
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any
+   pending output.
+
+     deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+   stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+   prematurely (some input or output was discarded). In the error case,
+   msg may be set but then points to a static string (which must not be
+   deallocated).
+*/
+
+
+/* 
+extern int EXPORT inflateInit OF((z_streamp strm));
+
+     Initializes the internal stream state for decompression. The fields
+   zalloc, zfree and opaque must be initialized before by the caller.  If
+   zalloc and zfree are set to Z_NULL, inflateInit updates them to use default
+   allocation functions.
+
+     inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_VERSION_ERROR if the zlib library version is incompatible
+   with the version assumed by the caller.  msg is set to null if there is no
+   error message. inflateInit does not perform any decompression: this will be
+   done by inflate().
+*/
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#define inflate       _zlib104_inflate     /* FreeBSD already has an inflate :-( */
+#endif
+
+extern int EXPORT inflate OF((z_streamp strm, int flush));
+/*
+  Performs one or both of the following actions:
+
+  - Decompress more input starting at next_in and update next_in and avail_in
+    accordingly. If not all input can be processed (because there is not
+    enough room in the output buffer), next_in is updated and processing
+    will resume at this point for the next call of inflate().
+
+  - Provide more output starting at next_out and update next_out and avail_out
+    accordingly.  inflate() provides as much output as possible, until there
+    is no more input data or no more space in the output buffer (see below
+    about the flush parameter).
+
+  Before the call of inflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming
+  more output, and updating the next_* and avail_* values accordingly.
+  The application can consume the uncompressed output when it wants, for
+  example when the output buffer is full (avail_out == 0), or after each
+  call of inflate(). If inflate returns Z_OK and with zero avail_out, it
+  must be called again after making room in the output buffer because there
+  might be more output pending.
+
+    If the parameter flush is set to Z_PARTIAL_FLUSH or Z_PACKET_FLUSH,
+  inflate flushes as much output as possible to the output buffer. The
+  flushing behavior of inflate is not specified for values of the flush
+  parameter other than Z_PARTIAL_FLUSH, Z_PACKET_FLUSH or Z_FINISH, but the
+  current implementation actually flushes as much output as possible
+  anyway.  For Z_PACKET_FLUSH, inflate checks that once all the input data
+  has been consumed, it is expecting to see the length field of a stored
+  block; if not, it returns Z_DATA_ERROR.
+
+    inflate() should normally be called until it returns Z_STREAM_END or an
+  error. However if all decompression is to be performed in a single step
+  (a single call of inflate), the parameter flush should be set to
+  Z_FINISH. In this case all pending input is processed and all pending
+  output is flushed; avail_out must be large enough to hold all the
+  uncompressed data. (The size of the uncompressed data may have been saved
+  by the compressor for this purpose.) The next operation on this stream must
+  be inflateEnd to deallocate the decompression state. The use of Z_FINISH
+  is never required, but can be used to inform inflate that a faster routine
+  may be used for the single inflate() call.
+
+    inflate() returns Z_OK if some progress has been made (more input
+  processed or more output produced), Z_STREAM_END if the end of the
+  compressed data has been reached and all uncompressed output has been
+  produced, Z_NEED_DICT if a preset dictionary is needed at this point (see
+  inflateSetDictionary below), Z_DATA_ERROR if the input data was corrupted,
+  Z_STREAM_ERROR if the stream structure was inconsistent (for example if
+  next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
+  Z_BUF_ERROR if no progress is possible or if there was not enough room in
+  the output buffer when Z_FINISH is used. In the Z_DATA_ERROR case, the
+  application may then call inflateSync to look for a good compression block.
+  In the Z_NEED_DICT case, strm->adler is set to the Adler32 value of the
+  dictionary chosen by the compressor.
+*/
+
+
+extern int EXPORT inflateEnd OF((z_streamp strm));
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any
+   pending output.
+
+     inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
+   was inconsistent. In the error case, msg may be set but then points to a
+   static string (which must not be deallocated).
+*/
+
+                        /* Advanced functions */
+
+/*
+    The following functions are needed only in some special applications.
+*/
+
+/*   
+extern int EXPORT deflateInit2 OF((z_streamp strm,
+                                   int  level,
+                                   int  method,
+                                   int  windowBits,
+                                   int  memLevel,
+                                   int  strategy));
+
+     This is another version of deflateInit with more compression options. The
+   fields next_in, zalloc, zfree and opaque must be initialized before by
+   the caller.
+
+     The method parameter is the compression method. It must be Z_DEFLATED in
+   this version of the library. (Method 9 will allow a 64K history buffer and
+   partial block flushes.)
+
+     The windowBits parameter is the base two logarithm of the window size
+   (the size of the history buffer).  It should be in the range 8..15 for this
+   version of the library (the value 16 will be allowed for method 9). Larger
+   values of this parameter result in better compression at the expense of
+   memory usage. The default value is 15 if deflateInit is used instead.
+
+     The memLevel parameter specifies how much memory should be allocated
+   for the internal compression state. memLevel=1 uses minimum memory but
+   is slow and reduces compression ratio; memLevel=9 uses maximum memory
+   for optimal speed. The default value is 8. See zconf.h for total memory
+   usage as a function of windowBits and memLevel.
+
+     The strategy parameter is used to tune the compression algorithm. Use the
+   value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+   filter (or predictor), or Z_HUFFMAN_ONLY to force Huffman encoding only (no
+   string match).  Filtered data consists mostly of small values with a
+   somewhat random distribution. In this case, the compression algorithm is
+   tuned to compress them better. The effect of Z_FILTERED is to force more
+   Huffman coding and less string matching; it is somewhat intermediate
+   between Z_DEFAULT and Z_HUFFMAN_ONLY. The strategy parameter only affects
+   the compression ratio but not the correctness of the compressed output even
+   if it is not set appropriately.
+
+     If next_in is not null, the library will use this buffer to hold also
+   some history information; the buffer must either hold the entire input
+   data, or have at least 1<<(windowBits+1) bytes and be writable. If next_in
+   is null, the library will allocate its own history buffer (and leave next_in
+   null). next_out need not be provided here but must be provided by the
+   application for the next call of deflate().
+
+     If the history buffer is provided by the application, next_in must
+   must never be changed by the application since the compressor maintains
+   information inside this buffer from call to call; the application
+   must provide more input only by increasing avail_in. next_in is always
+   reset by the library in this case.
+
+      deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was
+   not enough memory, Z_STREAM_ERROR if a parameter is invalid (such as
+   an invalid method). msg is set to null if there is no error message.
+   deflateInit2 does not perform any compression: this will be done by
+   deflate(). 
+*/
+                            
+extern int EXPORT deflateSetDictionary OF((z_streamp strm,
+                                           const Bytef *dictionary,
+				           uInt  dictLength));
+/*
+     Initializes the compression dictionary (history buffer) from the given
+   byte sequence without producing any compressed output. This function must
+   be called immediately after deflateInit or deflateInit2, before any call
+   of deflate. The compressor and decompressor must use exactly the same
+   dictionary (see inflateSetDictionary).
+     The dictionary should consist of strings (byte sequences) that are likely
+   to be encountered later in the data to be compressed, with the most commonly
+   used strings preferably put towards the end of the dictionary. Using a
+   dictionary is most useful when the data to be compressed is short and
+   can be predicted with good accuracy; the data can then be compressed better
+   than with the default empty dictionary. In this version of the library,
+   only the last 32K bytes of the dictionary are used.
+     Upon return of this function, strm->adler is set to the Adler32 value
+   of the dictionary; the decompressor may later use this value to determine
+   which dictionary has been used by the compressor. (The Adler32 value
+   applies to the whole dictionary even if only a subset of the dictionary is
+   actually used by the compressor.)
+
+     deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+   parameter is invalid (such as NULL dictionary) or the stream state
+   is inconsistent (for example if deflate has already been called for this
+   stream). deflateSetDictionary does not perform any compression: this will
+   be done by deflate(). 
+*/
+
+extern int EXPORT deflateCopy OF((z_streamp dest,
+                                  z_streamp source));
+/*
+     Sets the destination stream as a complete copy of the source stream.  If
+   the source stream is using an application-supplied history buffer, a new
+   buffer is allocated for the destination stream.  The compressed output
+   buffer is always application-supplied. It's the responsibility of the
+   application to provide the correct values of next_out and avail_out for the
+   next call of deflate.
+
+     This function can be useful when several compression strategies will be
+   tried, for example when there are several ways of pre-processing the input
+   data with a filter. The streams that will be discarded should then be freed
+   by calling deflateEnd.  Note that deflateCopy duplicates the internal
+   compression state which can be quite large, so this strategy is slow and
+   can consume lots of memory.
+
+     deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being NULL). msg is left unchanged in both source and
+   destination.
+*/
+
+extern int EXPORT deflateReset OF((z_streamp strm));
+/*
+     This function is equivalent to deflateEnd followed by deflateInit,
+   but does not free and reallocate all the internal compression state.
+   The stream will keep the same compression level and any other attributes
+   that may have been set by deflateInit2.
+
+      deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+extern int EXPORT deflateParams OF((z_streamp strm, int level, int strategy));
+/*
+     Dynamically update the compression level and compression strategy.
+   This can be used to switch between compression and straight copy of
+   the input data, or to switch to a different kind of input data requiring
+   a different strategy. If the compression level is changed, the input
+   available so far is compressed with the old level (and may be flushed);
+   the new level will take effect only at the next call of deflate().
+
+     Before the call of deflateParams, the stream state must be set as for
+   a call of deflate(), since the currently available input may have to
+   be compressed and flushed. In particular, strm->avail_out must be non-zero.
+
+     deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
+   stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR
+   if strm->avail_out was zero.
+*/
+
+extern int EXPORT deflateOutputPending OF((z_streamp strm));
+/*
+     Returns the number of bytes of output which are immediately
+   available from the compressor (i.e. without any further input
+   or flush).
+*/
+
+/*   
+extern int EXPORT inflateInit2 OF((z_streamp strm,
+                                   int  windowBits));
+
+     This is another version of inflateInit with more compression options. The
+   fields next_out, zalloc, zfree and opaque must be initialized before by
+   the caller.
+
+     The windowBits parameter is the base two logarithm of the maximum window
+   size (the size of the history buffer).  It should be in the range 8..15 for
+   this version of the library (the value 16 will be allowed soon). The
+   default value is 15 if inflateInit is used instead. If a compressed stream
+   with a larger window size is given as input, inflate() will return with
+   the error code Z_DATA_ERROR instead of trying to allocate a larger window.
+
+     If next_out is not null, the library will use this buffer for the history
+   buffer; the buffer must either be large enough to hold the entire output
+   data, or have at least 1<<windowBits bytes.  If next_out is null, the
+   library will allocate its own buffer (and leave next_out null). next_in
+   need not be provided here but must be provided by the application for the
+   next call of inflate().
+
+     If the history buffer is provided by the application, next_out must
+   never be changed by the application since the decompressor maintains
+   history information inside this buffer from call to call; the application
+   can only reset next_out to the beginning of the history buffer when
+   avail_out is zero and all output has been consumed.
+
+      inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was
+   not enough memory, Z_STREAM_ERROR if a parameter is invalid (such as
+   windowBits < 8). msg is set to null if there is no error message.
+   inflateInit2 does not perform any decompression: this will be done by
+   inflate().
+*/
+
+extern int EXPORT inflateSetDictionary OF((z_streamp strm,
+				           const Bytef *dictionary,
+					   uInt  dictLength));
+/*
+     Initializes the decompression dictionary (history buffer) from the given
+   uncompressed byte sequence. This function must be called immediately after
+   a call of inflate if this call returned Z_NEED_DICT. The dictionary chosen
+   by the compressor can be determined from the Adler32 value returned by this
+   call of inflate. The compressor and decompressor must use exactly the same
+   dictionary (see deflateSetDictionary).
+
+     inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+   parameter is invalid (such as NULL dictionary) or the stream state is
+   inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+   expected one (incorrect Adler32 value). inflateSetDictionary does not
+   perform any decompression: this will be done by subsequent calls of
+   inflate().
+*/
+
+extern int EXPORT inflateSync OF((z_streamp strm));
+/* 
+    Skips invalid compressed data until the special marker (see deflate()
+  above) can be found, or until all available input is skipped. No output
+  is provided.
+
+    inflateSync returns Z_OK if the special marker has been found, Z_BUF_ERROR
+  if no more input was provided, Z_DATA_ERROR if no marker has been found,
+  or Z_STREAM_ERROR if the stream structure was inconsistent. In the success
+  case, the application may save the current current value of total_in which
+  indicates where valid compressed data was found. In the error case, the
+  application may repeatedly call inflateSync, providing more input each time,
+  until success or end of the input data.
+*/
+
+extern int EXPORT inflateReset OF((z_streamp strm));
+/*
+     This function is equivalent to inflateEnd followed by inflateInit,
+   but does not free and reallocate all the internal decompression state.
+   The stream will keep attributes that may have been set by inflateInit2.
+
+      inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+extern int inflateIncomp OF((z_stream *strm));
+/*
+     This function adds the data at next_in (avail_in bytes) to the output
+   history without performing any output.  There must be no pending output,
+   and the decompressor must be expecting to see the start of a block.
+   Calling this function is equivalent to decompressing a stored block
+   containing the data at next_in (except that the data is not output).
+*/
+
+                        /* utility functions */
+
+/*
+     The following utility functions are implemented on top of the
+   basic stream-oriented functions. To simplify the interface, some
+   default options are assumed (compression level, window size,
+   standard memory allocation functions). The source code of these
+   utility functions can easily be modified if you need special options.
+*/
+
+extern int EXPORT compress OF((Bytef *dest,   uLongf *destLen,
+			       const Bytef *source, uLong sourceLen));
+/*
+     Compresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer. Upon entry, destLen is the total
+   size of the destination buffer, which must be at least 0.1% larger than
+   sourceLen plus 12 bytes. Upon exit, destLen is the actual size of the
+   compressed buffer.
+     This function can be used to compress a whole file at once if the
+   input file is mmap'ed.
+     compress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer.
+*/
+
+extern int EXPORT uncompress OF((Bytef *dest,   uLongf *destLen,
+				 const Bytef *source, uLong sourceLen));
+/*
+     Decompresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer. Upon entry, destLen is the total
+   size of the destination buffer, which must be large enough to hold the
+   entire uncompressed data. (The size of the uncompressed data must have
+   been saved previously by the compressor and transmitted to the decompressor
+   by some mechanism outside the scope of this compression library.)
+   Upon exit, destLen is the actual size of the compressed buffer.
+     This function can be used to decompress a whole file at once if the
+   input file is mmap'ed.
+
+     uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer, or Z_DATA_ERROR if the input data was corrupted.
+*/
+
+
+typedef voidp gzFile;
+
+extern gzFile EXPORT gzopen  OF((const char *path, const char *mode));
+/*
+     Opens a gzip (.gz) file for reading or writing. The mode parameter
+   is as in fopen ("rb" or "wb") but can also include a compression level
+   ("wb9").  gzopen can be used to read a file which is not in gzip format;
+   in this case gzread will directly read from the file without decompression.
+     gzopen returns NULL if the file could not be opened or if there was
+   insufficient memory to allocate the (de)compression state; errno
+   can be checked to distinguish the two cases (if errno is zero, the
+   zlib error is Z_MEM_ERROR).
+*/
+
+extern gzFile EXPORT gzdopen  OF((int fd, const char *mode));
+/*
+     gzdopen() associates a gzFile with the file descriptor fd.  File
+   descriptors are obtained from calls like open, dup, creat, pipe or
+   fileno (in the file has been previously opened with fopen).
+   The mode parameter is as in gzopen.
+     The next call of gzclose on the returned gzFile will also close the
+   file descriptor fd, just like fclose(fdopen(fd), mode) closes the file
+   descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode).
+     gzdopen returns NULL if there was insufficient memory to allocate
+   the (de)compression state.
+*/
+
+extern int EXPORT    gzread  OF((gzFile file, voidp buf, unsigned len));
+/*
+     Reads the given number of uncompressed bytes from the compressed file.
+   If the input file was not in gzip format, gzread copies the given number
+   of bytes into the buffer.
+     gzread returns the number of uncompressed bytes actually read (0 for
+   end of file, -1 for error). */
+
+extern int EXPORT    gzwrite OF((gzFile file, const voidp buf, unsigned len));
+/*
+     Writes the given number of uncompressed bytes into the compressed file.
+   gzwrite returns the number of uncompressed bytes actually written
+   (0 in case of error).
+*/
+
+extern int EXPORT    gzflush OF((gzFile file, int flush));
+/*
+     Flushes all pending output into the compressed file. The parameter
+   flush is as in the deflate() function. The return value is the zlib
+   error number (see function gzerror below). gzflush returns Z_OK if
+   the flush parameter is Z_FINISH and all output could be flushed.
+     gzflush should be called only when strictly necessary because it can
+   degrade compression.
+*/
+
+extern int EXPORT    gzclose OF((gzFile file));
+/*
+     Flushes all pending output if necessary, closes the compressed file
+   and deallocates all the (de)compression state. The return value is the zlib
+   error number (see function gzerror below).
+*/
+
+extern const char * EXPORT gzerror OF((gzFile file, int *errnum));
+/*
+     Returns the error message for the last error which occurred on the
+   given compressed file. errnum is set to zlib error number. If an
+   error occurred in the filesystem and not in the compression library,
+   errnum is set to Z_ERRNO and the application may consult errno
+   to get the exact error code.
+*/
+
+                        /* checksum functions */
+
+/*
+     These functions are not related to compression but are exported
+   anyway because they might be useful in applications using the
+   compression library.
+*/
+
+extern uLong EXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
+
+/*
+     Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+   return the updated checksum. If buf is NULL, this function returns
+   the required initial value for the checksum.
+   An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
+   much faster. Usage example:
+
+     uLong adler = adler32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       adler = adler32(adler, buffer, length);
+     }
+     if (adler != original_adler) error();
+*/
+
+#if 0
+extern uLong EXPORT crc32   OF((uLong crc, const Bytef *buf, uInt len));
+/*
+     Update a running crc with the bytes buf[0..len-1] and return the updated
+   crc. If buf is NULL, this function returns the required initial value
+   for the crc. Pre- and post-conditioning (one's complement) is performed
+   within this function so it shouldn't be done by the application.
+   Usage example:
+
+     uLong crc = crc32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       crc = crc32(crc, buffer, length);
+     }
+     if (crc != original_crc) error();
+*/
+#endif
+
+
+                        /* various hacks, don't look :) */
+
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+extern int EXPORT deflateInit_ OF((z_streamp strm, int level,
+			           const char *version, int stream_size));
+extern int EXPORT inflateInit_ OF((z_streamp strm,
+				   const char *version, int stream_size));
+extern int EXPORT deflateInit2_ OF((z_streamp strm, int  level, int  method,
+				    int windowBits, int memLevel, int strategy,
+				    const char *version, int stream_size));
+extern int EXPORT inflateInit2_ OF((z_streamp strm, int  windowBits,
+				    const char *version, int stream_size));
+#define deflateInit(strm, level) \
+        deflateInit_((strm), (level),       ZLIB_VERSION, sizeof(z_stream))
+#define inflateInit(strm) \
+        inflateInit_((strm),                ZLIB_VERSION, sizeof(z_stream))
+#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+        deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+		      (strategy),           ZLIB_VERSION, sizeof(z_stream))
+#define inflateInit2(strm, windowBits) \
+        inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream))
+
+#if !defined(_Z_UTIL_H) && !defined(NO_DUMMY_DECL)
+    struct internal_state {int dummy;}; /* hack for buggy compilers */
+#endif
+
+uLongf *get_crc_table OF((void)); /* can be used by asm versions of crc32() */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZLIB_H */
+/* --- zlib.h */


Property changes on: trunk/sys/sys/zlib.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/sys/zutil.h
===================================================================
--- trunk/sys/sys/zutil.h	                        (rev 0)
+++ trunk/sys/sys/zutil.h	2020-02-08 19:49:04 UTC (rev 12322)
@@ -0,0 +1,232 @@
+/* $MidnightBSD$ */
+/* zutil.h -- internal interface and configuration of the compression library
+ * Copyright (C) 1995-1996 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* From: zutil.h,v 1.16 1996/07/24 13:41:13 me Exp $ */
+/* $FreeBSD: stable/11/sys/sys/zutil.h 281855 2015-04-22 14:38:58Z rodrigc $ */
+
+#ifndef _Z_UTIL_H
+#define _Z_UTIL_H
+
+#define ZEXPORT
+
+#ifdef _KERNEL
+#include <sys/zlib.h>
+#else
+#include "zlib.h"
+#endif
+
+#ifdef _KERNEL
+/* Assume this is a *BSD or SVR4 kernel */
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/module.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#  define HAVE_MEMCPY
+#  define memcpy(d, s, n)	bcopy((s), (d), (n))
+#  define memset(d, v, n)	bzero((d), (n))
+#  define memcmp		bcmp
+
+#else
+#if defined(__KERNEL__)
+/* Assume this is a Linux kernel */
+#include <linux/string.h>
+#define HAVE_MEMCPY
+
+#else /* not kernel */
+
+#if defined(MSDOS)||defined(VMS)||defined(CRAY)||defined(WIN32)||defined(RISCOS)
+#   include <stddef.h>
+#   include <errno.h>
+#else
+    extern int errno;
+#endif
+#ifdef STDC
+#  include <string.h>
+#  include <stdlib.h>
+#endif
+#endif /* __KERNEL__ */
+#endif /* _KERNEL */
+
+#ifndef local
+#  define local static
+#endif
+/* compile with -Dlocal if your debugger can't find static symbols */
+
+typedef unsigned char  uch;
+typedef uch FAR uchf;
+typedef unsigned short ush;
+typedef ush FAR ushf;
+typedef unsigned long  ulg;
+
+#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)]
+
+#define ERR_RETURN(strm,err) \
+  return (strm->msg = (const char*)ERR_MSG(err), (err))
+/* To be used only when the state is known to be valid */
+
+        /* common constants */
+
+#ifndef DEF_WBITS
+#  define DEF_WBITS MAX_WBITS
+#endif
+/* default windowBits for decompression. MAX_WBITS is for compression only */
+
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+/* default memLevel */
+
+#define STORED_BLOCK 0
+#define STATIC_TREES 1
+#define DYN_TREES    2
+/* The three kinds of block type */
+
+#define MIN_MATCH  3
+#define MAX_MATCH  258
+/* The minimum and maximum match lengths */
+
+#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
+
+        /* target dependencies */
+
+#ifdef MSDOS
+#  define OS_CODE  0x00
+#  ifdef __TURBOC__
+#    include <alloc.h>
+#  else /* MSC or DJGPP */
+#    include <malloc.h>
+#  endif
+#endif
+
+#ifdef OS2
+#  define OS_CODE  0x06
+#endif
+
+#ifdef WIN32 /* Window 95 & Windows NT */
+#  define OS_CODE  0x0b
+#endif
+
+#if defined(VAXC) || defined(VMS)
+#  define OS_CODE  0x02
+#  define FOPEN(name, mode) \
+     fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512")
+#endif
+
+#ifdef AMIGA
+#  define OS_CODE  0x01
+#endif
+
+#if defined(ATARI) || defined(atarist)
+#  define OS_CODE  0x05
+#endif
+
+#ifdef MACOS
+#  define OS_CODE  0x07
+#endif
+
+#ifdef __50SERIES /* Prime/PRIMOS */
+#  define OS_CODE  0x0F
+#endif
+
+#ifdef TOPS20
+#  define OS_CODE  0x0a
+#endif
+
+#if defined(_BEOS_) || defined(RISCOS)
+#  define fdopen(fd,mode) NULL /* No fdopen() */
+#endif
+
+        /* Common defaults */
+
+#ifndef OS_CODE
+#  define OS_CODE  0x03  /* assume Unix */
+#endif
+
+#ifndef FOPEN
+#  define FOPEN(name, mode) fopen((name), (mode))
+#endif
+
+         /* functions */
+
+#ifdef HAVE_STRERROR
+   extern char *strerror OF((int));
+#  define zstrerror(errnum) strerror(errnum)
+#else
+#  define zstrerror(errnum) ""
+#endif
+
+#if defined(pyr)
+#  define NO_MEMCPY
+#endif
+#if (defined(M_I86SM) || defined(M_I86MM)) && !defined(_MSC_VER)
+ /* Use our own functions for small and medium model with MSC <= 5.0.
+  * You may have to use the same strategy for Borland C (untested).
+  */
+#  define NO_MEMCPY
+#endif
+#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY)
+#  define HAVE_MEMCPY
+#endif
+#ifdef HAVE_MEMCPY
+#  ifdef SMALL_MEDIUM /* MSDOS small or medium model */
+#    define zmemcpy _fmemcpy
+#    define zmemcmp _fmemcmp
+#    define zmemzero(dest, len) _fmemset(dest, 0, len)
+#  else
+#    define zmemcpy memcpy
+#    define zmemcmp memcmp
+#    define zmemzero(dest, len) memset(dest, 0, len)
+#  endif
+#else
+   extern void zmemcpy  OF((Bytef* dest, Bytef* source, uInt len));
+   extern int  zmemcmp  OF((Bytef* s1,   Bytef* s2, uInt len));
+   extern void zmemzero OF((Bytef* dest, uInt len));
+#endif
+
+/* Diagnostic functions */
+#ifdef DEBUG_ZLIB
+#  include <stdio.h>
+#  ifndef verbose
+#    define verbose 0
+#  endif
+   extern void z_error    OF((char *m));
+#  define Assert(cond,msg) {if(!(cond)) z_error(msg);}
+#  define Trace(x) fprintf x
+#  define Tracev(x) {if (verbose) fprintf x ;}
+#  define Tracevv(x) {if (verbose>1) fprintf x ;}
+#  define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
+#  define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
+#else
+#  define Assert(cond,msg)
+#  define Trace(x)
+#  define Tracev(x)
+#  define Tracevv(x)
+#  define Tracec(c,x)
+#  define Tracecv(c,x)
+#endif
+
+
+typedef uLong (*check_func) OF((uLong check, const Bytef *buf, uInt len));
+
+voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size));
+void   zcfree  OF((voidpf opaque, voidpf ptr));
+
+#define ZALLOC(strm, items, size) \
+           (*((strm)->zalloc))((strm)->opaque, (items), (size))
+#define ZFREE(strm, addr)  (*((strm)->zfree))((strm)->opaque, (voidpf)(addr))
+#define TRY_FREE(s, p) {if (p) ZFREE(s, p);}
+
+#endif /* _Z_UTIL_H */


Property changes on: trunk/sys/sys/zutil.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property

From laffer1 at midnightbsd.org  Sat Feb  8 14:49:57 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:49:57 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12323] trunk/sys/sys/watchdog.h: sync with
 FreeBSD 11-stable
Message-ID: <202002081949.018Jnvrx064978@stargazer.midnightbsd.org>

Revision: 12323
          http://svnweb.midnightbsd.org/src/?rev=12323
Author:   laffer1
Date:     2020-02-08 14:49:56 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/watchdog.h

Modified: trunk/sys/sys/watchdog.h
===================================================================
--- trunk/sys/sys/watchdog.h	2020-02-08 19:49:04 UTC (rev 12322)
+++ trunk/sys/sys/watchdog.h	2020-02-08 19:49:56 UTC (rev 12323)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/watchdog.h 247405 2013-02-27 19:03:31Z alfred $
+ * $FreeBSD: stable/11/sys/sys/watchdog.h 331722 2018-03-29 02:50:57Z eadler $
  */
 #ifndef _SYS_WATCHDOG_H
 #define	_SYS_WATCHDOG_H
@@ -111,6 +111,14 @@
 
 u_int	wdog_kern_last_timeout(void);
 int	wdog_kern_pat(u_int utim);
+
+/*
+ * The following function pointer is used to attach a software watchdog
+ * if no hardware watchdog has been attached, and if the software module
+ * has initialized the function pointer.
+ */
+
+extern void (*wdog_software_attach)(void);
 #endif
 
 #endif /* _SYS_WATCHDOG_H */


From laffer1 at midnightbsd.org  Sat Feb  8 14:51:00 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:51:00 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12324] trunk/sys/sys/wait.h: sync with
 FreeBSD 11-stable
Message-ID: <202002081951.018Jp0qN065697@stargazer.midnightbsd.org>

Revision: 12324
          http://svnweb.midnightbsd.org/src/?rev=12324
Author:   laffer1
Date:     2020-02-08 14:51:00 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/wait.h

Modified: trunk/sys/sys/wait.h
===================================================================
--- trunk/sys/sys/wait.h	2020-02-08 19:49:56 UTC (rev 12323)
+++ trunk/sys/sys/wait.h	2020-02-08 19:51:00 UTC (rev 12324)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)wait.h	8.2 (Berkeley) 7/10/94
- * $FreeBSD: stable/10/sys/sys/wait.h 254218 2013-08-11 14:15:01Z jilles $
+ * $FreeBSD: stable/11/sys/sys/wait.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_WAIT_H_
@@ -139,7 +139,19 @@
 #define	WAIT_MYPGRP	0	/* any process in my process group */
 #endif /* __BSD_VISIBLE */
 
+#if defined(_KERNEL) || defined(_WANT_KW_EXITCODE)
+
+/*
+ * Clamp the return code to the low 8 bits from full 32 bit value.
+ * Should be used in kernel to construct the wait(2)-compatible process
+ * status to usermode.
+ */
+#define	KW_EXITCODE(ret, sig)	W_EXITCODE((ret) & 0xff, (sig))
+
+#endif	/* _KERNEL || _WANT_KW_EXITCODE */
+
 #ifndef _KERNEL
+
 #include <sys/types.h>
 
 __BEGIN_DECLS


From laffer1 at midnightbsd.org  Sat Feb  8 14:52:44 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:52:44 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12325] trunk/sys/sys/vnode.h: sync with
 FreeBSD 11-stable
Message-ID: <202002081952.018JqijP065804@stargazer.midnightbsd.org>

Revision: 12325
          http://svnweb.midnightbsd.org/src/?rev=12325
Author:   laffer1
Date:     2020-02-08 14:52:43 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/vnode.h

Modified: trunk/sys/sys/vnode.h
===================================================================
--- trunk/sys/sys/vnode.h	2020-02-08 19:51:00 UTC (rev 12324)
+++ trunk/sys/sys/vnode.h	2020-02-08 19:52:43 UTC (rev 12325)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
- * $FreeBSD: stable/10/sys/sys/vnode.h 301100 2016-06-01 04:07:33Z kib $
+ * $FreeBSD: stable/11/sys/sys/vnode.h 355443 2019-12-06 11:48:22Z kib $
  */
 
 #ifndef _SYS_VNODE_H_
@@ -78,6 +78,7 @@
  *	c - namecache mutex
  *	f - freelist mutex
  *	i - interlock
+ *	I - updated with atomics, 0->1 and 1->0 transitions with interlock held
  *	m - mount point interlock
  *	p - pollinfo lock
  *	u - Only a reference to the vnode is needed to read.
@@ -163,8 +164,8 @@
 	daddr_t	v_lastw;			/* v last write  */
 	int	v_clen;				/* v length of cur. cluster */
 
-	int	v_holdcnt;			/* i prevents recycling. */
-	int	v_usecount;			/* i ref count of users */
+	u_int	v_holdcnt;			/* I prevents recycling. */
+	u_int	v_usecount;			/* I ref count of users */
 	u_int	v_iflag;			/* i vnode flags (see below) */
 	u_int	v_vflag;			/* v vnode flags */
 	int	v_writecount;			/* v ref count of writers */
@@ -234,7 +235,6 @@
  *	are required for writing but the status may be checked with either.
  */
 #define	VI_MOUNT	0x0020	/* Mount in progress */
-#define	VI_AGE		0x0040	/* Insert vnode at head of free list */
 #define	VI_DOOMED	0x0080	/* This vnode is being recycled */
 #define	VI_FREE		0x0100	/* This vnode is on the freelist */
 #define	VI_ACTIVE	0x0200	/* This vnode is on the active list */
@@ -254,6 +254,7 @@
 #define	VV_DELETED	0x0400	/* should be removed */
 #define	VV_MD		0x0800	/* vnode backs the md device */
 #define	VV_FORCEINSMQ	0x1000	/* force the insmntque to succeed */
+#define	VV_READLINK	0x2000	/* fdescfs linux vnode */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
@@ -303,6 +304,7 @@
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
+#define	IO_NOREUSE	0x0200		/* VMIO data won't be reused */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
@@ -338,6 +340,8 @@
 #define	VWRITE_ACL	 	000040000000 /* change ACL and/or file mode */
 #define	VWRITE_OWNER	 	000100000000 /* change file owner */
 #define	VSYNCHRONIZE	 	000200000000 /* not used */
+#define	VCREAT			000400000000 /* creating new file */
+#define	VVERIFY			001000000000 /* verification required */
 
 /*
  * Permissions that were traditionally granted only to the file owner.
@@ -372,6 +376,8 @@
 MALLOC_DECLARE(M_VNODE);
 #endif
 
+extern u_int ncsizefactor;
+
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
@@ -393,6 +399,8 @@
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
+#define	V_VMIO		0x0010	/* vinvalbuf: called during pageout */
+#define	V_ALLOWCLEAN	0x0020	/* vinvalbuf: allow clean buffers after flush */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
@@ -420,7 +428,6 @@
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	struct mount *rootdevmp;	/* "/dev" mount */
-extern	int async_io_version;		/* 0 or POSIX version of AIO i'face */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	struct vattr va_null;		/* predefined null vattr structure */
@@ -508,7 +515,9 @@
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
- * filesystem using a single-threaded test.
+ * filesystem using a single-threaded test.  Note that the unreliability is
+ * limited to false negatives; efforts were made to ensure that false
+ * positives cannot occur.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
@@ -576,6 +585,7 @@
 /*
  * Finally, include the default set of vnode operations.
  */
+typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int);
 #include "vnode_if.h"
 
 /* vn_open_flags */
@@ -582,6 +592,7 @@
 #define	VN_OPEN_NOAUDIT		0x00000001
 #define	VN_OPEN_NOCAPCHECK	0x00000002
 #define	VN_OPEN_NAMECACHE	0x00000004
+#define	VN_OPEN_INVFS		0x00000008
 
 /*
  * Public vnode manipulation functions.
@@ -598,10 +609,13 @@
 struct ucred;
 struct uio;
 struct vattr;
+struct vfsops;
 struct vnode;
 
 typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
 
+int	bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn,
+	    daddr_t endn);
 /* cache_* may belong in namei.h. */
 void	cache_changesize(int newhashsize);
 #define	cache_enter(dvp, vp, cnp)					\
@@ -613,9 +627,8 @@
 	    struct componentname *cnp, struct timespec *tsp, int *ticksp);
 void	cache_purge(struct vnode *vp);
 void	cache_purge_negative(struct vnode *vp);
-void	cache_purgevfs(struct mount *mp);
+void	cache_purgevfs(struct mount *mp, bool force);
 int	change_dir(struct vnode *vp, struct thread *td);
-int	change_root(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	cvtnstat(struct stat *sb, struct nstat *nsb);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
@@ -651,20 +664,20 @@
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
-void	vdrop(struct vnode *);
-void	vdropl(struct vnode *);
+#define	vdrop(vp)	_vdrop((vp), 0)
+#define	vdropl(vp)	_vdrop((vp), 1)
+void	_vdrop(struct vnode *, bool);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int lockflag, struct thread *td);
 void	vgone(struct vnode *vp);
-void	vhold(struct vnode *);
-void	vholdl(struct vnode *);
+#define	vhold(vp)	_vhold((vp), 0)
+#define	vholdl(vp)	_vhold((vp), 1)
+void	_vhold(struct vnode *, bool);
 void	vinactive(struct vnode *, struct thread *);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
-int	vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length,
-	    int blksize);
+int	vtruncbuf(struct vnode *vp, off_t length, int blksize);
 void	vunref(struct vnode *);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
-#define vprint(label, vp) vn_printf((vp), "%s\n", (label))
 int	vrecycle(struct vnode *vp);
 int	vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
@@ -691,7 +704,7 @@
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio,
-	    const struct thread *td);
+	    struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 	    struct ucred *file_cred, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
@@ -730,7 +743,9 @@
 void	vfs_write_resume(struct mount *mp, int flags);
 int	vfs_write_suspend(struct mount *mp, int flags);
 int	vfs_write_suspend_umnt(struct mount *mp);
+void	vnlru_free(int, struct vfsops *);
 int	vop_stdbmap(struct vop_bmap_args *);
+int	vop_stdfdatasync_buf(struct vop_fdatasync_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
@@ -771,8 +786,6 @@
 void	vop_create_post(void *a, int rc);
 void	vop_deleteextattr_post(void *a, int rc);
 void	vop_link_post(void *a, int rc);
-void	vop_lock_pre(void *a);
-void	vop_lock_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
@@ -787,10 +800,21 @@
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_post(void *a, int rc);
 void	vop_setextattr_post(void *a, int rc);
+void	vop_symlink_post(void *a, int rc);
+
+#ifdef DEBUG_VFS_LOCKS
 void	vop_strategy_pre(void *a);
-void	vop_symlink_post(void *a, int rc);
+void	vop_lock_pre(void *a);
+void	vop_lock_post(void *a, int rc);
 void	vop_unlock_post(void *a, int rc);
 void	vop_unlock_pre(void *a);
+#else
+#define	vop_strategy_pre(x)	do { } while (0)
+#define	vop_lock_pre(x)		do { } while (0)
+#define	vop_lock_post(x, y)	do { } while (0)
+#define	vop_unlock_post(x, y)	do { } while (0)
+#define	vop_unlock_pre(x)	do { } while (0)
+#endif
 
 void	vop_rename_fail(struct vop_rename_args *ap);
 
@@ -821,6 +845,8 @@
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
+void	vrefl(struct vnode *vp);
+void	vrefact(struct vnode *vp);
 int	vrefcnt(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 


From laffer1 at midnightbsd.org  Sat Feb  8 14:53:26 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:53:26 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12326] trunk/sys/sys/vmmeter.h: sync with
 FreeBSD 11-stable
Message-ID: <202002081953.018JrQVv065866@stargazer.midnightbsd.org>

Revision: 12326
          http://svnweb.midnightbsd.org/src/?rev=12326
Author:   laffer1
Date:     2020-02-08 14:53:25 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/vmmeter.h

Modified: trunk/sys/sys/vmmeter.h
===================================================================
--- trunk/sys/sys/vmmeter.h	2020-02-08 19:52:43 UTC (rev 12325)
+++ trunk/sys/sys/vmmeter.h	2020-02-08 19:53:25 UTC (rev 12326)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vmmeter.h	8.2 (Berkeley) 7/10/94
- * $FreeBSD: stable/10/sys/sys/vmmeter.h 330047 2018-02-27 01:28:19Z jhb $
+ * $FreeBSD: stable/11/sys/sys/vmmeter.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_VMMETER_H_
@@ -76,9 +76,10 @@
 	u_int v_vnodepgsin;	/* (p) vnode_pager pages paged in */
 	u_int v_vnodepgsout;	/* (p) vnode pager pages paged out */
 	u_int v_intrans;	/* (p) intransit blocking page faults */
-	u_int v_reactivated;	/* (f) pages reactivated from free list */
-	u_int v_pdwakeups;	/* (f) times daemon has awaken from sleep */
+	u_int v_reactivated;	/* (p) pages reactivated by the pagedaemon */
+	u_int v_pdwakeups;	/* (p) times daemon has awaken from sleep */
 	u_int v_pdpages;	/* (p) pages analyzed by daemon */
+	u_int v_pdshortfalls;	/* (p) page reclamation shortfalls */
 
 	u_int v_tcached;	/* (p) total pages cached */
 	u_int v_dfree;		/* (p) pages freed by daemon */
@@ -97,9 +98,8 @@
 	u_int v_active_count;	/* (q) pages active */
 	u_int v_inactive_target; /* (c) pages desired inactive */
 	u_int v_inactive_count;	/* (q) pages inactive */
+	u_int v_laundry_count;	/* (q) pages eligible for laundering */
 	u_int v_cache_count;	/* (f) pages on cache queue */
-	u_int v_cache_min;	/* (c) min pages desired on cache queue */
-	u_int v_cache_max;	/* (c) max pages in cached obj (unused) */
 	u_int v_pageout_free_min;   /* (c) min pages reserved for kernel */
 	u_int v_interrupt_free_min; /* (c) reserved pages for int code */
 	u_int v_free_severe;	/* (c) severe page depletion point */
@@ -117,9 +117,9 @@
 };
 #ifdef _KERNEL
 
-extern struct vmmeter cnt;
+extern struct vmmeter vm_cnt;
 
-extern int vm_pageout_wakeup_thresh;
+extern u_int vm_pageout_wakeup_thresh;
 
 /*
  * Return TRUE if we are under our severe low-free-pages threshold
@@ -127,12 +127,11 @@
  * This routine is typically used at the user<->system interface to determine
  * whether we need to block in order to avoid a low memory deadlock.
  */
-
-static __inline 
-int
+static inline int
 vm_page_count_severe(void)
 {
-    return (cnt.v_free_severe > (cnt.v_free_count + cnt.v_cache_count));
+
+	return (vm_cnt.v_free_severe > vm_cnt.v_free_count);
 }
 
 /*
@@ -142,14 +141,13 @@
  * we can execute potentially very expensive code in terms of memory.  It
  * is also used by the pageout daemon to calculate when to sleep, when
  * to wake waiters up, and when (after making a pass) to become more
- * desparate.
+ * desperate.
  */
-
-static __inline 
-int
+static inline int
 vm_page_count_min(void)
 {
-    return (cnt.v_free_min > (cnt.v_free_count + cnt.v_cache_count));
+
+	return (vm_cnt.v_free_min > vm_cnt.v_free_count);
 }
 
 /*
@@ -156,12 +154,11 @@
  * Return TRUE if we have not reached our free page target during
  * free page recovery operations.
  */
-
-static __inline 
-int
+static inline int
 vm_page_count_target(void)
 {
-    return (cnt.v_free_target > (cnt.v_free_count + cnt.v_cache_count));
+
+	return (vm_cnt.v_free_target > vm_cnt.v_free_count);
 }
 
 /*
@@ -168,26 +165,42 @@
  * Return the number of pages we need to free-up or cache
  * A positive number indicates that we do not have enough free pages.
  */
-
-static __inline 
-int
+static inline int
 vm_paging_target(void)
 {
-    return (cnt.v_free_target - (cnt.v_free_count + cnt.v_cache_count));
+
+	return (vm_cnt.v_free_target - vm_cnt.v_free_count);
 }
 
 /*
  * Returns TRUE if the pagedaemon needs to be woken up.
  */
+static inline int
+vm_paging_needed(u_int free_count)
+{
 
-static __inline 
-int
-vm_paging_needed(void)
+	return (free_count < vm_pageout_wakeup_thresh);
+}
+
+/*
+ * Return the number of pages we need to launder.
+ * A positive number indicates that we have a shortfall of clean pages.
+ */
+static inline int
+vm_laundry_target(void)
 {
-    return (cnt.v_free_count + cnt.v_cache_count <
-        (u_int)vm_pageout_wakeup_thresh);
+
+	return (vm_paging_target());
 }
 
+/*
+ * Obtain the value of a per-CPU counter.
+ */
+#define	VM_METER_PCPU_CNT(member)					\
+	vm_meter_cnt(__offsetof(struct vmmeter, member))
+
+u_int	vm_meter_cnt(size_t);
+
 #endif
 
 struct vmtotal {


From laffer1 at midnightbsd.org  Sat Feb  8 14:54:27 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:54:27 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12327] trunk/sys/sys/vdso.h: sync with
 FreeBSD 11-stable
Message-ID: <202002081954.018JsR7s065965@stargazer.midnightbsd.org>

Revision: 12327
          http://svnweb.midnightbsd.org/src/?rev=12327
Author:   laffer1
Date:     2020-02-08 14:54:27 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/vdso.h

Modified: trunk/sys/sys/vdso.h
===================================================================
--- trunk/sys/sys/vdso.h	2020-02-08 19:53:25 UTC (rev 12326)
+++ trunk/sys/sys/vdso.h	2020-02-08 19:54:27 UTC (rev 12327)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>.
  * All rights reserved.
@@ -22,7 +23,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: stable/11/sys/sys/vdso.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_VDSO_H
@@ -53,6 +54,9 @@
 #define	VDSO_TK_VER_1		0x1
 #define	VDSO_TK_VER_CURR	VDSO_TK_VER_1
 #define	VDSO_TH_ALGO_1		0x1
+#define	VDSO_TH_ALGO_2		0x2
+#define	VDSO_TH_ALGO_3		0x3
+#define	VDSO_TH_ALGO_4		0x4
 
 #ifndef _KERNEL
 
@@ -62,7 +66,7 @@
 
 int __vdso_clock_gettime(clockid_t clock_id, struct timespec *ts);
 int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
-u_int __vdso_gettc(const struct vdso_timehands *vdso_th);
+int __vdso_gettc(const struct vdso_timehands *vdso_th, u_int *tc);
 int __vdso_gettimekeep(struct vdso_timekeep **tk);
 
 #endif
@@ -69,6 +73,14 @@
 
 #ifdef _KERNEL
 
+struct timecounter;
+
+struct vdso_sv_tk {
+	int		sv_timekeep_off;
+	int		sv_timekeep_curr;
+	uint32_t	sv_timekeep_gen;
+};
+
 void timekeep_push_vdso(void);
 
 uint32_t tc_fill_vdso_timehands(struct vdso_timehands *vdso_th);
@@ -81,8 +93,11 @@
  * global sysctl enable override is handled by machine-independed code
  * after cpu_fill_vdso_timehands() call is made.
  */
-uint32_t cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th);
+uint32_t cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th,
+    struct timecounter *tc);
 
+struct vdso_sv_tk *alloc_sv_tk(void);
+
 #define	VDSO_TH_NUM	4
 
 #ifdef COMPAT_FREEBSD32
@@ -110,7 +125,9 @@
 };
 
 uint32_t tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32);
-uint32_t cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32);
+uint32_t cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
+    struct timecounter *tc);
+struct vdso_sv_tk *alloc_sv_tk_compat32(void);
 
 #endif
 #endif


From laffer1 at midnightbsd.org  Sat Feb  8 14:55:14 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:55:14 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12328] trunk/sys/sys/user.h: sync with
 FreeBSD 11-stable
Message-ID: <202002081955.018JtETg066674@stargazer.midnightbsd.org>

Revision: 12328
          http://svnweb.midnightbsd.org/src/?rev=12328
Author:   laffer1
Date:     2020-02-08 14:55:14 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/user.h

Modified: trunk/sys/sys/user.h
===================================================================
--- trunk/sys/sys/user.h	2020-02-08 19:54:27 UTC (rev 12327)
+++ trunk/sys/sys/user.h	2020-02-08 19:55:14 UTC (rev 12328)
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)user.h	8.2 (Berkeley) 9/23/93
- * $FreeBSD: stable/10/sys/sys/user.h 310121 2016-12-15 16:52:17Z vangyzen $
+ * $FreeBSD: stable/11/sys/sys/user.h 341778 2018-12-10 01:38:48Z kib $
  */
 
 #ifndef _SYS_USER_H_
@@ -85,7 +85,7 @@
  * it in two places: function fill_kinfo_proc in sys/kern/kern_proc.c and
  * function kvm_proclist in lib/libkvm/kvm_proc.c .
  */
-#define	KI_NSPARE_INT	7
+#define	KI_NSPARE_INT	4
 #define	KI_NSPARE_LONG	12
 #define	KI_NSPARE_PTR	6
 
@@ -172,8 +172,8 @@
 	signed char ki_nice;		/* Process "nice" value */
 	char	ki_lock;		/* Process lock (prevent swap) count */
 	char	ki_rqindex;		/* Run queue index */
-	u_char	ki_oncpu;		/* Which cpu we are on */
-	u_char	ki_lastcpu;		/* Last cpu we were on */
+	u_char	ki_oncpu_old;		/* Which cpu we are on (legacy) */
+	u_char	ki_lastcpu_old;		/* Last cpu we were on (legacy) */
 	char	ki_tdname[TDNAMLEN+1];	/* thread name */
 	char	ki_wmesg[WMESGLEN+1];	/* wchan message */
 	char	ki_login[LOGNAMELEN+1];	/* setlogin name */
@@ -189,6 +189,9 @@
 	 */
 	char	ki_sparestrings[46];	/* spare string space */
 	int	ki_spareints[KI_NSPARE_INT];	/* spare room for growth */
+	int	ki_oncpu;		/* Which cpu we are on */
+	int	ki_lastcpu;		/* Last cpu we were on */
+	int	ki_tracer;		/* Pid of tracing process */
 	int	ki_flag2;		/* P2_* flags */
 	int	ki_fibnum;		/* Default FIB number */
 	u_int	ki_cr_flags;		/* Credential flags */
@@ -257,6 +260,7 @@
 #define	KF_TYPE_SEM	9
 #define	KF_TYPE_PTS	10
 #define	KF_TYPE_PROCDESC	11
+#define	KF_TYPE_DEV	12
 #define	KF_TYPE_UNKNOWN	255
 
 #define	KF_VTYPE_VNON	0
@@ -273,7 +277,7 @@
 #define	KF_FD_TYPE_CWD	-1	/* Current working directory */
 #define	KF_FD_TYPE_ROOT	-2	/* Root directory */
 #define	KF_FD_TYPE_JAIL	-3	/* Jail directory */
-#define	KF_FD_TYPE_TRACE	-4	/* ptrace vnode */
+#define	KF_FD_TYPE_TRACE	-4	/* Ktrace vnode */
 #define	KF_FD_TYPE_TEXT	-5	/* Text vnode */
 #define	KF_FD_TYPE_CTTY	-6	/* Controlling terminal */
 
@@ -556,6 +560,7 @@
 
 int	kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen,
 	int flags);
+int	kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen);
 int	kern_proc_out(struct proc *p, struct sbuf *sb, int flags);
 int	kern_proc_vmmap_out(struct proc *p, struct sbuf *sb, ssize_t maxlen,
 	int flags);


From laffer1 at midnightbsd.org  Sat Feb  8 14:56:27 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:56:27 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12329] trunk/sys/sys/umtx.h: sync with
 FreeBSD 11-stable
Message-ID: <202002081956.018JuR8i066770@stargazer.midnightbsd.org>

Revision: 12329
          http://svnweb.midnightbsd.org/src/?rev=12329
Author:   laffer1
Date:     2020-02-08 14:56:26 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/umtx.h

Modified: trunk/sys/sys/umtx.h
===================================================================
--- trunk/sys/sys/umtx.h	2020-02-08 19:55:14 UTC (rev 12328)
+++ trunk/sys/sys/umtx.h	2020-02-08 19:56:26 UTC (rev 12329)
@@ -24,7 +24,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/umtx.h 233912 2012-04-05 02:24:08Z davidxu $
+ * $FreeBSD: stable/11/sys/sys/umtx.h 331722 2018-03-29 02:50:57Z eadler $
  *
  */
 
@@ -32,20 +32,28 @@
 #define	_SYS_UMTX_H_
 
 #include <sys/_umtx.h>
-#include <sys/limits.h>
 
-#define	UMTX_UNOWNED		0x0
-#define	UMTX_CONTESTED		LONG_MIN
-
+/* Common lock flags */
 #define USYNC_PROCESS_SHARED	0x0001	/* Process shared sync objs */
 
+/* umutex flags */
+#define	UMUTEX_PRIO_INHERIT	0x0004	/* Priority inherited mutex */
+#define	UMUTEX_PRIO_PROTECT	0x0008	/* Priority protect mutex */
+#define	UMUTEX_ROBUST		0x0010	/* Robust mutex */
+#define	UMUTEX_NONCONSISTENT	0x0020	/* Robust locked but not consistent */
+
+/*
+ * The umutex.m_lock values and bits.  The m_owner is the word which
+ * serves as the lock.  Its high bit is the contention indicator and
+ * rest of bits records the owner TID.  TIDs values start with PID_MAX
+ * + 2 and end by INT32_MAX.  The low range [1..PID_MAX] is guaranteed
+ * to be useable as the special markers.
+ */
 #define	UMUTEX_UNOWNED		0x0
 #define	UMUTEX_CONTESTED	0x80000000U
+#define	UMUTEX_RB_OWNERDEAD	(UMUTEX_CONTESTED | 0x10)
+#define	UMUTEX_RB_NOTRECOV	(UMUTEX_CONTESTED | 0x11)
 
-#define	UMUTEX_ERROR_CHECK	0x0002	/* Error-checking mutex */
-#define	UMUTEX_PRIO_INHERIT	0x0004	/* Priority inherited mutex */
-#define	UMUTEX_PRIO_PROTECT	0x0008	/* Priority protect mutex */
-
 /* urwlock flags */
 #define URWLOCK_PREFER_READER	0x0002
 
@@ -58,9 +66,14 @@
 /* _usem flags */
 #define SEM_NAMED	0x0002
 
+/* _usem2 count field */
+#define	USEM_HAS_WAITERS	0x80000000U
+#define	USEM_MAX_COUNT		0x7fffffffU
+#define	USEM_COUNT(c)		((c) & USEM_MAX_COUNT)
+
 /* op code for _umtx_op */
-#define	UMTX_OP_LOCK		0
-#define	UMTX_OP_UNLOCK		1
+#define	UMTX_OP_RESERVED0	0
+#define	UMTX_OP_RESERVED1	1
 #define	UMTX_OP_WAIT		2
 #define	UMTX_OP_WAKE		3
 #define	UMTX_OP_MUTEX_TRYLOCK	4
@@ -78,11 +91,14 @@
 #define	UMTX_OP_WAKE_PRIVATE	16
 #define	UMTX_OP_MUTEX_WAIT	17
 #define	UMTX_OP_MUTEX_WAKE	18	/* deprecated */
-#define	UMTX_OP_SEM_WAIT	19
-#define	UMTX_OP_SEM_WAKE	20
+#define	UMTX_OP_SEM_WAIT	19	/* deprecated */
+#define	UMTX_OP_SEM_WAKE	20	/* deprecated */
 #define	UMTX_OP_NWAKE_PRIVATE   21
 #define	UMTX_OP_MUTEX_WAKE2	22
-#define	UMTX_OP_MAX		23
+#define	UMTX_OP_SEM2_WAIT	23
+#define	UMTX_OP_SEM2_WAKE	24
+#define	UMTX_OP_SHM		25
+#define	UMTX_OP_ROBUST_LISTS	26
 
 /* Flags for UMTX_OP_CV_WAIT */
 #define	CVWAIT_CHECK_UNPARKING	0x01
@@ -93,86 +109,26 @@
 
 #define	UMTX_CHECK_UNPARKING	CVWAIT_CHECK_UNPARKING
 
-#ifndef _KERNEL
+/* Flags for UMTX_OP_SHM */
+#define	UMTX_SHM_CREAT		0x0001
+#define	UMTX_SHM_LOOKUP		0x0002
+#define	UMTX_SHM_DESTROY	0x0004
+#define	UMTX_SHM_ALIVE		0x0008
 
-int _umtx_op(void *obj, int op, u_long val, void *uaddr, void *uaddr2);
+struct umtx_robust_lists_params {
+	uintptr_t	robust_list_offset;
+	uintptr_t	robust_priv_list_offset;
+	uintptr_t	robust_inact_offset;
+};
 
-/*
- * Old (deprecated) userland mutex system calls.
- */
-int _umtx_lock(struct umtx *mtx);
-int _umtx_unlock(struct umtx *mtx);
+#ifndef _KERNEL
 
-/*
- * Standard api.  Try uncontested acquire/release and asks the
- * kernel to resolve failures.
- */
-static __inline void
-umtx_init(struct umtx *umtx)
-{
-	umtx->u_owner = UMTX_UNOWNED;
-}
+__BEGIN_DECLS
 
-static __inline u_long
-umtx_owner(struct umtx *umtx)
-{
-	return (umtx->u_owner & ~LONG_MIN);
-}
+int _umtx_op(void *obj, int op, u_long val, void *uaddr, void *uaddr2);
 
-static __inline int
-umtx_lock(struct umtx *umtx, u_long id)
-{
-	if (atomic_cmpset_acq_long(&umtx->u_owner, UMTX_UNOWNED, id) == 0)
-		if (_umtx_lock(umtx) == -1)
-			return (errno);
-	return (0);
-}
+__END_DECLS
 
-static __inline int
-umtx_trylock(struct umtx *umtx, u_long id)
-{
-	if (atomic_cmpset_acq_long(&umtx->u_owner, UMTX_UNOWNED, id) == 0)
-		return (EBUSY);
-	return (0);
-}
-
-static __inline int
-umtx_timedlock(struct umtx *umtx, u_long id, const struct timespec *timeout)
-{
-	if (atomic_cmpset_acq_long(&umtx->u_owner, UMTX_UNOWNED, id) == 0)
-		if (_umtx_op(umtx, UMTX_OP_LOCK, id, 0,
-		    __DECONST(void *, timeout)) == -1)
-			return (errno);
-	return (0);
-}
-
-static __inline int
-umtx_unlock(struct umtx *umtx, u_long id)
-{
-	if (atomic_cmpset_rel_long(&umtx->u_owner, id, UMTX_UNOWNED) == 0)
-		if (_umtx_unlock(umtx) == -1)
-			return (errno);
-	return (0);
-}
-
-static __inline int
-umtx_wait(u_long *p, long val, const struct timespec *timeout)
-{
-	if (_umtx_op(p, UMTX_OP_WAIT, val, 0,
-	    __DECONST(void *, timeout)) == -1)
-		return (errno);
-	return (0);
-}
-
-/* Wake threads waiting on a user address. */
-static __inline int
-umtx_wake(u_long *p, int nr_wakeup)
-{
-	if (_umtx_op(p, UMTX_OP_WAKE, nr_wakeup, 0, 0) == -1)
-		return (errno);
-	return (0);
-}
-
 #else
 
 /*
@@ -189,7 +145,10 @@
 	TYPE_PI_UMUTEX,
 	TYPE_PP_UMUTEX,
 	TYPE_RWLOCK,
-	TYPE_FUTEX
+	TYPE_FUTEX,
+	TYPE_SHM,
+	TYPE_PI_ROBUST_UMUTEX,
+	TYPE_PP_ROBUST_UMUTEX,
 };
 
 /* Key to represent a unique userland synchronous object */
@@ -228,7 +187,7 @@
 }
 
 int umtx_copyin_timeout(const void *, struct timespec *);
-int umtx_key_get(void *, int, int, struct umtx_key *);
+int umtx_key_get(const void *, int, int, struct umtx_key *);
 void umtx_key_release(struct umtx_key *);
 struct umtx_q *umtxq_alloc(void);
 void umtxq_free(struct umtx_q *);


From laffer1 at midnightbsd.org  Sat Feb  8 14:57:07 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 14:57:07 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12330] trunk/sys/sys: sync with FreeBSD
 11-stable
Message-ID: <202002081957.018Jv70S066845@stargazer.midnightbsd.org>

Revision: 12330
          http://svnweb.midnightbsd.org/src/?rev=12330
Author:   laffer1
Date:     2020-02-08 14:57:06 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/unistd.h
    trunk/sys/sys/unpcb.h

Modified: trunk/sys/sys/unistd.h
===================================================================
--- trunk/sys/sys/unistd.h	2020-02-08 19:56:26 UTC (rev 12329)
+++ trunk/sys/sys/unistd.h	2020-02-08 19:57:06 UTC (rev 12330)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)unistd.h	8.2 (Berkeley) 1/7/94
- * $FreeBSD: stable/10/sys/sys/unistd.h 312171 2017-01-14 11:27:11Z kib $
+ * $FreeBSD: stable/11/sys/sys/unistd.h 353789 2019-10-21 01:24:21Z kevans $
  */
 
 #ifndef _SYS_UNISTD_H_
@@ -51,7 +51,7 @@
  * returns -1, the functions may be stubbed out.
  */
 #define	_POSIX_ADVISORY_INFO		200112L
-#define	_POSIX_ASYNCHRONOUS_IO		0
+#define	_POSIX_ASYNCHRONOUS_IO		200112L
 #define	_POSIX_CHOWN_RESTRICTED		1
 #define	_POSIX_CLOCK_SELECTION		(-1)
 #define	_POSIX_CPUTIME			200112L
@@ -187,11 +187,14 @@
 #define	RFTSIGNUM(flags)	(((flags) >> RFTSIGSHIFT) & RFTSIGMASK)
 #define	RFTSIGFLAGS(signum)	((signum) << RFTSIGSHIFT)
 #define	RFPROCDESC	(1<<28)	/* return a process descriptor */
-#define	RFPPWAIT	(1<<31)	/* parent sleeps until child exits (vfork) */
+/* kernel: parent sleeps until child exits (vfork) */
+#define	RFPPWAIT	(1<<31)
+/* user: vfork(2) semantics, clear signals */
+#define	RFSPAWN		(1U<<31)
 #define	RFFLAGS		(RFFDG | RFPROC | RFMEM | RFNOWAIT | RFCFDG | \
     RFTHREAD | RFSIGSHARE | RFLINUXTHPN | RFSTOPPED | RFHIGHPID | RFTSIGZMB | \
-    RFPROCDESC | RFPPWAIT)
-#define	RFKERNELONLY	(RFSTOPPED | RFHIGHPID | RFPPWAIT | RFPROCDESC)
+    RFPROCDESC | RFSPAWN | RFPPWAIT)
+#define	RFKERNELONLY	(RFSTOPPED | RFHIGHPID | RFPROCDESC)
 
 #endif /* __BSD_VISIBLE */
 

Modified: trunk/sys/sys/unpcb.h
===================================================================
--- trunk/sys/sys/unpcb.h	2020-02-08 19:56:26 UTC (rev 12329)
+++ trunk/sys/sys/unpcb.h	2020-02-08 19:57:06 UTC (rev 12330)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)unpcb.h	8.1 (Berkeley) 6/2/93
- * $FreeBSD: stable/10/sys/sys/unpcb.h 305261 2016-09-02 00:14:28Z markj $
+ * $FreeBSD: stable/11/sys/sys/unpcb.h 339067 2018-10-01 17:36:58Z asomers $
  */
 
 #ifndef _SYS_UNPCB_H_
@@ -151,4 +151,13 @@
 };
 #endif /* _SYS_SOCKETVAR_H_ */
 
+#if defined(_KERNEL)
+struct thread;
+
+/* In uipc_userreq.c */
+void
+unp_copy_peercred(struct thread *td, struct unpcb *client_unp,
+    struct unpcb *server_unp, struct unpcb *listen_unp);
+#endif
+
 #endif /* _SYS_UNPCB_H_ */


From laffer1 at midnightbsd.org  Sat Feb  8 15:00:10 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:00:10 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12331] trunk/sys/sys: sync with FreeBSD
 11-stable
Message-ID: <202002082000.018K0AWi067616@stargazer.midnightbsd.org>

Revision: 12331
          http://svnweb.midnightbsd.org/src/?rev=12331
Author:   laffer1
Date:     2020-02-08 15:00:09 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/tty.h
    trunk/sys/sys/ttycom.h
    trunk/sys/sys/ttydefaults.h
    trunk/sys/sys/ttydisc.h
    trunk/sys/sys/ttyqueue.h
    trunk/sys/sys/turnstile.h
    trunk/sys/sys/types.h
    trunk/sys/sys/ucontext.h
    trunk/sys/sys/ucred.h

Modified: trunk/sys/sys/tty.h
===================================================================
--- trunk/sys/sys/tty.h	2020-02-08 19:57:06 UTC (rev 12330)
+++ trunk/sys/sys/tty.h	2020-02-08 20:00:09 UTC (rev 12331)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/tty.h 271773 2014-09-18 14:44:47Z grehan $
+ * $FreeBSD: stable/11/sys/sys/tty.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_TTY_H_
@@ -63,6 +63,7 @@
 	struct mtx	*t_mtx;		/* TTY lock. */
 	struct mtx	t_mtxobj;	/* Per-TTY lock (when not borrowing). */
 	TAILQ_ENTRY(tty) t_list;	/* (l) TTY list entry. */
+	int		t_drainwait;	/* (t) TIOCDRAIN timeout seconds. */
 	unsigned int	t_flags;	/* (t) Terminal option flags. */
 /* Keep flags in sync with db_show_tty and pstat(8). */
 #define	TF_NOPREFIX	0x00001	/* Don't prepend "tty" to device name. */
@@ -172,11 +173,11 @@
 #define	tty_getlock(tp)		((tp)->t_mtx)
 
 /* Device node creation. */
-void	tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...)
-    __printflike(3, 4);
 int	tty_makedevf(struct tty *tp, struct ucred *cred, int flags,
     const char *fmt, ...) __printflike(4, 5);
 #define	TTYMK_CLONING		0x1
+#define	tty_makedev(tp, cred, fmt, ...) \
+	(void )tty_makedevf((tp), (cred), 0, (fmt), ## __VA_ARGS__)
 #define	tty_makealias(tp,fmt,...) \
 	make_dev_alias((tp)->t_dev, fmt, ## __VA_ARGS__)
 

Modified: trunk/sys/sys/ttycom.h
===================================================================
--- trunk/sys/sys/ttycom.h	2020-02-08 19:57:06 UTC (rev 12330)
+++ trunk/sys/sys/ttycom.h	2020-02-08 20:00:09 UTC (rev 12331)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ttycom.h	8.1 (Berkeley) 3/28/94
- * $FreeBSD: stable/10/sys/sys/ttycom.h 231095 2012-02-06 18:15:46Z ed $
+ * $FreeBSD: stable/11/sys/sys/ttycom.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef	_SYS_TTYCOM_H_

Modified: trunk/sys/sys/ttydefaults.h
===================================================================
--- trunk/sys/sys/ttydefaults.h	2020-02-08 19:57:06 UTC (rev 12330)
+++ trunk/sys/sys/ttydefaults.h	2020-02-08 20:00:09 UTC (rev 12331)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ttydefaults.h	8.4 (Berkeley) 1/21/94
- * $FreeBSD: stable/10/sys/sys/ttydefaults.h 249311 2013-04-09 16:16:34Z ed $
+ * $FreeBSD: stable/11/sys/sys/ttydefaults.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*

Modified: trunk/sys/sys/ttydisc.h
===================================================================
--- trunk/sys/sys/ttydisc.h	2020-02-08 19:57:06 UTC (rev 12330)
+++ trunk/sys/sys/ttydisc.h	2020-02-08 20:00:09 UTC (rev 12331)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2008 Ed Schouten <ed at FreeBSD.org>
  * All rights reserved.
@@ -26,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/11/sys/sys/ttydisc.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_TTYDISC_H_

Modified: trunk/sys/sys/ttyqueue.h
===================================================================
--- trunk/sys/sys/ttyqueue.h	2020-02-08 19:57:06 UTC (rev 12330)
+++ trunk/sys/sys/ttyqueue.h	2020-02-08 20:00:09 UTC (rev 12331)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2008 Ed Schouten <ed at FreeBSD.org>
  * All rights reserved.
@@ -26,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/11/sys/sys/ttyqueue.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_TTYQUEUE_H_
@@ -69,7 +70,7 @@
 
 #ifdef _KERNEL
 /* Input queue handling routines. */
-void	ttyinq_setsize(struct ttyinq *ti, struct tty *tp, size_t len);
+int	ttyinq_setsize(struct ttyinq *ti, struct tty *tp, size_t len);
 void	ttyinq_free(struct ttyinq *ti);
 int	ttyinq_read_uio(struct ttyinq *ti, struct tty *tp, struct uio *uio,
     size_t readlen, size_t flushlen);
@@ -136,7 +137,7 @@
 
 /* Output queue handling routines. */
 void	ttyoutq_flush(struct ttyoutq *to);
-void	ttyoutq_setsize(struct ttyoutq *to, struct tty *tp, size_t len);
+int	ttyoutq_setsize(struct ttyoutq *to, struct tty *tp, size_t len);
 void	ttyoutq_free(struct ttyoutq *to);
 size_t	ttyoutq_read(struct ttyoutq *to, void *buf, size_t len);
 int	ttyoutq_read_uio(struct ttyoutq *to, struct tty *tp, struct uio *uio);

Modified: trunk/sys/sys/turnstile.h
===================================================================
--- trunk/sys/sys/turnstile.h	2020-02-08 19:57:06 UTC (rev 12330)
+++ trunk/sys/sys/turnstile.h	2020-02-08 20:00:09 UTC (rev 12331)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/turnstile.h 262192 2014-02-18 20:27:17Z jhb $
+ * $FreeBSD: stable/11/sys/sys/turnstile.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_TURNSTILE_H_
@@ -34,7 +34,7 @@
  * Turnstile interface.  Non-sleepable locks use a turnstile for the
  * queue of threads blocked on them when they are contested.  Each
  * turnstile contains two sub-queues: one for threads waiting for a
- * shared, or eread, lock, and one for threads waiting for an
+ * shared, or read, lock, and one for threads waiting for an
  * exclusive, or write, lock.
  *
  * A thread calls turnstile_chain_lock() to lock the turnstile chain

Modified: trunk/sys/sys/types.h
===================================================================
--- trunk/sys/sys/types.h	2020-02-08 19:57:06 UTC (rev 12330)
+++ trunk/sys/sys/types.h	2020-02-08 20:00:09 UTC (rev 12331)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)types.h	8.6 (Berkeley) 2/19/95
- * $FreeBSD: stable/10/sys/sys/types.h 289107 2015-10-10 05:50:42Z kib $
+ * $FreeBSD: stable/11/sys/sys/types.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_TYPES_H_
@@ -175,6 +175,11 @@
 #define	_OFF_T_DECLARED
 #endif
 
+#ifndef _OFF64_T_DECLARED
+typedef	__off64_t	off64_t;	/* file offset (alias) */
+#define	_OFF64_T_DECLARED
+#endif
+
 #ifndef _PID_T_DECLARED
 typedef	__pid_t		pid_t;		/* process id */
 #define	_PID_T_DECLARED
@@ -233,6 +238,11 @@
 #define	_USECONDS_T_DECLARED
 #endif
 
+#ifndef _CAP_IOCTL_T_DECLARED
+#define	_CAP_IOCTL_T_DECLARED
+typedef	unsigned long	cap_ioctl_t;
+#endif
+
 #ifndef _CAP_RIGHTS_T_DECLARED
 #define	_CAP_RIGHTS_T_DECLARED
 struct cap_rights;
@@ -241,11 +251,13 @@
 #endif
 
 typedef	__vm_offset_t	vm_offset_t;
-typedef	__vm_ooffset_t	vm_ooffset_t;
+typedef	__int64_t	vm_ooffset_t;
 typedef	__vm_paddr_t	vm_paddr_t;
-typedef	__vm_pindex_t	vm_pindex_t;
+typedef	__uint64_t	vm_pindex_t;
 typedef	__vm_size_t	vm_size_t;
 
+typedef __rman_res_t    rman_res_t;
+
 #ifdef _KERNEL
 typedef	int		boolean_t;
 typedef	struct device	*device_t;

Modified: trunk/sys/sys/ucontext.h
===================================================================
--- trunk/sys/sys/ucontext.h	2020-02-08 19:57:06 UTC (rev 12330)
+++ trunk/sys/sys/ucontext.h	2020-02-08 20:00:09 UTC (rev 12331)
@@ -26,7 +26,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/ucontext.h 278347 2015-02-07 08:47:15Z kib $
+ * $FreeBSD: stable/11/sys/sys/ucontext.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_UCONTEXT_H_
@@ -34,25 +34,9 @@
 
 #include <sys/signal.h>
 #include <machine/ucontext.h>
+#include <sys/_ucontext.h>
 
-typedef struct __ucontext {
-	/*
-	 * Keep the order of the first two fields. Also,
-	 * keep them the first two fields in the structure.
-	 * This way we can have a union with struct
-	 * sigcontext and ucontext_t. This allows us to
-	 * support them both at the same time.
-	 * note: the union is not defined, though.
-	 */
-	sigset_t	uc_sigmask;
-	mcontext_t	uc_mcontext;
-
-	struct __ucontext *uc_link;
-	stack_t		uc_stack;
-	int		uc_flags;
 #define	UCF_SWAPPED	0x00000001	/* Used by swapcontext(3). */
-	int		__spare__[4];
-} ucontext_t;
 
 #if defined(_KERNEL) && defined(COMPAT_FREEBSD4)
 #if defined(__i386__)

Modified: trunk/sys/sys/ucred.h
===================================================================
--- trunk/sys/sys/ucred.h	2020-02-08 19:57:06 UTC (rev 12330)
+++ trunk/sys/sys/ucred.h	2020-02-08 20:00:09 UTC (rev 12331)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ucred.h	8.4 (Berkeley) 1/9/95
- * $FreeBSD: stable/10/sys/sys/ucred.h 303846 2016-08-08 18:31:28Z bdrewery $
+ * $FreeBSD: stable/11/sys/sys/ucred.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_UCRED_H_
@@ -38,6 +38,8 @@
 
 struct loginclass;
 
+#define	XU_NGROUPS	16
+
 /*
  * Credentials.
  *
@@ -65,13 +67,12 @@
 	struct auditinfo_addr	cr_audit;	/* Audit properties. */
 	gid_t	*cr_groups;		/* groups */
 	int	cr_agroups;		/* Available groups */
+	gid_t   cr_smallgroups[XU_NGROUPS];	/* storage for small groups */
 };
 #define	NOCRED	((struct ucred *)0)	/* no credential available */
 #define	FSCRED	((struct ucred *)-1)	/* filesystem credential */
 #endif /* _KERNEL || _WANT_UCRED */
 
-#define	XU_NGROUPS	16
-
 /*
  * Flags for cr_flags.
  */
@@ -106,13 +107,11 @@
 struct ucred	*crcopysafe(struct proc *p, struct ucred *cr);
 struct ucred	*crdup(struct ucred *cr);
 void	crextend(struct ucred *cr, int n);
-void	cred_update_thread(struct thread *td);
 void	proc_set_cred_init(struct proc *p, struct ucred *cr);
 struct ucred	*proc_set_cred(struct proc *p, struct ucred *cr);
 void	crfree(struct ucred *cr);
 struct ucred	*crget(void);
 struct ucred	*crhold(struct ucred *cr);
-int	crshared(struct ucred *cr);
 void	cru2x(struct ucred *cr, struct xucred *xcr);
 void	crsetgroups(struct ucred *cr, int n, gid_t *groups);
 int	groupmember(gid_t gid, struct ucred *cred);


From laffer1 at midnightbsd.org  Sat Feb  8 15:01:04 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:01:04 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12332] trunk/sys/sys/sx.h: sync with FreeBSD
 11-stable
Message-ID: <202002082001.018K146H067722@stargazer.midnightbsd.org>

Revision: 12332
          http://svnweb.midnightbsd.org/src/?rev=12332
Author:   laffer1
Date:     2020-02-08 15:01:03 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sx.h

Modified: trunk/sys/sys/sx.h
===================================================================
--- trunk/sys/sys/sx.h	2020-02-08 20:00:09 UTC (rev 12331)
+++ trunk/sys/sys/sx.h	2020-02-08 20:01:03 UTC (rev 12332)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/sx.h 323870 2017-09-21 19:24:11Z marius $
+ * $FreeBSD: stable/11/sys/sys/sx.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef	_SYS_SX_H_
@@ -87,6 +87,13 @@
 
 #ifdef _KERNEL
 
+#define	sx_recurse	lock_object.lo_data
+
+#define	SX_READ_VALUE(sx)	((sx)->sx_lock)
+
+#define	lv_sx_owner(v) \
+	((v & SX_LOCK_SHARED) ? NULL : (struct thread *)SX_OWNER(v))
+
 /*
  * Function prototipes.  Routines that start with an underscore are not part
  * of the public interface and are wrappered with a macro.
@@ -95,20 +102,22 @@
 #define	sx_init(sx, desc)	sx_init_flags((sx), (desc), 0)
 void	sx_init_flags(struct sx *sx, const char *description, int opts);
 void	sx_destroy(struct sx *sx);
+int	sx_try_slock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF);
 int	sx_try_slock_(struct sx *sx, const char *file, int line);
+int	sx_try_xlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF);
 int	sx_try_xlock_(struct sx *sx, const char *file, int line);
+int	sx_try_upgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF);
 int	sx_try_upgrade_(struct sx *sx, const char *file, int line);
+void	sx_downgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF);
 void	sx_downgrade_(struct sx *sx, const char *file, int line);
+int	_sx_slock_int(struct sx *sx, int opts LOCK_FILE_LINE_ARG_DEF);
 int	_sx_slock(struct sx *sx, int opts, const char *file, int line);
 int	_sx_xlock(struct sx *sx, int opts, const char *file, int line);
+void	_sx_sunlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF);
 void	_sx_sunlock(struct sx *sx, const char *file, int line);
 void	_sx_xunlock(struct sx *sx, const char *file, int line);
-int	_sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts,
-	    const char *file, int line);
-int	_sx_slock_hard(struct sx *sx, int opts, const char *file, int line);
-void	_sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int
-	    line);
-void	_sx_sunlock_hard(struct sx *sx, const char *file, int line);
+int	_sx_xlock_hard(struct sx *sx, uintptr_t x, int opts LOCK_FILE_LINE_ARG_DEF);
+void	_sx_xunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF);
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void	_sx_assert(const struct sx *sx, int what, const char *file, int line);
 #endif
@@ -141,6 +150,7 @@
  * deferred to 'tougher' functions.
  */
 
+#if	(LOCK_DEBUG == 0)
 /* Acquire an exclusive lock. */
 static __inline int
 __sx_xlock(struct sx *sx, struct thread *td, int opts, const char *file,
@@ -147,14 +157,12 @@
     int line)
 {
 	uintptr_t tid = (uintptr_t)td;
+	uintptr_t v = SX_LOCK_UNLOCKED;
 	int error = 0;
 
-	if (sx->sx_lock != SX_LOCK_UNLOCKED ||
-	    !atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid))
-		error = _sx_xlock_hard(sx, tid, opts, file, line);
-	else 
-		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_XLOCK_ACQUIRE,
-		    sx, 0, 0, file, line);
+	if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__acquire) ||
+	    !atomic_fcmpset_acq_ptr(&sx->sx_lock, &v, tid)))
+		error = _sx_xlock_hard(sx, v, opts);
 
 	return (error);
 }
@@ -163,48 +171,15 @@
 static __inline void
 __sx_xunlock(struct sx *sx, struct thread *td, const char *file, int line)
 {
-	uintptr_t tid = (uintptr_t)td;
+	uintptr_t x = (uintptr_t)td;
 
-	if (sx->sx_lock != tid ||
-	    !atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED))
-		_sx_xunlock_hard(sx, tid, file, line);
+	if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__release) ||
+	    !atomic_fcmpset_rel_ptr(&sx->sx_lock, &x, SX_LOCK_UNLOCKED)))
+		_sx_xunlock_hard(sx, x);
 }
+#endif
 
-/* Acquire a shared lock. */
-static __inline int
-__sx_slock(struct sx *sx, int opts, const char *file, int line)
-{
-	uintptr_t x = sx->sx_lock;
-	int error = 0;
-
-	if (!(x & SX_LOCK_SHARED) ||
-	    !atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER))
-		error = _sx_slock_hard(sx, opts, file, line);
-	else
-		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_SLOCK_ACQUIRE, sx, 0,
-		    0, file, line);
-
-	return (error);
-}
-
 /*
- * Release a shared lock.  We can just drop a single shared lock so
- * long as we aren't trying to drop the last shared lock when other
- * threads are waiting for an exclusive lock.  This takes advantage of
- * the fact that an unlocked lock is encoded as a shared lock with a
- * count of 0.
- */
-static __inline void
-__sx_sunlock(struct sx *sx, const char *file, int line)
-{
-	uintptr_t x = sx->sx_lock;
-
-	if (x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS) ||
-	    !atomic_cmpset_rel_ptr(&sx->sx_lock, x, x - SX_ONE_SHARER))
-		_sx_sunlock_hard(sx, file, line);
-}
-
-/*
  * Public interface for lock operations.
  */
 #ifndef LOCK_DEBUG
@@ -217,12 +192,6 @@
 	_sx_xlock((sx), SX_INTERRUPTIBLE, (file), (line))
 #define	sx_xunlock_(sx, file, line)					\
 	_sx_xunlock((sx), (file), (line))
-#define	sx_slock_(sx, file, line)					\
-	(void)_sx_slock((sx), 0, (file), (line))
-#define	sx_slock_sig_(sx, file, line)					\
-	_sx_slock((sx), SX_INTERRUPTIBLE, (file) , (line))
-#define	sx_sunlock_(sx, file, line)					\
-	_sx_sunlock((sx), (file), (line))
 #else
 #define	sx_xlock_(sx, file, line)					\
 	(void)__sx_xlock((sx), curthread, 0, (file), (line))
@@ -230,17 +199,30 @@
 	__sx_xlock((sx), curthread, SX_INTERRUPTIBLE, (file), (line))
 #define	sx_xunlock_(sx, file, line)					\
 	__sx_xunlock((sx), curthread, (file), (line))
+#endif	/* LOCK_DEBUG > 0 || SX_NOINLINE */
+#if	(LOCK_DEBUG > 0)
 #define	sx_slock_(sx, file, line)					\
-	(void)__sx_slock((sx), 0, (file), (line))
+	(void)_sx_slock((sx), 0, (file), (line))
 #define	sx_slock_sig_(sx, file, line)					\
-	__sx_slock((sx), SX_INTERRUPTIBLE, (file), (line))
+	_sx_slock((sx), SX_INTERRUPTIBLE, (file) , (line))
 #define	sx_sunlock_(sx, file, line)					\
-	__sx_sunlock((sx), (file), (line))
-#endif	/* LOCK_DEBUG > 0 || SX_NOINLINE */
+	_sx_sunlock((sx), (file), (line))
 #define	sx_try_slock(sx)	sx_try_slock_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_try_xlock(sx)	sx_try_xlock_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_try_upgrade(sx)	sx_try_upgrade_((sx), LOCK_FILE, LOCK_LINE)
 #define	sx_downgrade(sx)	sx_downgrade_((sx), LOCK_FILE, LOCK_LINE)
+#else
+#define	sx_slock_(sx, file, line)					\
+	(void)_sx_slock_int((sx), 0)
+#define	sx_slock_sig_(sx, file, line)					\
+	_sx_slock_int((sx), SX_INTERRUPTIBLE)
+#define	sx_sunlock_(sx, file, line)					\
+	_sx_sunlock_int((sx))
+#define	sx_try_slock(sx)	sx_try_slock_int((sx))
+#define	sx_try_xlock(sx)	sx_try_xlock_int((sx))
+#define	sx_try_upgrade(sx)	sx_try_upgrade_int((sx))
+#define	sx_downgrade(sx)	sx_downgrade_int((sx))
+#endif
 #ifdef INVARIANTS
 #define	sx_assert_(sx, what, file, line)				\
 	_sx_assert((sx), (what), (file), (line))


From laffer1 at midnightbsd.org  Sat Feb  8 15:01:56 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:01:56 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12333] trunk/sys/sys/time.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082001.018K1u3r067791@stargazer.midnightbsd.org>

Revision: 12333
          http://svnweb.midnightbsd.org/src/?rev=12333
Author:   laffer1
Date:     2020-02-08 15:01:56 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/time.h

Modified: trunk/sys/sys/time.h
===================================================================
--- trunk/sys/sys/time.h	2020-02-08 20:01:03 UTC (rev 12332)
+++ trunk/sys/sys/time.h	2020-02-08 20:01:56 UTC (rev 12333)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)time.h	8.5 (Berkeley) 5/4/95
- * $FreeBSD: stable/10/sys/sys/time.h 304894 2016-08-27 10:56:04Z kib $
+ * $FreeBSD: stable/11/sys/sys/time.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_TIME_H_
@@ -129,7 +129,7 @@
 #define	SBT_1M	(SBT_1S * 60)
 #define	SBT_1MS	(SBT_1S / 1000)
 #define	SBT_1US	(SBT_1S / 1000000)
-#define	SBT_1NS	(SBT_1S / 1000000000)
+#define	SBT_1NS	(SBT_1S / 1000000000) /* beware rounding, see nstosbt() */
 #define	SBT_MAX	0x7fffffffffffffffLL
 
 static __inline int
@@ -156,6 +156,53 @@
 	return (_bt);
 }
 
+/*
+ * Decimal<->sbt conversions.  Multiplying or dividing by SBT_1NS results in
+ * large roundoff errors which sbttons() and nstosbt() avoid.  Millisecond and
+ * microsecond functions are also provided for completeness.
+ */
+static __inline int64_t
+sbttons(sbintime_t _sbt)
+{
+
+	return ((1000000000 * _sbt) >> 32);
+}
+
+static __inline sbintime_t
+nstosbt(int64_t _ns)
+{
+
+	return ((_ns * (((uint64_t)1 << 63) / 500000000)) >> 32);
+}
+
+static __inline int64_t
+sbttous(sbintime_t _sbt)
+{
+
+	return ((1000000 * _sbt) >> 32);
+}
+
+static __inline sbintime_t
+ustosbt(int64_t _us)
+{
+
+	return ((_us * (((uint64_t)1 << 63) / 500000)) >> 32);
+}
+
+static __inline int64_t
+sbttoms(sbintime_t _sbt)
+{
+
+	return ((1000 * _sbt) >> 32);
+}
+
+static __inline sbintime_t
+mstosbt(int64_t _ms)
+{
+
+	return ((_ms * (((uint64_t)1 << 63) / 500)) >> 32);
+}
+
 /*-
  * Background information:
  *
@@ -211,7 +258,7 @@
 	struct timespec _ts;
 
 	_ts.tv_sec = _sbt >> 32;
-	_ts.tv_nsec = ((uint64_t)1000000000 * (uint32_t)_sbt) >> 32;
+	_ts.tv_nsec = sbttons((uint32_t)_sbt);
 	return (_ts);
 }
 
@@ -219,8 +266,7 @@
 tstosbt(struct timespec _ts)
 {
 
-	return (((sbintime_t)_ts.tv_sec << 32) +
-	    (_ts.tv_nsec * (((uint64_t)1 << 63) / 500000000) >> 32));
+	return (((sbintime_t)_ts.tv_sec << 32) + nstosbt(_ts.tv_nsec));
 }
 
 static __inline struct timeval
@@ -229,7 +275,7 @@
 	struct timeval _tv;
 
 	_tv.tv_sec = _sbt >> 32;
-	_tv.tv_usec = ((uint64_t)1000000 * (uint32_t)_sbt) >> 32;
+	_tv.tv_usec = sbttous((uint32_t)_sbt);
 	return (_tv);
 }
 
@@ -237,8 +283,7 @@
 tvtosbt(struct timeval _tv)
 {
 
-	return (((sbintime_t)_tv.tv_sec << 32) +
-	    (_tv.tv_usec * (((uint64_t)1 << 63) / 500000) >> 32));
+	return (((sbintime_t)_tv.tv_sec << 32) + ustosbt(_tv.tv_usec));
 }
 #endif /* __BSD_VISIBLE */
 
@@ -373,8 +418,6 @@
 
 extern volatile time_t	time_second;
 extern volatile time_t	time_uptime;
-extern struct bintime boottimebin;
-extern struct timeval boottime;
 extern struct bintime tc_tick_bt;
 extern sbintime_t tc_tick_sbt;
 extern struct bintime tick_bt;
@@ -386,6 +429,8 @@
 extern sbintime_t sbt_timethreshold;
 extern sbintime_t sbt_tickthreshold;
 
+extern volatile int rtc_generation;
+
 /*
  * Functions for looking at our clock: [get]{bin,nano,micro}[up]time()
  *
@@ -399,7 +444,7 @@
  * Functions containing "up" returns time relative to boot and
  * should be used for calculating time intervals.
  *
- * Functions without "up" returns GMT time.
+ * Functions without "up" returns UTC time.
  *
  * Functions with the "get" prefix returns a less precise result
  * much faster than the functions without "get" prefix and should
@@ -441,6 +486,9 @@
 void	getnanotime(struct timespec *tsp);
 void	getmicrotime(struct timeval *tvp);
 
+void	getboottime(struct timeval *boottime);
+void	getboottimebin(struct bintime *boottimebin);
+
 /* Other functions */
 int	itimerdecr(struct itimerval *itp, int usec);
 int	itimerfix(struct timeval *tv);


From laffer1 at midnightbsd.org  Sat Feb  8 15:02:26 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:02:26 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12334] trunk/sys/sys/systm.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082002.018K2QO8067963@stargazer.midnightbsd.org>

Revision: 12334
          http://svnweb.midnightbsd.org/src/?rev=12334
Author:   laffer1
Date:     2020-02-08 15:02:25 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/systm.h

Modified: trunk/sys/sys/systm.h
===================================================================
--- trunk/sys/sys/systm.h	2020-02-08 20:01:56 UTC (rev 12333)
+++ trunk/sys/sys/systm.h	2020-02-08 20:02:25 UTC (rev 12334)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)systm.h	8.7 (Berkeley) 3/29/95
- * $FreeBSD: stable/10/sys/sys/systm.h 303433 2016-07-28 11:51:20Z kib $
+ * $FreeBSD: stable/11/sys/sys/systm.h 354405 2019-11-06 18:02:18Z mav $
  */
 
 #ifndef _SYS_SYSTM_H_
@@ -46,6 +46,8 @@
 #include <sys/queue.h>
 #include <sys/stdint.h>		/* for people using printf mainly */
 
+__NULLABILITY_PRAGMA_PUSH
+
 extern int cold;		/* nonzero if we are doing a cold boot */
 extern int suspend_blocked;	/* block suspend due to pending shutdown */
 extern int rebooting;		/* kern_reboot() has been called. */
@@ -75,9 +77,9 @@
  * Keep in sync with vm_guest_sysctl_names[].
  */
 enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV,
-		VM_GUEST_VMWARE, VM_LAST };
+		VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_LAST };
 
-#if defined(WITNESS) || defined(INVARIANTS)
+#if defined(WITNESS) || defined(INVARIANT_SUPPORT)
 void	kassert_panic(const char *fmt, ...)  __printflike(1, 2);
 #endif
 
@@ -84,12 +86,12 @@
 #ifdef	INVARIANTS		/* The option is always available */
 #define	KASSERT(exp,msg) do {						\
 	if (__predict_false(!(exp)))					\
-		kassert_panic msg;						\
+		kassert_panic msg;					\
 } while (0)
 #define	VNASSERT(exp, vp, msg) do {					\
 	if (__predict_false(!(exp))) {					\
 		vn_printf(vp, "VNASSERT failed\n");			\
-		kassert_panic msg;						\
+		kassert_panic msg;					\
 	}								\
 } while (0)
 #else
@@ -127,9 +129,20 @@
  * Otherwise, the kernel will deadlock since the scheduler isn't
  * going to run the thread that holds any lock we need.
  */
-#define	SCHEDULER_STOPPED() __predict_false(curthread->td_stopsched)
+#define	SCHEDULER_STOPPED_TD(td)  ({					\
+	MPASS((td) == curthread);					\
+	__predict_false((td)->td_stopsched);				\
+})
+#define	SCHEDULER_STOPPED() SCHEDULER_STOPPED_TD(curthread)
 
 /*
+ * Align variables.
+ */
+#define	__read_mostly		__section(".data.read_mostly")
+#define	__read_frequently	__section(".data.read_frequently")
+#define	__exclusive_cache_line	__aligned(CACHE_LINE_SIZE) \
+				    __section(".data.exclusive_cache_line")
+/*
  * XXX the hints declarations are even more misplaced than most declarations
  * in this file, since they are needed in one file (per arch) and only used
  * in two files.
@@ -136,11 +149,10 @@
  * XXX most of these variables should be const.
  */
 extern int osreldate;
-extern int envmode;
-extern int hintmode;		/* 0 = off. 1 = config, 2 = fallback */
-extern int dynamic_kenv;
+extern bool dynamic_kenv;
 extern struct mtx kenv_lock;
 extern char *kern_envp;
+extern char *md_envp;
 extern char static_env[];
 extern char static_hints[];	/* by config for now */
 
@@ -149,11 +161,15 @@
 extern const void *zero_region;	/* address space maps to a zeroed page	*/
 
 extern int unmapped_buf_allowed;
-extern int iosize_max_clamp;
-extern int devfs_iosize_max_clamp;
-#define	IOSIZE_MAX	(iosize_max_clamp ? INT_MAX : SSIZE_MAX)
-#define	DEVFS_IOSIZE_MAX	(devfs_iosize_max_clamp ? INT_MAX : SSIZE_MAX)
 
+#ifdef __LP64__
+#define	IOSIZE_MAX		iosize_max()
+#define	DEVFS_IOSIZE_MAX	devfs_iosize_max()
+#else
+#define	IOSIZE_MAX		SSIZE_MAX
+#define	DEVFS_IOSIZE_MAX	SSIZE_MAX
+#endif
+
 /*
  * General function declarations.
  */
@@ -186,6 +202,8 @@
 #define	HASH_WAITOK	0x00000002
 
 void	*phashinit(int count, struct malloc_type *type, u_long *nentries);
+void	*phashinit_flags(int count, struct malloc_type *type, u_long *nentries,
+    int flags);
 void	g_waitidle(void);
 
 void	panic(const char *, ...) __dead2 __printflike(1, 2);
@@ -208,6 +226,7 @@
 	    __va_list) __printflike(1, 0);
 void	log(int, const char *, ...) __printflike(2, 3);
 void	log_console(struct uio *);
+void	vlog(int, const char *, __va_list) __printflike(2, 0);
 int	asprintf(char **ret, struct malloc_type *mtp, const char *format, 
 	    ...) __printflike(3, 4);
 int	printf(const char *, ...) __printflike(1, 2);
@@ -221,12 +240,12 @@
 int	vsnrprintf(char *, size_t, int, const char *, __va_list) __printflike(4, 0);
 int	vsprintf(char *buf, const char *, __va_list) __printflike(2, 0);
 int	ttyprintf(struct tty *, const char *, ...) __printflike(2, 3);
-int	sscanf(const char *, char const *, ...) __nonnull(1) __nonnull(2);
-int	vsscanf(const char *, char const *, __va_list) __nonnull(1) __nonnull(2);
-long	strtol(const char *, char **, int) __nonnull(1);
-u_long	strtoul(const char *, char **, int) __nonnull(1);
-quad_t	strtoq(const char *, char **, int) __nonnull(1);
-u_quad_t strtouq(const char *, char **, int) __nonnull(1);
+int	sscanf(const char *, char const * _Nonnull, ...) __scanflike(2, 3);
+int	vsscanf(const char * _Nonnull, char const * _Nonnull, __va_list)  __scanflike(2, 0);
+long	strtol(const char *, char **, int);
+u_long	strtoul(const char *, char **, int);
+quad_t	strtoq(const char *, char **, int);
+u_quad_t strtouq(const char *, char **, int);
 void	tprintf(struct proc *p, int pri, const char *, ...) __printflike(3, 4);
 void	vtprintf(struct proc *, int, const char *, __va_list) __printflike(3, 0);
 void	hexdump(const void *ptr, int length, const char *hdr, int flags);
@@ -237,32 +256,27 @@
 #define	HD_OMIT_CHARS	(1 << 18)
 
 #define ovbcopy(f, t, l) bcopy((f), (t), (l))
-void	bcopy(const void *from, void *to, size_t len) __nonnull(1) __nonnull(2);
-void	bzero(void *buf, size_t len) __nonnull(1);
-#define bzero(buf, len) ({				\
-	if (__builtin_constant_p(len) && (len) <= 64)	\
-		__builtin_memset((buf), 0, (len));	\
-	else						\
-		bzero((buf), (len));			\
-})
+void	bcopy(const void * _Nonnull from, void * _Nonnull to, size_t len);
+void	bzero(void * _Nonnull buf, size_t len);
+void	explicit_bzero(void * _Nonnull, size_t);
 
-void	*memcpy(void *to, const void *from, size_t len) __nonnull(1) __nonnull(2);
-void	*memmove(void *dest, const void *src, size_t n) __nonnull(1) __nonnull(2);
+void	*memcpy(void * _Nonnull to, const void * _Nonnull from, size_t len);
+void	*memmove(void * _Nonnull dest, const void * _Nonnull src, size_t n);
 
-int	copystr(const void * __restrict kfaddr, void * __restrict kdaddr,
-	    size_t len, size_t * __restrict lencopied)
-	    __nonnull(1) __nonnull(2);
-int	copyinstr(const void * __restrict udaddr, void * __restrict kaddr,
-	    size_t len, size_t * __restrict lencopied)
-	    __nonnull(1) __nonnull(2);
-int	copyin(const void * __restrict udaddr, void * __restrict kaddr,
-	    size_t len) __nonnull(1) __nonnull(2);
-int	copyin_nofault(const void * __restrict udaddr, void * __restrict kaddr,
-	    size_t len) __nonnull(1) __nonnull(2);
-int	copyout(const void * __restrict kaddr, void * __restrict udaddr,
-	    size_t len) __nonnull(1) __nonnull(2);
-int	copyout_nofault(const void * __restrict kaddr, void * __restrict udaddr,
-	    size_t len) __nonnull(1) __nonnull(2);
+int	copystr(const void * _Nonnull __restrict kfaddr,
+	    void * _Nonnull __restrict kdaddr, size_t len,
+	    size_t * __restrict lencopied);
+int	copyinstr(const void * __restrict udaddr,
+	    void * _Nonnull __restrict kaddr, size_t len,
+	    size_t * __restrict lencopied);
+int	copyin(const void * __restrict udaddr,
+	    void * _Nonnull __restrict kaddr, size_t len);
+int	copyin_nofault(const void * __restrict udaddr,
+	    void * _Nonnull __restrict kaddr, size_t len);
+int	copyout(const void * _Nonnull __restrict kaddr,
+	    void * __restrict udaddr, size_t len);
+int	copyout_nofault(const void * _Nonnull __restrict kaddr,
+	    void * __restrict udaddr, size_t len);
 
 int	fubyte(volatile const void *base);
 long	fuword(volatile const void *base);
@@ -304,11 +318,12 @@
 void	stopprofclock(struct proc *);
 void	cpu_startprofclock(void);
 void	cpu_stopprofclock(void);
+void	suspendclock(void);
+void	resumeclock(void);
 sbintime_t 	cpu_idleclock(void);
 void	cpu_activeclock(void);
 void	cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt);
 void	cpu_et_frequency(struct eventtimer *et, uint64_t newfreq);
-extern int	cpu_deepest_sleep;
 extern int	cpu_disable_c2_sleep;
 extern int	cpu_disable_c3_sleep;
 
@@ -316,7 +331,7 @@
 int	cr_canseesocket(struct ucred *cred, struct socket *so);
 int	cr_canseeinpcb(struct ucred *cred, struct inpcb *inp);
 
-char	*getenv(const char *name);
+char	*kern_getenv(const char *name);
 void	freeenv(char *env);
 int	getenv_int(const char *name, int *data);
 int	getenv_uint(const char *name, unsigned int *data);
@@ -323,11 +338,18 @@
 int	getenv_long(const char *name, long *data);
 int	getenv_ulong(const char *name, unsigned long *data);
 int	getenv_string(const char *name, char *data, int size);
+int	getenv_int64(const char *name, int64_t *data);
+int	getenv_uint64(const char *name, uint64_t *data);
 int	getenv_quad(const char *name, quad_t *data);
-int	setenv(const char *name, const char *value);
-int	unsetenv(const char *name);
+int	kern_setenv(const char *name, const char *value);
+int	kern_unsetenv(const char *name);
 int	testenv(const char *name);
 
+int	getenv_array(const char *name, void *data, int size, int *psize,
+    int type_size, bool allow_signed);
+#define	GETENV_UNSIGNED	false	/* negative numbers not allowed */
+#define	GETENV_SIGNED	true	/* negative numbers allowed */
+
 typedef uint64_t (cpu_tick_f)(void);
 void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var);
 extern cpu_tick_f *cpu_ticks;
@@ -369,7 +391,6 @@
 static __inline intrmask_t	splimp(void)		{ return 0; }
 static __inline intrmask_t	splnet(void)		{ return 0; }
 static __inline intrmask_t	spltty(void)		{ return 0; }
-static __inline intrmask_t	splvm(void)		{ return 0; }
 static __inline void		splx(intrmask_t ipl __unused)	{ return; }
 
 /*
@@ -376,8 +397,8 @@
  * Common `proc' functions are declared here so that proc.h can be included
  * less often.
  */
-int	_sleep(void *chan, struct lock_object *lock, int pri, const char *wmesg,
-	   sbintime_t sbt, sbintime_t pr, int flags) __nonnull(1);
+int	_sleep(void * _Nonnull chan, struct lock_object *lock, int pri,
+	   const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags);
 #define	msleep(chan, mtx, pri, wmesg, timo)				\
 	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg),		\
 	    tick_sbt * (timo), 0, C_HARDCLOCK)
@@ -384,8 +405,8 @@
 #define	msleep_sbt(chan, mtx, pri, wmesg, bt, pr, flags)		\
 	_sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (bt), (pr),	\
 	    (flags))
-int	msleep_spin_sbt(void *chan, struct mtx *mtx, const char *wmesg,
-	    sbintime_t sbt, sbintime_t pr, int flags) __nonnull(1);
+int	msleep_spin_sbt(void * _Nonnull chan, struct mtx *mtx,
+	    const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags);
 #define	msleep_spin(chan, mtx, wmesg, timo)				\
 	msleep_spin_sbt((chan), (mtx), (wmesg), tick_sbt * (timo),	\
 	    0, C_HARDCLOCK)
@@ -393,13 +414,16 @@
 	    int flags);
 #define	pause(wmesg, timo)						\
 	pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK)
+#define	pause_sig(wmesg, timo)						\
+	pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK | C_CATCH)
 #define	tsleep(chan, pri, wmesg, timo)					\
 	_sleep((chan), NULL, (pri), (wmesg), tick_sbt * (timo),		\
 	    0, C_HARDCLOCK)
 #define	tsleep_sbt(chan, pri, wmesg, bt, pr, flags)			\
 	_sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags))
-void	wakeup(void *chan) __nonnull(1);
-void	wakeup_one(void *chan) __nonnull(1);
+void	wakeup(void * chan);
+void	wakeup_one(void * chan);
+void	wakeup_any(void * chan);
 
 /*
  * Common `struct cdev *' stuff are declared here to avoid #include poisoning
@@ -409,6 +433,11 @@
 dev_t dev2udev(struct cdev *x);
 const char *devtoname(struct cdev *cdev);
 
+#ifdef __LP64__
+size_t	devfs_iosize_max(void);
+size_t	iosize_max(void);
+#endif
+
 int poll_no_poll(int events);
 
 /* XXX: Should be void nanodelay(u_int nsec); */
@@ -419,7 +448,6 @@
 
 struct root_hold_token *root_mount_hold(const char *identifier);
 void root_mount_rel(struct root_hold_token *h);
-void root_mount_wait(void);
 int root_mounted(void);
 
 
@@ -439,8 +467,27 @@
 
 void	intr_prof_stack_use(struct thread *td, struct trapframe *frame);
 
-extern void (*softdep_ast_cleanup)(void);
-
 void counted_warning(unsigned *counter, const char *msg);
 
+/*
+ * APIs to manage deprecation and obsolescence.
+ */
+struct device;
+void _gone_in(int major, const char *msg);
+void _gone_in_dev(struct device *dev, int major, const char *msg);
+#ifdef NO_OBSOLETE_CODE
+#define __gone_ok(m, msg)					 \
+	_Static_assert(m < P_OSREL_MAJOR(__FreeBSD_version)),	 \
+	    "Obsolete code" msg);
+#else
+#define	__gone_ok(m, msg)
+#endif
+#define gone_in(major, msg)		__gone_ok(major, msg) _gone_in(major, msg)
+#define gone_in_dev(dev, major, msg)	__gone_ok(major, msg) _gone_in_dev(dev, major, msg)
+#define	gone_by_fcp101_dev(dev)						\
+	gone_in_dev((dev), 13,						\
+	    "see https://github.com/freebsd/fcp/blob/master/fcp-0101.md")
+
+__NULLABILITY_PRAGMA_POP
+
 #endif /* !_SYS_SYSTM_H_ */


From laffer1 at midnightbsd.org  Sat Feb  8 15:02:45 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:02:45 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12335] trunk/sys/sys/stat.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082002.018K2jV3068019@stargazer.midnightbsd.org>

Revision: 12335
          http://svnweb.midnightbsd.org/src/?rev=12335
Author:   laffer1
Date:     2020-02-08 15:02:44 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/stat.h

Modified: trunk/sys/sys/stat.h
===================================================================
--- trunk/sys/sys/stat.h	2020-02-08 20:02:25 UTC (rev 12334)
+++ trunk/sys/sys/stat.h	2020-02-08 20:02:44 UTC (rev 12335)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)stat.h	8.12 (Berkeley) 6/16/95
- * $FreeBSD: stable/10/sys/sys/stat.h 293474 2016-01-09 14:20:23Z dchagin $
+ * $FreeBSD: stable/11/sys/sys/stat.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_STAT_H_
@@ -348,12 +348,12 @@
 #endif
 int	stat(const char * __restrict, struct stat * __restrict);
 mode_t	umask(mode_t);
-#if __BSD_VISIBLE || __POSIX_VISIBLE >= 200809
+#if __POSIX_VISIBLE >= 200809
 int	fstatat(int, const char *, struct stat *, int);
 int	mkdirat(int, const char *, mode_t);
 int	mkfifoat(int, const char *, mode_t);
 #endif
-#if __BSD_VISIBLE || __XSI_VISIBLE >= 700
+#if __XSI_VISIBLE >= 700
 int	mknodat(int, const char *, mode_t, dev_t);
 #endif
 __END_DECLS


From laffer1 at midnightbsd.org  Sat Feb  8 15:03:36 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:03:36 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12336] trunk/sys/sys/smp.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082003.018K3aHi068089@stargazer.midnightbsd.org>

Revision: 12336
          http://svnweb.midnightbsd.org/src/?rev=12336
Author:   laffer1
Date:     2020-02-08 15:03:36 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/smp.h

Modified: trunk/sys/sys/smp.h
===================================================================
--- trunk/sys/sys/smp.h	2020-02-08 20:02:44 UTC (rev 12335)
+++ trunk/sys/sys/smp.h	2020-02-08 20:03:36 UTC (rev 12336)
@@ -7,7 +7,7 @@
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
- * $FreeBSD: stable/10/sys/sys/smp.h 331910 2018-04-03 07:52:06Z avg $
+ * $FreeBSD: stable/11/sys/sys/smp.h 331909 2018-04-03 07:31:22Z avg $
  */
 
 #ifndef _SYS_SMP_H_
@@ -18,9 +18,52 @@
 #ifndef LOCORE
 
 #include <sys/cpuset.h>
+#include <sys/queue.h>
 
 /*
- * Topology of a NUMA or HTT system.
+ * Types of nodes in the topological tree.
+ */
+typedef enum {
+	/* No node has this type; can be used in topo API calls. */
+	TOPO_TYPE_DUMMY,
+	/* Processing unit aka computing unit aka logical CPU. */
+	TOPO_TYPE_PU,
+	/* Physical subdivision of a package. */
+	TOPO_TYPE_CORE,
+	/* CPU L1/L2/L3 cache. */
+	TOPO_TYPE_CACHE,
+	/* Package aka chip, equivalent to socket. */
+	TOPO_TYPE_PKG,
+	/* NUMA node. */
+	TOPO_TYPE_NODE,
+	/* Other logical or physical grouping of PUs. */
+	/* E.g. PUs on the same dye, or PUs sharing an FPU. */
+	TOPO_TYPE_GROUP,
+	/* The whole system. */
+	TOPO_TYPE_SYSTEM
+} topo_node_type;
+
+/* Hardware indenitifier of a topology component. */
+typedef	unsigned int hwid_t;
+/* Logical CPU idenitifier. */
+typedef	int cpuid_t;
+
+/* A node in the topology. */
+struct topo_node {
+	struct topo_node			*parent;
+	TAILQ_HEAD(topo_children, topo_node)	children;
+	TAILQ_ENTRY(topo_node)			siblings;
+	cpuset_t				cpuset;
+	topo_node_type				type;
+	uintptr_t				subtype;
+	hwid_t					hwid;
+	cpuid_t					id;
+	int					nchildren;
+	int					cpu_count;
+};
+
+/*
+ * Scheduling topology of a NUMA or SMP system.
  *
  * The top level topology is an array of pointers to groups.  Each group
  * contains a bitmask of cpus in its group or subgroups.  It may also
@@ -53,6 +96,8 @@
 #define	CG_SHARE_L2	2
 #define	CG_SHARE_L3	3
 
+#define MAX_CACHE_LEVELS	CG_SHARE_L3
+
 /*
  * Behavior modifiers for load balancing and affinity.
  */
@@ -61,10 +106,29 @@
 #define	CG_FLAG_THREAD	(CG_FLAG_HTT | CG_FLAG_SMT)	/* Any threading. */
 
 /*
- * Convenience routines for building topologies.
+ * Convenience routines for building and traversing topologies.
  */
 #ifdef SMP
+void topo_init_node(struct topo_node *node);
+void topo_init_root(struct topo_node *root);
+struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype);
+struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid,
+    topo_node_type type, uintptr_t subtype);
+void topo_promote_child(struct topo_node *child);
+struct topo_node * topo_next_node(struct topo_node *top,
+    struct topo_node *node);
+struct topo_node * topo_next_nonchild_node(struct topo_node *top,
+    struct topo_node *node);
+void topo_set_pu_id(struct topo_node *node, cpuid_t id);
+int topo_analyze(struct topo_node *topo_root, int all, int *pkg_count,
+    int *cores_per_pkg, int *thrs_per_core);
+
+#define	TOPO_FOREACH(i, root)	\
+	for (i = root; i != NULL; i = topo_next_node(root, i))
+
 struct cpu_group *smp_topo(void);
+struct cpu_group *smp_topo_alloc(u_int count);
 struct cpu_group *smp_topo_none(void);
 struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags);
 struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share,
@@ -89,6 +153,7 @@
 extern volatile int smp_started;
 
 extern cpuset_t all_cpus;
+extern cpuset_t cpuset_domain[MAXMEMDOM]; 	/* CPUs in each NUMA domain. */
 
 /*
  * Macro allowing us to determine whether a CPU is absent at any given
@@ -179,7 +244,14 @@
 
 int	quiesce_all_cpus(const char *, int);
 int	quiesce_cpus(cpuset_t, const char *, int);
+/*
+ * smp_no_rendevous_barrier was renamed to smp_no_rendezvous_barrier
+ * in __FreeBSD_version 1101508, with the old name remaining in 11.x
+ * as an alias for compatibility.  The old name will be gone in 12.0
+ * (__FreeBSD_version >= 1200028).
+ */
 void	smp_no_rendevous_barrier(void *);
+void	smp_no_rendezvous_barrier(void *);
 void	smp_rendezvous(void (*)(void *), 
 		       void (*)(void *),
 		       void (*)(void *),


From laffer1 at midnightbsd.org  Sat Feb  8 15:04:23 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:04:23 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12337] trunk/sys/sys/timeet.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082004.018K4NxK068152@stargazer.midnightbsd.org>

Revision: 12337
          http://svnweb.midnightbsd.org/src/?rev=12337
Author:   laffer1
Date:     2020-02-08 15:04:23 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/timeet.h

Modified: trunk/sys/sys/timeet.h
===================================================================
--- trunk/sys/sys/timeet.h	2020-02-08 20:03:36 UTC (rev 12336)
+++ trunk/sys/sys/timeet.h	2020-02-08 20:04:23 UTC (rev 12337)
@@ -24,7 +24,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/timeet.h 266347 2014-05-17 20:10:12Z ian $
+ * $FreeBSD: stable/11/sys/sys/timeet.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_TIMEEC_H_
@@ -54,7 +54,7 @@
 struct eventtimer {
 	SLIST_ENTRY(eventtimer)	et_all;
 		/* Pointer to the next event timer. */
-	char			*et_name;
+	const char		*et_name;
 		/* Name of the event timer. */
 	int			et_flags;
 		/* Set of capabilities flags: */


From laffer1 at midnightbsd.org  Sat Feb  8 15:04:47 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:04:47 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12338] trunk/sys/sys/timetc.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082004.018K4lbI068211@stargazer.midnightbsd.org>

Revision: 12338
          http://svnweb.midnightbsd.org/src/?rev=12338
Author:   laffer1
Date:     2020-02-08 15:04:46 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/timetc.h

Modified: trunk/sys/sys/timetc.h
===================================================================
--- trunk/sys/sys/timetc.h	2020-02-08 20:04:23 UTC (rev 12337)
+++ trunk/sys/sys/timetc.h	2020-02-08 20:04:46 UTC (rev 12338)
@@ -7,7 +7,7 @@
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
- * $FreeBSD: stable/10/sys/sys/timetc.h 280973 2015-04-02 01:02:42Z jhb $
+ * $FreeBSD: stable/11/sys/sys/timetc.h 305866 2016-09-16 10:04:28Z kib $
  */
 
 #ifndef _SYS_TIMETC_H_
@@ -29,8 +29,14 @@
  */
 
 struct timecounter;
+struct vdso_timehands;
+struct vdso_timehands32;
 typedef u_int timecounter_get_t(struct timecounter *);
 typedef void timecounter_pps_t(struct timecounter *);
+typedef uint32_t timecounter_fill_vdso_timehands_t(struct vdso_timehands *,
+    struct timecounter *);
+typedef uint32_t timecounter_fill_vdso_timehands32_t(struct vdso_timehands32 *,
+    struct timecounter *);
 
 struct timecounter {
 	timecounter_get_t	*tc_get_timecount;
@@ -50,7 +56,7 @@
 		/* This mask should mask off any unimplemented bits. */
 	uint64_t		tc_frequency;
 		/* Frequency of the counter in Hz. */
-	char			*tc_name;
+	const char		*tc_name;
 		/* Name of the timecounter. */
 	int			tc_quality;
 		/*
@@ -69,6 +75,8 @@
 		/* Pointer to the timecounter's private parts. */
 	struct timecounter	*tc_next;
 		/* Pointer to the next timecounter. */
+	timecounter_fill_vdso_timehands_t *tc_fill_vdso_timehands;
+	timecounter_fill_vdso_timehands32_t *tc_fill_vdso_timehands32;
 };
 
 extern struct timecounter *timecounter;


From laffer1 at midnightbsd.org  Sat Feb  8 15:06:07 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:06:07 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12339] trunk/sys/sys/sysproto.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082006.018K67tU068928@stargazer.midnightbsd.org>

Revision: 12339
          http://svnweb.midnightbsd.org/src/?rev=12339
Author:   laffer1
Date:     2020-02-08 15:06:06 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sysproto.h

Modified: trunk/sys/sys/sysproto.h
===================================================================
--- trunk/sys/sys/sysproto.h	2020-02-08 20:04:46 UTC (rev 12338)
+++ trunk/sys/sys/sysproto.h	2020-02-08 20:06:06 UTC (rev 12339)
@@ -1,8 +1,9 @@
+/* $MidnightBSD$ */
 /*
  * System call prototypes.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $MidnightBSD$
+ * $FreeBSD: stable/11/sys/sys/sysproto.h 330964 2018-03-15 02:20:06Z eadler $
  */
 
 #ifndef _SYS_SYSPROTO_H_
@@ -182,7 +183,7 @@
 struct dup_args {
 	char fd_l_[PADL_(u_int)]; u_int fd; char fd_r_[PADR_(u_int)];
 };
-struct pipe_args {
+struct freebsd10_pipe_args {
 	register_t dummy;
 };
 struct getegid_args {
@@ -531,20 +532,6 @@
 	char a3_l_[PADL_(int)]; int a3; char a3_r_[PADR_(int)];
 	char a4_l_[PADL_(int)]; int a4; char a4_r_[PADR_(int)];
 };
-struct freebsd6_pread_args {
-	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
-	char buf_l_[PADL_(void *)]; void * buf; char buf_r_[PADR_(void *)];
-	char nbyte_l_[PADL_(size_t)]; size_t nbyte; char nbyte_r_[PADR_(size_t)];
-	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
-	char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)];
-};
-struct freebsd6_pwrite_args {
-	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
-	char buf_l_[PADL_(const void *)]; const void * buf; char buf_r_[PADR_(const void *)];
-	char nbyte_l_[PADL_(size_t)]; size_t nbyte; char nbyte_r_[PADR_(size_t)];
-	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
-	char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)];
-};
 struct setfib_args {
 	char fibnum_l_[PADL_(int)]; int fibnum; char fibnum_r_[PADR_(int)];
 };
@@ -594,31 +581,6 @@
 	char count_l_[PADL_(u_int)]; u_int count; char count_r_[PADR_(u_int)];
 	char basep_l_[PADL_(long *)]; long * basep; char basep_r_[PADR_(long *)];
 };
-struct freebsd6_mmap_args {
-	char addr_l_[PADL_(caddr_t)]; caddr_t addr; char addr_r_[PADR_(caddr_t)];
-	char len_l_[PADL_(size_t)]; size_t len; char len_r_[PADR_(size_t)];
-	char prot_l_[PADL_(int)]; int prot; char prot_r_[PADR_(int)];
-	char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
-	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
-	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
-	char pos_l_[PADL_(off_t)]; off_t pos; char pos_r_[PADR_(off_t)];
-};
-struct freebsd6_lseek_args {
-	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
-	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
-	char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)];
-	char whence_l_[PADL_(int)]; int whence; char whence_r_[PADR_(int)];
-};
-struct freebsd6_truncate_args {
-	char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
-	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
-	char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)];
-};
-struct freebsd6_ftruncate_args {
-	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
-	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
-	char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)];
-};
 struct sysctl_args {
 	char name_l_[PADL_(int *)]; int * name; char name_r_[PADR_(int *)];
 	char namelen_l_[PADL_(u_int)]; u_int namelen; char namelen_r_[PADR_(u_int)];
@@ -736,6 +698,12 @@
 struct ffclock_getestimate_args {
 	char cest_l_[PADL_(struct ffclock_estimate *)]; struct ffclock_estimate * cest; char cest_r_[PADR_(struct ffclock_estimate *)];
 };
+struct clock_nanosleep_args {
+	char clock_id_l_[PADL_(clockid_t)]; clockid_t clock_id; char clock_id_r_[PADR_(clockid_t)];
+	char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
+	char rqtp_l_[PADL_(const struct timespec *)]; const struct timespec * rqtp; char rqtp_r_[PADR_(const struct timespec *)];
+	char rmtp_l_[PADL_(struct timespec *)]; struct timespec * rmtp; char rmtp_r_[PADR_(struct timespec *)];
+};
 struct clock_getcpuclockid2_args {
 	char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)];
 	char which_l_[PADL_(int)]; int which; char which_r_[PADR_(int)];
@@ -882,18 +850,6 @@
 struct aio_error_args {
 	char aiocbp_l_[PADL_(struct aiocb *)]; struct aiocb * aiocbp; char aiocbp_r_[PADR_(struct aiocb *)];
 };
-struct oaio_read_args {
-	char aiocbp_l_[PADL_(struct oaiocb *)]; struct oaiocb * aiocbp; char aiocbp_r_[PADR_(struct oaiocb *)];
-};
-struct oaio_write_args {
-	char aiocbp_l_[PADL_(struct oaiocb *)]; struct oaiocb * aiocbp; char aiocbp_r_[PADR_(struct oaiocb *)];
-};
-struct olio_listio_args {
-	char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)];
-	char acb_list_l_[PADL_(struct oaiocb *const *)]; struct oaiocb *const * acb_list; char acb_list_r_[PADR_(struct oaiocb *const *)];
-	char nent_l_[PADL_(int)]; int nent; char nent_r_[PADR_(int)];
-	char sig_l_[PADL_(struct osigevent *)]; struct osigevent * sig; char sig_r_[PADR_(struct osigevent *)];
-};
 struct yield_args {
 	register_t dummy;
 };
@@ -1157,7 +1113,7 @@
 struct getfsstat_args {
 	char buf_l_[PADL_(struct statfs *)]; struct statfs * buf; char buf_r_[PADR_(struct statfs *)];
 	char bufsize_l_[PADL_(long)]; long bufsize; char bufsize_r_[PADR_(long)];
-	char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
+	char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)];
 };
 struct statfs_args {
 	char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
@@ -1300,12 +1256,6 @@
 	char id_l_[PADL_(long)]; long id; char id_r_[PADR_(long)];
 	char sig_l_[PADL_(int)]; int sig; char sig_r_[PADR_(int)];
 };
-struct _umtx_lock_args {
-	char umtx_l_[PADL_(struct umtx *)]; struct umtx * umtx; char umtx_r_[PADR_(struct umtx *)];
-};
-struct _umtx_unlock_args {
-	char umtx_l_[PADL_(struct umtx *)]; struct umtx * umtx; char umtx_r_[PADR_(struct umtx *)];
-};
 struct jail_attach_args {
 	char jid_l_[PADL_(int)]; int jid; char jid_r_[PADR_(int)];
 };
@@ -1834,6 +1784,19 @@
 	char times_l_[PADL_(struct timespec *)]; struct timespec * times; char times_r_[PADR_(struct timespec *)];
 	char flag_l_[PADL_(int)]; int flag; char flag_r_[PADR_(int)];
 };
+struct numa_getaffinity_args {
+	char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)];
+	char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)];
+	char policy_l_[PADL_(struct vm_domain_policy_entry *)]; struct vm_domain_policy_entry * policy; char policy_r_[PADR_(struct vm_domain_policy_entry *)];
+};
+struct numa_setaffinity_args {
+	char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)];
+	char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)];
+	char policy_l_[PADL_(const struct vm_domain_policy_entry *)]; const struct vm_domain_policy_entry * policy; char policy_r_[PADR_(const struct vm_domain_policy_entry *)];
+};
+struct fdatasync_args {
+	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+};
 int	nosys(struct thread *, struct nosys_args *);
 void	sys_sys_exit(struct thread *, struct sys_exit_args *);
 int	sys_fork(struct thread *, struct fork_args *);
@@ -1870,7 +1833,6 @@
 int	sys_kill(struct thread *, struct kill_args *);
 int	sys_getppid(struct thread *, struct getppid_args *);
 int	sys_dup(struct thread *, struct dup_args *);
-int	sys_pipe(struct thread *, struct pipe_args *);
 int	sys_getegid(struct thread *, struct getegid_args *);
 int	sys_profil(struct thread *, struct profil_args *);
 int	sys_ktrace(struct thread *, struct ktrace_args *);
@@ -1946,8 +1908,6 @@
 int	sys_semsys(struct thread *, struct semsys_args *);
 int	sys_msgsys(struct thread *, struct msgsys_args *);
 int	sys_shmsys(struct thread *, struct shmsys_args *);
-int	freebsd6_pread(struct thread *, struct freebsd6_pread_args *);
-int	freebsd6_pwrite(struct thread *, struct freebsd6_pwrite_args *);
 int	sys_setfib(struct thread *, struct setfib_args *);
 int	sys_ntp_adjtime(struct thread *, struct ntp_adjtime_args *);
 int	sys_setgid(struct thread *, struct setgid_args *);
@@ -1961,10 +1921,6 @@
 int	sys_getrlimit(struct thread *, struct __getrlimit_args *);
 int	sys_setrlimit(struct thread *, struct __setrlimit_args *);
 int	sys_getdirentries(struct thread *, struct getdirentries_args *);
-int	freebsd6_mmap(struct thread *, struct freebsd6_mmap_args *);
-int	freebsd6_lseek(struct thread *, struct freebsd6_lseek_args *);
-int	freebsd6_truncate(struct thread *, struct freebsd6_truncate_args *);
-int	freebsd6_ftruncate(struct thread *, struct freebsd6_ftruncate_args *);
 int	sys___sysctl(struct thread *, struct sysctl_args *);
 int	sys_mlock(struct thread *, struct mlock_args *);
 int	sys_munlock(struct thread *, struct munlock_args *);
@@ -1992,6 +1948,7 @@
 int	sys_ffclock_getcounter(struct thread *, struct ffclock_getcounter_args *);
 int	sys_ffclock_setestimate(struct thread *, struct ffclock_setestimate_args *);
 int	sys_ffclock_getestimate(struct thread *, struct ffclock_getestimate_args *);
+int	sys_clock_nanosleep(struct thread *, struct clock_nanosleep_args *);
 int	sys_clock_getcpuclockid2(struct thread *, struct clock_getcpuclockid2_args *);
 int	sys_ntp_gettime(struct thread *, struct ntp_gettime_args *);
 int	sys_minherit(struct thread *, struct minherit_args *);
@@ -2029,9 +1986,6 @@
 int	sys_aio_suspend(struct thread *, struct aio_suspend_args *);
 int	sys_aio_cancel(struct thread *, struct aio_cancel_args *);
 int	sys_aio_error(struct thread *, struct aio_error_args *);
-int	sys_oaio_read(struct thread *, struct oaio_read_args *);
-int	sys_oaio_write(struct thread *, struct oaio_write_args *);
-int	sys_olio_listio(struct thread *, struct olio_listio_args *);
 int	sys_yield(struct thread *, struct yield_args *);
 int	sys_mlockall(struct thread *, struct mlockall_args *);
 int	sys_munlockall(struct thread *, struct munlockall_args *);
@@ -2123,8 +2077,6 @@
 int	sys_thr_exit(struct thread *, struct thr_exit_args *);
 int	sys_thr_self(struct thread *, struct thr_self_args *);
 int	sys_thr_kill(struct thread *, struct thr_kill_args *);
-int	sys__umtx_lock(struct thread *, struct _umtx_lock_args *);
-int	sys__umtx_unlock(struct thread *, struct _umtx_unlock_args *);
 int	sys_jail_attach(struct thread *, struct jail_attach_args *);
 int	sys_extattr_list_fd(struct thread *, struct extattr_list_fd_args *);
 int	sys_extattr_list_file(struct thread *, struct extattr_list_file_args *);
@@ -2230,6 +2182,9 @@
 int	sys_ppoll(struct thread *, struct ppoll_args *);
 int	sys_futimens(struct thread *, struct futimens_args *);
 int	sys_utimensat(struct thread *, struct utimensat_args *);
+int	sys_numa_getaffinity(struct thread *, struct numa_getaffinity_args *);
+int	sys_numa_setaffinity(struct thread *, struct numa_setaffinity_args *);
+int	sys_fdatasync(struct thread *, struct fdatasync_args *);
 
 #ifdef COMPAT_43
 
@@ -2408,7 +2363,7 @@
 struct freebsd4_getfsstat_args {
 	char buf_l_[PADL_(struct ostatfs *)]; struct ostatfs * buf; char buf_r_[PADR_(struct ostatfs *)];
 	char bufsize_l_[PADL_(long)]; long bufsize; char bufsize_r_[PADR_(long)];
-	char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
+	char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)];
 };
 struct freebsd4_statfs_args {
 	char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
@@ -2466,6 +2421,66 @@
 
 #ifdef COMPAT_FREEBSD6
 
+struct freebsd6_pread_args {
+	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+	char buf_l_[PADL_(void *)]; void * buf; char buf_r_[PADR_(void *)];
+	char nbyte_l_[PADL_(size_t)]; size_t nbyte; char nbyte_r_[PADR_(size_t)];
+	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
+	char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)];
+};
+struct freebsd6_pwrite_args {
+	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+	char buf_l_[PADL_(const void *)]; const void * buf; char buf_r_[PADR_(const void *)];
+	char nbyte_l_[PADL_(size_t)]; size_t nbyte; char nbyte_r_[PADR_(size_t)];
+	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
+	char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)];
+};
+struct freebsd6_mmap_args {
+	char addr_l_[PADL_(caddr_t)]; caddr_t addr; char addr_r_[PADR_(caddr_t)];
+	char len_l_[PADL_(size_t)]; size_t len; char len_r_[PADR_(size_t)];
+	char prot_l_[PADL_(int)]; int prot; char prot_r_[PADR_(int)];
+	char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
+	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
+	char pos_l_[PADL_(off_t)]; off_t pos; char pos_r_[PADR_(off_t)];
+};
+struct freebsd6_lseek_args {
+	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
+	char offset_l_[PADL_(off_t)]; off_t offset; char offset_r_[PADR_(off_t)];
+	char whence_l_[PADL_(int)]; int whence; char whence_r_[PADR_(int)];
+};
+struct freebsd6_truncate_args {
+	char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
+	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
+	char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)];
+};
+struct freebsd6_ftruncate_args {
+	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+	char pad_l_[PADL_(int)]; int pad; char pad_r_[PADR_(int)];
+	char length_l_[PADL_(off_t)]; off_t length; char length_r_[PADR_(off_t)];
+};
+struct freebsd6_aio_read_args {
+	char aiocbp_l_[PADL_(struct oaiocb *)]; struct oaiocb * aiocbp; char aiocbp_r_[PADR_(struct oaiocb *)];
+};
+struct freebsd6_aio_write_args {
+	char aiocbp_l_[PADL_(struct oaiocb *)]; struct oaiocb * aiocbp; char aiocbp_r_[PADR_(struct oaiocb *)];
+};
+struct freebsd6_lio_listio_args {
+	char mode_l_[PADL_(int)]; int mode; char mode_r_[PADR_(int)];
+	char acb_list_l_[PADL_(struct oaiocb *const *)]; struct oaiocb *const * acb_list; char acb_list_r_[PADR_(struct oaiocb *const *)];
+	char nent_l_[PADL_(int)]; int nent; char nent_r_[PADR_(int)];
+	char sig_l_[PADL_(struct osigevent *)]; struct osigevent * sig; char sig_r_[PADR_(struct osigevent *)];
+};
+int	freebsd6_pread(struct thread *, struct freebsd6_pread_args *);
+int	freebsd6_pwrite(struct thread *, struct freebsd6_pwrite_args *);
+int	freebsd6_mmap(struct thread *, struct freebsd6_mmap_args *);
+int	freebsd6_lseek(struct thread *, struct freebsd6_lseek_args *);
+int	freebsd6_truncate(struct thread *, struct freebsd6_truncate_args *);
+int	freebsd6_ftruncate(struct thread *, struct freebsd6_ftruncate_args *);
+int	freebsd6_aio_read(struct thread *, struct freebsd6_aio_read_args *);
+int	freebsd6_aio_write(struct thread *, struct freebsd6_aio_write_args *);
+int	freebsd6_lio_listio(struct thread *, struct freebsd6_lio_listio_args *);
 
 #endif /* COMPAT_FREEBSD6 */
 
@@ -2494,11 +2509,18 @@
 
 #endif /* COMPAT_FREEBSD7 */
 
+
+#ifdef COMPAT_FREEBSD10
+
+int	freebsd10_pipe(struct thread *, struct freebsd10_pipe_args *);
+
+#endif /* COMPAT_FREEBSD10 */
+
 #define	SYS_AUE_syscall	AUE_NULL
 #define	SYS_AUE_exit	AUE_EXIT
 #define	SYS_AUE_fork	AUE_FORK
-#define	SYS_AUE_read	AUE_NULL
-#define	SYS_AUE_write	AUE_NULL
+#define	SYS_AUE_read	AUE_READ
+#define	SYS_AUE_write	AUE_WRITE
 #define	SYS_AUE_open	AUE_OPEN_RWTC
 #define	SYS_AUE_close	AUE_CLOSE
 #define	SYS_AUE_wait4	AUE_WAIT4
@@ -2535,7 +2557,7 @@
 #define	SYS_AUE_getppid	AUE_GETPPID
 #define	SYS_AUE_olstat	AUE_LSTAT
 #define	SYS_AUE_dup	AUE_DUP
-#define	SYS_AUE_pipe	AUE_PIPE
+#define	SYS_AUE_freebsd10_pipe	AUE_PIPE
 #define	SYS_AUE_getegid	AUE_GETEGID
 #define	SYS_AUE_profil	AUE_PROFILE
 #define	SYS_AUE_ktrace	AUE_KTRACE
@@ -2698,6 +2720,7 @@
 #define	SYS_AUE_ffclock_getcounter	AUE_NULL
 #define	SYS_AUE_ffclock_setestimate	AUE_NULL
 #define	SYS_AUE_ffclock_getestimate	AUE_NULL
+#define	SYS_AUE_clock_nanosleep	AUE_NULL
 #define	SYS_AUE_clock_getcpuclockid2	AUE_NULL
 #define	SYS_AUE_ntp_gettime	AUE_NULL
 #define	SYS_AUE_minherit	AUE_MINHERIT
@@ -2736,9 +2759,9 @@
 #define	SYS_AUE_aio_suspend	AUE_NULL
 #define	SYS_AUE_aio_cancel	AUE_NULL
 #define	SYS_AUE_aio_error	AUE_NULL
-#define	SYS_AUE_oaio_read	AUE_NULL
-#define	SYS_AUE_oaio_write	AUE_NULL
-#define	SYS_AUE_olio_listio	AUE_NULL
+#define	SYS_AUE_freebsd6_aio_read	AUE_NULL
+#define	SYS_AUE_freebsd6_aio_write	AUE_NULL
+#define	SYS_AUE_freebsd6_lio_listio	AUE_NULL
 #define	SYS_AUE_yield	AUE_NULL
 #define	SYS_AUE_mlockall	AUE_MLOCKALL
 #define	SYS_AUE_munlockall	AUE_MUNLOCKALL
@@ -2833,8 +2856,6 @@
 #define	SYS_AUE_thr_exit	AUE_NULL
 #define	SYS_AUE_thr_self	AUE_NULL
 #define	SYS_AUE_thr_kill	AUE_NULL
-#define	SYS_AUE__umtx_lock	AUE_NULL
-#define	SYS_AUE__umtx_unlock	AUE_NULL
 #define	SYS_AUE_jail_attach	AUE_NULL
 #define	SYS_AUE_extattr_list_fd	AUE_EXTATTR_LIST_FD
 #define	SYS_AUE_extattr_list_file	AUE_EXTATTR_LIST_FILE
@@ -2940,6 +2961,9 @@
 #define	SYS_AUE_ppoll	AUE_POLL
 #define	SYS_AUE_futimens	AUE_FUTIMES
 #define	SYS_AUE_utimensat	AUE_FUTIMESAT
+#define	SYS_AUE_numa_getaffinity	AUE_NULL
+#define	SYS_AUE_numa_setaffinity	AUE_NULL
+#define	SYS_AUE_fdatasync	AUE_FSYNC
 
 #undef PAD_
 #undef PADL_


From laffer1 at midnightbsd.org  Sat Feb  8 15:07:15 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:07:15 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12340] trunk/sys/sys/taskqueue.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082007.018K7Fth069005@stargazer.midnightbsd.org>

Revision: 12340
          http://svnweb.midnightbsd.org/src/?rev=12340
Author:   laffer1
Date:     2020-02-08 15:07:14 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/taskqueue.h

Modified: trunk/sys/sys/taskqueue.h
===================================================================
--- trunk/sys/sys/taskqueue.h	2020-02-08 20:06:06 UTC (rev 12339)
+++ trunk/sys/sys/taskqueue.h	2020-02-08 20:07:14 UTC (rev 12340)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/taskqueue.h 315268 2017-03-14 16:00:33Z hselasky $
+ * $FreeBSD: stable/11/sys/sys/taskqueue.h 341154 2018-11-28 17:00:18Z markj $
  */
 
 #ifndef _SYS_TASKQUEUE_H_
@@ -37,8 +37,10 @@
 #include <sys/queue.h>
 #include <sys/_task.h>
 #include <sys/_callout.h>
+#include <sys/_cpuset.h>
 
 struct taskqueue;
+struct taskqgroup;
 struct thread;
 
 struct timeout_task {
@@ -55,6 +57,7 @@
 #define	TASKQUEUE_CALLBACK_TYPE_MIN	TASKQUEUE_CALLBACK_TYPE_INIT
 #define	TASKQUEUE_CALLBACK_TYPE_MAX	TASKQUEUE_CALLBACK_TYPE_SHUTDOWN
 #define	TASKQUEUE_NUM_CALLBACKS		TASKQUEUE_CALLBACK_TYPE_MAX + 1
+#define	TASKQUEUE_NAMELEN		32
 
 typedef void (*taskqueue_callback_fn)(void *context);
 
@@ -72,9 +75,14 @@
 				    void *context);
 int	taskqueue_start_threads(struct taskqueue **tqp, int count, int pri,
 				const char *name, ...) __printflike(4, 5);
+int	taskqueue_start_threads_cpuset(struct taskqueue **tqp, int count,
+	    int pri, cpuset_t *mask, const char *name, ...) __printflike(5, 6);
 int	taskqueue_enqueue(struct taskqueue *queue, struct task *task);
 int	taskqueue_enqueue_timeout(struct taskqueue *queue,
 	    struct timeout_task *timeout_task, int ticks);
+int	taskqueue_enqueue_timeout_sbt(struct taskqueue *queue,
+	    struct timeout_task *timeout_task, sbintime_t sbt, sbintime_t pr,
+	    int flags);
 int	taskqueue_poll_is_busy(struct taskqueue *queue, struct task *task);
 int	taskqueue_cancel(struct taskqueue *queue, struct task *task,
 	    u_int *pendp);
@@ -84,6 +92,7 @@
 void	taskqueue_drain_timeout(struct taskqueue *queue,
 	    struct timeout_task *timeout_task);
 void	taskqueue_drain_all(struct taskqueue *queue);
+void	taskqueue_quiesce(struct taskqueue *queue);
 void	taskqueue_free(struct taskqueue *queue);
 void	taskqueue_run(struct taskqueue *queue);
 void	taskqueue_block(struct taskqueue *queue);
@@ -142,7 +151,7 @@
 	init;								\
 }									\
 									\
-SYSINIT(taskqueue_##name, SI_SUB_CONFIGURE, SI_ORDER_SECOND,		\
+SYSINIT(taskqueue_##name, SI_SUB_TASKQ, SI_ORDER_SECOND,		\
 	taskqueue_define_##name, NULL);					\
 									\
 struct __hack
@@ -167,7 +176,7 @@
 	init;								\
 }									\
 									\
-SYSINIT(taskqueue_##name, SI_SUB_CONFIGURE, SI_ORDER_SECOND,		\
+SYSINIT(taskqueue_##name, SI_SUB_TASKQ, SI_ORDER_SECOND,		\
 	taskqueue_define_##name, NULL);					\
 									\
 struct __hack
@@ -197,7 +206,6 @@
  * from a fast interrupt handler context.
  */
 TASKQUEUE_DECLARE(fast);
-int	taskqueue_enqueue_fast(struct taskqueue *queue, struct task *task);
 struct taskqueue *taskqueue_create_fast(const char *name, int mflags,
 				    taskqueue_enqueue_fn enqueue,
 				    void *context);


From laffer1 at midnightbsd.org  Sat Feb  8 15:07:52 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:07:52 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12341] trunk/sys/sys/syslimits.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082007.018K7qGg069072@stargazer.midnightbsd.org>

Revision: 12341
          http://svnweb.midnightbsd.org/src/?rev=12341
Author:   laffer1
Date:     2020-02-08 15:07:51 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/syslimits.h

Modified: trunk/sys/sys/syslimits.h
===================================================================
--- trunk/sys/sys/syslimits.h	2020-02-08 20:07:14 UTC (rev 12340)
+++ trunk/sys/sys/syslimits.h	2020-02-08 20:07:51 UTC (rev 12341)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -27,13 +28,13 @@
  * SUCH DAMAGE.
  *
  *	@(#)syslimits.h	8.1 (Berkeley) 6/2/93
- * $MidnightBSD$
+ * $FreeBSD: stable/11/sys/sys/syslimits.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_SYSLIMITS_H_
 #define _SYS_SYSLIMITS_H_
 
-#if !defined(_KERNEL) && !defined(_LIMITS_H_) && !defined(_SYS_PARAM_H_)
+#if !defined(_STANDALONE) && !defined(_KERNEL) && !defined(_LIMITS_H_) && !defined(_SYS_PARAM_H_)
 #ifndef _SYS_CDEFS_H_
 #error this file needs sys/cdefs.h as a prerequisite
 #endif


From laffer1 at midnightbsd.org  Sat Feb  8 15:08:43 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:08:43 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12342] trunk/sys/sys/sem.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082008.018K8hoi069149@stargazer.midnightbsd.org>

Revision: 12342
          http://svnweb.midnightbsd.org/src/?rev=12342
Author:   laffer1
Date:     2020-02-08 15:08:42 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sem.h

Modified: trunk/sys/sys/sem.h
===================================================================
--- trunk/sys/sys/sem.h	2020-02-08 20:07:51 UTC (rev 12341)
+++ trunk/sys/sys/sem.h	2020-02-08 20:08:42 UTC (rev 12342)
@@ -1,5 +1,5 @@
 /* $MidnightBSD$ */
-/* $FreeBSD: stable/10/sys/sys/sem.h 224016 2011-07-14 14:18:14Z bz $ */
+/* $FreeBSD: stable/11/sys/sys/sem.h 347995 2019-05-20 16:31:45Z kib $ */
 /*	$NetBSD: sem.h,v 1.5 1994/06/29 06:45:15 cgd Exp $	*/
 
 /*
@@ -11,6 +11,9 @@
 #ifndef _SYS_SEM_H_
 #define _SYS_SEM_H_
 
+#ifdef _WANT_SYSVSEM_INTERNALS
+#define	_WANT_SYSVIPC_INTERNALS
+#endif
 #include <sys/ipc.h>
 
 #ifndef _PID_T_DECLARED
@@ -38,7 +41,7 @@
 	long		sem_pad1;	/* SVABI/386 says I need this here */
 	time_t		sem_ctime;	/* last change time */
     					/* Times measured in secs since */
-    					/* 00:00:00 GMT, Jan. 1, 1970 */
+    					/* 00:00:00 UTC, Jan. 1, 1970, without leap seconds */
 	long		sem_pad2;	/* SVABI/386 says I need this here */
 	long		sem_pad3[4];	/* SVABI/386 says I need this here */
 };
@@ -51,7 +54,7 @@
 	time_t		sem_otime;	/* last operation time */
 	time_t		sem_ctime;	/* last change time */
     					/* Times measured in secs since */
-    					/* 00:00:00 GMT, Jan. 1, 1970 */
+    					/* 00:00:00 UTC, Jan. 1, 1970, without leap seconds */
 };
 
 /*
@@ -102,8 +105,7 @@
 #define SEM_A		IPC_W	/* alter permission */
 #define SEM_R		IPC_R	/* read permission */
 
-#ifdef _KERNEL
-
+#if defined(_KERNEL) || defined(_WANT_SYSVSEM_INTERNALS)
 /*
  * semaphore info struct
  */
@@ -118,7 +120,6 @@
 		semvmx,		/* semaphore maximum value */
 		semaem;		/* adjust on exit max value */
 };
-extern struct seminfo	seminfo;
 
 /*
  * Kernel wrapper for the user-level structure
@@ -132,13 +133,16 @@
 /* internal "mode" bits */
 #define	SEM_ALLOC	01000	/* semaphore is allocated */
 #define	SEM_DEST	02000	/* semaphore will be destroyed on last detach */
+#endif
 
+#ifdef _KERNEL
+extern struct seminfo	seminfo;
 /*
  * Process sem_undo vectors at proc exit.
  */
 void	semexit(struct proc *p);
 
-#else /* ! _KERNEL */
+#else /* !_KERNEL */
 
 __BEGIN_DECLS
 #if __BSD_VISIBLE


From laffer1 at midnightbsd.org  Sat Feb  8 15:08:58 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:08:58 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12343] trunk/sys/sys/stack.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082008.018K8wgq069202@stargazer.midnightbsd.org>

Revision: 12343
          http://svnweb.midnightbsd.org/src/?rev=12343
Author:   laffer1
Date:     2020-02-08 15:08:57 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/stack.h

Modified: trunk/sys/sys/stack.h
===================================================================
--- trunk/sys/sys/stack.h	2020-02-08 20:08:42 UTC (rev 12342)
+++ trunk/sys/sys/stack.h	2020-02-08 20:08:57 UTC (rev 12343)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/stack.h 227581 2011-11-16 19:06:55Z pjd $
+ * $FreeBSD: stable/11/sys/sys/stack.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_STACK_H_
@@ -57,9 +57,10 @@
 #define	CTRSTACK(m, st, depth, cheap)
 #endif
 
-/* MD Routine. */
+/* MD Routines. */
 struct thread;
 void		 stack_save(struct stack *);
 void		 stack_save_td(struct stack *, struct thread *);
+int		 stack_save_td_running(struct stack *, struct thread *);
 
 #endif


From laffer1 at midnightbsd.org  Sat Feb  8 15:09:30 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sat, 8 Feb 2020 15:09:30 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12344] trunk/sys/sys/sysent.h: sync with
 FreeBSD 11-stable
Message-ID: <202002082009.018K9UHA069258@stargazer.midnightbsd.org>

Revision: 12344
          http://svnweb.midnightbsd.org/src/?rev=12344
Author:   laffer1
Date:     2020-02-08 15:09:29 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sysent.h

Modified: trunk/sys/sys/sysent.h
===================================================================
--- trunk/sys/sys/sysent.h	2020-02-08 20:08:57 UTC (rev 12343)
+++ trunk/sys/sys/sysent.h	2020-02-08 20:09:29 UTC (rev 12344)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/sysent.h 303395 2016-07-27 16:27:41Z julian $
+ * $FreeBSD: stable/11/sys/sys/sysent.h 346815 2019-04-28 13:16:54Z dchagin $
  */
 
 #ifndef _SYS_SYSENT_H_
@@ -39,19 +39,19 @@
 struct sysent;
 struct thread;
 struct ksiginfo;
+struct syscall_args;
 
+enum systrace_probe_t {
+	SYSTRACE_ENTRY,
+	SYSTRACE_RETURN,
+};
+
 typedef	int	sy_call_t(struct thread *, void *);
 
-/* Used by the machine dependent syscall() code. */
-typedef	void (*systrace_probe_func_t)(u_int32_t, int, struct sysent *, void *,
-    int);
+typedef	void	(*systrace_probe_func_t)(struct syscall_args *,
+		    enum systrace_probe_t, int);
+typedef	void	(*systrace_args_func_t)(int, void *, uint64_t *, int *);
 
-/*
- * Used by loaded syscalls to convert arguments to a DTrace array
- * of 64-bit arguments.
- */
-typedef	void (*systrace_args_func_t)(int, void *, u_int64_t *, int *);
-
 extern systrace_probe_func_t	systrace_probe_func;
 
 struct sysent {			/* system call table */
@@ -77,9 +77,14 @@
 #define	SY_THR_ABSENT	0x4
 #define	SY_THR_INCR	0x8
 
+#ifdef KLD_MODULE
+#define	SY_THR_STATIC_KLD	0
+#else
+#define	SY_THR_STATIC_KLD	SY_THR_STATIC
+#endif
+
 struct image_params;
 struct __sigset;
-struct syscall_args;
 struct trapframe;
 struct vnode;
 
@@ -87,10 +92,8 @@
 	int		sv_size;	/* number of entries */
 	struct sysent	*sv_table;	/* pointer to sysent */
 	u_int		sv_mask;	/* optional mask to index */
-	int		sv_sigsize;	/* size of signal translation table */
-	int		*sv_sigtbl;	/* signal translation table */
 	int		sv_errsize;	/* size of errno translation table */
-	int 		*sv_errtbl;	/* errno translation table */
+	const int 	*sv_errtbl;	/* errno translation table */
 	int		(*sv_transtrap)(int, int);
 					/* translate trap-to-signal mapping */
 	int		(*sv_fixup)(register_t **, struct image_params *);
@@ -99,8 +102,6 @@
 			    		/* send signal */
 	char 		*sv_sigcode;	/* start of sigtramp code */
 	int 		*sv_szsigcode;	/* size of sigtramp code */
-	void		(*sv_prepsyscall)(struct trapframe *, int *, u_int *,
-			    caddr_t *);
 	char		*sv_name;	/* name of binary type */
 	int		(*sv_coredump)(struct thread *, struct vnode *, off_t, int);
 					/* function to dump core, or NULL */
@@ -119,27 +120,28 @@
 	u_long		*sv_maxssiz;
 	u_int		sv_flags;
 	void		(*sv_set_syscall_retval)(struct thread *, int);
-	int		(*sv_fetch_syscall_args)(struct thread *, struct
-			    syscall_args *);
+	int		(*sv_fetch_syscall_args)(struct thread *);
 	const char	**sv_syscallnames;
+	vm_offset_t	sv_timekeep_base;
 	vm_offset_t	sv_shared_page_base;
 	vm_offset_t	sv_shared_page_len;
 	vm_offset_t	sv_sigcode_base;
-	vm_offset_t	sv_timekeep_base;
-	int		sv_timekeep_off;
-	int		sv_timekeep_curr;
-	uint32_t	sv_timekeep_gen;
 	void		*sv_shared_page_obj;
 	void		(*sv_schedtail)(struct thread *);
 	void		(*sv_thread_detach)(struct thread *);
 	int		(*sv_trap)(struct thread *);
+	u_long		*sv_hwcap;	/* Value passed in AT_HWCAP. */
+	u_long		*sv_hwcap2;	/* Value passed in AT_HWCAP2. */
 };
 
-#define	SV_ILP32	0x000100
-#define	SV_LP64		0x000200
-#define	SV_IA32		0x004000
-#define	SV_AOUT		0x008000
-#define	SV_SHP		0x010000
+#define	SV_ILP32	0x000100	/* 32-bit executable. */
+#define	SV_LP64		0x000200	/* 64-bit executable. */
+#define	SV_IA32		0x004000	/* Intel 32-bit executable. */
+#define	SV_AOUT		0x008000	/* a.out executable. */
+#define	SV_SHP		0x010000	/* Shared page. */
+#define	SV_CAPSICUM	0x020000	/* Force cap_enter() on startup. */
+#define	SV_TIMEKEEP	0x040000	/* Shared page timehands. */
+#define	SV_HWCAP	0x080000	/* sv_hwcap field is valid. */
 
 #define	SV_ABI_MASK	0xff
 #define	SV_ABI_ERRNO(p, e)	((p)->p_sysent->sv_errsize <= 0 ? e :	\
@@ -151,6 +153,7 @@
 /* same as ELFOSABI_XXX, to prevent header pollution */
 #define	SV_ABI_LINUX	3
 #define	SV_ABI_FREEBSD 	9
+#define	SV_ABI_CLOUDABI	17
 #define	SV_ABI_UNDEF	255
 
 #ifdef _KERNEL
@@ -158,7 +161,7 @@
 extern struct sysent sysent[];
 extern const char *syscallnames[];
 
-#if defined(__amd64__) || defined(__ia64__)
+#if defined(__amd64__)
 extern int i386_read_exec;
 #endif
 
@@ -172,6 +175,7 @@
 	int	*offset;		/* offset into sysent */
 	struct sysent *new_sysent;	/* new sysent */
 	struct sysent old_sysent;	/* old sysent */
+	int	flags;			/* flags for syscall_register */
 };
 
 /* separate initialization vector so it can be used in a substructure */
@@ -230,33 +234,39 @@
 	int syscall_no;
 	int registered;
 };
-#define SYSCALL_INIT_HELPER(syscallname) {			\
+#define SYSCALL_INIT_HELPER_F(syscallname, flags) {		\
     .new_sysent = {						\
 	.sy_narg = (sizeof(struct syscallname ## _args )	\
 	    / sizeof(register_t)),				\
 	.sy_call = (sy_call_t *)& sys_ ## syscallname,		\
-	.sy_auevent = SYS_AUE_##syscallname			\
+	.sy_auevent = SYS_AUE_##syscallname,			\
+	.sy_flags = (flags)					\
     },								\
     .syscall_no = SYS_##syscallname				\
 }
-#define SYSCALL_INIT_HELPER_COMPAT(syscallname) {		\
+#define SYSCALL_INIT_HELPER_COMPAT_F(syscallname, flags) {	\
     .new_sysent = {						\
 	.sy_narg = (sizeof(struct syscallname ## _args )	\
 	    / sizeof(register_t)),				\
 	.sy_call = (sy_call_t *)& syscallname,			\
-	.sy_auevent = SYS_AUE_##syscallname			\
+	.sy_auevent = SYS_AUE_##syscallname,			\
+	.sy_flags = (flags)					\
     },								\
     .syscall_no = SYS_##syscallname				\
 }
+#define SYSCALL_INIT_HELPER(syscallname)			\
+    SYSCALL_INIT_HELPER_F(syscallname, 0)
+#define SYSCALL_INIT_HELPER_COMPAT(syscallname)			\
+    SYSCALL_INIT_HELPER_COMPAT_F(syscallname, 0)
 #define SYSCALL_INIT_LAST {					\
     .syscall_no = NO_SYSCALL					\
 }
 
 int	syscall_register(int *offset, struct sysent *new_sysent,
-	    struct sysent *old_sysent);
+	    struct sysent *old_sysent, int flags);
 int	syscall_deregister(int *offset, struct sysent *old_sysent);
 int	syscall_module_handler(struct module *mod, int what, void *arg);
-int	syscall_helper_register(struct syscall_helper_data *sd);
+int	syscall_helper_register(struct syscall_helper_data *sd, int flags);
 int	syscall_helper_unregister(struct syscall_helper_data *sd);
 
 struct proc;
@@ -275,6 +285,7 @@
 int shared_page_fill(int size, int align, const void *data);
 void shared_page_write(int base, int size, const void *data);
 void exec_sysvec_init(void *param);
+void exec_inittk(void);
 
 #define INIT_SYSENTVEC(name, sv)					\
     SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY,				\


From laffer1 at midnightbsd.org  Sun Feb  9 11:49:32 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 11:49:32 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12345] trunk/sys/sys/spigenio.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091649.019GnWGP073936@stargazer.midnightbsd.org>

Revision: 12345
          http://svnweb.midnightbsd.org/src/?rev=12345
Author:   laffer1
Date:     2020-02-09 11:49:31 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Added Paths:
-----------
    trunk/sys/sys/spigenio.h

Added: trunk/sys/sys/spigenio.h
===================================================================
--- trunk/sys/sys/spigenio.h	                        (rev 0)
+++ trunk/sys/sys/spigenio.h	2020-02-09 16:49:31 UTC (rev 12345)
@@ -0,0 +1,55 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$FreeBSD: stable/11/sys/sys/spigenio.h 332942 2018-04-24 17:00:08Z ian $
+ */
+
+#ifndef _SYS_SPIGENIO_H_
+#define _SYS_SPIGENIO_H_
+
+#include <sys/_iovec.h>
+
+struct spigen_transfer {
+	struct iovec st_command; /* master to slave */
+	struct iovec st_data;    /* slave to master and/or master to slave */
+};
+
+struct spigen_transfer_mmapped {
+	size_t stm_command_length; /* at offset 0 in mmap(2) area */
+	size_t stm_data_length;    /* at offset stm_command_length */
+};
+
+#define SPIGENIOC_BASE     'S'
+#define SPIGENIOC_TRANSFER 	   _IOW(SPIGENIOC_BASE, 0, \
+	    struct spigen_transfer)
+#define SPIGENIOC_TRANSFER_MMAPPED _IOW(SPIGENIOC_BASE, 1, \
+	    struct spigen_transfer_mmapped)
+#define SPIGENIOC_GET_CLOCK_SPEED  _IOR(SPIGENIOC_BASE, 2, uint32_t)
+#define SPIGENIOC_SET_CLOCK_SPEED  _IOW(SPIGENIOC_BASE, 3, uint32_t)
+#define SPIGENIOC_GET_SPI_MODE     _IOR(SPIGENIOC_BASE, 4, uint32_t)
+#define SPIGENIOC_SET_SPI_MODE     _IOW(SPIGENIOC_BASE, 5, uint32_t)
+
+#endif /* !_SYS_SPIGENIO_H_ */


Property changes on: trunk/sys/sys/spigenio.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property

From laffer1 at midnightbsd.org  Sun Feb  9 12:03:30 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:03:30 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12346] trunk/sys/sys/syscallsubr.h: sync
 with FreeBSD 11-stable
Message-ID: <202002091703.019H3UID076491@stargazer.midnightbsd.org>

Revision: 12346
          http://svnweb.midnightbsd.org/src/?rev=12346
Author:   laffer1
Date:     2020-02-09 12:03:29 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/syscallsubr.h

Modified: trunk/sys/sys/syscallsubr.h
===================================================================
--- trunk/sys/sys/syscallsubr.h	2020-02-09 16:49:31 UTC (rev 12345)
+++ trunk/sys/sys/syscallsubr.h	2020-02-09 17:03:29 UTC (rev 12346)
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/syscallsubr.h 321009 2017-07-15 14:48:31Z dchagin $
+ * $FreeBSD: stable/11/sys/sys/syscallsubr.h 356634 2020-01-11 15:06:06Z kevans $
  */
 
 #ifndef _SYS_SYSCALLSUBR_H_
@@ -34,8 +34,10 @@
 #include <sys/socket.h>
 #include <sys/mac.h>
 #include <sys/mount.h>
+#include <sys/_cpuset.h>
 
 struct file;
+struct filecaps;
 enum idtype;
 struct itimerval;
 struct image_args;
@@ -59,6 +61,8 @@
 struct sched_param;
 struct __wrusage;
 
+typedef int (*mmap_check_fp_fn)(struct file *, int, int, int);
+
 int	kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg,
 	    u_int buflen, u_int path_max);
 int	kern_accept(struct thread *td, int s, struct sockaddr **name,
@@ -65,8 +69,6 @@
 	    socklen_t *namelen, struct file **fp);
 int	kern_accept4(struct thread *td, int s, struct sockaddr **name,
 	    socklen_t *namelen, int flags, struct file **fp);
-int	kern_access(struct thread *td, char *path, enum uio_seg pathseg,
-	    int flags);
 int	kern_accessat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, int flags, int mode);
 int	kern_adjtime(struct thread *td, struct timeval *delta,
@@ -73,14 +75,11 @@
 	    struct timeval *olddelta);
 int	kern_alternate_path(struct thread *td, const char *prefix, const char *path,
 	    enum uio_seg pathseg, char **pathbuf, int create, int dirfd);
-int	kern_bind(struct thread *td, int fd, struct sockaddr *sa);
+int	kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa);
 int	kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds,
 	    size_t ncmds);
+int	kern_cap_rights_limit(struct thread *td, int fd, cap_rights_t *rights);
 int	kern_chdir(struct thread *td, char *path, enum uio_seg pathseg);
-int	kern_chmod(struct thread *td, char *path, enum uio_seg pathseg,
-	    int mode);
-int	kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
-	    int gid);
 int	kern_clock_getcpuclockid2(struct thread *td, id_t id, int which,
 	    clockid_t *clk_id);
 int	kern_clock_getres(struct thread *td, clockid_t clock_id,
@@ -87,12 +86,23 @@
 	    struct timespec *ts);
 int	kern_clock_gettime(struct thread *td, clockid_t clock_id,
 	    struct timespec *ats);
+int	kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
+	    const struct timespec *rqtp, struct timespec *rmtp);
 int	kern_clock_settime(struct thread *td, clockid_t clock_id,
 	    struct timespec *ats);
 int	kern_close(struct thread *td, int fd);
-int	kern_connect(struct thread *td, int fd, struct sockaddr *sa);
-int	kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg,
-	    int flags);
+int	kern_connectat(struct thread *td, int dirfd, int fd,
+	    struct sockaddr *sa);
+int	kern_cpuset_getaffinity(struct thread *td, cpulevel_t level,
+	    cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp);
+int	kern_cpuset_setaffinity(struct thread *td, cpulevel_t level,
+	    cpuwhich_t which, id_t id, size_t cpusetsize,
+	    const cpuset_t *maskp);
+int	kern_cpuset_getid(struct thread *td, cpulevel_t level,
+	    cpuwhich_t which, id_t id, cpusetid_t *setid);
+int	kern_cpuset_setid(struct thread *td, cpuwhich_t which,
+	    id_t id, cpusetid_t setid);
+int	kern_dup(struct thread *td, u_int mode, int flags, int old, int new);
 int	kern_execve(struct thread *td, struct image_args *args,
 	    struct mac *mac_p);
 int	kern_fchmodat(struct thread *td, int fd, char *path,
@@ -103,8 +113,10 @@
 int	kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg);
 int	kern_fhstat(struct thread *td, fhandle_t fh, struct stat *buf);
 int	kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf);
+int	kern_fpathconf(struct thread *td, int fd, int name);
 int	kern_fstat(struct thread *td, int fd, struct stat *sbp);
 int	kern_fstatfs(struct thread *td, int fd, struct statfs *buf);
+int	kern_fsync(struct thread *td, int fd, bool fullsync);
 int	kern_ftruncate(struct thread *td, int fd, off_t length);
 int	kern_futimes(struct thread *td, int fd, struct timeval *tptr,
 	    enum uio_seg tptrseg);
@@ -113,9 +125,9 @@
 int	kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
 	    long *basep, ssize_t *residp, enum uio_seg bufseg);
 int	kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
-	    enum uio_seg bufseg, int flags);
-int	kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups);
+	    size_t *countp, enum uio_seg bufseg, int mode);
 int	kern_getitimer(struct thread *, u_int, struct itimerval *);
+int	kern_getppid(struct thread *);
 int	kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
 	    socklen_t *alen);
 int	kern_getrusage(struct thread *td, int who, struct rusage *rup);
@@ -129,52 +141,56 @@
 int	kern_jail_set(struct thread *td, struct uio *options, int flags);
 int	kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
 	    struct kevent_copyops *k_ops, const struct timespec *timeout);
+int	kern_kevent_anonymous(struct thread *td, int nevents,
+	    struct kevent_copyops *k_ops);
 int	kern_kevent_fp(struct thread *td, struct file *fp, int nchanges,
 	    int nevents, struct kevent_copyops *k_ops,
 	    const struct timespec *timeout);
-int	kern_kqueue(struct thread *td, int flags);
+int	kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps);
 int	kern_kldload(struct thread *td, const char *file, int *fileid);
 int	kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat);
 int	kern_kldunload(struct thread *td, int fileid, int flags);
-int	kern_lchown(struct thread *td, char *path, enum uio_seg pathseg,
-	    int uid, int gid);
-int	kern_link(struct thread *td, char *path, char *link,
-	    enum uio_seg segflg);
 int	kern_linkat(struct thread *td, int fd1, int fd2, char *path1,
 	    char *path2, enum uio_seg segflg, int follow);
-int	kern_lstat(struct thread *td, char *path, enum uio_seg pathseg,
-	    struct stat *sbp);
+int	kern_listen(struct thread *td, int s, int backlog);
+int	kern_lseek(struct thread *td, int fd, off_t offset, int whence);
 int	kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
 	    struct timeval *tptr, enum uio_seg tptrseg);
-int	kern_mkdir(struct thread *td, char *path, enum uio_seg segflg,
-	    int mode);
+int	kern_madvise(struct thread *td, uintptr_t addr, size_t len, int behav);
+int	kern_mincore(struct thread *td, uintptr_t addr, size_t len, char *vec);
 int	kern_mkdirat(struct thread *td, int fd, char *path,
 	    enum uio_seg segflg, int mode);
-int	kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg,
-	    int mode);
 int	kern_mkfifoat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, int mode);
-int	kern_mknod(struct thread *td, char *path, enum uio_seg pathseg,
-	    int mode, int dev);
 int	kern_mknodat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, int mode, int dev);
+int	kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr,
+	    size_t len);
+int	kern_mmap(struct thread *td, uintptr_t addr, size_t size, int prot,
+	    int flags, int fd, off_t pos);
+int	kern_mmap_fpcheck(struct thread *td, uintptr_t addr, size_t len,
+	    int prot, int flags, int fd, off_t pos,
+	    mmap_check_fp_fn check_fp_fn);
+int	kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot);
 int	kern_msgctl(struct thread *, int, int, struct msqid_ds *);
 int	kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *);
 int	kern_msgsnd(struct thread *, int, const void *, size_t, int, long);
+int	kern_msync(struct thread *td, uintptr_t addr, size_t size, int flags);
+int	kern_munlock(struct thread *td, uintptr_t addr, size_t size);
+int	kern_munmap(struct thread *td, uintptr_t addr, size_t size);
 int     kern_nanosleep(struct thread *td, struct timespec *rqt,
 	    struct timespec *rmt);
 int	kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
 	    long *ploff);
-int	kern_open(struct thread *td, char *path, enum uio_seg pathseg,
-	    int flags, int mode);
 int	kern_openat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, int flags, int mode);
 int	kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg,
 	    int name, u_long flags);
-int	kern_pipe(struct thread *td, int fildes[2]);
-int	kern_pipe2(struct thread *td, int fildes[2], int flags);
+int	kern_pipe(struct thread *td, int fildes[2], int flags,
+	    struct filecaps *fcaps1, struct filecaps *fcaps2);
 int	kern_poll(struct thread *td, struct pollfd *fds, u_int nfds,
 	    struct timespec *tsp, sigset_t *uset);
+int	kern_posix_error(struct thread *td, int error);
 int	kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
 	    int advice);
 int	kern_posix_fallocate(struct thread *td, int fd, off_t offset,
@@ -181,24 +197,23 @@
 	    off_t len);
 int	kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com,
 	    void *data);
+int	kern_pread(struct thread *td, int fd, void *buf, size_t nbyte,
+	    off_t offset);
 int	kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset);
 int	kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou,
 	    fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits);
 int	kern_ptrace(struct thread *td, int req, pid_t pid, void *addr,
 	    int data);
+int	kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte,
+	    off_t offset);
 int	kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset);
-int	kern_readlink(struct thread *td, char *path, enum uio_seg pathseg,
-	    char *buf, enum uio_seg bufseg, size_t count);
 int	kern_readlinkat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count);
 int	kern_readv(struct thread *td, int fd, struct uio *auio);
 int	kern_recvit(struct thread *td, int s, struct msghdr *mp,
 	    enum uio_seg fromseg, struct mbuf **controlp);
-int	kern_rename(struct thread *td, char *from, char *to,
-	    enum uio_seg pathseg);
 int	kern_renameat(struct thread *td, int oldfd, char *old, int newfd,
 	    char *new, enum uio_seg pathseg);
-int	kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg);
 int	kern_rmdirat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg);
 int	kern_sched_getparam(struct thread *td, struct thread *targettd,
@@ -229,11 +244,14 @@
 	    void *optval, enum uio_seg valseg, socklen_t valsize);
 int	kern_settimeofday(struct thread *td, struct timeval *tv,
 	    struct timezone *tzp);
+int	kern_shm_open(struct thread *td, const char *userpath, int flags,
+	    mode_t mode, struct filecaps *fcaps);
 int	kern_shmat(struct thread *td, int shmid, const void *shmaddr,
 	    int shmflg);
 int	kern_shmctl(struct thread *td, int shmid, int cmd, void *buf,
 	    size_t *bufsz);
-int	kern_sigaction(struct thread *td, int sig, struct sigaction *act,
+int	kern_shutdown(struct thread *td, int s, int how);
+int	kern_sigaction(struct thread *td, int sig, const struct sigaction *act,
 	    struct sigaction *oact, int flags);
 int	kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss);
 int	kern_sigprocmask(struct thread *td, int how,
@@ -241,19 +259,14 @@
 int	kern_sigsuspend(struct thread *td, sigset_t mask);
 int	kern_sigtimedwait(struct thread *td, sigset_t waitset,
 	    struct ksiginfo *ksi, struct timespec *timeout);
-int	kern_stat(struct thread *td, char *path, enum uio_seg pathseg,
-	    struct stat *sbp);
 int	kern_sigqueue(struct thread *td, pid_t pid, int signum,
 	    union sigval *value);
+int	kern_socket(struct thread *td, int domain, int type, int protocol);
 int	kern_statat(struct thread *td, int flag, int fd, char *path,
-	    enum uio_seg pathseg, struct stat *sbp);
-int	kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
 	    enum uio_seg pathseg, struct stat *sbp,
 	    void (*hook)(struct vnode *vp, struct stat *sbp));
 int	kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
 	    struct statfs *buf);
-int	kern_symlink(struct thread *td, char *path, char *link,
-	    enum uio_seg segflg);
 int	kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
 	    enum uio_seg segflg);
 int	kern_ktimer_create(struct thread *td, clockid_t clock_id,
@@ -270,11 +283,8 @@
 int	kern_thr_suspend(struct thread *td, struct timespec *tsp);
 int	kern_truncate(struct thread *td, char *path, enum uio_seg pathseg,
 	    off_t length);
-int	kern_unlink(struct thread *td, char *path, enum uio_seg pathseg);
 int	kern_unlinkat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, ino_t oldinum);
-int	kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
-	    struct timeval *tptr, enum uio_seg tptrseg);
 int	kern_utimesat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg);
 int	kern_utimensat(struct thread *td, int fd, char *path,


From laffer1 at midnightbsd.org  Sun Feb  9 12:05:27 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:05:27 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12347] trunk/sys/sys/timex.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091705.019H5RcO077226@stargazer.midnightbsd.org>

Revision: 12347
          http://svnweb.midnightbsd.org/src/?rev=12347
Author:   laffer1
Date:     2020-02-09 12:05:26 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/timex.h

Modified: trunk/sys/sys/timex.h
===================================================================
--- trunk/sys/sys/timex.h	2020-02-09 17:03:29 UTC (rev 12346)
+++ trunk/sys/sys/timex.h	2020-02-09 17:05:26 UTC (rev 12347)
@@ -3,6 +3,7 @@
  ***********************************************************************
  *								       *
  * Copyright (c) David L. Mills 1993-2001			       *
+ * Copyright (c) Poul-Henning Kamp 2000-2001                           *
  *								       *
  * Permission to use, copy, modify, and distribute this software and   *
  * its documentation for any purpose and without fee is hereby	       *
@@ -16,94 +17,30 @@
  * purpose. It is provided "as is" without express or implied	       *
  * warranty.							       *
  *								       *
- **********************************************************************/
-
-/*
- * Modification history timex.h
+ ***********************************************************************
  *
- * 16 Aug 00	David L. Mills
- *	API Version 4. Added MOD_TAI and tai member of ntptimeval
- *	structure.
+ * $FreeBSD: stable/11/sys/sys/timex.h 298981 2016-05-03 15:14:17Z pfg $
  *
- * 17 Nov 98	David L. Mills
- *	Revised for nanosecond kernel and user interface.
- *
- * 26 Sep 94	David L. Mills
- *	Added defines for hybrid phase/frequency-lock loop.
- *
- * 19 Mar 94	David L. Mills
- *	Moved defines from kernel routines to header file and added new
- *	defines for PPS phase-lock loop.
- *
- * 20 Feb 94	David L. Mills
- *	Revised status codes and structures for external clock and PPS
- *	signal discipline.
- *
- * 28 Nov 93	David L. Mills
- *	Adjusted parameters to improve stability and increase poll
- *	interval.
- *
- * 17 Sep 93    David L. Mills
- *      Created file
- *
- * $FreeBSD: stable/10/sys/sys/timex.h 250889 2013-05-21 21:50:11Z ed $
- */
-/*
  * This header file defines the Network Time Protocol (NTP) interfaces
- * for user and daemon application programs. These are implemented using
- * defined syscalls and data structures and require specific kernel
- * support.
+ * for user and daemon application programs.
  *
- * The original precision time kernels developed from 1993 have an
- * ultimate resolution of one microsecond; however, the most recent
- * kernels have an ultimate resolution of one nanosecond. In these
- * kernels, a ntp_adjtime() syscalls can be used to determine which
- * resolution is in use and to select either one at any time. The
- * resolution selected affects the scaling of certain fields in the
- * ntp_gettime() and ntp_adjtime() syscalls, as described below.
+ * This file was originally created 17 Sep 93 by David L. Mills, Professor
+ * of University of Delaware, building on work which had already been ongoing
+ * for a decade and a half at that point in time.
  *
- * NAME
- *	ntp_gettime - NTP user application interface
+ * In 2000 the APIs got a upgrade from microseconds to nanoseconds,
+ * a joint work between Poul-Henning Kamp and David L. Mills.
  *
- * SYNOPSIS
- *	#include <sys/timex.h>
- *
- *	int ntp_gettime(struct ntptimeval *ntv);
- *
- * DESCRIPTION
- *	The time returned by ntp_gettime() is in a timespec structure,
- *	but may be in either microsecond (seconds and microseconds) or
- *	nanosecond (seconds and nanoseconds) format. The particular
- *	format in use is determined by the STA_NANO bit of the status
- *	word returned by the ntp_adjtime() syscall.
- *
- * NAME
- *	ntp_adjtime - NTP daemon application interface
- *
- * SYNOPSIS
- *	#include <sys/timex.h>
- *	#include <sys/syscall.h>
- *
- *	int syscall(SYS_ntp_adjtime, tptr);
- *	int SYS_ntp_adjtime;
- *	struct timex *tptr;
- *
- * DESCRIPTION
- *	Certain fields of the timex structure are interpreted in either
- *	microseconds or nanoseconds according to the state of the
- *	STA_NANO bit in the status word. See the description below for
- *	further information.
  */
+
 #ifndef _SYS_TIMEX_H_
 #define _SYS_TIMEX_H_ 1
-#define NTP_API		4	/* NTP API version */
 
+#define NTP_API		4		/* NTP API version */
+
 #ifdef __MidnightBSD__
 #include <sys/_timespec.h>
 #endif /* __MidnightBSD__ */
-#ifndef MSDOS			/* Microsoft specific */
-#include <sys/syscall.h>
-#endif /* MSDOS */
 
 /*
  * The following defines establish the performance envelope of the
@@ -114,98 +51,93 @@
  * mode. Between these two limits the operating mode is selected by the
  * STA_FLL bit in the status word.
  */
-#define MAXPHASE	500000000L /* max phase error (ns) */
-#define MAXFREQ		500000L	/* max freq error (ns/s) */
-#define MINSEC		256	/* min FLL update interval (s) */
-#define MAXSEC		2048	/* max PLL update interval (s) */
-#define NANOSECOND	1000000000L /* nanoseconds in one second */
-#define SCALE_PPM	(65536 / 1000) /* crude ns/s to scaled PPM */
-#define MAXTC		10	/* max time constant */
 
+#define MAXPHASE	500000000L	/* max phase error (ns) */
+#define MAXFREQ		500000L		/* max freq error (ns/s) */
+#define MINSEC		256		/* min FLL update interval (s) */
+#define MAXSEC		2048		/* max PLL update interval (s) */
+#define NANOSECOND	1000000000L	/* nanoseconds in one second */
+#define SCALE_PPM	(65536 / 1000)	/* crude ns/s to scaled PPM */
+#define MAXTC		10		/* max time constant */
+
 /*
- * The following defines and structures define the user interface for
- * the ntp_gettime() and ntp_adjtime() syscalls.
- *
  * Control mode codes (timex.modes)
  */
-#define MOD_OFFSET	0x0001	/* set time offset */
-#define MOD_FREQUENCY	0x0002	/* set frequency offset */
-#define MOD_MAXERROR	0x0004	/* set maximum time error */
-#define MOD_ESTERROR	0x0008	/* set estimated time error */
-#define MOD_STATUS	0x0010	/* set clock status bits */
-#define MOD_TIMECONST	0x0020	/* set PLL time constant */
-#define MOD_PPSMAX	0x0040	/* set PPS maximum averaging time */
-#define MOD_TAI		0x0080	/* set TAI offset */
-#define	MOD_MICRO	0x1000	/* select microsecond resolution */
-#define	MOD_NANO	0x2000	/* select nanosecond resolution */
-#define MOD_CLKB	0x4000	/* select clock B */
-#define MOD_CLKA	0x8000	/* select clock A */
+#define MOD_OFFSET	0x0001		/* set time offset */
+#define MOD_FREQUENCY	0x0002		/* set frequency offset */
+#define MOD_MAXERROR	0x0004		/* set maximum time error */
+#define MOD_ESTERROR	0x0008		/* set estimated time error */
+#define MOD_STATUS	0x0010		/* set clock status bits */
+#define MOD_TIMECONST	0x0020		/* set PLL time constant */
+#define MOD_PPSMAX	0x0040		/* set PPS maximum averaging time */
+#define MOD_TAI		0x0080		/* set TAI offset */
+#define	MOD_MICRO	0x1000		/* select microsecond resolution */
+#define	MOD_NANO	0x2000		/* select nanosecond resolution */
+#define MOD_CLKB	0x4000		/* select clock B */
+#define MOD_CLKA	0x8000		/* select clock A */
 
 /*
  * Status codes (timex.status)
  */
-#define STA_PLL		0x0001	/* enable PLL updates (rw) */
-#define STA_PPSFREQ	0x0002	/* enable PPS freq discipline (rw) */
-#define STA_PPSTIME	0x0004	/* enable PPS time discipline (rw) */
-#define STA_FLL		0x0008	/* enable FLL mode (rw) */
-#define STA_INS		0x0010	/* insert leap (rw) */
-#define STA_DEL		0x0020	/* delete leap (rw) */
-#define STA_UNSYNC	0x0040	/* clock unsynchronized (rw) */
-#define STA_FREQHOLD	0x0080	/* hold frequency (rw) */
-#define STA_PPSSIGNAL	0x0100	/* PPS signal present (ro) */
-#define STA_PPSJITTER	0x0200	/* PPS signal jitter exceeded (ro) */
-#define STA_PPSWANDER	0x0400	/* PPS signal wander exceeded (ro) */
-#define STA_PPSERROR	0x0800	/* PPS signal calibration error (ro) */
-#define STA_CLOCKERR	0x1000	/* clock hardware fault (ro) */
-#define STA_NANO	0x2000	/* resolution (0 = us, 1 = ns) (ro) */
-#define STA_MODE	0x4000	/* mode (0 = PLL, 1 = FLL) (ro) */
-#define STA_CLK		0x8000	/* clock source (0 = A, 1 = B) (ro) */
+#define STA_PLL		0x0001		/* enable PLL updates (rw) */
+#define STA_PPSFREQ	0x0002		/* enable PPS freq discipline (rw) */
+#define STA_PPSTIME	0x0004		/* enable PPS time discipline (rw) */
+#define STA_FLL		0x0008		/* enable FLL mode (rw) */
+#define STA_INS		0x0010		/* insert leap (rw) */
+#define STA_DEL		0x0020		/* delete leap (rw) */
+#define STA_UNSYNC	0x0040		/* clock unsynchronized (rw) */
+#define STA_FREQHOLD	0x0080		/* hold frequency (rw) */
+#define STA_PPSSIGNAL	0x0100		/* PPS signal present (ro) */
+#define STA_PPSJITTER	0x0200		/* PPS signal jitter exceeded (ro) */
+#define STA_PPSWANDER	0x0400		/* PPS signal wander exceeded (ro) */
+#define STA_PPSERROR	0x0800		/* PPS signal calibration error (ro) */
+#define STA_CLOCKERR	0x1000		/* clock hardware fault (ro) */
+#define STA_NANO	0x2000		/* resolution (0 = us, 1 = ns) (ro) */
+#define STA_MODE	0x4000		/* mode (0 = PLL, 1 = FLL) (ro) */
+#define STA_CLK		0x8000		/* clock source (0 = A, 1 = B) (ro) */
 
 #define STA_RONLY (STA_PPSSIGNAL | STA_PPSJITTER | STA_PPSWANDER | \
     STA_PPSERROR | STA_CLOCKERR | STA_NANO | STA_MODE | STA_CLK)
 
 /*
- * Clock states (time_state)
+ * Clock states (ntptimeval.time_state)
  */
-#define TIME_OK		0	/* no leap second warning */
-#define TIME_INS	1	/* insert leap second warning */
-#define TIME_DEL	2	/* delete leap second warning */
-#define TIME_OOP	3	/* leap second in progress */
-#define TIME_WAIT	4	/* leap second has occured */
-#define TIME_ERROR	5	/* error (see status word) */
+#define TIME_OK		0		/* no leap second warning */
+#define TIME_INS	1		/* insert leap second warning */
+#define TIME_DEL	2		/* delete leap second warning */
+#define TIME_OOP	3		/* leap second in progress */
+#define TIME_WAIT	4		/* leap second has occurred */
+#define TIME_ERROR	5		/* error (see status word) */
 
 /*
- * NTP user interface (ntp_gettime()) - used to read kernel clock values
- *
- * Note: The time member is in microseconds if STA_NANO is zero and
- * nanoseconds if not.
+ * NTP user interface -- ntp_gettime(2) - used to read kernel clock values
  */
 struct ntptimeval {
-	struct timespec time;	/* current time (ns) (ro) */
-	long maxerror;		/* maximum error (us) (ro) */
-	long esterror;		/* estimated error (us) (ro) */
-	long tai;		/* TAI offset */
-	int time_state;		/* time status */
+	struct timespec time;		/* current time (ns) (ro) */
+	long maxerror;			/* maximum error (us) (ro) */
+	long esterror;			/* estimated error (us) (ro) */
+	long tai;			/* TAI offset */
+	int time_state;			/* time status */
 };
 
 /*
- * NTP daemon interface (ntp_adjtime()) - used to discipline CPU clock
- * oscillator and determine status.
+ * NTP daemon interface -- ntp_adjtime(2) -- used to discipline CPU clock
+ * oscillator and control/determine status.
  *
  * Note: The offset, precision and jitter members are in microseconds if
  * STA_NANO is zero and nanoseconds if not.
  */
 struct timex {
-	unsigned int modes;	/* clock mode bits (wo) */
-	long	offset;		/* time offset (ns/us) (rw) */
-	long	freq;		/* frequency offset (scaled PPM) (rw) */
-	long	maxerror;	/* maximum error (us) (rw) */
-	long	esterror;	/* estimated error (us) (rw) */
-	int	status;		/* clock status bits (rw) */
-	long	constant;	/* poll interval (log2 s) (rw) */
-	long	precision;	/* clock precision (ns/us) (ro) */
-	long	tolerance;	/* clock frequency tolerance (scaled
-				 * PPM) (ro) */
+	unsigned int modes;		/* clock mode bits (wo) */
+	long	offset;			/* time offset (ns/us) (rw) */
+	long	freq;			/* frequency offset (scaled PPM) (rw) */
+	long	maxerror;		/* maximum error (us) (rw) */
+	long	esterror;		/* estimated error (us) (rw) */
+	int	status;			/* clock status bits (rw) */
+	long	constant;		/* poll interval (log2 s) (rw) */
+	long	precision;		/* clock precision (ns/us) (ro) */
+	long	tolerance;		/* clock frequency tolerance (scaled
+				 	 * PPM) (ro) */
 	/*
 	 * The following read-only structure members are implemented
 	 * only if the PPS signal discipline is configured in the
@@ -212,14 +144,14 @@
 	 * kernel. They are included in all configurations to insure
 	 * portability.
 	 */
-	long	ppsfreq;	/* PPS frequency (scaled PPM) (ro) */
-	long	jitter;		/* PPS jitter (ns/us) (ro) */
-	int	shift;		/* interval duration (s) (shift) (ro) */
-	long	stabil;		/* PPS stability (scaled PPM) (ro) */
-	long	jitcnt;		/* jitter limit exceeded (ro) */
-	long	calcnt;		/* calibration intervals (ro) */
-	long	errcnt;		/* calibration errors (ro) */
-	long	stbcnt;		/* stability limit exceeded (ro) */
+	long	ppsfreq;		/* PPS frequency (scaled PPM) (ro) */
+	long	jitter;			/* PPS jitter (ns/us) (ro) */
+	int	shift;			/* interval duration (s) (shift) (ro) */
+	long	stabil;			/* PPS stability (scaled PPM) (ro) */
+	long	jitcnt;			/* jitter limit exceeded (ro) */
+	long	calcnt;			/* calibration intervals (ro) */
+	long	errcnt;			/* calibration errors (ro) */
+	long	stbcnt;			/* stability limit exceeded (ro) */
 };
 
 #ifdef __MidnightBSD__


From laffer1 at midnightbsd.org  Sun Feb  9 12:06:02 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:06:02 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12348] trunk/sys/sys/sysctl.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091706.019H62nM077308@stargazer.midnightbsd.org>

Revision: 12348
          http://svnweb.midnightbsd.org/src/?rev=12348
Author:   laffer1
Date:     2020-02-09 12:06:01 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sysctl.h

Modified: trunk/sys/sys/sysctl.h
===================================================================
--- trunk/sys/sys/sysctl.h	2020-02-09 17:05:26 UTC (rev 12347)
+++ trunk/sys/sys/sysctl.h	2020-02-09 17:06:01 UTC (rev 12348)
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)sysctl.h	8.1 (Berkeley) 6/2/93
- * $FreeBSD: stable/10/sys/sys/sysctl.h 324749 2017-10-19 08:00:34Z avg $
+ * $FreeBSD: stable/11/sys/sys/sysctl.h 354762 2019-11-16 00:33:02Z scottl $
  */
 
 #ifndef _SYS_SYSCTL_H_
@@ -74,6 +74,12 @@
 #define	CTLTYPE_LONG	7	/* name describes a long */
 #define	CTLTYPE_ULONG	8	/* name describes an unsigned long */
 #define	CTLTYPE_U64	9	/* name describes an unsigned 64-bit number */
+#define	CTLTYPE_U8	0xa	/* name describes an unsigned 8-bit number */
+#define	CTLTYPE_U16	0xb	/* name describes an unsigned 16-bit number */
+#define	CTLTYPE_S8	0xc	/* name describes a signed 8-bit number */
+#define	CTLTYPE_S16	0xd	/* name describes a signed 16-bit number */
+#define	CTLTYPE_S32	0xe	/* name describes a signed 32-bit number */
+#define	CTLTYPE_U32	0xf	/* name describes an unsigned 32-bit number */
 
 #define	CTLFLAG_RD	0x80000000	/* Allow reads of variable */
 #define	CTLFLAG_WR	0x40000000	/* Allow writes to the variable */
@@ -85,7 +91,7 @@
 #define	CTLFLAG_DYN	0x02000000	/* Dynamic oid - can be freed */
 #define	CTLFLAG_SKIP	0x01000000	/* Skip this sysctl when listing */
 #define	CTLMASK_SECURE	0x00F00000	/* Secure level */
-#define	CTLFLAG_TUN	0x00080000	/* Tunable variable */
+#define	CTLFLAG_TUN	0x00080000	/* Default value is loaded from getenv() */
 #define	CTLFLAG_RDTUN	(CTLFLAG_RD|CTLFLAG_TUN)
 #define	CTLFLAG_RWTUN	(CTLFLAG_RW|CTLFLAG_TUN)
 #define	CTLFLAG_MPSAFE	0x00040000	/* Handler is MP safe */
@@ -133,7 +139,7 @@
 #endif
 
 #define	SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1,	\
-	intptr_t arg2, struct sysctl_req *req
+	intmax_t arg2, struct sysctl_req *req
 
 /* definitions for sysctl_req 'lock' member */
 #define	REQ_UNWIRED	1
@@ -140,7 +146,7 @@
 #define	REQ_WIRED	2
 
 /* definitions for sysctl_req 'flags' member */
-#if defined(__amd64__) || defined(__ia64__) || defined(__powerpc64__) ||\
+#if defined(__amd64__) || defined(__powerpc64__) ||\
     (defined(__mips__) && defined(__mips_n64))
 #define	SCTL_MASK32	1	/* 32 bit emulation */
 #endif
@@ -171,12 +177,13 @@
  * be hidden behind it, expanded by the handler.
  */
 struct sysctl_oid {
+	struct sysctl_oid_list oid_children;
 	struct sysctl_oid_list *oid_parent;
 	SLIST_ENTRY(sysctl_oid) oid_link;
 	int		 oid_number;
 	u_int		 oid_kind;
 	void		*oid_arg1;
-	intptr_t	 oid_arg2;
+	intmax_t	 oid_arg2;
 	const char	*oid_name;
 	int		(*oid_handler)(SYSCTL_HANDLER_ARGS);
 	const char	*oid_fmt;
@@ -187,18 +194,26 @@
 
 #define	SYSCTL_IN(r, p, l)	(r->newfunc)(r, p, l)
 #define	SYSCTL_OUT(r, p, l)	(r->oldfunc)(r, p, l)
+#define	SYSCTL_OUT_STR(r, p)	(r->oldfunc)(r, p, strlen(p) + 1)
 
+int sysctl_handle_bool(SYSCTL_HANDLER_ARGS);
+int sysctl_handle_8(SYSCTL_HANDLER_ARGS);
+int sysctl_handle_16(SYSCTL_HANDLER_ARGS);
+int sysctl_handle_32(SYSCTL_HANDLER_ARGS);
+int sysctl_handle_64(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_int(SYSCTL_HANDLER_ARGS);
 int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_long(SYSCTL_HANDLER_ARGS);
-int sysctl_handle_64(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_string(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS);
+int sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS);
 
 int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS);
 
+int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS);
+
 int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS);
 int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS);
 int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS);
@@ -212,15 +227,16 @@
 void sysctl_unregister_oid(struct sysctl_oid *oidp);
 
 /* Declare a static oid to allow child oids to be added to it. */
-#define	SYSCTL_DECL(name)						\
-	extern struct sysctl_oid_list sysctl_##name##_children
+#define	SYSCTL_DECL(name)			\
+	extern struct sysctl_oid sysctl__##name
 
 /* Hide these in macros. */
-#define	SYSCTL_CHILDREN(oid_ptr)					\
-	(struct sysctl_oid_list *)(oid_ptr)->oid_arg1
-#define	SYSCTL_PARENT(oid_ptr)			NULL	/* not supported */
-#define	SYSCTL_CHILDREN_SET(oid_ptr, val)	(oid_ptr)->oid_arg1 = (val)
-#define	SYSCTL_STATIC_CHILDREN(oid_name)	(&sysctl_##oid_name##_children)
+#define	SYSCTL_CHILDREN(oid_ptr)		(&(oid_ptr)->oid_children)
+#define	SYSCTL_PARENT(oid_ptr)					\
+    (((oid_ptr)->oid_parent != &sysctl__children) ?		\
+	__containerof((oid_ptr)->oid_parent, struct sysctl_oid,	\
+	oid_children) : (struct sysctl_oid *)NULL)
+#define	SYSCTL_STATIC_CHILDREN(oid_name)	(&sysctl__##oid_name.oid_children)
 
 /* === Structs and macros related to context handling. === */
 
@@ -233,7 +249,7 @@
 TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry);
 
 #define	SYSCTL_NODE_CHILDREN(parent, name) \
-	sysctl_##parent##_##name##_children
+	sysctl__##parent##_##name.oid_children
 
 #ifndef NO_SYSCTL_DESCR
 #define	__DESCR(d) d
@@ -241,44 +257,52 @@
 #define	__DESCR(d) ""
 #endif
 
-/* This constructs a "raw" MIB oid. */
-#define	SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr)\
-	static struct sysctl_oid sysctl__##parent##_##name = {		\
-		&sysctl_##parent##_children,				\
-		{ NULL },						\
-		nbr,							\
-		kind,							\
-		a1,							\
-		a2,							\
-		#name,							\
-		handler,						\
-		fmt,							\
-		0,							\
-		0,							\
-		__DESCR(descr)						\
-		};							\
-	DATA_SET(sysctl_set, sysctl__##parent##_##name)
+/* This macro is only for internal use */
+#define	SYSCTL_OID_RAW(id, parent_child_head, nbr, name, kind, a1, a2, handler, fmt, descr) \
+	struct sysctl_oid id = {					\
+		.oid_parent = (parent_child_head),			\
+		.oid_children = SLIST_HEAD_INITIALIZER(&id.oid_children), \
+		.oid_number = (nbr),					\
+		.oid_kind = (kind),					\
+		.oid_arg1 = (a1),					\
+		.oid_arg2 = (a2),					\
+		.oid_name = (name),					\
+		.oid_handler = (handler),				\
+		.oid_fmt = (fmt),					\
+		.oid_descr = __DESCR(descr)				\
+	};								\
+	DATA_SET(sysctl_set, id)
 
+/* This constructs a static "raw" MIB oid. */
+#define	SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
+    static SYSCTL_OID_RAW(sysctl__##parent##_##name, \
+	SYSCTL_CHILDREN(&sysctl__##parent), \
+	nbr, #name, kind, a1, a2, handler, fmt, descr)
+
+/* This constructs a global "raw" MIB oid. */
+#define	SYSCTL_OID_GLOBAL(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
+    SYSCTL_OID_RAW(sysctl__##parent##_##name, \
+	SYSCTL_CHILDREN(&sysctl__##parent),	\
+	nbr, #name, kind, a1, a2, handler, fmt, descr)
+
 #define	SYSCTL_ADD_OID(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
 	sysctl_add_oid(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, __DESCR(descr))
 
 /* This constructs a root node from which other nodes can hang. */
-#define	SYSCTL_ROOT_NODE(nbr, name, access, handler, descr)		\
-	SYSCTL_NODE(, nbr, name, access, handler, descr);		\
+#define	SYSCTL_ROOT_NODE(nbr, name, access, handler, descr)	\
+	SYSCTL_OID_RAW(sysctl___##name, &sysctl__children,	\
+	    nbr, #name, CTLTYPE_NODE|(access), NULL, 0,		\
+	    handler, "N", descr);				\
 	CTASSERT(((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE)
 
 /* This constructs a node from which other oids can hang. */
-#define	SYSCTL_NODE(parent, nbr, name, access, handler, descr)		    \
-	struct sysctl_oid_list SYSCTL_NODE_CHILDREN(parent, name);	    \
-	SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|(access),		    \
-	    (void*)&SYSCTL_NODE_CHILDREN(parent, name), 0, handler, "N", descr); \
+#define	SYSCTL_NODE(parent, nbr, name, access, handler, descr)		\
+	SYSCTL_OID_GLOBAL(parent, nbr, name, CTLTYPE_NODE|(access),	\
+	    NULL, 0, handler, "N", descr);				\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE)
 
-#define	SYSCTL_ADD_ROOT_NODE(ctx, nbr, name, access, handler, descr) \
-	SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(), nbr, name, access, handler, descr)
-
 #define	SYSCTL_ADD_NODE(ctx, parent, nbr, name, access, handler, descr)	\
 ({									\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
@@ -287,6 +311,15 @@
 	    NULL, 0, handler, "N", __DESCR(descr));			\
 })
 
+#define	SYSCTL_ADD_ROOT_NODE(ctx, nbr, name, access, handler, descr)	\
+({									\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE);	\
+	sysctl_add_oid(ctx, &sysctl__children, nbr, name,		\
+	    CTLTYPE_NODE|(access),					\
+	    NULL, 0, handler, "N", __DESCR(descr));			\
+})
+
 /* Oid for a string.  len can be 0 to indicate '\0' termination. */
 #define	SYSCTL_STRING(parent, nbr, name, access, arg, len, descr)	\
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access),		\
@@ -303,6 +336,202 @@
 	    __arg, len, sysctl_handle_string, "A", __DESCR(descr));	\
 })
 
+/* Oid for a constant '\0' terminated string. */
+#define	SYSCTL_CONST_STRING(parent, nbr, name, access, arg, descr)	\
+	SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access),		\
+	    __DECONST(char *, arg), 0, sysctl_handle_string, "A", descr); \
+	CTASSERT(!(access & CTLFLAG_WR));				\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING)
+
+#define	SYSCTL_ADD_CONST_STRING(ctx, parent, nbr, name, access, arg, descr) \
+({									\
+	char *__arg = __DECONST(char *, arg);				\
+	CTASSERT(!(access & CTLFLAG_WR));				\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING);	\
+	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING|(access),	\
+	    __arg, 0, sysctl_handle_string, "A", __DESCR(descr));	\
+})
+
+/* Oid for a bool.  If ptr is NULL, val is returned. */
+#define	SYSCTL_NULL_BOOL_PTR ((bool *)NULL)
+#define	SYSCTL_BOOL(parent, nbr, name, access, ptr, val, descr)	\
+	SYSCTL_OID(parent, nbr, name,				\
+	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),		\
+	    ptr, val, sysctl_handle_bool, "CU", descr);		\
+	CTASSERT(((access) & CTLTYPE) == 0 &&			\
+	    sizeof(bool) == sizeof(*(ptr)))
+
+#define	SYSCTL_ADD_BOOL(ctx, parent, nbr, name, access, ptr, val, descr) \
+({									\
+	bool *__ptr = (ptr);						\
+	CTASSERT(((access) & CTLTYPE) == 0);				\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, val, sysctl_handle_bool, "CU", __DESCR(descr));	\
+})
+
+/* Oid for a signed 8-bit int.  If ptr is NULL, val is returned. */
+#define	SYSCTL_NULL_S8_PTR ((int8_t *)NULL)
+#define	SYSCTL_S8(parent, nbr, name, access, ptr, val, descr)	\
+	SYSCTL_OID(parent, nbr, name,				\
+	    CTLTYPE_S8 | CTLFLAG_MPSAFE | (access),		\
+	    ptr, val, sysctl_handle_8, "C", descr);		\
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8) && \
+	    sizeof(int8_t) == sizeof(*(ptr)))
+
+#define	SYSCTL_ADD_S8(ctx, parent, nbr, name, access, ptr, val, descr)	\
+({									\
+	int8_t *__ptr = (ptr);						\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8);		\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_S8 | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, val, sysctl_handle_8, "C", __DESCR(descr));	\
+})
+
+/* Oid for an unsigned 8-bit int.  If ptr is NULL, val is returned. */
+#define	SYSCTL_NULL_U8_PTR ((uint8_t *)NULL)
+#define	SYSCTL_U8(parent, nbr, name, access, ptr, val, descr)	\
+	SYSCTL_OID(parent, nbr, name,				\
+	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),		\
+	    ptr, val, sysctl_handle_8, "CU", descr);		\
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8) && \
+	    sizeof(uint8_t) == sizeof(*(ptr)))
+
+#define	SYSCTL_ADD_U8(ctx, parent, nbr, name, access, ptr, val, descr)	\
+({									\
+	uint8_t *__ptr = (ptr);						\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8);		\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, val, sysctl_handle_8, "CU", __DESCR(descr));	\
+})
+
+/* Oid for a signed 16-bit int.  If ptr is NULL, val is returned. */
+#define	SYSCTL_NULL_S16_PTR ((int16_t *)NULL)
+#define	SYSCTL_S16(parent, nbr, name, access, ptr, val, descr)	\
+	SYSCTL_OID(parent, nbr, name,				\
+	    CTLTYPE_S16 | CTLFLAG_MPSAFE | (access),		\
+	    ptr, val, sysctl_handle_16, "S", descr);		\
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16) && \
+	    sizeof(int16_t) == sizeof(*(ptr)))
+
+#define	SYSCTL_ADD_S16(ctx, parent, nbr, name, access, ptr, val, descr)	\
+({									\
+	int16_t *__ptr = (ptr);						\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16);		\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_S16 | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, val, sysctl_handle_16, "S", __DESCR(descr));	\
+})
+
+/* Oid for an unsigned 16-bit int.  If ptr is NULL, val is returned. */
+#define	SYSCTL_NULL_U16_PTR ((uint16_t *)NULL)
+#define	SYSCTL_U16(parent, nbr, name, access, ptr, val, descr)	\
+	SYSCTL_OID(parent, nbr, name,				\
+	    CTLTYPE_U16 | CTLFLAG_MPSAFE | (access),		\
+	    ptr, val, sysctl_handle_16, "SU", descr);		\
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16) && \
+	    sizeof(uint16_t) == sizeof(*(ptr)))
+
+#define	SYSCTL_ADD_U16(ctx, parent, nbr, name, access, ptr, val, descr)	\
+({									\
+	uint16_t *__ptr = (ptr);					\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16);		\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_U16 | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, val, sysctl_handle_16, "SU", __DESCR(descr));	\
+})
+
+/* Oid for a signed 32-bit int.  If ptr is NULL, val is returned. */
+#define	SYSCTL_NULL_S32_PTR ((int32_t *)NULL)
+#define	SYSCTL_S32(parent, nbr, name, access, ptr, val, descr)	\
+	SYSCTL_OID(parent, nbr, name,				\
+	    CTLTYPE_S32 | CTLFLAG_MPSAFE | (access),		\
+	    ptr, val, sysctl_handle_32, "I", descr);		\
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32) && \
+	    sizeof(int32_t) == sizeof(*(ptr)))
+
+#define	SYSCTL_ADD_S32(ctx, parent, nbr, name, access, ptr, val, descr)	\
+({									\
+	int32_t *__ptr = (ptr);						\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32);		\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_S32 | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, val, sysctl_handle_32, "I", __DESCR(descr));	\
+})
+
+/* Oid for an unsigned 32-bit int.  If ptr is NULL, val is returned. */
+#define	SYSCTL_NULL_U32_PTR ((uint32_t *)NULL)
+#define	SYSCTL_U32(parent, nbr, name, access, ptr, val, descr)	\
+	SYSCTL_OID(parent, nbr, name,				\
+	    CTLTYPE_U32 | CTLFLAG_MPSAFE | (access),		\
+	    ptr, val, sysctl_handle_32, "IU", descr);		\
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32) && \
+	    sizeof(uint32_t) == sizeof(*(ptr)))
+
+#define	SYSCTL_ADD_U32(ctx, parent, nbr, name, access, ptr, val, descr)	\
+({									\
+	uint32_t *__ptr = (ptr);					\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32);		\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_U32 | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, val, sysctl_handle_32, "IU", __DESCR(descr));	\
+})
+
+/* Oid for a signed 64-bit int.  If ptr is NULL, val is returned. */
+#define	SYSCTL_NULL_S64_PTR ((int64_t *)NULL)
+#define	SYSCTL_S64(parent, nbr, name, access, ptr, val, descr)	\
+	SYSCTL_OID(parent, nbr, name,				\
+	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),		\
+	    ptr, val, sysctl_handle_64, "Q", descr);		\
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \
+	    sizeof(int64_t) == sizeof(*(ptr)))
+
+#define	SYSCTL_ADD_S64(ctx, parent, nbr, name, access, ptr, val, descr)	\
+({									\
+	int64_t *__ptr = (ptr);						\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);		\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, val, sysctl_handle_64, "Q", __DESCR(descr));	\
+})
+
+/* Oid for an unsigned 64-bit int.  If ptr is NULL, val is returned. */
+#define	SYSCTL_NULL_U64_PTR ((uint64_t *)NULL)
+#define	SYSCTL_U64(parent, nbr, name, access, ptr, val, descr)	\
+	SYSCTL_OID(parent, nbr, name,				\
+	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),		\
+	    ptr, val, sysctl_handle_64, "QU", descr);		\
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \
+	    sizeof(uint64_t) == sizeof(*(ptr)))
+
+#define	SYSCTL_ADD_U64(ctx, parent, nbr, name, access, ptr, val, descr)	\
+({									\
+	uint64_t *__ptr = (ptr);					\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64);		\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, val, sysctl_handle_64, "QU", __DESCR(descr));	\
+})
+
 /* Oid for an int.  If ptr is SYSCTL_NULL_INT_PTR, val is returned. */
 #define	SYSCTL_NULL_INT_PTR ((int *)NULL)
 #define	SYSCTL_INT(parent, nbr, name, access, ptr, val, descr)	\
@@ -309,9 +538,9 @@
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_int, "I", descr);		\
-	CTASSERT(((access) & CTLTYPE) == 0 ||			\
-	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);	\
-	CTASSERT(sizeof(int) == sizeof(*(ptr)))
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) && \
+	    sizeof(int) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_INT(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
@@ -329,9 +558,9 @@
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_UINT | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_int, "IU", descr);		\
-	CTASSERT(((access) & CTLTYPE) == 0 ||			\
-	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT);\
-	CTASSERT(sizeof(unsigned) == sizeof(*(ptr)))
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT) && \
+	    sizeof(unsigned) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_UINT(ctx, parent, nbr, name, access, ptr, val, descr) \
 ({									\
@@ -349,9 +578,9 @@
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_LONG | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_long, "L", descr);		\
-	CTASSERT(((access) & CTLTYPE) == 0 ||			\
-	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG);\
-	CTASSERT(sizeof(long) == sizeof(*(ptr)))
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG) && \
+	    sizeof(long) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_LONG(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
@@ -369,9 +598,9 @@
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access),			\
 	    ptr, val, sysctl_handle_long, "LU", descr);			\
-	CTASSERT(((access) & CTLTYPE) == 0 ||				\
-	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG);	\
-	CTASSERT(sizeof(unsigned long) == sizeof(*(ptr)))
+	CTASSERT((((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG) &&	\
+	    sizeof(unsigned long) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_ULONG(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
@@ -389,9 +618,9 @@
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_64, "Q", descr);		\
-	CTASSERT(((access) & CTLTYPE) == 0 ||			\
-	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);	\
-	CTASSERT(sizeof(int64_t) == sizeof(*(ptr)))
+	CTASSERT((((access) & CTLTYPE) == 0 ||			\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \
+	    sizeof(int64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_QUAD(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
@@ -408,9 +637,9 @@
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),			\
 	     ptr, val, sysctl_handle_64, "QU", descr);			\
-	CTASSERT(((access) & CTLTYPE) == 0 ||				\
-	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64);		\
-	CTASSERT(sizeof(uint64_t) == sizeof(*(ptr)))
+	CTASSERT((((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) &&	\
+	    sizeof(uint64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_UQUAD(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
@@ -426,9 +655,9 @@
 #define	SYSCTL_ADD_UAUTO(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	struct sysctl_oid *__ret;					\
-	CTASSERT(sizeof(uint64_t) == sizeof(*(ptr)) ||			\
-	    sizeof(unsigned) == sizeof(*(ptr)));			\
-	CTASSERT(((access) & CTLTYPE) == 0);				\
+	CTASSERT((sizeof(uint64_t) == sizeof(*(ptr)) ||			\
+	    sizeof(unsigned) == sizeof(*(ptr))) &&			\
+	    ((access) & CTLTYPE) == 0);					\
 	if (sizeof(uint64_t) == sizeof(*(ptr))) {			\
 		__ret = sysctl_add_oid(ctx, parent, nbr, name,		\
 		    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),		\
@@ -448,10 +677,10 @@
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),			\
 	    (ptr), 0, sysctl_handle_counter_u64, "QU", descr);		\
-	CTASSERT(((access) & CTLTYPE) == 0 ||				\
-	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64);		\
-	CTASSERT(sizeof(counter_u64_t) == sizeof(*(ptr)));		\
-	CTASSERT(sizeof(uint64_t) == sizeof(**(ptr)))
+	CTASSERT((((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) &&	\
+	    sizeof(counter_u64_t) == sizeof(*(ptr)) &&			\
+	    sizeof(uint64_t) == sizeof(**(ptr)))
 
 #define	SYSCTL_ADD_COUNTER_U64(ctx, parent, nbr, name, access, ptr, descr) \
 ({									\
@@ -463,6 +692,28 @@
 	    __ptr, 0, sysctl_handle_counter_u64, "QU", __DESCR(descr));	\
 })
 
+/* Oid for an array of counter(9)s.  The pointer and length must be non zero. */
+#define	SYSCTL_COUNTER_U64_ARRAY(parent, nbr, name, access, ptr, len, descr) \
+	SYSCTL_OID(parent, nbr, name,					\
+	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
+	    (ptr), (len), sysctl_handle_counter_u64_array, "S", descr);	\
+	CTASSERT((((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) &&	\
+	    sizeof(counter_u64_t) == sizeof(*(ptr)) &&			\
+	    sizeof(uint64_t) == sizeof(**(ptr)))
+
+#define	SYSCTL_ADD_COUNTER_U64_ARRAY(ctx, parent, nbr, name, access,	\
+    ptr, len, descr)							\
+({									\
+	counter_u64_t *__ptr = (ptr);					\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE);	\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
+	    __ptr, len, sysctl_handle_counter_u64_array, "S",		\
+	    __DESCR(descr));						\
+})
+
 /* Oid for an opaque object.  Specified by a pointer and a length. */
 #define	SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr)	\
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE|(access),		\
@@ -544,6 +795,24 @@
 	    __ptr, 0, sysctl_handle_uma_zone_cur, "I", __DESCR(descr));	\
 })
 
+/* OID expressing a struct timeval as seconds */
+#define	SYSCTL_TIMEVAL_SEC(parent, nbr, name, access, ptr, descr)	\
+	SYSCTL_OID(parent, nbr, name,					\
+	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
+	    (ptr), 0, sysctl_sec_to_timeval, "I", descr);		\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT)
+#define	SYSCTL_ADD_TIMEVAL_SEC(ctx, parent, nbr, name, access, ptr, descr) \
+({									\
+	struct timeval *__ptr = (ptr);					\
+	CTASSERT(((access) & CTLTYPE) == 0 ||				\
+	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);		\
+	sysctl_add_oid(ctx, parent, nbr, name,				\
+	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
+	    __ptr, 0, sysctl_sec_to_timeval, "I", __DESCR(descr),	\
+	    NULL);							\
+})
+
 /*
  * A macro to generate a read-only sysctl to indicate the presence of optional
  * kernel features.
@@ -567,7 +836,6 @@
 #define	CTL_MACHDEP	7		/* machine dependent */
 #define	CTL_USER	8		/* user-level */
 #define	CTL_P1003_1B	9		/* POSIX 1003.1B */
-#define	CTL_MAXID	10		/* number of valid top-level ids */
 
 /*
  * CTL_KERN identifiers
@@ -609,7 +877,6 @@
 #define	KERN_IOV_MAX		35	/* int: value of UIO_MAXIOV */
 #define	KERN_HOSTUUID		36	/* string: host UUID identifier */
 #define	KERN_ARND		37	/* int: from arc4rand() */
-#define	KERN_MAXID		38	/* number of valid kern ids */
 /*
  * KERN_PROC subtypes
  */
@@ -644,6 +911,8 @@
 #define	KERN_PROC_UMASK		39	/* process umask */
 #define	KERN_PROC_OSREL		40	/* osreldate for process binary */
 #define	KERN_PROC_SIGTRAMP	41	/* signal trampoline location */
+#define	KERN_PROC_CWD		42	/* process current working directory */
+#define	KERN_PROC_NFDS		43	/* number of open file descriptors */
 
 /*
  * KERN_IPC identifiers
@@ -671,7 +940,6 @@
 #define	HW_FLOATINGPT	10		/* int: has HW floating point? */
 #define	HW_MACHINE_ARCH	11		/* string: machine architecture */
 #define	HW_REALMEM	12		/* int: 'real' memory */
-#define	HW_MAXID	13		/* number of valid hw ids */
 
 /*
  * CTL_USER definitions
@@ -696,7 +964,6 @@
 #define	USER_POSIX2_UPE		18	/* int: POSIX2_UPE */
 #define	USER_STREAM_MAX		19	/* int: POSIX2_STREAM_MAX */
 #define	USER_TZNAME_MAX		20	/* int: POSIX2_TZNAME_MAX */
-#define	USER_MAXID		21	/* number of valid user ids */
 
 #define	CTL_P1003_1B_ASYNCHRONOUS_IO		1	/* boolean */
 #define	CTL_P1003_1B_MAPPED_FILES		2	/* boolean */
@@ -752,6 +1019,7 @@
 SYSCTL_DECL(_hw_bus_devices);
 SYSCTL_DECL(_hw_bus_info);
 SYSCTL_DECL(_machdep);
+SYSCTL_DECL(_machdep_mitigations);
 SYSCTL_DECL(_user);
 SYSCTL_DECL(_compat);
 SYSCTL_DECL(_regression);
@@ -766,7 +1034,7 @@
 /* Dynamic oid handling */
 struct sysctl_oid *sysctl_add_oid(struct sysctl_ctx_list *clist,
 	    struct sysctl_oid_list *parent, int nbr, const char *name, int kind,
-	    void *arg1, intptr_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS),
+	    void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS),
 	    const char *fmt, const char *descr);
 int	sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del,
 	    int recurse);
@@ -794,8 +1062,8 @@
 	    size_t *retval, int flags);
 int	sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
 	    int *nindx, struct sysctl_req *req);
-void	sysctl_lock(void);
-void	sysctl_unlock(void);
+void	sysctl_wlock(void);
+void	sysctl_wunlock(void);
 int	sysctl_wire_old_buffer(struct sysctl_req *req, size_t len);
 
 struct sbuf;


From laffer1 at midnightbsd.org  Sun Feb  9 12:07:19 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:07:19 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12349] trunk/sys/sys/syscall.mk: sync with
 FreeBSD 11-stable
Message-ID: <202002091707.019H7J60077410@stargazer.midnightbsd.org>

Revision: 12349
          http://svnweb.midnightbsd.org/src/?rev=12349
Author:   laffer1
Date:     2020-02-09 12:07:18 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/syscall.mk

Modified: trunk/sys/sys/syscall.mk
===================================================================
--- trunk/sys/sys/syscall.mk	2020-02-09 17:06:01 UTC (rev 12348)
+++ trunk/sys/sys/syscall.mk	2020-02-09 17:07:18 UTC (rev 12349)
@@ -18,7 +18,6 @@
 	chmod.o \
 	chown.o \
 	break.o \
-	freebsd4_getfsstat.o \
 	getpid.o \
 	mount.o \
 	unmount.o \
@@ -39,7 +38,7 @@
 	kill.o \
 	getppid.o \
 	dup.o \
-	pipe.o \
+	freebsd10_pipe.o \
 	getegid.o \
 	profil.o \
 	ktrace.o \
@@ -108,20 +107,13 @@
 	quotactl.o \
 	nlm_syscall.o \
 	nfssvc.o \
-	freebsd4_statfs.o \
-	freebsd4_fstatfs.o \
 	lgetfh.o \
 	getfh.o \
-	freebsd4_getdomainname.o \
-	freebsd4_setdomainname.o \
-	freebsd4_uname.o \
 	sysarch.o \
 	rtprio.o \
 	semsys.o \
 	msgsys.o \
 	shmsys.o \
-	freebsd6_pread.o \
-	freebsd6_pwrite.o \
 	setfib.o \
 	ntp_adjtime.o \
 	setgid.o \
@@ -135,11 +127,7 @@
 	getrlimit.o \
 	setrlimit.o \
 	getdirentries.o \
-	freebsd6_mmap.o \
 	__syscall.o \
-	freebsd6_lseek.o \
-	freebsd6_truncate.o \
-	freebsd6_ftruncate.o \
 	__sysctl.o \
 	mlock.o \
 	munlock.o \
@@ -170,6 +158,7 @@
 	ffclock_getcounter.o \
 	ffclock_setestimate.o \
 	ffclock_getestimate.o \
+	clock_nanosleep.o \
 	clock_getcpuclockid2.o \
 	ntp_gettime.o \
 	minherit.o \
@@ -190,7 +179,6 @@
 	nlstat.o \
 	preadv.o \
 	pwritev.o \
-	freebsd4_fhstatfs.o \
 	fhopen.o \
 	fhstat.o \
 	modnext.o \
@@ -210,9 +198,6 @@
 	aio_suspend.o \
 	aio_cancel.o \
 	aio_error.o \
-	oaio_read.o \
-	oaio_write.o \
-	olio_listio.o \
 	yield.o \
 	mlockall.o \
 	munlockall.o \
@@ -226,15 +211,12 @@
 	sched_get_priority_min.o \
 	sched_rr_get_interval.o \
 	utrace.o \
-	freebsd4_sendfile.o \
 	kldsym.o \
 	jail.o \
 	nnpfs_syscall.o \
 	sigprocmask.o \
 	sigsuspend.o \
-	freebsd4_sigaction.o \
 	sigpending.o \
-	freebsd4_sigreturn.o \
 	sigtimedwait.o \
 	sigwaitinfo.o \
 	__acl_get_file.o \
@@ -307,8 +289,6 @@
 	thr_exit.o \
 	thr_self.o \
 	thr_kill.o \
-	_umtx_lock.o \
-	_umtx_unlock.o \
 	jail_attach.o \
 	extattr_list_fd.o \
 	extattr_list_file.o \
@@ -413,4 +393,7 @@
 	procctl.o \
 	ppoll.o \
 	futimens.o \
-	utimensat.o
+	utimensat.o \
+	numa_getaffinity.o \
+	numa_setaffinity.o \
+	fdatasync.o


From laffer1 at midnightbsd.org  Sun Feb  9 12:08:07 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:08:07 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12350] trunk/sys/sys/syscall.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091708.019H87Ur077472@stargazer.midnightbsd.org>

Revision: 12350
          http://svnweb.midnightbsd.org/src/?rev=12350
Author:   laffer1
Date:     2020-02-09 12:08:07 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/syscall.h

Modified: trunk/sys/sys/syscall.h
===================================================================
--- trunk/sys/sys/syscall.h	2020-02-09 17:07:18 UTC (rev 12349)
+++ trunk/sys/sys/syscall.h	2020-02-09 17:08:07 UTC (rev 12350)
@@ -1,8 +1,8 @@
+/* $MidnightBSD$ */
 /*
  * System call numbers.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $MidnightBSD$
  */
 
 #define	SYS_syscall	0
@@ -23,7 +23,7 @@
 #define	SYS_chmod	15
 #define	SYS_chown	16
 #define	SYS_break	17
-#define	SYS_freebsd4_getfsstat	18
+				/* 18 is freebsd4 getfsstat */
 				/* 19 is old lseek */
 #define	SYS_getpid	20
 #define	SYS_mount	21
@@ -47,7 +47,7 @@
 #define	SYS_getppid	39
 				/* 40 is old lstat */
 #define	SYS_dup	41
-#define	SYS_pipe	42
+#define	SYS_freebsd10_pipe	42
 #define	SYS_getegid	43
 #define	SYS_profil	44
 #define	SYS_ktrace	45
@@ -156,20 +156,20 @@
 #define	SYS_nlm_syscall	154
 #define	SYS_nfssvc	155
 				/* 156 is old getdirentries */
-#define	SYS_freebsd4_statfs	157
-#define	SYS_freebsd4_fstatfs	158
+				/* 157 is freebsd4 statfs */
+				/* 158 is freebsd4 fstatfs */
 #define	SYS_lgetfh	160
 #define	SYS_getfh	161
-#define	SYS_freebsd4_getdomainname	162
-#define	SYS_freebsd4_setdomainname	163
-#define	SYS_freebsd4_uname	164
+				/* 162 is freebsd4 getdomainname */
+				/* 163 is freebsd4 setdomainname */
+				/* 164 is freebsd4 uname */
 #define	SYS_sysarch	165
 #define	SYS_rtprio	166
 #define	SYS_semsys	169
 #define	SYS_msgsys	170
 #define	SYS_shmsys	171
-#define	SYS_freebsd6_pread	173
-#define	SYS_freebsd6_pwrite	174
+				/* 173 is freebsd6 pread */
+				/* 174 is freebsd6 pwrite */
 #define	SYS_setfib	175
 #define	SYS_ntp_adjtime	176
 #define	SYS_setgid	181
@@ -183,11 +183,11 @@
 #define	SYS_getrlimit	194
 #define	SYS_setrlimit	195
 #define	SYS_getdirentries	196
-#define	SYS_freebsd6_mmap	197
+				/* 197 is freebsd6 mmap */
 #define	SYS___syscall	198
-#define	SYS_freebsd6_lseek	199
-#define	SYS_freebsd6_truncate	200
-#define	SYS_freebsd6_ftruncate	201
+				/* 199 is freebsd6 lseek */
+				/* 200 is freebsd6 truncate */
+				/* 201 is freebsd6 ftruncate */
 #define	SYS___sysctl	202
 #define	SYS_mlock	203
 #define	SYS_munlock	204
@@ -218,6 +218,7 @@
 #define	SYS_ffclock_getcounter	241
 #define	SYS_ffclock_setestimate	242
 #define	SYS_ffclock_getestimate	243
+#define	SYS_clock_nanosleep	244
 #define	SYS_clock_getcpuclockid2	247
 #define	SYS_ntp_gettime	248
 #define	SYS_minherit	250
@@ -238,7 +239,7 @@
 #define	SYS_nlstat	280
 #define	SYS_preadv	289
 #define	SYS_pwritev	290
-#define	SYS_freebsd4_fhstatfs	297
+				/* 297 is freebsd4 fhstatfs */
 #define	SYS_fhopen	298
 #define	SYS_fhstat	299
 #define	SYS_modnext	300
@@ -259,9 +260,9 @@
 #define	SYS_aio_suspend	315
 #define	SYS_aio_cancel	316
 #define	SYS_aio_error	317
-#define	SYS_oaio_read	318
-#define	SYS_oaio_write	319
-#define	SYS_olio_listio	320
+				/* 318 is freebsd6 aio_read */
+				/* 319 is freebsd6 aio_write */
+				/* 320 is freebsd6 lio_listio */
 #define	SYS_yield	321
 				/* 322 is obsolete thr_sleep */
 				/* 323 is obsolete thr_wakeup */
@@ -277,15 +278,15 @@
 #define	SYS_sched_get_priority_min	333
 #define	SYS_sched_rr_get_interval	334
 #define	SYS_utrace	335
-#define	SYS_freebsd4_sendfile	336
+				/* 336 is freebsd4 sendfile */
 #define	SYS_kldsym	337
 #define	SYS_jail	338
 #define	SYS_nnpfs_syscall	339
 #define	SYS_sigprocmask	340
 #define	SYS_sigsuspend	341
-#define	SYS_freebsd4_sigaction	342
+				/* 342 is freebsd4 sigaction */
 #define	SYS_sigpending	343
-#define	SYS_freebsd4_sigreturn	344
+				/* 344 is freebsd4 sigreturn */
 #define	SYS_sigtimedwait	345
 #define	SYS_sigwaitinfo	346
 #define	SYS___acl_get_file	347
@@ -358,8 +359,6 @@
 #define	SYS_thr_exit	431
 #define	SYS_thr_self	432
 #define	SYS_thr_kill	433
-#define	SYS__umtx_lock	434
-#define	SYS__umtx_unlock	435
 #define	SYS_jail_attach	436
 #define	SYS_extattr_list_fd	437
 #define	SYS_extattr_list_file	438
@@ -466,4 +465,7 @@
 #define	SYS_ppoll	545
 #define	SYS_futimens	546
 #define	SYS_utimensat	547
-#define	SYS_MAXSYSCALL	548
+#define	SYS_numa_getaffinity	548
+#define	SYS_numa_setaffinity	549
+#define	SYS_fdatasync	550
+#define	SYS_MAXSYSCALL	551


From laffer1 at midnightbsd.org  Sun Feb  9 12:49:22 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:49:22 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12351] trunk/sys/sys/stdint.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091749.019HnMe3083984@stargazer.midnightbsd.org>

Revision: 12351
          http://svnweb.midnightbsd.org/src/?rev=12351
Author:   laffer1
Date:     2020-02-09 12:49:21 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/stdint.h

Modified: trunk/sys/sys/stdint.h
===================================================================
--- trunk/sys/sys/stdint.h	2020-02-09 17:08:07 UTC (rev 12350)
+++ trunk/sys/sys/stdint.h	2020-02-09 17:49:21 UTC (rev 12351)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/stdint.h 291134 2015-11-21 16:21:27Z kib $
+ * $FreeBSD: stable/11/sys/sys/stdint.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_STDINT_H_
@@ -67,4 +67,11 @@
 #define	WCHAR_MIN	__WCHAR_MIN
 #define	WCHAR_MAX	__WCHAR_MAX
 
+#if __EXT1_VISIBLE
+/* ISO/IEC 9899:2011 K.3.4.4 */
+#ifndef RSIZE_MAX
+#define RSIZE_MAX (SIZE_MAX >> 1)
+#endif
+#endif /* __EXT1_VISIBLE */
+
 #endif /* !_SYS_STDINT_H_ */


From laffer1 at midnightbsd.org  Sun Feb  9 12:50:32 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:50:32 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12352] trunk/sys/sys/sockio.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091750.019HoWxS084700@stargazer.midnightbsd.org>

Revision: 12352
          http://svnweb.midnightbsd.org/src/?rev=12352
Author:   laffer1
Date:     2020-02-09 12:50:31 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sockio.h

Modified: trunk/sys/sys/sockio.h
===================================================================
--- trunk/sys/sys/sockio.h	2020-02-09 17:49:21 UTC (rev 12351)
+++ trunk/sys/sys/sockio.h	2020-02-09 17:50:31 UTC (rev 12352)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)sockio.h	8.1 (Berkeley) 3/28/94
- * $FreeBSD: stable/10/sys/sys/sockio.h 324462 2017-10-10 02:35:04Z sephe $
+ * $FreeBSD: stable/11/sys/sys/sockio.h 352649 2019-09-24 06:36:25Z kib $
  */
 
 #ifndef _SYS_SOCKIO_H_
@@ -51,28 +51,28 @@
 #define	SIOCGETSGCNT	_IOWR('r', 16, struct sioc_sg_req) /* get s,g pkt cnt */
 
 #define	SIOCSIFADDR	 _IOW('i', 12, struct ifreq)	/* set ifnet address */
-#define	OSIOCGIFADDR	_IOWR('i', 13, struct ifreq)	/* get ifnet address */
+/*	OSIOCGIFADDR	_IOWR('i', 13, struct ifreq)	4.3BSD */
 #define	SIOCGIFADDR	_IOWR('i', 33, struct ifreq)	/* get ifnet address */
 #define	SIOCSIFDSTADDR	 _IOW('i', 14, struct ifreq)	/* set p-p address */
-#define	OSIOCGIFDSTADDR	_IOWR('i', 15, struct ifreq)	/* get p-p address */
+/*	OSIOCGIFDSTADDR	_IOWR('i', 15, struct ifreq)	4.3BSD */
 #define	SIOCGIFDSTADDR	_IOWR('i', 34, struct ifreq)	/* get p-p address */
 #define	SIOCSIFFLAGS	 _IOW('i', 16, struct ifreq)	/* set ifnet flags */
 #define	SIOCGIFFLAGS	_IOWR('i', 17, struct ifreq)	/* get ifnet flags */
-#define	OSIOCGIFBRDADDR	_IOWR('i', 18, struct ifreq)	/* get broadcast addr */
+/*	OSIOCGIFBRDADDR	_IOWR('i', 18, struct ifreq)	4.3BSD */
 #define	SIOCGIFBRDADDR	_IOWR('i', 35, struct ifreq)	/* get broadcast addr */
 #define	SIOCSIFBRDADDR	 _IOW('i', 19, struct ifreq)	/* set broadcast addr */
-#define	OSIOCGIFCONF	_IOWR('i', 20, struct ifconf)	/* get ifnet list */
+/*	OSIOCGIFCONF	_IOWR('i', 20, struct ifconf)	4.3BSD */
 #define	SIOCGIFCONF	_IOWR('i', 36, struct ifconf)	/* get ifnet list */
-#define	OSIOCGIFNETMASK	_IOWR('i', 21, struct ifreq)	/* get net addr mask */
+/*	OSIOCGIFNETMASK	_IOWR('i', 21, struct ifreq)	4.3BSD */
 #define	SIOCGIFNETMASK	_IOWR('i', 37, struct ifreq)	/* get net addr mask */
 #define	SIOCSIFNETMASK	 _IOW('i', 22, struct ifreq)	/* set net addr mask */
 #define	SIOCGIFMETRIC	_IOWR('i', 23, struct ifreq)	/* get IF metric */
 #define	SIOCSIFMETRIC	 _IOW('i', 24, struct ifreq)	/* set IF metric */
 #define	SIOCDIFADDR	 _IOW('i', 25, struct ifreq)	/* delete IF addr */
-#define	OSIOCAIFADDR	 _IOW('i', 26, struct oifaliasreq)/* add/chg IF alias */
-#define	SIOCALIFADDR	 _IOW('i', 27, struct if_laddrreq) /* add IF addr */
-#define	SIOCGLIFADDR	_IOWR('i', 28, struct if_laddrreq) /* get IF addr */
-#define	SIOCDLIFADDR	 _IOW('i', 29, struct if_laddrreq) /* delete IF addr */
+#define	OSIOCAIFADDR	 _IOW('i', 26, struct oifaliasreq) /* FreeBSD 9.x */
+/*	SIOCALIFADDR	 _IOW('i', 27, struct if_laddrreq) KAME */
+/*	SIOCGLIFADDR	_IOWR('i', 28, struct if_laddrreq) KAME */
+/*	SIOCDLIFADDR	 _IOW('i', 29, struct if_laddrreq) KAME */
 #define	SIOCSIFCAP	 _IOW('i', 30, struct ifreq)	/* set IF features */
 #define	SIOCGIFCAP	_IOWR('i', 31, struct ifreq)	/* get IF features */
 #define	SIOCGIFINDEX	_IOWR('i', 32, struct ifreq)	/* get IF index */
@@ -104,8 +104,8 @@
 #define	SIOCGIFPSRCADDR	_IOWR('i', 71, struct ifreq)	/* get gif psrc addr */
 #define	SIOCGIFPDSTADDR	_IOWR('i', 72, struct ifreq)	/* get gif pdst addr */
 #define	SIOCDIFPHYADDR	 _IOW('i', 73, struct ifreq)	/* delete gif addrs */
-#define	SIOCSLIFPHYADDR	 _IOW('i', 74, struct if_laddrreq) /* set gif addrs */
-#define	SIOCGLIFPHYADDR	_IOWR('i', 75, struct if_laddrreq) /* get gif addrs */
+/*	SIOCSLIFPHYADDR	 _IOW('i', 74, struct if_laddrreq) KAME */
+/*	SIOCGLIFPHYADDR	_IOWR('i', 75, struct if_laddrreq) KAME */
 
 #define	SIOCGPRIVATE_0	_IOWR('i', 80, struct ifreq)	/* device private 0 */
 #define	SIOCGPRIVATE_1	_IOWR('i', 81, struct ifreq)	/* device private 1 */
@@ -139,4 +139,9 @@
 #define	SIOCGIFRSSHASH	_IOWR('i', 151, struct ifrsshash)/* get the current RSS
 							type/func settings */
 
+#define	SIOCGLANPCP	_IOWR('i', 152, struct ifreq)	/* Get (V)LAN PCP */
+#define	SIOCSLANPCP	 _IOW('i', 153, struct ifreq)	/* Set (V)LAN PCP */
+
+#define	SIOCGIFDOWNREASON	_IOWR('i', 154, struct ifdownreason)
+
 #endif /* !_SYS_SOCKIO_H_ */


From laffer1 at midnightbsd.org  Sun Feb  9 12:50:44 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:50:44 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12353] trunk/sys/sys/shm.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091750.019HoiV4084753@stargazer.midnightbsd.org>

Revision: 12353
          http://svnweb.midnightbsd.org/src/?rev=12353
Author:   laffer1
Date:     2020-02-09 12:50:44 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/shm.h

Modified: trunk/sys/sys/shm.h
===================================================================
--- trunk/sys/sys/shm.h	2020-02-09 17:50:31 UTC (rev 12352)
+++ trunk/sys/sys/shm.h	2020-02-09 17:50:44 UTC (rev 12353)
@@ -1,5 +1,5 @@
 /* $MidnightBSD$ */
-/* $FreeBSD: stable/10/sys/sys/shm.h 328294 2018-01-23 18:22:41Z jhb $ */
+/* $FreeBSD: stable/11/sys/sys/shm.h 347995 2019-05-20 16:31:45Z kib $ */
 /*	$NetBSD: shm.h,v 1.15 1994/06/29 06:45:17 cgd Exp $	*/
 
 /*-
@@ -41,11 +41,17 @@
 #define _SYS_SHM_H_
 
 #include <sys/cdefs.h>
+#ifdef _WANT_SYSVSHM_INTERNALS
+#define	_WANT_SYSVIPC_INTERNALS
+#endif
 #include <sys/ipc.h>
 #include <sys/_types.h>
 
+#include <machine/param.h>
+
 #define SHM_RDONLY  010000  /* Attach read-only (else read-write) */
 #define SHM_RND     020000  /* Round attach address to SHMLBA */
+#define	SHM_REMAP   030000  /* Unmap before mapping */
 #define SHMLBA      PAGE_SIZE /* Segment low boundary address multiple */
 
 /* "official" access mode definitions; somewhat braindead since you have
@@ -102,9 +108,7 @@
 	time_t          shm_ctime;	/* time of last change by shmctl() */
 };
 
-#ifdef _KERNEL
-#include <vm/vm.h>
-
+#if defined(_KERNEL) || defined(_WANT_SYSVSHM_INTERNALS)
 /*
  * System 5 style catch-all structure for shared memory constants that
  * might be of interest to user programs.  Do we really want/need this?
@@ -117,6 +121,8 @@
 	u_long	shmall;		/* max amount of shared memory (pages) */
 };
 
+struct vm_object;
+
 /* 
  * Add a kernel wrapper to the shmid_ds struct so that private info (like the
  * MAC label) can be added to it, without changing the user interface.
@@ -123,13 +129,12 @@
  */
 struct shmid_kernel {
 	struct shmid_ds u;
-	vm_object_t object;
+	struct vm_object *object;
 	struct label *label;	/* MAC label */
 	struct ucred *cred;	/* creator's credendials */
 };
+#endif
 
-extern struct shminfo	shminfo;
-
 struct shm_info {
 	int used_ids;
 	unsigned long shm_tot;
@@ -139,12 +144,15 @@
 	unsigned long swap_successes;
 };
 
-struct thread;
+#ifdef _KERNEL
 struct proc;
 struct vmspace;
 
+extern struct shminfo	shminfo;
+
 void	shmexit(struct vmspace *);
 void	shmfork(struct proc *, struct proc *);
+
 #else /* !_KERNEL */
 
 #include <sys/cdefs.h>
@@ -164,6 +172,6 @@
 int shmdt(const void *);
 __END_DECLS
 
-#endif /* !_KERNEL */
+#endif /* _KERNEL */
 
 #endif /* !_SYS_SHM_H_ */


From laffer1 at midnightbsd.org  Sun Feb  9 12:50:57 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:50:57 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12354] trunk/sys/sys/socketvar.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091750.019HovSY084797@stargazer.midnightbsd.org>

Revision: 12354
          http://svnweb.midnightbsd.org/src/?rev=12354
Author:   laffer1
Date:     2020-02-09 12:50:57 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/socketvar.h

Modified: trunk/sys/sys/socketvar.h
===================================================================
--- trunk/sys/sys/socketvar.h	2020-02-09 17:50:44 UTC (rev 12353)
+++ trunk/sys/sys/socketvar.h	2020-02-09 17:50:57 UTC (rev 12354)
@@ -29,7 +29,7 @@
  *
  *	@(#)socketvar.h	8.3 (Berkeley) 2/19/95
  *
- * $FreeBSD: stable/10/sys/sys/socketvar.h 321021 2017-07-15 17:28:03Z dchagin $
+ * $FreeBSD: stable/11/sys/sys/socketvar.h 338617 2018-09-12 18:52:18Z sobomax $
  */
 
 #ifndef _SYS_SOCKETVAR_H_
@@ -39,6 +39,7 @@
 #include <sys/selinfo.h>		/* for struct selinfo */
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
+#include <sys/osd.h>
 #include <sys/_sx.h>
 #include <sys/sockbuf.h>
 #include <sys/sockstate.h>
@@ -64,7 +65,6 @@
  * (a) constant after allocation, no locking required.
  * (b) locked by SOCK_LOCK(so).
  * (c) locked by SOCKBUF_LOCK(&so->so_rcv).
- * (d) locked by SOCKBUF_LOCK(&so->so_snd).
  * (e) locked by ACCEPT_LOCK().
  * (f) not locked since integer reads/writes are atomic.
  * (g) used only as a sleep/wakeup address, no value.
@@ -95,16 +95,15 @@
 	TAILQ_HEAD(, socket) so_incomp;	/* (e) queue of partial unaccepted connections */
 	TAILQ_HEAD(, socket) so_comp;	/* (e) queue of complete unaccepted connections */
 	TAILQ_ENTRY(socket) so_list;	/* (e) list of unaccepted connections */
-	u_short	so_qlen;		/* (e) number of unaccepted connections */
-	u_short	so_incqlen;		/* (e) number of unaccepted incomplete
+	u_int	so_qlen;		/* (e) number of unaccepted connections */
+	u_int	so_incqlen;		/* (e) number of unaccepted incomplete
 					   connections */
-	u_short	so_qlimit;		/* (e) max number queued connections */
+	u_int	so_qlimit;		/* (e) max number queued connections */
 	short	so_timeo;		/* (g) connection timeout */
 	u_short	so_error;		/* (f) error affecting connection */
 	struct	sigio *so_sigio;	/* [sg] information for async I/O or
 					   out of band data (SIGURG) */
 	u_long	so_oobmark;		/* (c) chars to oob mark */
-	TAILQ_HEAD(, aiocblist) so_aiojobq; /* AIO ops waiting on socket */
 
 	struct sockbuf so_rcv, so_snd;
 
@@ -119,6 +118,7 @@
 		void	*so_accept_filter_arg;	/* saved filter args */
 		char	*so_accept_filter_str;	/* saved user args */
 	} *so_accf;
+	struct	osd	osd;		/* Object Specific extensions */
 	/*
 	 * so_fibnum, so_user_cookie and friends can be used to attach
 	 * some user-specified metadata to a socket, which then can be
@@ -127,6 +127,11 @@
 	 */
 	int so_fibnum;		/* routing domain for this socket */
 	uint32_t so_user_cookie;
+
+	int so_ts_clock;	/* type of the clock used for timestamps */
+
+	void *so_pspare[2];	/* packet pacing / general use */
+	int so_ispare[2];	/* packet pacing / general use */
 };
 
 /*
@@ -171,9 +176,9 @@
 	caddr_t	so_pcb;		/* another convenient handle */
 	int	xso_protocol;
 	int	xso_family;
-	u_short	so_qlen;
-	u_short	so_incqlen;
-	u_short	so_qlimit;
+	u_int	so_qlen;
+	u_int	so_incqlen;
+	u_int	so_qlimit;
 	short	so_timeo;
 	u_short	so_error;
 	pid_t	so_pgid;
@@ -207,7 +212,7 @@
 
 /* can we read something from so? */
 #define	soreadabledata(so) \
-    ((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \
+    (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \
 	!TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error)
 #define	soreadable(so) \
 	(soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE))
@@ -294,11 +299,32 @@
 MALLOC_DECLARE(M_SONAME);
 #endif
 
+/*
+ * Socket specific helper hook point identifiers
+ * Do not leave holes in the sequence, hook registration is a loop.
+ */
+#define HHOOK_SOCKET_OPT		0
+#define HHOOK_SOCKET_CREATE		1
+#define HHOOK_SOCKET_RCV 		2
+#define HHOOK_SOCKET_SND		3
+#define HHOOK_FILT_SOREAD		4
+#define HHOOK_FILT_SOWRITE		5
+#define HHOOK_SOCKET_CLOSE		6
+#define HHOOK_SOCKET_LAST		HHOOK_SOCKET_CLOSE
+
+struct socket_hhook_data {
+	struct socket	*so;
+	struct mbuf	*m;
+	void		*hctx;		/* hook point specific data*/
+	int		status;
+};
+
 extern int	maxsockets;
 extern u_long	sb_max;
 extern so_gen_t so_gencnt;
 
 struct file;
+struct filecaps;
 struct filedesc;
 struct mbuf;
 struct sockaddr;
@@ -316,12 +342,14 @@
 /*
  * From uipc_socket and friends
  */
-int	sockargs(struct mbuf **mp, caddr_t buf, int buflen, int type);
 int	getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len);
 int	getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
-	    struct file **fpp, u_int *fflagp);
+	    struct file **fpp, u_int *fflagp, struct filecaps *havecaps);
 void	soabort(struct socket *so);
 int	soaccept(struct socket *so, struct sockaddr **nam);
+void	soaio_enqueue(struct task *task);
+void	soaio_rcv(void *context, int pending);
+void	soaio_snd(void *context, int pending);
 int	socheckuid(struct socket *so, uid_t uid);
 int	sobind(struct socket *so, struct sockaddr *nam, struct thread *td);
 int	sobindat(int fd, struct socket *so, struct sockaddr *nam,
@@ -376,6 +404,7 @@
 void	soupcall_set(struct socket *so, int which,
 	    int (*func)(struct socket *, void *, int), void *arg);
 void	sowakeup(struct socket *so, struct sockbuf *sb);
+void	sowakeup_aio(struct socket *so, struct sockbuf *sb);
 int	selsocket(struct socket *so, int events, struct timeval *tv,
 	    struct thread *td);
 


From laffer1 at midnightbsd.org  Sun Feb  9 12:51:23 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:51:23 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12355] trunk/sys/sys/socket.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091751.019HpNAL084848@stargazer.midnightbsd.org>

Revision: 12355
          http://svnweb.midnightbsd.org/src/?rev=12355
Author:   laffer1
Date:     2020-02-09 12:51:22 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/socket.h

Modified: trunk/sys/sys/socket.h
===================================================================
--- trunk/sys/sys/socket.h	2020-02-09 17:50:57 UTC (rev 12354)
+++ trunk/sys/sys/socket.h	2020-02-09 17:51:22 UTC (rev 12355)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)socket.h	8.4 (Berkeley) 2/21/94
- * $FreeBSD: stable/10/sys/sys/socket.h 254925 2013-08-26 18:16:05Z jhb $
+ * $FreeBSD: stable/11/sys/sys/socket.h 338617 2018-09-12 18:52:18Z sobomax $
  */
 
 #ifndef _SYS_SOCKET_H_
@@ -85,6 +85,16 @@
 #endif
 #endif
 
+#ifndef _UINT32_T_DECLARED
+typedef	__uint32_t	uint32_t;
+#define	_UINT32_T_DECLARED
+#endif
+
+#ifndef _UINTPTR_T_DECLARED
+typedef	__uintptr_t	uintptr_t;
+#define	_UINTPTR_T_DECLARED
+#endif
+
 /*
  * Types
  */
@@ -149,8 +159,18 @@
 #define	SO_USER_COOKIE	0x1015		/* user cookie (dummynet etc.) */
 #define	SO_PROTOCOL	0x1016		/* get socket protocol (Linux name) */
 #define	SO_PROTOTYPE	SO_PROTOCOL	/* alias for SO_PROTOCOL (SunOS name) */
+#define	SO_TS_CLOCK	0x1017		/* clock type used for SO_TIMESTAMP */
 #endif
 
+#if __BSD_VISIBLE
+#define	SO_TS_REALTIME_MICRO	0	/* microsecond resolution, realtime */
+#define	SO_TS_BINTIME		1	/* sub-nanosecond resolution, realtime */
+#define	SO_TS_REALTIME		2	/* nanosecond resolution, realtime */
+#define	SO_TS_MONOTONIC		3	/* nanosecond resolution, monotonic */
+#define	SO_TS_DEFAULT		SO_TS_REALTIME_MICRO
+#define	SO_TS_CLOCK_MAX		SO_TS_MONOTONIC
+#endif
+
 /*
  * Space reserved for new socket options added by third-party vendors.
  * This range applies to all socket option levels.  New socket options
@@ -367,9 +387,8 @@
  * Second level is protocol family.
  * Third level is protocol number.
  *
- * Further levels are defined by the individual families below.
+ * Further levels are defined by the individual families.
  */
-#define NET_MAXID	AF_MAX
 
 /*
  * PF_ROUTE - Routing table
@@ -385,14 +404,12 @@
 #define	NET_RT_IFMALIST	4		/* return multicast address list */
 #define	NET_RT_IFLISTL	5		/* Survey interface list, using 'l'en
 					 * versions of msghdr structs. */
-#define	NET_RT_MAXID	6
-
 #endif /* __BSD_VISIBLE */
 
 /*
  * Maximum queue length specifiable by listen.
  */
-#define	SOMAXCONN	256
+#define	SOMAXCONN	128
 
 /*
  * Message header for recvmsg and sendmsg calls.
@@ -425,9 +442,11 @@
 #define	MSG_NBIO	0x4000		/* FIONBIO mode, used by fifofs */
 #define	MSG_COMPAT      0x8000		/* used in sendit() */
 #define	MSG_CMSG_CLOEXEC 0x40000	/* make received fds close-on-exec */
+#define	MSG_WAITFORONE	0x80000		/* for recvmmsg() */
 #endif
 #ifdef _KERNEL
 #define	MSG_SOCALLBCK   0x10000		/* for use by socket callbacks - soreceive (TCP) */
+#define	MSG_MORETOCOME	0x100000	/* additional data pending */
 #endif
 
 /*
@@ -493,7 +512,7 @@
 
 /* given pointer to struct cmsghdr, return pointer to next cmsghdr */
 #define	CMSG_NXTHDR(mhdr, cmsg)	\
-	((char *)(cmsg) == NULL ? CMSG_FIRSTHDR(mhdr) : \
+	((char *)(cmsg) == (char *)0 ? CMSG_FIRSTHDR(mhdr) : \
 	    ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \
 	  _ALIGN(sizeof(struct cmsghdr)) > \
 	    (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \
@@ -508,7 +527,7 @@
 #define	CMSG_FIRSTHDR(mhdr) \
 	((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \
 	 (struct cmsghdr *)(mhdr)->msg_control : \
-	 (struct cmsghdr *)NULL)
+	 (struct cmsghdr *)0)
 
 #if __BSD_VISIBLE
 /* RFC 2292 additions */
@@ -526,6 +545,8 @@
 #define	SCM_TIMESTAMP	0x02		/* timestamp (struct timeval) */
 #define	SCM_CREDS	0x03		/* process creds (struct cmsgcred) */
 #define	SCM_BINTIME	0x04		/* timestamp (struct bintime) */
+#define	SCM_REALTIME	0x05		/* timestamp (struct timespec) */
+#define	SCM_MONOTONIC	0x06		/* timestamp (struct timespec) */
 #endif
 
 #if __BSD_VISIBLE
@@ -581,12 +602,22 @@
  * Sendfile-specific flag(s)
  */
 #define	SF_NODISKIO     0x00000001
-#define	SF_MNOWAIT	0x00000002
+#define	SF_MNOWAIT	0x00000002	/* obsolete */
 #define	SF_SYNC		0x00000004
+#define	SF_NOCACHE	0x00000010
+#define	SF_FLAGS(rh, flags)	(((rh) << 16) | (flags))
 
 #ifdef _KERNEL
-#define	SFK_COMPAT	0x00000001
+#define	SF_READAHEAD(flags)	((flags) >> 16)
 #endif /* _KERNEL */
+
+/*
+ * Sendmmsg/recvmmsg specific structure(s)
+ */
+struct mmsghdr {
+	struct msghdr	msg_hdr;		/* message header */
+	ssize_t		msg_len;		/* message length */
+};
 #endif /* __BSD_VISIBLE */
 
 #ifndef	_KERNEL
@@ -609,6 +640,11 @@
 ssize_t	recv(int, void *, size_t, int);
 ssize_t	recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict);
 ssize_t	recvmsg(int, struct msghdr *, int);
+#if __BSD_VISIBLE
+struct timespec;
+ssize_t	recvmmsg(int, struct mmsghdr * __restrict, size_t, int,
+    const struct timespec * __restrict);
+#endif
 ssize_t	send(int, const void *, size_t, int);
 ssize_t	sendto(int, const void *,
 	    size_t, int, const struct sockaddr *, socklen_t);
@@ -615,6 +651,7 @@
 ssize_t	sendmsg(int, const struct msghdr *, int);
 #if __BSD_VISIBLE
 int	sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int);
+ssize_t	sendmmsg(int, struct mmsghdr * __restrict, size_t, int);
 int	setfib(int);
 #endif
 int	setsockopt(int, int, int, const void *, socklen_t);


From laffer1 at midnightbsd.org  Sun Feb  9 12:51:39 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:51:39 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12356] trunk/sys/sys/sockbuf.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091751.019Hpdsc084897@stargazer.midnightbsd.org>

Revision: 12356
          http://svnweb.midnightbsd.org/src/?rev=12356
Author:   laffer1
Date:     2020-02-09 12:51:38 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sockbuf.h

Modified: trunk/sys/sys/sockbuf.h
===================================================================
--- trunk/sys/sys/sockbuf.h	2020-02-09 17:51:22 UTC (rev 12355)
+++ trunk/sys/sys/sockbuf.h	2020-02-09 17:51:38 UTC (rev 12356)
@@ -29,7 +29,7 @@
  *
  *	@(#)socketvar.h	8.3 (Berkeley) 2/19/95
  *
- * $FreeBSD: stable/10/sys/sys/sockbuf.h 279930 2015-03-12 17:07:45Z sjg $
+ * $FreeBSD: stable/11/sys/sys/sockbuf.h 337975 2018-08-17 16:04:20Z markj $
  */
 #ifndef _SYS_SOCKBUF_H_
 #define _SYS_SOCKBUF_H_
@@ -37,6 +37,7 @@
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_sx.h>
+#include <sys/_task.h>
 
 #define	SB_MAX		(2*1024*1024)	/* default for max chars in sockbuf */
 
@@ -54,6 +55,7 @@
 #define	SB_IN_TOE	0x400		/* socket buffer is in the middle of an operation */
 #define	SB_AUTOSIZE	0x800		/* automatically size socket buffer */
 #define	SB_STOP		0x1000		/* backpressure indicator */
+#define	SB_AIO_RUNNING	0x2000		/* AIO operation running */
 
 #define	SBS_CANTSENDMORE	0x0010	/* can't send more data to peer */
 #define	SBS_CANTRCVMORE		0x0020	/* can't receive more data from peer */
@@ -78,31 +80,38 @@
 
 /*
  * Variables for socket buffering.
+ *
+ * Locking key to struct sockbuf:
+ * (a) locked by SOCKBUF_LOCK().
  */
 struct	sockbuf {
 	struct	selinfo sb_sel;	/* process selecting read/write */
 	struct	mtx sb_mtx;	/* sockbuf lock */
 	struct	sx sb_sx;	/* prevent I/O interlacing */
-	short	sb_state;	/* (c/d) socket state on sockbuf */
+	short	sb_state;	/* (a) socket state on sockbuf */
 #define	sb_startzero	sb_mb
-	struct	mbuf *sb_mb;	/* (c/d) the mbuf chain */
-	struct	mbuf *sb_mbtail; /* (c/d) the last mbuf in the chain */
-	struct	mbuf *sb_lastrecord;	/* (c/d) first mbuf of last
+	struct	mbuf *sb_mb;	/* (a) the mbuf chain */
+	struct	mbuf *sb_mbtail; /* (a) the last mbuf in the chain */
+	struct	mbuf *sb_lastrecord;	/* (a) first mbuf of last
 					 * record in socket buffer */
-	struct	mbuf *sb_sndptr; /* (c/d) pointer into mbuf chain */
-	u_int	sb_sndptroff;	/* (c/d) byte offset of ptr into chain */
-	u_int	sb_cc;		/* (c/d) actual chars in buffer */
-	u_int	sb_hiwat;	/* (c/d) max actual char count */
-	u_int	sb_mbcnt;	/* (c/d) chars of mbufs used */
-	u_int   sb_mcnt;        /* (c/d) number of mbufs in buffer */
-	u_int   sb_ccnt;        /* (c/d) number of clusters in buffer */
-	u_int	sb_mbmax;	/* (c/d) max chars of mbufs to use */
-	u_int	sb_ctl;		/* (c/d) non-data chars in buffer */
-	int	sb_lowat;	/* (c/d) low water mark */
-	sbintime_t	sb_timeo;	/* (c/d) timeout for read/write */
-	short	sb_flags;	/* (c/d) flags, see below */
-	int	(*sb_upcall)(struct socket *, void *, int); /* (c/d) */
-	void	*sb_upcallarg;	/* (c/d) */
+	struct	mbuf *sb_sndptr; /* (a) pointer into mbuf chain */
+	struct	mbuf *sb_fnrdy;	/* (a) pointer to first not ready buffer */
+	u_int	sb_sndptroff;	/* (a) byte offset of ptr into chain */
+	u_int	sb_acc;		/* (a) available chars in buffer */
+	u_int	sb_ccc;		/* (a) claimed chars in buffer */
+	u_int	sb_hiwat;	/* (a) max actual char count */
+	u_int	sb_mbcnt;	/* (a) chars of mbufs used */
+	u_int   sb_mcnt;        /* (a) number of mbufs in buffer */
+	u_int   sb_ccnt;        /* (a) number of clusters in buffer */
+	u_int	sb_mbmax;	/* (a) max chars of mbufs to use */
+	u_int	sb_ctl;		/* (a) non-data chars in buffer */
+	int	sb_lowat;	/* (a) low water mark */
+	sbintime_t	sb_timeo;	/* (a) timeout for read/write */
+	short	sb_flags;	/* (a) flags, see below */
+	int	(*sb_upcall)(struct socket *, void *, int); /* (a) */
+	void	*sb_upcallarg;	/* (a) */
+	TAILQ_HEAD(, kaiocb) sb_aiojobq; /* (a) pending AIO ops */
+	struct	task sb_aiotask; /* AIO task */
 };
 
 #ifdef _KERNEL
@@ -121,10 +130,17 @@
 #define	SOCKBUF_LOCK_ASSERT(_sb)	mtx_assert(SOCKBUF_MTX(_sb), MA_OWNED)
 #define	SOCKBUF_UNLOCK_ASSERT(_sb)	mtx_assert(SOCKBUF_MTX(_sb), MA_NOTOWNED)
 
-void	sbappend(struct sockbuf *sb, struct mbuf *m);
-void	sbappend_locked(struct sockbuf *sb, struct mbuf *m);
-void	sbappendstream(struct sockbuf *sb, struct mbuf *m);
-void	sbappendstream_locked(struct sockbuf *sb, struct mbuf *m);
+/*
+ * Socket buffer private mbuf(9) flags.
+ */
+#define	M_NOTREADY	M_PROTO1	/* m_data not populated yet */
+#define	M_BLOCKED	M_PROTO2	/* M_NOTREADY in front of m */
+#define	M_NOTAVAIL	(M_NOTREADY | M_BLOCKED)
+
+void	sbappend(struct sockbuf *sb, struct mbuf *m, int flags);
+void	sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags);
+void	sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags);
+void	sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags);
 int	sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
 	    struct mbuf *m0, struct mbuf *control);
 int	sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
@@ -131,13 +147,12 @@
 	    struct mbuf *m0, struct mbuf *control);
 int	sbappendaddr_nospacecheck_locked(struct sockbuf *sb,
 	    const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control);
-int	sbappendcontrol(struct sockbuf *sb, struct mbuf *m0,
+void	sbappendcontrol(struct sockbuf *sb, struct mbuf *m0,
 	    struct mbuf *control);
-int	sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
+void	sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
 	    struct mbuf *control);
 void	sbappendrecord(struct sockbuf *sb, struct mbuf *m0);
 void	sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0);
-void	sbcheck(struct sockbuf *sb);
 void	sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n);
 struct mbuf *
 	sbcreatecontrol(caddr_t p, int size, int type, int level);
@@ -165,58 +180,61 @@
 int	sbwait(struct sockbuf *sb);
 int	sblock(struct sockbuf *sb, int flags);
 void	sbunlock(struct sockbuf *sb);
+void	sballoc(struct sockbuf *, struct mbuf *);
+void	sbfree(struct sockbuf *, struct mbuf *);
+int	sbready(struct sockbuf *, struct mbuf *, int);
 
 /*
+ * Return how much data is available to be taken out of socket
+ * buffer right now.
+ */
+static inline u_int
+sbavail(struct sockbuf *sb)
+{
+
+#if 0
+	SOCKBUF_LOCK_ASSERT(sb);
+#endif
+	return (sb->sb_acc);
+}
+
+/*
+ * Return how much data sits there in the socket buffer
+ * It might be that some data is not yet ready to be read.
+ */
+static inline u_int
+sbused(struct sockbuf *sb)
+{
+
+#if 0
+	SOCKBUF_LOCK_ASSERT(sb);
+#endif
+	return (sb->sb_ccc);
+}
+
+/*
  * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
  * This is problematical if the fields are unsigned, as the space might
- * still be negative (cc > hiwat or mbcnt > mbmax).  Should detect
- * overflow and return 0.  Should use "lmin" but it doesn't exist now.
+ * still be negative (ccc > hiwat or mbcnt > mbmax).
  */
-static __inline
-long
+static inline long
 sbspace(struct sockbuf *sb)
 {
 	int bleft, mleft;		/* size should match sockbuf fields */
 
+#if 0
+	SOCKBUF_LOCK_ASSERT(sb);
+#endif
+
 	if (sb->sb_flags & SB_STOP)
 		return(0);
-	bleft = sb->sb_hiwat - sb->sb_cc;
+
+	bleft = sb->sb_hiwat - sb->sb_ccc;
 	mleft = sb->sb_mbmax - sb->sb_mbcnt;
-	return((bleft < mleft) ? bleft : mleft);
-}
 
-/* adjust counters in sb reflecting allocation of m */
-#define	sballoc(sb, m) { \
-	(sb)->sb_cc += (m)->m_len; \
-	if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \
-		(sb)->sb_ctl += (m)->m_len; \
-	(sb)->sb_mbcnt += MSIZE; \
-	(sb)->sb_mcnt += 1; \
-	if ((m)->m_flags & M_EXT) { \
-		(sb)->sb_mbcnt += (m)->m_ext.ext_size; \
-		(sb)->sb_ccnt += 1; \
-	} \
+	return ((bleft < mleft) ? bleft : mleft);
 }
 
-/* adjust counters in sb reflecting freeing of m */
-#define	sbfree(sb, m) { \
-	(sb)->sb_cc -= (m)->m_len; \
-	if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \
-		(sb)->sb_ctl -= (m)->m_len; \
-	(sb)->sb_mbcnt -= MSIZE; \
-	(sb)->sb_mcnt -= 1; \
-	if ((m)->m_flags & M_EXT) { \
-		(sb)->sb_mbcnt -= (m)->m_ext.ext_size; \
-		(sb)->sb_ccnt -= 1; \
-	} \
-	if ((sb)->sb_sndptr == (m)) { \
-		(sb)->sb_sndptr = NULL; \
-		(sb)->sb_sndptroff = 0; \
-	} \
-	if ((sb)->sb_sndptroff != 0) \
-		(sb)->sb_sndptroff -= (m)->m_len; \
-}
-
 #define SB_EMPTY_FIXUP(sb) do {						\
 	if ((sb)->sb_mb == NULL) {					\
 		(sb)->sb_mbtail = NULL;					\
@@ -226,13 +244,15 @@
 
 #ifdef SOCKBUF_DEBUG
 void	sblastrecordchk(struct sockbuf *, const char *, int);
+void	sblastmbufchk(struct sockbuf *, const char *, int);
+void	sbcheck(struct sockbuf *, const char *, int);
 #define	SBLASTRECORDCHK(sb)	sblastrecordchk((sb), __FILE__, __LINE__)
-
-void	sblastmbufchk(struct sockbuf *, const char *, int);
 #define	SBLASTMBUFCHK(sb)	sblastmbufchk((sb), __FILE__, __LINE__)
+#define	SBCHECK(sb)		sbcheck((sb), __FILE__, __LINE__)
 #else
-#define	SBLASTRECORDCHK(sb)      /* nothing */
-#define	SBLASTMBUFCHK(sb)        /* nothing */
+#define	SBLASTRECORDCHK(sb)	do {} while (0)
+#define	SBLASTMBUFCHK(sb)	do {} while (0)
+#define	SBCHECK(sb)		do {} while (0)
 #endif /* SOCKBUF_DEBUG */
 
 #endif /* _KERNEL */


From laffer1 at midnightbsd.org  Sun Feb  9 12:52:06 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 12:52:06 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12357] trunk/sys/sys: sync with FreeBSD
 11-stable
Message-ID: <202002091752.019Hq6ij084969@stargazer.midnightbsd.org>

Revision: 12357
          http://svnweb.midnightbsd.org/src/?rev=12357
Author:   laffer1
Date:     2020-02-09 12:52:06 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sleepqueue.h
    trunk/sys/sys/slicer.h

Modified: trunk/sys/sys/sleepqueue.h
===================================================================
--- trunk/sys/sys/sleepqueue.h	2020-02-09 17:51:38 UTC (rev 12356)
+++ trunk/sys/sys/sleepqueue.h	2020-02-09 17:52:06 UTC (rev 12357)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/sleepqueue.h 274139 2014-11-05 16:24:57Z lwhsu $
+ * $FreeBSD: stable/11/sys/sys/sleepqueue.h 354405 2019-11-06 18:02:18Z mav $
  */
 
 #ifndef _SYS_SLEEPQUEUE_H_
@@ -84,6 +84,7 @@
 #define	SLEEPQ_SX		0x03		/* Used by an sx lock. */
 #define	SLEEPQ_LK		0x04		/* Used by a lockmgr. */
 #define	SLEEPQ_INTERRUPTIBLE	0x100		/* Sleep is interruptible. */
+#define	SLEEPQ_UNFAIR		0x200		/* Unfair wakeup order. */
 
 void	init_sleepqueues(void);
 int	sleepq_abort(struct thread *td, int intrval);
@@ -91,11 +92,14 @@
 	    int flags, int queue);
 struct sleepqueue *sleepq_alloc(void);
 int	sleepq_broadcast(void *wchan, int flags, int pri, int queue);
+void	sleepq_chains_remove_matching(bool (*matches)(struct thread *));
 void	sleepq_free(struct sleepqueue *sq);
 void	sleepq_lock(void *wchan);
 struct sleepqueue *sleepq_lookup(void *wchan);
 void	sleepq_release(void *wchan);
 void	sleepq_remove(struct thread *td, void *wchan);
+int	sleepq_remove_matching(struct sleepqueue *sq, int queue,
+	    bool (*matches)(struct thread *), int pri);
 int	sleepq_signal(void *wchan, int flags, int pri, int queue);
 void	sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt,
 	    sbintime_t pr, int flags);
@@ -108,5 +112,11 @@
 void	sleepq_wait(void *wchan, int pri);
 int	sleepq_wait_sig(void *wchan, int pri);
 
+#ifdef STACK
+struct sbuf;
+int sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue,
+    int *count_stacks_printed);
+#endif
+
 #endif	/* _KERNEL */
 #endif	/* !_SYS_SLEEPQUEUE_H_ */

Modified: trunk/sys/sys/slicer.h
===================================================================
--- trunk/sys/sys/slicer.h	2020-02-09 17:51:38 UTC (rev 12356)
+++ trunk/sys/sys/slicer.h	2020-02-09 17:52:06 UTC (rev 12357)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/slicer.h 318159 2017-05-10 21:42:16Z marius $
+ * $FreeBSD: stable/11/sys/sys/slicer.h 346557 2019-04-22 15:04:11Z ian $
  */
 
 #ifndef _FLASH_SLICER_H_
@@ -57,7 +57,7 @@
 #define	FLASH_SLICES_TYPE_SPI		2
 #define	FLASH_SLICES_TYPE_MMC		3
 
-/* Use NULL for deregistering a slicer */
+/* Use NULL and set force to true for deregistering a slicer */
 void flash_register_slicer(flash_slicer_t slicer, u_int type, bool force);
 
 #endif /* _KERNEL */


From laffer1 at midnightbsd.org  Sun Feb  9 13:26:51 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 13:26:51 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12358] trunk/sys/sys/signal.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091826.019IQpod090805@stargazer.midnightbsd.org>

Revision: 12358
          http://svnweb.midnightbsd.org/src/?rev=12358
Author:   laffer1
Date:     2020-02-09 13:26:51 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/signal.h

Modified: trunk/sys/sys/signal.h
===================================================================
--- trunk/sys/sys/signal.h	2020-02-09 17:52:06 UTC (rev 12357)
+++ trunk/sys/sys/signal.h	2020-02-09 18:26:51 UTC (rev 12358)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)signal.h	8.4 (Berkeley) 5/4/95
- * $FreeBSD: stable/10/sys/sys/signal.h 233519 2012-03-26 19:12:09Z rmh $
+ * $FreeBSD: stable/11/sys/sys/signal.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_SIGNAL_H_
@@ -46,6 +46,23 @@
 #include <machine/_limits.h>	/* __MINSIGSTKSZ */
 #include <machine/signal.h>	/* sig_atomic_t; trap codes; sigcontext */
 
+#if __POSIX_VISIBLE >= 200809
+
+#include <sys/_pthreadtypes.h>
+#include <sys/_timespec.h>
+
+#ifndef _SIZE_T_DECLARED
+typedef	__size_t	size_t;
+#define	_SIZE_T_DECLARED
+#endif
+
+#ifndef _UID_T_DECLARED
+typedef	__uid_t		uid_t;
+#define	_UID_T_DECLARED
+#endif
+
+#endif /* __POSIX_VISIBLE >= 200809 */
+
 /*
  * System defined signals.
  */
@@ -158,9 +175,22 @@
 	int     sigval_int;
 	void    *sigval_ptr;
 };
+
+#if defined(_WANT_LWPINFO32) || (defined(_KERNEL) && defined(__LP64__))
+union sigval32 {
+	int	sival_int;
+	uint32_t sival_ptr;
+	/* 6.0 compatibility */
+	int	sigval_int;
+	uint32_t sigval_ptr;
+};
 #endif
+#endif
 
 #if __POSIX_VISIBLE >= 199309
+
+struct pthread_attr;
+
 struct sigevent {
 	int	sigev_notify;		/* Notification type */
 	int	sigev_signo;		/* Signal number */
@@ -169,7 +199,7 @@
 		__lwpid_t	_threadid;
 		struct {
 			void (*_function)(union sigval);
-			void *_attribute; /* pthread_attr_t * */
+			struct pthread_attr **_attribute;
 		} _sigev_thread;
 		unsigned short _kevent_flags;
 		long __spare__[8];
@@ -191,6 +221,7 @@
 #define	SIGEV_KEVENT	3		/* Generate a kevent. */
 #define	SIGEV_THREAD_ID	4		/* Send signal to a kernel thread. */
 #endif
+
 #endif /* __POSIX_VISIBLE >= 199309 */
 
 #if __POSIX_VISIBLE >= 199309 || __XSI_VISIBLE
@@ -236,6 +267,38 @@
 #define si_mqd		_reason._mesgq._mqd
 #define si_band		_reason._poll._band
 
+#if defined(_WANT_LWPINFO32) || (defined(_KERNEL) && defined(__LP64__))
+struct siginfo32 {
+	int	si_signo;		/* signal number */
+	int	si_errno;		/* errno association */
+	int	si_code;		/* signal code */
+	__pid_t	si_pid;			/* sending process */
+	__uid_t	si_uid;			/* sender's ruid */
+	int	si_status;		/* exit value */
+	uint32_t si_addr;		/* faulting instruction */
+	union sigval32 si_value;	/* signal value */
+	union	{
+		struct {
+			int	_trapno;/* machine specific trap code */
+		} _fault;
+		struct {
+			int	_timerid;
+			int	_overrun;
+		} _timer;
+		struct {
+			int	_mqd;
+		} _mesgq;
+		struct {
+			int32_t	_band;		/* band event for SIGPOLL */
+		} _poll;			/* was this ever used ? */
+		struct {
+			int32_t	__spare1__;
+			int	__spare2__[7];
+		} __spare__;
+	} _reason;
+};
+#endif
+
 /** si_code **/
 /* codes for SIGILL */
 #define ILL_ILLOPC 	1	/* Illegal opcode.			*/
@@ -271,6 +334,7 @@
 #define TRAP_BRKPT	1	/* Process breakpoint.			*/
 #define TRAP_TRACE	2	/* Process trace trap.			*/
 #define	TRAP_DTRACE	3	/* DTrace induced trap.			*/
+#define	TRAP_CAP	4	/* Capabilities protective trap.	*/
 
 /* codes for SIGCHLD */
 #define CLD_EXITED	1	/* Child has exited			*/
@@ -355,18 +419,10 @@
 #endif
 
 #if __XSI_VISIBLE
-/*
- * Structure used in sigaltstack call.
- */
 #if __BSD_VISIBLE
-typedef	struct sigaltstack {
-#else
-typedef	struct {
+#define	__stack_t sigaltstack
 #endif
-	char	*ss_sp;			/* signal stack base */
-	__size_t ss_size;		/* signal stack length */
-	int	ss_flags;		/* SS_DISABLE and/or SS_ONSTACK */
-} stack_t;
+typedef	struct __stack_t stack_t;
 
 #define	SS_ONSTACK	0x0001	/* take signal on alternate stack */
 #define	SS_DISABLE	0x0004	/* disable taking signals on alternate stack */
@@ -374,6 +430,17 @@
 #define	SIGSTKSZ	(MINSIGSTKSZ + 32768)	/* recommended stack size */
 #endif
 
+/*
+ * Structure used in sigaltstack call.  Its definition is always
+ * needed for __ucontext.  If __BSD_VISIBLE is defined, the structure
+ * tag is actually sigaltstack.
+ */
+struct __stack_t {
+	void	*ss_sp;			/* signal stack base */
+	__size_t ss_size;		/* signal stack length */
+	int	ss_flags;		/* SS_DISABLE and/or SS_ONSTACK */
+};
+
 #if __BSD_VISIBLE
 /*
  * 4.3 compatibility:
@@ -407,8 +474,7 @@
  * Structure used in sigstack call.
  */
 struct sigstack {
-	/* XXX ss_sp's type should be `void *'. */
-	char	*ss_sp;			/* signal stack pointer */
+	void	*ss_sp;			/* signal stack pointer */
 	int	ss_onstack;		/* current status */
 };
 #endif


From laffer1 at midnightbsd.org  Sun Feb  9 13:28:48 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 13:28:48 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12359] trunk/sys/sys/signalvar.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091828.019ISmK8090921@stargazer.midnightbsd.org>

Revision: 12359
          http://svnweb.midnightbsd.org/src/?rev=12359
Author:   laffer1
Date:     2020-02-09 13:28:48 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/signalvar.h

Modified: trunk/sys/sys/signalvar.h
===================================================================
--- trunk/sys/sys/signalvar.h	2020-02-09 18:26:51 UTC (rev 12358)
+++ trunk/sys/sys/signalvar.h	2020-02-09 18:28:48 UTC (rev 12359)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)signalvar.h	8.6 (Berkeley) 2/19/95
- * $FreeBSD: stable/10/sys/sys/signalvar.h 315949 2017-03-25 13:33:23Z badger $
+ * $FreeBSD: stable/11/sys/sys/signalvar.h 353789 2019-10-21 01:24:21Z kevans $
  */
 
 #ifndef _SYS_SIGNALVAR_H_
@@ -200,6 +200,7 @@
 	return (1);
 }
 
+#ifdef COMPAT_FREEBSD6
 struct osigevent {
 	int	sigev_notify;		/* Notification type */
 	union {
@@ -208,6 +209,7 @@
 	} __sigev_u;
 	union sigval sigev_value;	/* Signal value */
 };
+#endif
 
 typedef struct ksiginfo {
 	TAILQ_ENTRY(ksiginfo)	ksi_link;
@@ -326,9 +328,41 @@
 #define	SIGPROCMASK_PROC_LOCKED	0x0002
 #define	SIGPROCMASK_PS_LOCKED	0x0004
 
+/*
+ * Modes for sigdeferstop().  Manages behaviour of
+ * thread_suspend_check() in the region delimited by
+ * sigdeferstop()/sigallowstop().  Must be restored to
+ * SIGDEFERSTOP_OFF before returning to userspace.
+ */
+#define	SIGDEFERSTOP_NOP	0 /* continue doing whatever is done now */
+#define	SIGDEFERSTOP_OFF	1 /* stop ignoring STOPs */
+#define	SIGDEFERSTOP_SILENT	2 /* silently ignore STOPs */
+#define	SIGDEFERSTOP_EINTR	3 /* ignore STOPs, return EINTR */
+#define	SIGDEFERSTOP_ERESTART	4 /* ignore STOPs, return ERESTART */
+
+#define	SIGDEFERSTOP_VAL_NCHG	(-1) /* placeholder indicating no state change */
+int	sigdeferstop_impl(int mode);
+void	sigallowstop_impl(int prev);
+
+static inline int
+sigdeferstop(int mode)
+{
+
+	if (mode == SIGDEFERSTOP_NOP)
+		return (SIGDEFERSTOP_VAL_NCHG);
+	return (sigdeferstop_impl(mode));
+}
+
+static inline void
+sigallowstop(int prev)
+{
+
+	if (prev == SIGDEFERSTOP_VAL_NCHG)
+		return;
+	sigallowstop_impl(prev);
+}
+
 int	cursig(struct thread *td);
-int	sigdeferstop(void);
-int	sigallowstop(void);
 void	execsigs(struct proc *p);
 void	gsignal(int pgid, int sig, ksiginfo_t *ksi);
 void	killproc(struct proc *p, char *why);
@@ -346,6 +380,7 @@
 void	sigacts_free(struct sigacts *ps);
 struct sigacts *sigacts_hold(struct sigacts *ps);
 int	sigacts_shared(struct sigacts *ps);
+void	sig_drop_caught(struct proc *p);
 void	sigexit(struct thread *td, int sig) __dead2;
 int	sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **);
 int	sig_ffs(sigset_t *set);


From laffer1 at midnightbsd.org  Sun Feb  9 13:30:16 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 13:30:16 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12360] trunk/sys/sys: sync with FreeBSD
 11-stable
Message-ID: <202002091830.019IUGsh091647@stargazer.midnightbsd.org>

Revision: 12360
          http://svnweb.midnightbsd.org/src/?rev=12360
Author:   laffer1
Date:     2020-02-09 13:30:15 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sf_buf.h
    trunk/sys/sys/sglist.h

Modified: trunk/sys/sys/sf_buf.h
===================================================================
--- trunk/sys/sys/sf_buf.h	2020-02-09 18:28:48 UTC (rev 12359)
+++ trunk/sys/sys/sf_buf.h	2020-02-09 18:30:15 UTC (rev 12360)
@@ -1,5 +1,6 @@
 /* $MidnightBSD$ */
 /*-
+ * Copyright (c) 2014 Gleb Smirnoff <glebius at FreeBSD.org>
  * Copyright (c) 2003-2004 Alan L. Cox <alc at cs.rice.edu>
  * All rights reserved.
  *
@@ -24,13 +25,166 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/sf_buf.h 255786 2013-09-22 13:36:52Z glebius $
+ * $FreeBSD: stable/11/sys/sys/sf_buf.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_SF_BUF_H_
 #define _SYS_SF_BUF_H_
 
+struct sfstat {				/* sendfile statistics */
+	uint64_t	sf_syscalls;	/* times sendfile was called */
+	uint64_t	sf_noiocnt;	/* times sendfile didn't require I/O */
+	uint64_t	sf_iocnt;	/* times sendfile had to do disk I/O */
+	uint64_t	sf_pages_read;	/* pages read as part of a request */
+	uint64_t	sf_pages_valid;	/* pages were valid for a request */
+	uint64_t	sf_rhpages_requested;	/* readahead pages requested */
+	uint64_t	sf_rhpages_read;	/* readahead pages read */
+	uint64_t	sf_busy;	/* times aborted on a busy page */
+	uint64_t	sf_allocfail;	/* times sfbuf allocation failed */
+	uint64_t	sf_allocwait;	/* times sfbuf allocation had to wait */
+};
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_page.h>
+
 /*
+ * Sf_bufs, or sendfile(2) buffers provide a vm_page that is mapped
+ * into kernel address space. Note, that they aren't used only
+ * by sendfile(2)!
+ *
+ * Sf_bufs could be implemented as a feature of vm_page_t, but that
+ * would require growth of the structure. That's why they are implemented
+ * as a separate hash indexed by vm_page address. Implementation lives in
+ * kern/subr_sfbuf.c. Meanwhile, most 64-bit machines have a physical map,
+ * so they don't require this hash at all, thus ignore subr_sfbuf.c.
+ *
+ * Different 32-bit architectures demand different requirements on sf_buf
+ * hash and functions. They request features in machine/vmparam.h, which
+ * enable parts of this file. They can also optionally provide helpers in
+ * machine/sf_buf.h
+ *
+ * Defines are:
+ * SFBUF		This machine requires sf_buf hash.
+ * 			subr_sfbuf.c should be compiled.
+ * SFBUF_CPUSET		This machine can perform SFB_CPUPRIVATE mappings,
+ *			that do no invalidate cache on the rest of CPUs.
+ * SFBUF_NOMD		This machine doesn't have machine/sf_buf.h
+ *
+ * SFBUF_OPTIONAL_DIRECT_MAP	Value of this define is used as boolean
+ *				variable that tells whether machine is
+ *				capable of direct map or not at runtime.
+ * SFBUF_MAP		This machine provides its own sf_buf_map() and
+ *			sf_buf_unmap().
+ * SFBUF_PROCESS_PAGE	This machine provides sf_buf_process_page()
+ *			function.
+ */
+
+#ifdef SFBUF
+#if defined(SMP) && defined(SFBUF_CPUSET)
+#include <sys/_cpuset.h>
+#endif
+#include <sys/queue.h>
+
+struct sf_buf {
+	LIST_ENTRY(sf_buf)	list_entry;	/* list of buffers */
+	TAILQ_ENTRY(sf_buf)	free_entry;	/* list of buffers */
+	vm_page_t		m;		/* currently mapped page */
+	vm_offset_t		kva;		/* va of mapping */
+	int			ref_count;	/* usage of this mapping */
+#if defined(SMP) && defined(SFBUF_CPUSET)
+	cpuset_t		cpumask;	/* where mapping is valid */
+#endif
+};
+#else /* ! SFBUF */
+struct sf_buf;
+#endif /* SFBUF */
+
+#ifndef SFBUF_NOMD
+#include <machine/sf_buf.h>
+#endif
+#ifdef SFBUF_OPTIONAL_DIRECT_MAP
+#include <machine/md_var.h>
+#endif
+
+#ifdef SFBUF
+struct sf_buf *sf_buf_alloc(struct vm_page *, int);
+void sf_buf_free(struct sf_buf *);
+void sf_buf_ref(struct sf_buf *);
+
+static inline vm_offset_t
+sf_buf_kva(struct sf_buf *sf)
+{
+#ifdef SFBUF_OPTIONAL_DIRECT_MAP
+	if (SFBUF_OPTIONAL_DIRECT_MAP)
+		return (SFBUF_PHYS_DMAP(VM_PAGE_TO_PHYS((vm_page_t)sf)));
+#endif
+
+        return (sf->kva);
+}
+
+static inline vm_page_t
+sf_buf_page(struct sf_buf *sf)
+{
+#ifdef SFBUF_OPTIONAL_DIRECT_MAP
+	if (SFBUF_OPTIONAL_DIRECT_MAP)
+		return ((vm_page_t)sf);
+#endif
+
+        return (sf->m);
+}
+
+#ifndef SFBUF_MAP
+#include <vm/pmap.h>
+
+static inline void
+sf_buf_map(struct sf_buf *sf, int flags)
+{
+
+	pmap_qenter(sf->kva, &sf->m, 1);
+}
+
+static inline int
+sf_buf_unmap(struct sf_buf *sf)
+{
+
+	return (0);
+}
+#endif /* SFBUF_MAP */
+
+#if defined(SMP) && defined(SFBUF_CPUSET)
+void sf_buf_shootdown(struct sf_buf *, int);
+#endif
+
+#ifdef SFBUF_PROCESS_PAGE
+boolean_t sf_buf_process_page(vm_page_t, void (*)(struct sf_buf *));
+#endif
+
+#else /* ! SFBUF */
+
+static inline struct sf_buf *
+sf_buf_alloc(struct vm_page *m, int pri)
+{
+
+	return ((struct sf_buf *)m);
+}
+
+static inline void
+sf_buf_free(struct sf_buf *sf)
+{
+}
+
+static inline void
+sf_buf_ref(struct sf_buf *sf)
+{
+}
+#endif /* SFBUF */
+
+/*
  * Options to sf_buf_alloc() are specified through its flags argument.  This
  * argument's value should be the result of a bitwise or'ing of one or more
  * of the following values.
@@ -41,20 +195,6 @@
 #define	SFB_DEFAULT	0
 #define	SFB_NOWAIT	4		/* Return NULL if all bufs are used. */
 
-struct vm_page;
-
-struct sfstat {				/* sendfile statistics */
-	uint64_t	sf_iocnt;	/* times sendfile had to do disk I/O */
-	uint64_t	sf_allocfail;	/* times sfbuf allocation failed */
-	uint64_t	sf_allocwait;	/* times sfbuf allocation had to wait */
-};
-
-#ifdef _KERNEL
-#include <machine/sf_buf.h>
-#include <sys/systm.h>
-#include <sys/counter.h>
-struct mbuf;	/* for sf_buf_mext() */
-
 extern counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 #define	SFSTAT_ADD(name, val)	\
     counter_u64_add(sfstat[offsetof(struct sfstat, name) / sizeof(uint64_t)],\
@@ -61,7 +201,4 @@
 	(val))
 #define	SFSTAT_INC(name)	SFSTAT_ADD(name, 1)
 #endif /* _KERNEL */
-
-int	sf_buf_mext(struct mbuf *mb, void *addr, void *args);
-
 #endif /* !_SYS_SF_BUF_H_ */

Modified: trunk/sys/sys/sglist.h
===================================================================
--- trunk/sys/sys/sglist.h	2020-02-09 18:28:48 UTC (rev 12359)
+++ trunk/sys/sys/sglist.h	2020-02-09 18:30:15 UTC (rev 12360)
@@ -28,7 +28,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/sglist.h 260856 2014-01-18 18:36:41Z bryanv $
+ * $FreeBSD: stable/11/sys/sys/sglist.h 345039 2019-03-11 22:48:51Z jhb $
  */
 
 /*
@@ -89,13 +89,18 @@
 int	sglist_append_mbuf(struct sglist *sg, struct mbuf *m0);
 int	sglist_append_phys(struct sglist *sg, vm_paddr_t paddr,
 	    size_t len);
+int	sglist_append_sglist(struct sglist *sg, struct sglist *source,
+	    size_t offset, size_t length);
 int	sglist_append_uio(struct sglist *sg, struct uio *uio);
 int	sglist_append_user(struct sglist *sg, void *buf, size_t len,
 	    struct thread *td);
+int	sglist_append_vmpages(struct sglist *sg, vm_page_t *m, size_t pgoff,
+	    size_t len);
 struct sglist *sglist_build(void *buf, size_t len, int mflags);
 struct sglist *sglist_clone(struct sglist *sg, int mflags);
 int	sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid);
 int	sglist_count(void *buf, size_t len);
+int	sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len);
 void	sglist_free(struct sglist *sg);
 int	sglist_join(struct sglist *first, struct sglist *second);
 size_t	sglist_length(struct sglist *sg);


From laffer1 at midnightbsd.org  Sun Feb  9 13:31:04 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 13:31:04 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12361] trunk/sys/sys: sync with FreeBSD
 11-stable
Message-ID: <202002091831.019IV4Du091736@stargazer.midnightbsd.org>

Revision: 12361
          http://svnweb.midnightbsd.org/src/?rev=12361
Author:   laffer1
Date:     2020-02-09 13:31:04 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sdt.h
    trunk/sys/sys/seq.h

Modified: trunk/sys/sys/sdt.h
===================================================================
--- trunk/sys/sys/sdt.h	2020-02-09 18:30:15 UTC (rev 12360)
+++ trunk/sys/sys/sdt.h	2020-02-09 18:31:04 UTC (rev 12361)
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/sdt.h 289795 2015-10-23 07:37:44Z avg $
+ * $FreeBSD: stable/11/sys/sys/sdt.h 331722 2018-03-29 02:50:57Z eadler $
  *
  * Statically Defined Tracing (SDT) definitions.
  *
@@ -81,6 +81,8 @@
 #include <sys/cdefs.h>
 #include <sys/linker_set.h>
 
+extern volatile bool sdt_probes_enabled;
+
 #ifndef KDTRACE_HOOKS
 
 #define SDT_PROVIDER_DEFINE(prov)
@@ -162,10 +164,12 @@
 	extern struct sdt_probe sdt_##prov##_##mod##_##func##_##name[1]
 
 #define SDT_PROBE(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4)	do {	\
-	if (sdt_##prov##_##mod##_##func##_##name->id)				\
+	if (__predict_false(sdt_probes_enabled)) {				\
+		if (__predict_false(sdt_##prov##_##mod##_##func##_##name->id))	\
 		(*sdt_probe_func)(sdt_##prov##_##mod##_##func##_##name->id,	\
 		    (uintptr_t) arg0, (uintptr_t) arg1, (uintptr_t) arg2,	\
 		    (uintptr_t) arg3, (uintptr_t) arg4);			\
+	} \
 } while (0)
 
 #define SDT_PROBE_ARGTYPE(prov, mod, func, name, num, type, xtype)		\

Modified: trunk/sys/sys/seq.h
===================================================================
--- trunk/sys/sys/seq.h	2020-02-09 18:30:15 UTC (rev 12360)
+++ trunk/sys/sys/seq.h	2020-02-09 18:31:04 UTC (rev 12361)
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/seq.h 273109 2014-10-14 21:19:23Z mjg $
+ * $FreeBSD: stable/11/sys/sys/seq.h 312714 2017-01-24 19:39:24Z mjg $
  */
 
 #ifndef _SYS_SEQ_H_
@@ -60,7 +60,6 @@
  * 		lobj = gobj;
  * 		if (seq_consistent(&gobj->seq, seq))
  * 			break;
- * 		cpu_spinwait();
  * 	}
  * 	foo(lobj);
  */		
@@ -70,25 +69,6 @@
 
 #include <machine/cpu.h>
 
-/*
- * This is a temporary hack until memory barriers are cleaned up.
- *
- * atomic_load_acq_int at least on amd64 provides a full memory barrier,
- * in a way which affects perforance.
- *
- * Hack below covers all architectures and avoids most of the penalty at least
- * on amd64.
- */
-static __inline int
-atomic_load_acq_rmb_int(volatile u_int *p)
-{
-	volatile u_int v;
-
-	v = *p;
-	atomic_load_acq_int(&v);
-	return (v);
-}
-
 static __inline bool
 seq_in_modify(seq_t seqp)
 {
@@ -101,7 +81,8 @@
 {
 
 	MPASS(!seq_in_modify(*seqp));
-	atomic_add_acq_int(seqp, 1);
+	*seqp += 1;
+	atomic_thread_fence_rel();
 }
 
 static __inline void
@@ -108,17 +89,17 @@
 seq_write_end(seq_t *seqp)
 {
 
-	atomic_add_rel_int(seqp, 1);
+	atomic_store_rel_int(seqp, *seqp + 1);
 	MPASS(!seq_in_modify(*seqp));
 }
 
 static __inline seq_t
-seq_read(seq_t *seqp)
+seq_read(const seq_t *seqp)
 {
 	seq_t ret;
 
 	for (;;) {
-		ret = atomic_load_acq_rmb_int(seqp);
+		ret = atomic_load_acq_int(__DECONST(seq_t *, seqp));
 		if (seq_in_modify(ret)) {
 			cpu_spinwait();
 			continue;
@@ -130,17 +111,18 @@
 }
 
 static __inline seq_t
-seq_consistent(seq_t *seqp, seq_t oldseq)
+seq_consistent_nomb(const seq_t *seqp, seq_t oldseq)
 {
 
-	return (atomic_load_acq_rmb_int(seqp) == oldseq);
+	return (*seqp == oldseq);
 }
 
 static __inline seq_t
-seq_consistent_nomb(seq_t *seqp, seq_t oldseq)
+seq_consistent(const seq_t *seqp, seq_t oldseq)
 {
 
-	return (*seqp == oldseq);
+	atomic_thread_fence_acq();
+	return (seq_consistent_nomb(seqp, oldseq));
 }
 
 #endif	/* _KERNEL */


From laffer1 at midnightbsd.org  Sun Feb  9 13:33:36 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 13:33:36 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12362] trunk/sys/sys/param.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091833.019IXa5V091877@stargazer.midnightbsd.org>

Revision: 12362
          http://svnweb.midnightbsd.org/src/?rev=12362
Author:   laffer1
Date:     2020-02-09 13:33:35 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/param.h

Modified: trunk/sys/sys/param.h
===================================================================
--- trunk/sys/sys/param.h	2020-02-09 18:31:04 UTC (rev 12361)
+++ trunk/sys/sys/param.h	2020-02-09 18:33:35 UTC (rev 12362)
@@ -55,7 +55,7 @@
 
 /* Version of FreeBSD we're compatible with */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1004501	/* Master, propagated to newvers */
+#define __FreeBSD_version 1103507	/* Master, propagated to newvers */
 
 /*
  * It is tempting to use this macro in userland code when we want to enable
@@ -70,12 +70,17 @@
 #define __MidnightBSD_kernel__
 
 #if defined(_KERNEL) || defined(IN_RTLD)
-#define	P_OSREL_SIGWAIT		700000
-#define	P_OSREL_SIGSEGV		700004
-#define	P_OSREL_MAP_ANON	800104
+#define	P_OSREL_SIGWAIT			700000
+#define	P_OSREL_SIGSEGV			700004
+#define	P_OSREL_MAP_ANON		800104
+#define	P_OSREL_MAP_FSTRICT		1100036
+#define	P_OSREL_SHUTDOWN_ENOTCONN	1100077
 #define	P_OSREL_MAP_GUARD		1200035
+#define	P_OSREL_MAP_GUARD_11		1101501
+#define	P_OSREL_WRFSBASE		1200041
+#define	P_OSREL_WRFSBASE_11		1101503
 
-#define	P_OSREL_MAJOR(x)	((x) / 100000)
+#define	P_OSREL_MAJOR(x)		((x) / 100000)
 #endif
 
 #ifndef LOCORE


From laffer1 at midnightbsd.org  Sun Feb  9 13:34:07 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 13:34:07 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12363] trunk/sys/sys/select.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091834.019IY7Jm091927@stargazer.midnightbsd.org>

Revision: 12363
          http://svnweb.midnightbsd.org/src/?rev=12363
Author:   laffer1
Date:     2020-02-09 13:34:06 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/select.h

Modified: trunk/sys/sys/select.h
===================================================================
--- trunk/sys/sys/select.h	2020-02-09 18:33:35 UTC (rev 12362)
+++ trunk/sys/sys/select.h	2020-02-09 18:34:06 UTC (rev 12363)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/select.h 154090 2006-01-06 22:12:46Z marcel $
+ * $FreeBSD: stable/11/sys/sys/select.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS_SELECT_H_
@@ -57,7 +57,7 @@
  * be enough for most uses.
  */
 #ifndef	FD_SETSIZE
-#define	FD_SETSIZE	1024U
+#define	FD_SETSIZE	1024
 #endif
 
 #define	_NFDBITS	(sizeof(__fd_mask) * 8)	/* bits per mask */


From laffer1 at midnightbsd.org  Sun Feb  9 13:34:17 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 13:34:17 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12364] trunk/sys/sys/sched.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091834.019IYHlM091974@stargazer.midnightbsd.org>

Revision: 12364
          http://svnweb.midnightbsd.org/src/?rev=12364
Author:   laffer1
Date:     2020-02-09 13:34:16 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/sched.h

Modified: trunk/sys/sys/sched.h
===================================================================
--- trunk/sys/sys/sched.h	2020-02-09 18:34:06 UTC (rev 12363)
+++ trunk/sys/sys/sched.h	2020-02-09 18:34:16 UTC (rev 12364)
@@ -57,7 +57,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/sched.h 253604 2013-07-24 09:45:31Z avg $
+ * $FreeBSD: stable/11/sys/sys/sched.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SCHED_H_
@@ -91,6 +91,7 @@
  * priorities inherited from their procs, and use up cpu time.
  */
 void	sched_exit_thread(struct thread *td, struct thread *child);
+u_int	sched_estcpu(struct thread *td);
 void	sched_fork_thread(struct thread *td, struct thread *child);
 void	sched_lend_prio(struct thread *td, u_char prio);
 void	sched_lend_user_prio(struct thread *td, u_char pri);
@@ -103,7 +104,6 @@
 void	sched_user_prio(struct thread *td, u_char prio);
 void	sched_userret(struct thread *td);
 void	sched_wakeup(struct thread *td);
-void	sched_preempt(struct thread *td);
 #ifdef	RACCT
 #ifdef	SCHED_4BSD
 fixpt_t	sched_pctcpu_delta(struct thread *td);
@@ -115,8 +115,8 @@
  */
 void	sched_add(struct thread *td, int flags);
 void	sched_clock(struct thread *td);
+void	sched_preempt(struct thread *td);
 void	sched_rem(struct thread *td);
-void	sched_tick(int cnt);
 void	sched_relinquish(struct thread *td);
 struct thread *sched_choose(void);
 void	sched_idletd(void *);
@@ -223,6 +223,7 @@
  */
 #ifndef _KERNEL
 #include <sys/cdefs.h>
+#include <sys/_timespec.h>
 #include <sys/_types.h>
 
 #ifndef _PID_T_DECLARED
@@ -230,8 +231,6 @@
 #define _PID_T_DECLARED
 #endif
 
-struct timespec;
-
 __BEGIN_DECLS
 int     sched_get_priority_max(int);
 int     sched_get_priority_min(int);


From laffer1 at midnightbsd.org  Sun Feb  9 13:34:37 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 13:34:37 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12365] trunk/sys/sys/rwlock.h: sync with
 FreeBSD 11-stable
Message-ID: <202002091834.019IYbF3092027@stargazer.midnightbsd.org>

Revision: 12365
          http://svnweb.midnightbsd.org/src/?rev=12365
Author:   laffer1
Date:     2020-02-09 13:34:36 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/rwlock.h

Modified: trunk/sys/sys/rwlock.h
===================================================================
--- trunk/sys/sys/rwlock.h	2020-02-09 18:34:16 UTC (rev 12364)
+++ trunk/sys/sys/rwlock.h	2020-02-09 18:34:36 UTC (rev 12365)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/rwlock.h 323870 2017-09-21 19:24:11Z marius $
+ * $FreeBSD: stable/11/sys/sys/rwlock.h 343420 2019-01-25 11:01:11Z kib $
  */
 
 #ifndef _SYS_RWLOCK_H_
@@ -59,13 +59,14 @@
 #define	RW_LOCK_READ_WAITERS	0x02
 #define	RW_LOCK_WRITE_WAITERS	0x04
 #define	RW_LOCK_WRITE_SPINNER	0x08
+#define	RW_LOCK_WRITER_RECURSED	0x10
 #define	RW_LOCK_FLAGMASK						\
 	(RW_LOCK_READ | RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS |	\
-	RW_LOCK_WRITE_SPINNER)
+	RW_LOCK_WRITE_SPINNER | RW_LOCK_WRITER_RECURSED)
 #define	RW_LOCK_WAITERS		(RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)
 
 #define	RW_OWNER(x)		((x) & ~RW_LOCK_FLAGMASK)
-#define	RW_READERS_SHIFT	4
+#define	RW_READERS_SHIFT	5
 #define	RW_READERS(x)		(RW_OWNER((x)) >> RW_READERS_SHIFT)
 #define	RW_READERS_LOCK(x)	((x) << RW_READERS_SHIFT | RW_LOCK_READ)
 #define	RW_ONE_READER		(1 << RW_READERS_SHIFT)
@@ -77,6 +78,8 @@
 
 #define	rw_recurse	lock_object.lo_data
 
+#define	RW_READ_VALUE(x)	((x)->rw_lock)
+
 /* Very simple operations on rw_lock. */
 
 /* Try to obtain a write lock once. */
@@ -83,10 +86,16 @@
 #define	_rw_write_lock(rw, tid)						\
 	atomic_cmpset_acq_ptr(&(rw)->rw_lock, RW_UNLOCKED, (tid))
 
+#define	_rw_write_lock_fetch(rw, vp, tid)				\
+	atomic_fcmpset_acq_ptr(&(rw)->rw_lock, vp, (tid))
+
 /* Release a write lock quickly if there are no waiters. */
 #define	_rw_write_unlock(rw, tid)					\
 	atomic_cmpset_rel_ptr(&(rw)->rw_lock, (tid), RW_UNLOCKED)
 
+#define	_rw_write_unlock_fetch(rw, tid)					\
+	atomic_fcmpset_rel_ptr(&(rw)->rw_lock, (tid), RW_UNLOCKED)
+
 /*
  * Full lock operations that are suitable to be inlined in non-debug
  * kernels.  If the lock cannot be acquired or released trivially then
@@ -96,22 +105,20 @@
 /* Acquire a write lock. */
 #define	__rw_wlock(rw, tid, file, line) do {				\
 	uintptr_t _tid = (uintptr_t)(tid);				\
-						                        \
-	if ((rw)->rw_lock != RW_UNLOCKED || !_rw_write_lock((rw), _tid))\
-		_rw_wlock_hard((rw), _tid, (file), (line));		\
-	else 								\
-		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_WLOCK_ACQUIRE, \
-		    rw, 0, 0, (file), (line));				\
+	uintptr_t _v = RW_UNLOCKED;					\
+									\
+	if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__acquire) ||	\
+	    !_rw_write_lock_fetch((rw), &_v, _tid)))			\
+		_rw_wlock_hard((rw), _v, (file), (line));		\
 } while (0)
 
 /* Release a write lock. */
 #define	__rw_wunlock(rw, tid, file, line) do {				\
-	uintptr_t _tid = (uintptr_t)(tid);				\
+	uintptr_t _v = (uintptr_t)(tid);				\
 									\
-	if ((rw)->rw_recurse)						\
-		(rw)->rw_recurse--;					\
-	else if (!_rw_write_unlock((rw), _tid))				\
-		_rw_wunlock_hard((rw), _tid, (file), (line));		\
+	if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__release) ||	\
+	    !_rw_write_unlock_fetch((rw), &_v)))			\
+		_rw_wunlock_hard((rw), _v, (file), (line));		\
 } while (0)
 
 /*
@@ -122,19 +129,24 @@
 void	_rw_init_flags(volatile uintptr_t *c, const char *name, int opts);
 void	_rw_destroy(volatile uintptr_t *c);
 void	rw_sysinit(void *arg);
-void	rw_sysinit_flags(void *arg);
 int	_rw_wowned(const volatile uintptr_t *c);
 void	_rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line);
+int	__rw_try_wlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF);
 int	__rw_try_wlock(volatile uintptr_t *c, const char *file, int line);
 void	_rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line);
+void	__rw_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF);
 void	__rw_rlock(volatile uintptr_t *c, const char *file, int line);
+int	__rw_try_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF);
 int	__rw_try_rlock(volatile uintptr_t *c, const char *file, int line);
+void	_rw_runlock_cookie_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF);
 void	_rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line);
-void	__rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
-	    int line);
-void	__rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid,
-	    const char *file, int line);
+void	__rw_wlock_hard(volatile uintptr_t *c, uintptr_t v
+	    LOCK_FILE_LINE_ARG_DEF);
+void	__rw_wunlock_hard(volatile uintptr_t *c, uintptr_t v
+	    LOCK_FILE_LINE_ARG_DEF);
+int	__rw_try_upgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF);
 int	__rw_try_upgrade(volatile uintptr_t *c, const char *file, int line);
+void	__rw_downgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF);
 void	__rw_downgrade(volatile uintptr_t *c, const char *file, int line);
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void	__rw_assert(const volatile uintptr_t *c, int what, const char *file,
@@ -160,20 +172,38 @@
 	__rw_try_wlock(&(rw)->rw_lock, f, l)
 #define	_rw_wunlock(rw, f, l)						\
 	_rw_wunlock_cookie(&(rw)->rw_lock, f, l)
+#define	_rw_try_rlock(rw, f, l)						\
+	__rw_try_rlock(&(rw)->rw_lock, f, l)
+#if LOCK_DEBUG > 0
 #define	_rw_rlock(rw, f, l)						\
 	__rw_rlock(&(rw)->rw_lock, f, l)
-#define	_rw_try_rlock(rw, f, l)						\
-	__rw_try_rlock(&(rw)->rw_lock, f, l)
 #define	_rw_runlock(rw, f, l)						\
 	_rw_runlock_cookie(&(rw)->rw_lock, f, l)
-#define	_rw_wlock_hard(rw, t, f, l)					\
-	__rw_wlock_hard(&(rw)->rw_lock, t, f, l)
-#define	_rw_wunlock_hard(rw, t, f, l)					\
-	__rw_wunlock_hard(&(rw)->rw_lock, t, f, l)
+#else
+#define	_rw_rlock(rw, f, l)						\
+	__rw_rlock_int((struct rwlock *)rw)
+#define	_rw_runlock(rw, f, l)						\
+	_rw_runlock_cookie_int((struct rwlock *)rw)
+#endif
+#if LOCK_DEBUG > 0
+#define	_rw_wlock_hard(rw, v, f, l)					\
+	__rw_wlock_hard(&(rw)->rw_lock, v, f, l)
+#define	_rw_wunlock_hard(rw, v, f, l)					\
+	__rw_wunlock_hard(&(rw)->rw_lock, v, f, l)
 #define	_rw_try_upgrade(rw, f, l)					\
 	__rw_try_upgrade(&(rw)->rw_lock, f, l)
 #define	_rw_downgrade(rw, f, l)						\
 	__rw_downgrade(&(rw)->rw_lock, f, l)
+#else
+#define	_rw_wlock_hard(rw, v, f, l)					\
+	__rw_wlock_hard(&(rw)->rw_lock, v)
+#define	_rw_wunlock_hard(rw, v, f, l)					\
+	__rw_wunlock_hard(&(rw)->rw_lock, v)
+#define	_rw_try_upgrade(rw, f, l)					\
+	__rw_try_upgrade_int(rw)
+#define	_rw_downgrade(rw, f, l)						\
+	__rw_downgrade_int(rw)
+#endif
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 #define	_rw_assert(rw, w, f, l)						\
 	__rw_assert(&(rw)->rw_lock, w, f, l)
@@ -212,23 +242,19 @@
 	_sleep((chan), &(rw)->lock_object, (pri), (wmesg),		\
 	    tick_sbt * (timo), 0, C_HARDCLOCK)
 
-#define	rw_initialized(rw)	lock_initalized(&(rw)->lock_object)
+#define	rw_initialized(rw)	lock_initialized(&(rw)->lock_object)
 
 struct rw_args {
 	void		*ra_rw;
 	const char 	*ra_desc;
-};
-
-struct rw_args_flags {
-	void		*ra_rw;
-	const char 	*ra_desc;
 	int		ra_flags;
 };
 
-#define	RW_SYSINIT(name, rw, desc)					\
+#define	RW_SYSINIT_FLAGS(name, rw, desc, flags)				\
 	static struct rw_args name##_args = {				\
 		(rw),							\
 		(desc),							\
+		(flags),						\
 	};								\
 	SYSINIT(name##_rw_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    rw_sysinit, &name##_args);					\
@@ -235,18 +261,8 @@
 	SYSUNINIT(name##_rw_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    _rw_destroy, __DEVOLATILE(void *, &(rw)->rw_lock))
 
+#define	RW_SYSINIT(name, rw, desc)	RW_SYSINIT_FLAGS(name, rw, desc, 0)
 
-#define	RW_SYSINIT_FLAGS(name, rw, desc, flags)				\
-	static struct rw_args_flags name##_args = {			\
-		(rw),							\
-		(desc),							\
-		(flags),						\
-	};								\
-	SYSINIT(name##_rw_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
-	    rw_sysinit_flags, &name##_args);				\
-	SYSUNINIT(name##_rw_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
-	    _rw_destroy, __DEVOLATILE(void *, &(rw)->rw_lock))
-
 /*
  * Options passed to rw_init_flags().
  */


From laffer1 at midnightbsd.org  Sun Feb  9 13:38:04 2020
From: laffer1 at midnightbsd.org (laffer1 at midnightbsd.org)
Date: Sun, 9 Feb 2020 13:38:04 -0500 (EST)
Subject: [Midnightbsd-cvs] src [12366] trunk/sys/sys: sync with FreeBSD
 11-stable
Message-ID: <202002091838.019Ic4Un092848@stargazer.midnightbsd.org>

Revision: 12366
          http://svnweb.midnightbsd.org/src/?rev=12366
Author:   laffer1
Date:     2020-02-09 13:38:04 -0500 (Sun, 09 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/sys/_bitset.h
    trunk/sys/sys/rman.h
    trunk/sys/sys/rmlock.h

Modified: trunk/sys/sys/_bitset.h
===================================================================
--- trunk/sys/sys/_bitset.h	2020-02-09 18:34:36 UTC (rev 12365)
+++ trunk/sys/sys/_bitset.h	2020-02-09 18:38:04 UTC (rev 12366)
@@ -27,7 +27,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/_bitset.h 290739 2015-11-13 01:37:08Z markj $
+ * $FreeBSD: stable/11/sys/sys/_bitset.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _SYS__BITSET_H_
@@ -37,26 +37,23 @@
  * Macros addressing word and bit within it, tuned to make compiler
  * optimize cases when SETSIZE fits into single machine word.
  */
-#define	_BITSET_BITS		(sizeof(long) * NBBY)
+#define	_BITSET_BITS		(sizeof(long) * 8)
 
-#define	__bitset_words(_s)	(howmany(_s, _BITSET_BITS))
+#define	__howmany(x, y)	(((x) + ((y) - 1)) / (y))
 
-#define	__bitset_mask(_s, n)						\
-	(1L << ((__bitset_words((_s)) == 1) ?				\
-	    (__size_t)(n) : ((n) % _BITSET_BITS)))
+#define	__bitset_words(_s)	(__howmany(_s, _BITSET_BITS))
 
-#define	__bitset_word(_s, n)						\
-	((__bitset_words((_s)) == 1) ? 0 : ((n) / _BITSET_BITS))
-
 #define	BITSET_DEFINE(t, _s)						\
 struct t {								\
         long    __bits[__bitset_words((_s))];				\
 }
 
-#define	BITSET_T_INITIALIZER(x)						\
-	{ .__bits = { x } }
+/*
+ * Helper to declare a bitset without it's size being a constant.
+ *
+ * Sadly we cannot declare a bitset struct with '__bits[]', because it's
+ * the only member of the struct and the compiler complains.
+ */
+#define BITSET_DEFINE_VAR(t)	BITSET_DEFINE(t, 1)
 
-#define	BITSET_FSET(n)							\
-	[ 0 ... ((n) - 1) ] = (-1L)
-
 #endif /* !_SYS__BITSET_H_ */

Modified: trunk/sys/sys/rman.h
===================================================================
--- trunk/sys/sys/rman.h	2020-02-09 18:34:36 UTC (rev 12365)
+++ trunk/sys/sys/rman.h	2020-02-09 18:38:04 UTC (rev 12366)
@@ -27,7 +27,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/rman.h 303254 2016-07-24 04:38:50Z jhb $
+ * $FreeBSD: stable/11/sys/sys/rman.h 300317 2016-05-20 17:57:47Z jhb $
  */
 
 #ifndef _SYS_RMAN_H_
@@ -48,6 +48,7 @@
 #define	RF_FIRSTSHARE	0x0020	/* first in sharing list */
 #define	RF_PREFETCHABLE	0x0040	/* resource is prefetchable */
 #define	RF_OPTIONAL	0x0080	/* for bus_alloc_resources() */
+#define	RF_UNMAPPED	0x0100	/* don't map resource when activating */
 
 #define	RF_ALIGNMENT_SHIFT	10 /* alignment size bit starts bit 10 */
 #define	RF_ALIGNMENT_MASK	(0x003F << RF_ALIGNMENT_SHIFT)
@@ -62,6 +63,10 @@
  */
 #define RM_TEXTLEN	32
 
+#define	RM_MAX_END	(~(rman_res_t)0)
+
+#define	RMAN_IS_DEFAULT_RANGE(s,e)	((s) == 0 && (e) == RM_MAX_END)
+
 /*
  * Userspace-exported structures.
  */
@@ -71,8 +76,8 @@
 	uintptr_t	r_device;		/* device owning this resource */
 	char		r_devname[RM_TEXTLEN];	/* device name XXX obsolete */
 
-	u_long		r_start;		/* offset in resource space */
-	u_long		r_size;			/* size in resource space */
+	rman_res_t	r_start;		/* offset in resource space */
+	rman_res_t	r_size;			/* size in resource space */
 	u_int		r_flags;		/* RF_* flags */
 };
 
@@ -80,8 +85,8 @@
 	uintptr_t	rm_handle;		/* rman uniquifier */
 	char		rm_descr[RM_TEXTLEN];	/* rman description */
 
-	u_long		rm_start;		/* base of managed region */
-	u_long		rm_size;		/* size of managed region */
+	rman_res_t	rm_start;		/* base of managed region */
+	rman_res_t	rm_size;		/* size of managed region */
 	enum rman_type	rm_type;		/* region type */
 };
 
@@ -102,6 +107,7 @@
 };
 
 struct resource_i;
+struct resource_map;
 
 TAILQ_HEAD(resource_head, resource_i);
 
@@ -109,8 +115,8 @@
 	struct	resource_head 	rm_list;
 	struct	mtx *rm_mtx;	/* mutex used to protect rm_list */
 	TAILQ_ENTRY(rman)	rm_link; /* link in list of all rmans */
-	u_long	rm_start;	/* index of globally first entry */
-	u_long	rm_end;		/* index of globally last entry */
+	rman_res_t	rm_start;	/* index of globally first entry */
+	rman_res_t	rm_end;	/* index of globally last entry */
 	enum	rman_type rm_type; /* what type of resource this is */
 	const	char *rm_descr;	/* text descripion of this resource */
 };
@@ -117,38 +123,40 @@
 TAILQ_HEAD(rman_head, rman);
 
 int	rman_activate_resource(struct resource *r);
-int	rman_adjust_resource(struct resource *r, u_long start, u_long end);
-int	rman_first_free_region(struct rman *rm, u_long *start, u_long *end);
+int	rman_adjust_resource(struct resource *r, rman_res_t start, rman_res_t end);
+int	rman_first_free_region(struct rman *rm, rman_res_t *start, rman_res_t *end);
 bus_space_handle_t rman_get_bushandle(struct resource *);
 bus_space_tag_t rman_get_bustag(struct resource *);
-u_long	rman_get_end(struct resource *);
+rman_res_t	rman_get_end(struct resource *);
 struct device *rman_get_device(struct resource *);
 u_int	rman_get_flags(struct resource *);
+void	rman_get_mapping(struct resource *, struct resource_map *);
 int	rman_get_rid(struct resource *);
-u_long	rman_get_size(struct resource *);
-u_long	rman_get_start(struct resource *);
+rman_res_t	rman_get_size(struct resource *);
+rman_res_t	rman_get_start(struct resource *);
 void   *rman_get_virtual(struct resource *);
 int	rman_deactivate_resource(struct resource *r);
 int	rman_fini(struct rman *rm);
 int	rman_init(struct rman *rm);
 int	rman_init_from_resource(struct rman *rm, struct resource *r);
-int	rman_last_free_region(struct rman *rm, u_long *start, u_long *end);
+int	rman_last_free_region(struct rman *rm, rman_res_t *start, rman_res_t *end);
 uint32_t rman_make_alignment_flags(uint32_t size);
-int	rman_manage_region(struct rman *rm, u_long start, u_long end);
+int	rman_manage_region(struct rman *rm, rman_res_t start, rman_res_t end);
 int	rman_is_region_manager(struct resource *r, struct rman *rm);
 int	rman_release_resource(struct resource *r);
-struct resource *rman_reserve_resource(struct rman *rm, u_long start,
-					u_long end, u_long count,
+struct resource *rman_reserve_resource(struct rman *rm, rman_res_t start,
+					rman_res_t end, rman_res_t count,
 					u_int flags, struct device *dev);
-struct resource *rman_reserve_resource_bound(struct rman *rm, u_long start,
-					u_long end, u_long count, u_long bound,
+struct resource *rman_reserve_resource_bound(struct rman *rm, rman_res_t start,
+					rman_res_t end, rman_res_t count, rman_res_t bound,
 					u_int flags, struct device *dev);
 void	rman_set_bushandle(struct resource *_r, bus_space_handle_t _h);
 void	rman_set_bustag(struct resource *_r, bus_space_tag_t _t);
 void	rman_set_device(struct resource *_r, struct device *_dev);
-void	rman_set_end(struct resource *_r, u_long _end);
+void	rman_set_end(struct resource *_r, rman_res_t _end);
+void	rman_set_mapping(struct resource *, struct resource_map *);
 void	rman_set_rid(struct resource *_r, int _rid);
-void	rman_set_start(struct resource *_r, u_long _start);
+void	rman_set_start(struct resource *_r, rman_res_t _start);
 void	rman_set_virtual(struct resource *_r, void *_v);
 
 extern	struct rman_head rman_head;

Modified: trunk/sys/sys/rmlock.h
===================================================================
--- trunk/sys/sys/rmlock.h	2020-02-09 18:34:36 UTC (rev 12365)
+++ trunk/sys/sys/rmlock.h	2020-02-09 18:38:04 UTC (rev 12366)
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/sys/rmlock.h 323870 2017-09-21 19:24:11Z marius $
+ * $FreeBSD: stable/11/sys/sys/rmlock.h 343420 2019-01-25 11:01:11Z kib $
  */
 
 #ifndef _SYS_RMLOCK_H_
@@ -53,7 +53,6 @@
 void	rm_destroy(struct rmlock *rm);
 int	rm_wowned(const struct rmlock *rm);
 void	rm_sysinit(void *arg);
-void	rm_sysinit_flags(void *arg);
 
 void	_rm_wlock_debug(struct rmlock *rm, const char *file, int line);
 void	_rm_wunlock_debug(struct rmlock *rm, const char *file, int line);
@@ -102,18 +101,14 @@
 struct rm_args {
 	struct rmlock	*ra_rm;
 	const char 	*ra_desc;
+	int		ra_flags;
 };
 
-struct rm_args_flags {
-	struct rmlock	*ra_rm;
-	const char 	*ra_desc;
-	int		ra_opts;
-};
-
-#define	RM_SYSINIT(name, rm, desc)       				\
+#define	RM_SYSINIT_FLAGS(name, rm, desc, flags)				\
 	static struct rm_args name##_args = {				\
 		(rm),							\
 		(desc),							\
+		(flags),						\
 	};								\
 	SYSINIT(name##_rm_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    rm_sysinit, &name##_args);					\
@@ -120,18 +115,8 @@
 	SYSUNINIT(name##_rm_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
 	    rm_destroy, (rm))
 
+#define	RM_SYSINIT(name, rm, desc)	RM_SYSINIT_FLAGS(name, rm, desc, 0)
 
-#define	RM_SYSINIT_FLAGS(name, rm, desc, opts)       			\
-	static struct rm_args name##_args = {				\
-		(rm),							\
-		(desc),							\
-                (opts),							\
-	};								\
-	SYSINIT(name##_rm_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
-	    rm_sysinit_flags, &name##_args);				\
-	SYSUNINIT(name##_rm_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE,	\
-	    rm_destroy, (rm))
-
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 #define	RA_LOCKED		LA_LOCKED
 #define	RA_RLOCKED		LA_SLOCKED